diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 419f90dab..203320826 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -219,6 +219,7 @@ In chronological order:
 
 * Mark Seminatore <https://github.com/mseminatore>
   * [2023-11-09] Improve Windows threading performance scaling
+  * [2024-02-09] Introduce MT_TRACE facility and improve code consistency
 
 * Dirreke <https://github.com/mseminatore>
   * [2024-01-16] Add basic support for the CSKY architecture
diff --git a/Makefile.prebuild b/Makefile.prebuild
index 83da8e2ce..b7d695a75 100644
--- a/Makefile.prebuild
+++ b/Makefile.prebuild
@@ -59,6 +59,22 @@ ifeq ($(TARGET), CK860FV)
 TARGET_FLAGS = -march=ck860v -mcpu=ck860fv -mfdivdu -mhard-float
 endif
 
+ifeq ($(TARGET), x280)
+TARGET_FLAGS = -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d
+endif
+
+ifeq ($(TARGET), RISCV64_ZVL256B)
+TARGET_FLAGS = -march=rv64imafdcv -mabi=lp64d
+endif
+
+ifeq ($(TARGET), RISCV64_ZVL128B)
+TARGET_FLAGS = -march=rv64imafdcv -mabi=lp64d
+endif
+
+ifeq ($(TARGET), RISCV64_GENERIC)
+TARGET_FLAGS = -march=rv64imafdc -mabi=lp64d
+endif
+
 all: getarch_2nd
 	./getarch_2nd  0 >> $(TARGET_MAKE)
 	./getarch_2nd  1 >> $(TARGET_CONF)
diff --git a/Makefile.riscv64 b/Makefile.riscv64
index ce91e03ec..113cc57c5 100644
--- a/Makefile.riscv64
+++ b/Makefile.riscv64
@@ -2,3 +2,19 @@ ifeq ($(CORE), C910V)
 CCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920
 FCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920 -static
 endif
+ifeq ($(CORE), x280)
+CCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh_zvl512b -mabi=lp64d -ffast-math
+FCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d -static
+endif
+ifeq ($(CORE), RISCV64_ZVL256B)
+CCOMMON_OPT += -march=rv64imafdcv_zvl256b -mabi=lp64d
+FCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d -static
+endif
+ifeq ($(CORE), RISCV64_ZVL128B)
+CCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d 
+FCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d -static
+endif
+ifeq ($(CORE), RISCV64_GENERIC)
+CCOMMON_OPT += -march=rv64imafdc -mabi=lp64d
+FCOMMON_OPT += -march=rv64imafdc -mabi=lp64d -static
+endif
diff --git a/README.md b/README.md
index b8d66ed42..2f0a0da4c 100644
--- a/README.md
+++ b/README.md
@@ -198,6 +198,11 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th
   ```
   (also known to work on C906 as long as you use only single-precision functions - its instruction set support appears to be incomplete in double precision)
 
+- **x280**: Level-3 BLAS and Level-1,2 are optimized by RISC-V Vector extension 1.0.
+  ```sh
+  make HOSTCC=gcc TARGET=x280 NUM_THREADS=8 CC=riscv64-unknown-linux-gnu-clang FC=riscv64-unknown-linux-gnu-gfortran
+  ```
+
 ### Support for multiple targets in a single library
 
 OpenBLAS can be built for multiple targets with runtime detection of the target cpu by specifiying `DYNAMIC_ARCH=1` in Makefile.rule, on the gmake command line or as `-DDYNAMIC_ARCH=TRUE` in cmake.
diff --git a/TargetList.txt b/TargetList.txt
index c11b94fa5..115030c1b 100644
--- a/TargetList.txt
+++ b/TargetList.txt
@@ -118,8 +118,11 @@ Z13
 Z14
 
 10.RISC-V 64:
-RISCV64_GENERIC
+RISCV64_GENERIC (e.g. PolarFire Soc/SiFive U54)
+RISCV64_ZVL128B
 C910V
+x280
+RISCV64_ZVL256B
 
 11.LOONGARCH64:
 LOONGSONGENERIC
diff --git a/benchmark/Makefile b/benchmark/Makefile
index 6a7c54636..b7493950a 100644
--- a/benchmark/Makefile
+++ b/benchmark/Makefile
@@ -37,6 +37,12 @@ ESSL=/opt/ibm/lib
 #LIBESSL = -lesslsmp $(ESSL)/libxlomp_ser.so.1 $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a
 LIBESSL = -lesslsmp  $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a
 
+# x280 temporary workaround for gfortran
+ifeq ($(TARGET), x280)
+CCOMMON_OPT:=$(filter-out -mllvm --riscv-v-vector-bits-min=512,$(CCOMMON_OPT))
+endif
+
+
 ifneq ($(NO_LAPACK), 1)
 GOTO_LAPACK_TARGETS=slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
 		    scholesky.goto dcholesky.goto ccholesky.goto zcholesky.goto \
@@ -265,9 +271,9 @@ goto :: sgemm.goto dgemm.goto cgemm.goto zgemm.goto \
        ismax.goto idmax.goto \
        isamin.goto idamin.goto icamin.goto izamin.goto \
        ismin.goto idmin.goto \
-       samax.goto damax.goto scamax.goto dzamax.goto \
+       samax.goto damax.goto camax.goto zamax.goto \
        smax.goto dmax.goto \
-       samin.goto damin.goto scamin.goto dzamin.goto \
+       samin.goto damin.goto camin.goto zamin.goto \
        smin.goto dmin.goto \
        saxpby.goto daxpby.goto caxpby.goto zaxpby.goto \
        snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto $(GOTO_LAPACK_TARGETS) $(GOTO_HALF_TARGETS)
@@ -2832,12 +2838,12 @@ samax.goto : samax.$(SUFFIX) ../$(LIBNAME)
 damax.goto : damax.$(SUFFIX) ../$(LIBNAME)
 	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
 
-############################################## SCAMAX ##############################################
-scamax.goto : scamax.$(SUFFIX) ../$(LIBNAME)
+############################################## CAMAX ##############################################
+camax.goto : camax.$(SUFFIX) ../$(LIBNAME)
 	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
 
-############################################## DZAMAX ##############################################
-dzamax.goto : dzamax.$(SUFFIX) ../$(LIBNAME)
+############################################## ZAMAX ##############################################
+zamax.goto : zamax.$(SUFFIX) ../$(LIBNAME)
 	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
 
 ############################################## SMAX ##############################################
@@ -2856,12 +2862,12 @@ samin.goto : samin.$(SUFFIX) ../$(LIBNAME)
 damin.goto : damin.$(SUFFIX) ../$(LIBNAME)
 	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
 
-############################################## SCAMIN ##############################################
-scamin.goto : scamin.$(SUFFIX) ../$(LIBNAME)
+############################################## CAMIN ##############################################
+camin.goto : camin.$(SUFFIX) ../$(LIBNAME)
 	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
 
-############################################## DZAMIN ##############################################
-dzamin.goto : dzamin.$(SUFFIX) ../$(LIBNAME)
+############################################## ZAMIN ##############################################
+zamin.goto : zamin.$(SUFFIX) ../$(LIBNAME)
 	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
 
 ############################################## SMIN ##############################################
@@ -3383,10 +3389,10 @@ samax.$(SUFFIX) : amax.c
 damax.$(SUFFIX) : amax.c
 	$(CC) $(CFLAGS) -c -UCOMPLEX  -DDOUBLE -o $(@F) $^
 
-scamax.$(SUFFIX) : amax.c
+camax.$(SUFFIX) : amax.c
 	$(CC) $(CFLAGS) -c -DCOMPLEX  -UDOUBLE -o $(@F) $^
 
-dzamax.$(SUFFIX) : amax.c
+zamax.$(SUFFIX) : amax.c
 	$(CC) $(CFLAGS) -c -DCOMPLEX  -DDOUBLE -o $(@F) $^
 
 
@@ -3403,10 +3409,10 @@ samin.$(SUFFIX) : amin.c
 damin.$(SUFFIX) : amin.c
 	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
 
-scamin.$(SUFFIX) : amin.c
+camin.$(SUFFIX) : amin.c
 	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
 
-dzamin.$(SUFFIX) : amin.c
+zamin.$(SUFFIX) : amin.c
 	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
 
 
@@ -3436,4 +3442,4 @@ smallscaling: smallscaling.c ../$(LIBNAME)
 clean ::
 	@rm -f *.goto *.mkl *.acml *.atlas *.veclib *.essl smallscaling
 
-include $(TOPDIR)/Makefile.tail
+include $(TOPDIR)/Makefile.tail
\ No newline at end of file
diff --git a/cblas.h b/cblas.h
index ade2fca3a..beaa32cc2 100644
--- a/cblas.h
+++ b/cblas.h
@@ -101,6 +101,16 @@ CBLAS_INDEX cblas_idamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPE
 CBLAS_INDEX cblas_icamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void  *x, OPENBLAS_CONST blasint incx);
 CBLAS_INDEX cblas_izamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
 
+float cblas_samax(OPENBLAS_CONST blasint n, OPENBLAS_CONST float  *x, OPENBLAS_CONST blasint incx);
+double cblas_damax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
+float cblas_scamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void  *x, OPENBLAS_CONST blasint incx);
+double cblas_dzamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
+
+float cblas_samin(OPENBLAS_CONST blasint n, OPENBLAS_CONST float  *x, OPENBLAS_CONST blasint incx);
+double cblas_damin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
+float cblas_scamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void  *x, OPENBLAS_CONST blasint incx);
+double cblas_dzamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
+
 CBLAS_INDEX cblas_ismax(OPENBLAS_CONST blasint n, OPENBLAS_CONST float  *x, OPENBLAS_CONST blasint incx);
 CBLAS_INDEX cblas_idmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
 CBLAS_INDEX cblas_icmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void  *x, OPENBLAS_CONST blasint incx);
@@ -116,6 +126,9 @@ void cblas_daxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double alpha, OPENBLAS
 void cblas_caxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
 void cblas_zaxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
 
+void cblas_caxpyc(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
+void cblas_zaxpyc(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
+
 void cblas_scopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy);
 void cblas_dcopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy);
 void cblas_ccopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
@@ -290,6 +303,14 @@ void cblas_zgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLA
 void cblas_zgemm3m(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
 		 OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc);
 
+void cblas_sgemmt(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint K,
+		 OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc);
+void cblas_dgemmt(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint K,
+		 OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double beta, double *C, OPENBLAS_CONST blasint ldc);
+void cblas_cgemmt(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint K,
+		 OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc);
+void cblas_zgemmt(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint K,
+		 OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc);
 
 void cblas_ssymm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N,
                  OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc);
diff --git a/common_interface.h b/common_interface.h
index 318827920..5a2e1654c 100644
--- a/common_interface.h
+++ b/common_interface.h
@@ -498,6 +498,15 @@ void BLASFUNC(zgemm3m)(char *, char *, blasint *, blasint *, blasint *, double *
 void BLASFUNC(xgemm3m)(char *, char *, blasint *, blasint *, blasint *, xdouble *,
 	   xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *);
 
+void BLASFUNC(sgemmt)(char*, char *, char *, blasint *, blasint *, float *,
+	   float  *, blasint *, float  *, blasint *, float  *, float  *, blasint *);
+void BLASFUNC(dgemmt)(char*, char *, char *, blasint *, blasint *, double *,
+	   double *, blasint *, double *, blasint *, double *, double *, blasint *);
+void BLASFUNC(cgemmt)(char*, char *, char *, blasint *, blasint *, float *,
+	   float  *, blasint *, float  *, blasint *, float  *, float  *, blasint *);
+void BLASFUNC(zgemmt)(char*, char *, char *, blasint *, blasint *, double *,
+	   double *, blasint *, double *, blasint *, double *, double *, blasint *);
+
 int BLASFUNC(sge2mm)(char *, char *, char *, blasint *, blasint *,
 		     float *, float  *, blasint *, float  *, blasint *,
 		     float *, float  *, blasint *);
@@ -764,8 +773,8 @@ xdouble   BLASFUNC(qlamc3)(xdouble *, xdouble *);
 
 void    BLASFUNC(saxpby) (blasint *, float  *, float  *, blasint *, float *, float  *, blasint *);
 void    BLASFUNC(daxpby) (blasint *, double  *, double  *, blasint *, double *, double  *, blasint *);
-void    BLASFUNC(caxpby) (blasint *, float  *, float  *, blasint *, float *, float  *, blasint *);
-void    BLASFUNC(zaxpby) (blasint *, double  *, double  *, blasint *, double *, double  *, blasint *);
+void    BLASFUNC(caxpby) (blasint *, void  *, float  *, blasint *, void *, float  *, blasint *);
+void    BLASFUNC(zaxpby) (blasint *, void  *, double *, blasint *, void *, double  *, blasint *);
 
 void    BLASFUNC(somatcopy) (char *, char *, blasint *, blasint *, float  *, float  *, blasint *, float  *, blasint *);
 void    BLASFUNC(domatcopy) (char *, char *, blasint *, blasint *, double  *, double  *, blasint *, double  *, blasint *);
diff --git a/common_riscv64.h b/common_riscv64.h
index 7ddbe80a4..ab3bfa25a 100644
--- a/common_riscv64.h
+++ b/common_riscv64.h
@@ -91,8 +91,26 @@ static inline int blas_quickdivide(blasint x, blasint y){
 #define BUFFER_SIZE     ( 32 << 20)
 #define SEEK_ADDRESS
 
-#if defined(C910V)
-#include <riscv_vector.h>
+#if defined(C910V) || (defined(RISCV64_ZVL256B) && (defined(__clang__) || defined(RVV_COMPATIBLE_GCC))) || defined(RISCV64_ZVL128B) || defined(x280)
+# include <riscv_vector.h>
+#endif
+
+#if defined( __riscv_xtheadc ) && defined( __riscv_v ) && ( __riscv_v <= 7000 )
+// t-head toolchain uses obsolete rvv intrinsics, can't build for C910V without this
+#define RISCV_0p10_INTRINSICS
+#define RISCV_RVV(x) x
+#else
+#define RISCV_RVV(x) __riscv_ ## x
+#endif
+
+#if defined(C910V) || defined(RISCV64_ZVL256B)
+# if !defined(DOUBLE)
+#  define EXTRACT_FLOAT(v) RISCV_RVV(vfmv_f_s_f32m1_f32)(v)
+# else
+#  define EXTRACT_FLOAT(v) RISCV_RVV(vfmv_f_s_f64m1_f64)(v)
+# endif
+#else
+# define EXTRACT_FLOAT(v) (v[0])
 #endif
 
 #endif
diff --git a/cpuid_riscv64.c b/cpuid_riscv64.c
index 894d2b873..ff7ba2aad 100644
--- a/cpuid_riscv64.c
+++ b/cpuid_riscv64.c
@@ -70,12 +70,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 /* or implied, of The University of Texas at Austin.                 */
 /*********************************************************************/
 
-#define CPU_GENERIC   0
-#define CPU_C910V     1
+#define CPU_GENERIC         0
+#define CPU_C910V           1
+#define CPU_x280            2
+#define CPU_RISCV64_ZVL256B 3
+#define CPU_RISCV64_ZVL128B 4
 
 static char *cpuname[] = {
   "RISCV64_GENERIC",
-  "C910V"
+  "C910V",
+  "x280",
+  "CPU_RISCV64_ZVL256B",
+  "CPU_RISCV64_ZVL128B"
+};
+
+static char *cpuname_lower[] = {
+  "riscv64_generic",
+  "c910v",
+  "x280",
+  "riscv64_zvl256b",
+  "riscv64_zvl128b"
 };
 
 int detect(void){
@@ -86,23 +100,29 @@ int detect(void){
   char *pmodel = NULL, *pisa = NULL;
 
   infile = fopen("/proc/cpuinfo", "r");
+  if (!infile)
+    return CPU_GENERIC;
   while (fgets(buffer, sizeof(buffer), infile)){
     if(!strncmp(buffer, "model name", 10)){
       strcpy(model_buffer, buffer);
-      pmodel = strchr(isa_buffer, ':') + 1;
+      pmodel = strchr(model_buffer, ':');
+      if (pmodel)
+        pmodel++;
     }
 
     if(!strncmp(buffer, "isa", 3)){
       strcpy(isa_buffer, buffer);
-      pisa = strchr(isa_buffer, '4') + 1;
+      pisa = strchr(isa_buffer, '4');
+      if (pisa)
+        pisa++;
     }
   }
 
   fclose(infile);
 
-  if (!pmodel)
+  if (!pmodel || !pisa)
    return(CPU_GENERIC);
-   
+
   if (strstr(pmodel, check_c910_str) && strchr(pisa, 'v'))
     return CPU_C910V;
 
@@ -140,5 +160,5 @@ void get_cpuconfig(void){
 }
 
 void get_libname(void){
-  printf("riscv64\n");
+  printf("%s", cpuname_lower[detect()]);
 }
diff --git a/ctest/Makefile b/ctest/Makefile
index af5b34a36..ad960b35a 100644
--- a/ctest/Makefile
+++ b/ctest/Makefile
@@ -218,6 +218,9 @@ ifeq ($(F_COMPILER), IBM)
 ifeq ($(C_COMPILER), GCC)
 CEXTRALIB += -lgomp
 endif
+ifeq ($(C_COMPILER), CLANG)
+CEXTRALIB += -lomp
+endif
 endif
 endif
 
diff --git a/ctest/c_cblat1.f b/ctest/c_cblat1.f
index 1a123d74d..cad7c7fa7 100644
--- a/ctest/c_cblat1.f
+++ b/ctest/c_cblat1.f
@@ -96,7 +96,7 @@
       INTEGER           ICAMAXTEST
       EXTERNAL          SCASUMTEST, SCNRM2TEST, ICAMAXTEST
 *     .. External Subroutines ..
-      EXTERNAL          CSCAL, CSSCALTEST, CTEST, ITEST1, STEST1
+      EXTERNAL          CSCALTEST, CSSCALTEST, CTEST, ITEST1, STEST1
 *     .. Intrinsic Functions ..
       INTRINSIC         MAX
 *     .. Common blocks ..
@@ -214,8 +214,8 @@
                CALL STEST1(SCASUMTEST(N,CX,INCX),STRUE4(NP1),
      +                     STRUE4(NP1),SFAC)
             ELSE IF (ICASE.EQ.8) THEN
-*              .. CSCAL ..
-               CALL CSCAL(N,CA,CX,INCX)
+*              .. CSCALTEST ..
+               CALL CSCALTEST(N,CA,CX,INCX)
                CALL CTEST(LEN,CX,CTRUE5(1,NP1,INCX),CTRUE5(1,NP1,INCX),
      +                    SFAC)
             ELSE IF (ICASE.EQ.9) THEN
@@ -236,14 +236,14 @@
 *
       INCX = 1
       IF (ICASE.EQ.8) THEN
-*        CSCAL
+*        CSCALTEST
 *        Add a test for alpha equal to zero.
          CA = (0.0E0,0.0E0)
          DO 80 I = 1, 5
             MWPCT(I) = (0.0E0,0.0E0)
             MWPCS(I) = (1.0E0,1.0E0)
    80    CONTINUE
-         CALL CSCAL(5,CA,CX,INCX)
+         CALL CSCALTEST(5,CA,CX,INCX)
          CALL CTEST(5,CX,MWPCT,MWPCS,SFAC)
       ELSE IF (ICASE.EQ.9) THEN
 *        CSSCALTEST
diff --git a/ctest/c_cblat1c.c b/ctest/c_cblat1c.c
index b4c512436..d9a539097 100644
--- a/ctest/c_cblat1c.c
+++ b/ctest/c_cblat1c.c
@@ -440,6 +440,7 @@ static real c_b43 = (float)1.;
     extern /* Subroutine */ int ctest_(integer*, complex*, complex*, complex*, real*);
     static complex mwpcs[5], mwpct[5];
     extern /* Subroutine */ int itest1_(integer*, integer*), stest1_(real*,real*,real*,real*);
+    extern /* Subroutine */ int cscaltest_(), itest1_(), stest1_();
     static complex cx[8];
     extern real scnrm2test_(integer*, complex*, integer*);
     static integer np1;
@@ -481,7 +482,7 @@ static real c_b43 = (float)1.;
 		stest1_(&r__1, &strue4[np1 - 1], &strue4[np1 - 1], sfac);
 	    } else if (combla_1.icase == 8) {
 /*              .. CSCAL .. */
-		cscal_(&combla_1.n, &ca, cx, &combla_1.incx);
+		cscaltest_(&combla_1.n, &ca, cx, &combla_1.incx);
 		ctest_(&len, cx, &ctrue5[(np1 + combla_1.incx * 5 << 3) - 48],
 			 &ctrue5[(np1 + combla_1.incx * 5 << 3) - 48], sfac);
 	    } else if (combla_1.icase == 9) {
@@ -515,7 +516,7 @@ static real c_b43 = (float)1.;
 	    mwpcs[i__1].r = (float)1., mwpcs[i__1].i = (float)1.;
 /* L80: */
 	}
-	cscal_(&c__5, &ca, cx, &combla_1.incx);
+	cscaltest_(&c__5, &ca, cx, &combla_1.incx);
 	ctest_(&c__5, cx, mwpct, mwpcs, sfac);
     } else if (combla_1.icase == 9) {
 /*        CSSCALTEST */
diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c
index 2d41af228..2ad8b8c5f 100644
--- a/driver/others/blas_server_win32.c
+++ b/driver/others/blas_server_win32.c
@@ -48,6 +48,12 @@
 #endif
 #endif
 
+#ifdef SMP_DEBUG
+#   define MT_TRACE(...) fprintf(stderr, __VA_ARGS__)
+#else
+#   define MT_TRACE(...)
+#endif
+
 /* This is a thread implementation for Win32 lazy implementation */
 
 /* Thread server common information */
@@ -68,19 +74,12 @@ static HANDLE	    blas_threads   [MAX_CPU_NUMBER];
 static DWORD	    blas_threads_id[MAX_CPU_NUMBER];
 static volatile int thread_target;	// target num of live threads, volatile for cross-thread reads
 
-#if defined (__GNUC__) && (__GNUC__ < 6)
-	#define WIN_CAS(dest, exch, comp) __sync_val_compare_and_swap(dest, comp, exch)
-#else
-	#if defined(_WIN64)
-		#define WIN_CAS(dest, exch, comp) InterlockedCompareExchange64(dest, exch, comp)
-	#else
-		#define WIN_CAS(dest, exch, comp) InterlockedCompareExchange(dest, exch, comp)
-	#endif
-#endif
+//
+// Legacy code path
+//
+static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb) {
 
-static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
-
-      if (!(mode & BLAS_COMPLEX)){
+      if (!(mode & BLAS_COMPLEX)) {
 #ifdef EXPRECISION
 	if ((mode & BLAS_PREC) == BLAS_XDOUBLE){
 	  /* REAL / Extended Double */
@@ -95,7 +94,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
 		args -> c, args -> ldc, sb);
 	} else
 #endif
-	  if ((mode & BLAS_PREC) == BLAS_DOUBLE){
+	  if ((mode & BLAS_PREC) == BLAS_DOUBLE) {
 	    /* REAL / Double */
 	    void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double,
 			  double *, BLASLONG, double *, BLASLONG,
@@ -106,7 +105,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
 		  args -> a, args -> lda,
 		  args -> b, args -> ldb,
 		  args -> c, args -> ldc, sb);
-	  } else if ((mode & BLAS_PREC) == BLAS_SINGLE){
+	  } else if ((mode & BLAS_PREC) == BLAS_SINGLE) {
 	    /* REAL / Single */
 	    void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float,
 			  float *, BLASLONG, float *, BLASLONG,
@@ -118,7 +117,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
 		  args -> b, args -> ldb,
 		  args -> c, args -> ldc, sb);
 #ifdef BUILD_BFLOAT16
-          } else if ((mode & BLAS_PREC) == BLAS_BFLOAT16){
+          } else if ((mode & BLAS_PREC) == BLAS_BFLOAT16) {
             /* REAL / BFLOAT16 */
             void (*afunc)(BLASLONG, BLASLONG, BLASLONG, bfloat16,
                           bfloat16 *, BLASLONG, bfloat16 *, BLASLONG,
@@ -129,7 +128,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
                   args -> a, args -> lda,
                   args -> b, args -> ldb,
                   args -> c, args -> ldc, sb);
-          } else if ((mode & BLAS_PREC) == BLAS_STOBF16){
+          } else if ((mode & BLAS_PREC) == BLAS_STOBF16) {
             /* REAL / BLAS_STOBF16 */
             void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float,
                           float *, BLASLONG, bfloat16 *, BLASLONG,
@@ -140,7 +139,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
                   args -> a, args -> lda,
                   args -> b, args -> ldb,
                   args -> c, args -> ldc, sb);
-          } else if ((mode & BLAS_PREC) == BLAS_DTOBF16){
+          } else if ((mode & BLAS_PREC) == BLAS_DTOBF16) {
             /* REAL / BLAS_DTOBF16 */
             void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double,
                           double *, BLASLONG, bfloat16 *, BLASLONG,
@@ -157,7 +156,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
 	  }
       } else {
 #ifdef EXPRECISION
-	if ((mode & BLAS_PREC) == BLAS_XDOUBLE){
+	if ((mode & BLAS_PREC) == BLAS_XDOUBLE) {
 	  /* COMPLEX / Extended Double */
 	  void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble,
 			xdouble *, BLASLONG, xdouble *, BLASLONG,
@@ -171,7 +170,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
 		args -> c, args -> ldc, sb);
 	} else
 #endif
-	  if ((mode & BLAS_PREC) == BLAS_DOUBLE){
+	  if ((mode & BLAS_PREC) == BLAS_DOUBLE) {
 	    /* COMPLEX / Double */
 	  void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double,
 			double *, BLASLONG, double *, BLASLONG,
@@ -201,10 +200,10 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
       }
 }
 
-/* This is a main routine of threads. Each thread waits until job is */
-/* queued.                                                           */
-
-static DWORD WINAPI blas_thread_server(void *arg){
+//
+// This is a main routine of threads. Each thread waits until job is queued.
+//
+static DWORD WINAPI blas_thread_server(void *arg) {
 
   /* Thread identifier */
   BLASLONG  cpu = (BLASLONG)arg;
@@ -215,31 +214,24 @@ static DWORD WINAPI blas_thread_server(void *arg){
   /* Each server needs each buffer */
   buffer   = blas_memory_alloc(2);
 
-#ifdef SMP_DEBUG
-  fprintf(STDERR, "Server[%2ld] Thread is started!\n", cpu);
-#endif
+  MT_TRACE("Server[%2ld] Thread is started!\n", cpu);
 
-  while (1){
+  while (1) {
 
     /* Waiting for Queue */
 
-#ifdef SMP_DEBUG
-    fprintf(STDERR, "Server[%2ld] Waiting for Queue.\n", cpu);
-#endif
-	// event raised when work is added to the queue
-	WaitForSingleObject(kickoff_event, INFINITE);
+    MT_TRACE("Server[%2ld] Waiting for Queue.\n", cpu);
 
-	if (cpu > thread_target - 2)
-	{
-		//printf("thread [%d] exiting.\n", cpu);
-		break;	// excess thread, so worker thread exits
-	}
+    // event raised when work is added to the queue
+    WaitForSingleObject(kickoff_event, INFINITE);
 
-#ifdef SMP_DEBUG
-    fprintf(STDERR, "Server[%2ld] Got it.\n", cpu);
-#endif
+    if (cpu > thread_target - 2) {
+      //MT_TRACE("thread [%d] exiting.\n", cpu);
+      break;	// excess thread, so worker thread exits
+    }
+
+    MT_TRACE("Server[%2ld] Got it.\n", cpu);
 
-#if 1
     EnterCriticalSection(&queue_lock);
 
     queue = work_queue;
@@ -247,53 +239,39 @@ static DWORD WINAPI blas_thread_server(void *arg){
         work_queue = work_queue->next;
 
     LeaveCriticalSection(&queue_lock);
-#else
-    volatile blas_queue_t* queue_next;
-
-    INT_PTR prev_value;
-    do {
-        queue = (volatile blas_queue_t*)work_queue;
-        if (!queue)
-            break;
-
-        queue_next = (volatile blas_queue_t*)queue->next;
-        prev_value = WIN_CAS((INT_PTR*)&work_queue, (INT_PTR)queue_next, (INT_PTR)queue);
-    } while (prev_value != queue);
-#endif
 
-    if (queue)  {
+    if (queue) {
       int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = queue -> routine;
 
       sa = queue -> sa;
       sb = queue -> sb;
 
-#ifdef CONSISTENT_FPCSR
-      __asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode));
-      __asm__ __volatile__ ("fldcw %0"   : : "m" (queue -> x87_mode));
-#endif
+      #ifdef CONSISTENT_FPCSR
+        __asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode));
+        __asm__ __volatile__ ("fldcw %0"   : : "m" (queue -> x87_mode));
+      #endif
 
-#ifdef SMP_DEBUG
-      fprintf(STDERR, "Server[%2ld] Started.  Mode = 0x%03x M = %3ld N=%3ld K=%3ld\n",
+      MT_TRACE("Server[%2ld] Started.  Mode = 0x%03x M = %3ld N=%3ld K=%3ld\n",
 	      cpu, queue->mode, queue-> args ->m, queue->args->n, queue->args->k);
-#endif
 
       // fprintf(stderr, "queue start[%ld]!!!\n", cpu);
 
-#ifdef MONITOR
-      main_status[cpu] = MAIN_RUNNING1;
-#endif
+      #ifdef MONITOR
+        main_status[cpu] = MAIN_RUNNING1;
+      #endif
 
-      if (sa == NULL) sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A);
+      if (sa == NULL) 
+        sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A);
 
       if (sb == NULL) {
-	if (!(queue -> mode & BLAS_COMPLEX)){
+        if (!(queue -> mode & BLAS_COMPLEX)) {
 #ifdef EXPRECISION
-	  if ((queue -> mode & BLAS_PREC) == BLAS_XDOUBLE){
+	  if ((queue -> mode & BLAS_PREC) == BLAS_XDOUBLE) {
 	    sb = (void *)(((BLASLONG)sa + ((XGEMM_P * XGEMM_Q * sizeof(xdouble)
 					+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
 	  } else
 #endif
-	    if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE){
+	    if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE) {
 #ifdef BUILD_DOUBLE
 	      sb = (void *)(((BLASLONG)sa + ((DGEMM_P * DGEMM_Q * sizeof(double)
 					  + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
@@ -327,65 +305,58 @@ static DWORD WINAPI blas_thread_server(void *arg){
             /* Other types in future */
 	    }
 	}
-	queue->sb=sb;
+      	queue->sb=sb;
       }
 
-#ifdef MONITOR
-      main_status[cpu] = MAIN_RUNNING2;
-#endif
+      #ifdef MONITOR
+        main_status[cpu] = MAIN_RUNNING2;
+      #endif
 
       if (!(queue -> mode & BLAS_LEGACY)) {
-
-	(routine)(queue -> args, queue -> range_m, queue -> range_n, sa, sb, queue -> position);
+      	(routine)(queue -> args, queue -> range_m, queue -> range_n, sa, sb, queue -> position);
       } else {
-	legacy_exec(routine, queue -> mode, queue -> args, sb);
+  	    legacy_exec(routine, queue -> mode, queue -> args, sb);
       }
-    }else{
-		continue; //if queue == NULL
-	}
+    } else {
+  		continue; //if queue == NULL
+	  }
 
-#ifdef SMP_DEBUG
-    fprintf(STDERR, "Server[%2ld] Finished!\n", cpu);
-#endif
+    MT_TRACE("Server[%2ld] Finished!\n", cpu);
 	
-	queue->finished = 1;
-
+	  queue->finished = 1;
   }
 
   /* Shutdown procedure */
 
-#ifdef SMP_DEBUG
-  fprintf(STDERR, "Server[%2ld] Shutdown!\n",  cpu);
-#endif
+  MT_TRACE("Server[%2ld] Shutdown!\n",  cpu);
 
   blas_memory_free(buffer);
 
   return 0;
-  }
+}
 
-/* Initializing routine */
-int blas_thread_init(void){
+//
+// Initializing routine
+//
+int blas_thread_init(void) {
   BLASLONG i;
 
   if (blas_server_avail || (blas_cpu_number <= 1)) return 0;
 
   LOCK_COMMAND(&server_lock);
 
-#ifdef SMP_DEBUG
-  fprintf(STDERR, "Initializing Thread(Num. threads = %d)\n",
-	  blas_cpu_number);
-#endif
+  MT_TRACE("Initializing Thread(Num. threads = %d)\n", blas_cpu_number);
 
-  if (!blas_server_avail){
-	// create the kickoff Event
-	kickoff_event = CreateEvent(NULL, TRUE, FALSE, NULL);
+  if (!blas_server_avail) {
+    // create the kickoff Event
+    kickoff_event = CreateEvent(NULL, TRUE, FALSE, NULL);
 
-	thread_target = blas_cpu_number;
+    thread_target = blas_cpu_number;
 
     InitializeCriticalSection(&queue_lock);
 
-    for(i = 0; i < blas_cpu_number - 1; i++){
-	  //printf("thread_init: creating thread [%d]\n", i);
+    for(i = 0; i < blas_cpu_number - 1; i++) {
+	    //MT_TRACE("thread_init: creating thread [%d]\n", i);
 
       blas_threads[i] = CreateThread(NULL, 0,
 				     blas_thread_server, (void *)i,
@@ -400,15 +371,12 @@ int blas_thread_init(void){
   return 0;
 }
 
-/*
-   User can call one of two routines.
-
-     exec_blas_async ... immediately returns after jobs are queued.
-
-     exec_blas       ... returns after jobs are finished.
-*/
-
-int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
+//
+//   User can call one of two routines.
+//     exec_blas_async ... immediately returns after jobs are queued.
+//     exec_blas       ... returns after jobs are finished.
+//
+int exec_blas_async(BLASLONG pos, blas_queue_t *queue) {
 
 #if defined(SMP_SERVER)
   // Handle lazy re-init of the thread-pool after a POSIX fork
@@ -428,7 +396,7 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
     __asm__ __volatile__ ("stmxcsr %0" : "=m" (current -> sse_mode));
 #endif
 
-	current->finished = 0;
+  	current->finished = 0;
     current = current -> next;
     pos ++;
   }
@@ -437,18 +405,18 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
 
   if (!work_queue)
   {
-      work_queue = queue;
+    work_queue = queue;
   }
   else
   {
 	  blas_queue_t *next_item = work_queue;
 
-      // find the end of the work queue
-      while (next_item)
-          next_item = next_item->next;
+    // find the end of the work queue
+    while (next_item)
+        next_item = next_item->next;
 
-      // add new work to the end
-      next_item = queue;
+    // add new work to the end
+    next_item = queue;
   }
 
   LeaveCriticalSection(&queue_lock);
@@ -458,26 +426,25 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
   return 0;
 }
 
-int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){
+//
+// Join. Wait for all queued tasks to complete
+//
+int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue) {
 
-#ifdef SMP_DEBUG
-    fprintf(STDERR, "Synchronization Waiting.\n");
-#endif
+  MT_TRACE("Synchronization Waiting.\n");
 
-    while (num){
-#ifdef SMP_DEBUG
-    fprintf(STDERR, "Waiting Queue ..\n");
-#endif
-	  while (!queue->finished)
-		  YIELDING;
+  while (num) {
+    MT_TRACE("Waiting Queue ..\n");
 
-  	  queue = queue->next;
-      num--;
-    }
+    while (!queue->finished)
+      YIELDING;
+
+    queue = queue->next;
+    num--;
+  }
+
+  MT_TRACE("Completely Done.\n\n");
 
-#ifdef SMP_DEBUG
-    fprintf(STDERR, "Completely Done.\n\n");
-#endif
 	// if work was added to the queue after this batch we can't sleep the worker threads
 	// by resetting the event
 	EnterCriticalSection(&queue_lock);
@@ -490,8 +457,10 @@ int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){
 	return 0;
 }
 
-/* Execute Threads */
-int exec_blas(BLASLONG num, blas_queue_t *queue){
+//
+// Execute Threads
+//
+int exec_blas(BLASLONG num, blas_queue_t *queue) {
 
 #if defined(SMP_SERVER) && defined(OS_CYGWIN_NT)
   // Handle lazy re-init of the thread-pool after a POSIX fork
@@ -504,29 +473,33 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
 
   if ((num <= 0) || (queue == NULL)) return 0;
 
-  if ((num > 1) && queue -> next) exec_blas_async(1, queue -> next);
+  if ((num > 1) && queue -> next) 
+    exec_blas_async(1, queue -> next);
 
   routine = queue -> routine;
 
   if (queue -> mode & BLAS_LEGACY) {
     legacy_exec(routine, queue -> mode, queue -> args, queue -> sb);
-  } else
+  } else {
     if (queue -> mode & BLAS_PTHREAD) {
       void (*pthreadcompat)(void *) = queue -> routine;
       (pthreadcompat)(queue -> args);
     } else
       (routine)(queue -> args, queue -> range_m, queue -> range_n,
-		queue -> sa, queue -> sb, 0);
+    		queue -> sa, queue -> sb, 0);
+  }
 
-  if ((num > 1) && queue -> next) exec_blas_async_wait(num - 1, queue -> next);
+  if ((num > 1) && queue -> next) 
+    exec_blas_async_wait(num - 1, queue -> next);
 
   return 0;
 }
 
-/* Shutdown procedure, but user don't have to call this routine. The */
-/* kernel automatically kill threads.                                */
-
-int BLASFUNC(blas_thread_shutdown)(void){
+//
+// Shutdown procedure, but user don't have to call this routine. The
+// kernel automatically kill threads.
+//
+int BLASFUNC(blas_thread_shutdown)(void) {
 
   int i;
 
@@ -534,9 +507,9 @@ int BLASFUNC(blas_thread_shutdown)(void){
 
   LOCK_COMMAND(&server_lock);
 
-  if (blas_server_avail){
+  if (blas_server_avail) {
 
-    for(i = 0; i < blas_num_threads - 1; i++){
+    for (i = 0; i < blas_num_threads - 1; i++) {
       // Could also just use WaitForMultipleObjects
       DWORD wait_thread_value = WaitForSingleObject(blas_threads[i], 50);
 
@@ -558,6 +531,9 @@ int BLASFUNC(blas_thread_shutdown)(void){
   return 0;
 }
 
+//
+// Legacy function to set numbef of threads
+//
 void goto_set_num_threads(int num_threads)
 {
 	long i;
@@ -571,7 +547,7 @@ void goto_set_num_threads(int num_threads)
 
 	if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER;
 
-	if (blas_server_avail && num_threads < blas_num_threads)	{
+	if (blas_server_avail && num_threads < blas_num_threads) {
 		LOCK_COMMAND(&server_lock);
 
 		thread_target = num_threads;
@@ -579,11 +555,11 @@ void goto_set_num_threads(int num_threads)
 		SetEvent(kickoff_event);
 
 		for (i = num_threads - 1; i < blas_num_threads - 1; i++) {
-			//printf("set_num_threads: waiting on thread [%d] to quit.\n", i);
+			//MT_TRACE("set_num_threads: waiting on thread [%d] to quit.\n", i);
 
 			WaitForSingleObject(blas_threads[i], INFINITE);
 
-			//printf("set_num_threads: thread [%d] has quit.\n", i);
+			//MT_TRACE("set_num_threads: thread [%d] has quit.\n", i);
 
 			CloseHandle(blas_threads[i]);
 		}
@@ -601,8 +577,8 @@ void goto_set_num_threads(int num_threads)
 
 		thread_target = num_threads;
 
-		//increased_threads = 1;
-	    if (!blas_server_avail){
+		  //increased_threads = 1;
+	    if (!blas_server_avail) {
 			// create the kickoff Event
 			kickoff_event = CreateEvent(NULL, TRUE, FALSE, NULL);
 
@@ -611,8 +587,8 @@ void goto_set_num_threads(int num_threads)
 			blas_server_avail = 1;
 		}
 
-		for(i = (blas_num_threads > 0) ? blas_num_threads - 1 : 0; i < num_threads - 1; i++){
-			//printf("set_num_threads: creating thread [%d]\n", i);
+		for (i = (blas_num_threads > 0) ? blas_num_threads - 1 : 0; i < num_threads - 1; i++) {
+			//MT_TRACE("set_num_threads: creating thread [%d]\n", i);
 
 			blas_threads[i] = CreateThread(NULL, 0,
 				     blas_thread_server, (void *)i,
@@ -627,6 +603,9 @@ void goto_set_num_threads(int num_threads)
 	blas_cpu_number  = num_threads;
 }
 
+//
+// Openblas function to set thread count
+//
 void openblas_set_num_threads(int num)
 {
 	goto_set_num_threads(num);
diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c
index 69a473060..e3f905265 100644
--- a/driver/others/dynamic.c
+++ b/driver/others/dynamic.c
@@ -275,6 +275,7 @@ extern gotoblas_t  gotoblas_EXCAVATOR;
 #define gotoblas_SKYLAKEX gotoblas_SANDYBRIDGE
 #define gotoblas_COOPERLAKE gotoblas_SANDYBRIDGE
 #define gotoblas_ZEN gotoblas_SANDYBRIDGE
+#define gotoblas_SAPPHIRERAPIDS gotoblas_SANDYBRIDGE
 #else
 extern gotoblas_t  gotoblas_HASWELL;
 extern gotoblas_t  gotoblas_ZEN;
diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c
index 0454f186c..4c1f4a26e 100644
--- a/driver/others/dynamic_power.c
+++ b/driver/others/dynamic_power.c
@@ -43,6 +43,13 @@ char *gotoblas_corename(void) {
 #define CPU_POWER9   9
 #define CPU_POWER10 10
 
+#ifndef POWER_9
+#define POWER_9         0x20000         /* 9 class CPU */
+#endif
+#ifndef POWER_10
+#define POWER_10        0x40000         /* 10 class CPU */
+#endif
+
 #ifdef _AIX
 #include <sys/systemcfg.h>
 
@@ -62,7 +69,7 @@ static int cpuid(void)
     else if (arch == POWER_9) return CPU_POWER9;
 #endif
 #ifdef POWER_10
-    else if (arch == POWER_10) return CPU_POWER10;
+    else if (arch >= POWER_10) return CPU_POWER10;
 #endif
     return CPU_UNKNOWN;
 }
@@ -332,6 +339,9 @@ void gotoblas_dynamic_init(void) {
 	if (gotoblas && gotoblas -> init) {
 		strncpy(coren,gotoblas_corename(),20);
 		sprintf(coremsg, "Core: %s\n",coren);
+		if (getenv("GET_OPENBLAS_CORETYPE")) {
+			fprintf(stderr, "%s", coremsg);
+		}
 		openblas_warning(2, coremsg);
 		gotoblas -> init();
 	} else {
diff --git a/driver/others/memory.c b/driver/others/memory.c
index caef3e2b7..4ee8f9a2e 100644
--- a/driver/others/memory.c
+++ b/driver/others/memory.c
@@ -3214,7 +3214,7 @@ void blas_shutdown(void){
 #endif
     memory[pos].lock   = 0;
   }
-  if (memory_overflowed)
+  if (memory_overflowed) {
     for (pos = 0; pos < NEW_BUFFERS; pos ++){
       newmemory[pos].addr   = (void *)0;
       newmemory[pos].used   = 0;
@@ -3222,6 +3222,10 @@ void blas_shutdown(void){
       newmemory[pos].pos    = -1;
 #endif
       newmemory[pos].lock   = 0;
+    }
+    free(newmemory);
+    newmemory = NULL;
+    memory_overflowed = 0;  
   }
 
   UNLOCK_COMMAND(&alloc_lock);
diff --git a/exports/gensymbol b/exports/gensymbol
index 704eab06f..226035842 100755
--- a/exports/gensymbol
+++ b/exports/gensymbol
@@ -60,6 +60,7 @@ cblasobjsc="
     cblas_ctbsv cblas_ctpmv cblas_ctpsv cblas_ctrmm cblas_ctrmv cblas_ctrsm cblas_ctrsv
     cblas_scnrm2 cblas_scasum cblas_cgemmt
     cblas_icamax cblas_icamin cblas_icmin cblas_icmax cblas_scsum cblas_cimatcopy cblas_comatcopy
+    cblas_caxpyc cblas_crotg cblas_csrot cblas_scamax cblas_scamin
     "
 cblasobjsd="
     cblas_dasum cblas_daxpy cblas_dcopy cblas_ddot
@@ -69,6 +70,7 @@ cblasobjsd="
     cblas_dsyr2k cblas_dsyr cblas_dsyrk cblas_dtbmv cblas_dtbsv cblas_dtpmv cblas_dtpsv
     cblas_dtrmm cblas_dtrmv cblas_dtrsm cblas_dtrsv cblas_daxpby cblas_dgeadd cblas_dgemmt
     cblas_idamax cblas_idamin cblas_idmin cblas_idmax cblas_dsum cblas_dimatcopy cblas_domatcopy
+    cblas_damax  cblas_damin
     "
 
 cblasobjss="
@@ -80,6 +82,7 @@ cblasobjss="
     cblas_stbmv cblas_stbsv cblas_stpmv cblas_stpsv cblas_strmm cblas_strmv cblas_strsm
     cblas_strsv cblas_sgeadd cblas_sgemmt
     cblas_isamax cblas_isamin cblas_ismin cblas_ismax cblas_ssum cblas_simatcopy cblas_somatcopy
+    cblas_samax cblas_samin
     "
 
 cblasobjsz="
@@ -91,6 +94,7 @@ cblasobjsz="
     cblas_ztrsv cblas_cdotc_sub cblas_cdotu_sub cblas_zdotc_sub cblas_zdotu_sub
     cblas_zaxpby cblas_zgeadd cblas_zgemmt
     cblas_izamax cblas_izamin cblas_izmin cblas_izmax cblas_dzsum cblas_zimatcopy cblas_zomatcopy
+    cblas_zaxpyc cblas_zdrot cblas_zrotg cblas_dzamax cblas_dzamin
 "
 
 cblasobjs="cblas_xerbla"
@@ -861,6 +865,53 @@ lapackobjs2z="$lapackobjs2z
     zgedmd
     zgedmdq
     "
+
+#functions added post 3.11
+
+lapackobjs2c="$lapackobjs2c
+    claqp2rk
+    claqp3rk
+    ctrsyl3
+    "
+#    claqz0
+#    claqz1
+#    claqz2
+#    claqz3
+#    clatrs3
+
+lapackobjs2d="$lapackobjs2d
+    dgelqs
+    dgelst
+    dgeqp3rk
+    dgeqrs
+    dlaqp2rk
+    dlaqp3rk
+    dlarmm
+    dlatrs3
+    dtrsyl3
+    "
+#    dlaqz0
+#    dlaqz1
+#    dlaqz2
+#    dlaqz3
+#    dlaqz4
+
+lapackobjs2z="$lapackobjs2z
+    zgelqs
+    zgelst
+    zgeqp3rk
+    zgeqrs
+    zlaqp2rk
+    zlaqp3rk
+    zlatrs3
+    zrscl
+    ztrsyl3
+    "
+#    zlaqz0
+#    zlaqz1
+#    zlaqz2
+#    zlaqz3
+
 lapack_extendedprecision_objs="
     zposvxx clagge clatms chesvxx cposvxx cgesvxx ssyrfssx csyrfsx
     dlagsy dsysvxx sporfsx slatms zlatms zherfsx csysvxx
@@ -1622,6 +1673,14 @@ lapackeobjsc="
     LAPACKE_cgetsqrhrt_work
     LAPACKE_cungtsqr_row
     LAPACKE_cungtsqr_row_work
+    LAPACKE_clangb
+    LAPACKE_clangb_work
+    LAPACKE_ctrsyl3
+    LAPACKE_ctrsyl3_work
+    LAPACKE_ctz_nancheck
+    LAPACKE_ctz_trans
+    LAPACKE_cunhr_col
+    LAPACKE_cunhr_col_work
 "
 
 lapackeobjsd="
@@ -2239,6 +2298,14 @@ lapackeobjsd="
     LAPACKE_dgetsqrhrt_work
     LAPACKE_dorgtsqr_row
     LAPACKE_dorgtsqr_row_work
+    LAPACKE_dlangb
+    LAPACKE_dlangb_work
+    LAPACKE_dorhr_col
+    LAPACKE_dorhr_col_work
+    LAPACKE_dtrsyl3
+    LAPACKE_dtrsyl3_work
+    LAPACKE_dtz_nancheck
+    LAPACKE_dtz_trans
 "
 
 lapackeobjss="
@@ -2848,6 +2915,14 @@ lapackeobjss="
     LAPACKE_sgetsqrhrt_work
     LAPACKE_sorgtsqr_row
     LAPACKE_sorgtsqr_row_work
+    LAPACKE_slangb
+    LAPACKE_slangb_work
+    LAPACKE_sorhr_col
+    LAPACKE_sorhr_col_work
+    LAPACKE_strsyl3
+    LAPACKE_strsyl3_work
+    LAPACKE_stz_nancheck
+    LAPACKE_stz_trans
 "
 
 lapackeobjsz="
@@ -3515,6 +3590,14 @@ lapackeobjsz="
     LAPACKE_zgetsqrhrt_work
     LAPACKE_zungtsqr_row
     LAPACKE_zungtsqr_row_work
+    LAPACKE_zlangb
+    LAPACKE_zlangb_work
+    LAPACKE_ztrsyl3
+    LAPACKE_ztrsyl3_work
+    LAPACKE_ztz_nancheck
+    LAPACKE_ztz_trans
+    LAPACKE_zunhr_col
+    LAPACKE_zunhr_col_work
 "
 ## @(SRCX_OBJ) from `lapack-3.4.1/lapacke/src/Makefile`
 ## Not exported: requires LAPACKE_EXTENDED to be set and depends on the
@@ -3616,6 +3699,7 @@ lapack_embeded_underscore_objs_s="
     ssysv_aa_2stage ssytrf_aa_2stage
     ssytrs_aa_2stage
     slaorhr_col_getrfnp slaorhr_col_getrfnp2 sorhr_col
+    slarfb_gett
 "
 lapack_embeded_underscore_objs_c="
     chetf2_rook chetrf_rook chetri_rook
@@ -3641,6 +3725,7 @@ lapack_embeded_underscore_objs_c="
     csysv_aa_2stage csytrf_aa_2stage
     csytrs_aa_2stage
     claunhr_col_getrfnp claunhr_col_getrfnp2 cunhr_col
+    clarfb_gett
 "
 lapack_embeded_underscore_objs_d="
     dlasyf_rook
@@ -3658,6 +3743,7 @@ lapack_embeded_underscore_objs_d="
      dsysv_aa_2stage
     dsytrf_aa_2stage dsytrs_aa_2stage
     dlaorhr_col_getrfnp dlaorhr_col_getrfnp2 dorhr_col
+    dlarfb_gett
 "
 lapack_embeded_underscore_objs_z="
     zhetf2_rook zhetrf_rook zhetri_rook
@@ -3682,6 +3768,7 @@ lapack_embeded_underscore_objs_z="
     zhetrs_aa_2stage zsysv_aa_2stage
     zsytrf_aa_2stage zsytrs_aa_2stage
     zlaunhr_col_getrfnp zlaunhr_col_getrfnp2 zunhr_col
+    zlarfb_gett
 "
 
 dirname=`pwd -P`/../lapack-netlib
diff --git a/getarch.c b/getarch.c
index 2d26da079..f879e6bbb 100644
--- a/getarch.c
+++ b/getarch.c
@@ -1679,9 +1679,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define LIBNAME   "c910v"
 #define CORENAME  "C910V"
 #endif
+#endif
+#ifdef FORCE_x280
+#define FORCE
+#define ARCHITECTURE    "RISCV64"
+#define SUBARCHITECTURE "x280"
+#define SUBDIRNAME      "riscv64"
+#define ARCHCONFIG   "-Dx280 " \
+       "-DL1_DATA_SIZE=64536 -DL1_DATA_LINESIZE=32 " \
+       "-DL2_SIZE=262144 -DL2_LINESIZE=32 " \
+       "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 "
+#define LIBNAME   "x280"
+#define CORENAME  "x280"
 #else
 #endif
 
+#ifdef FORCE_RISCV64_ZVL256B
+#define FORCE
+#define ARCHITECTURE    "RISCV64"
+#define SUBARCHITECTURE "RISCV64_ZVL256B"
+#define SUBDIRNAME      "riscv64"
+#define ARCHCONFIG   "-DRISCV64_ZVL256B " \
+       "-DL1_DATA_SIZE=64536 -DL1_DATA_LINESIZE=32 " \
+       "-DL2_SIZE=262144 -DL2_LINESIZE=32 " \
+       "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 "
+#define LIBNAME   "riscv64_zvl256b"
+#define CORENAME  "RISCV64_ZVL256B"
+#endif
+
+#ifdef FORCE_RISCV64_ZVL128B
+#define FORCE
+#define ARCHITECTURE    "RISCV64"
+#define SUBARCHITECTURE "RISCV64_ZVL128B"
+#define SUBDIRNAME      "riscv64"
+#define ARCHCONFIG "-DRISCV64_ZVL128B "                          \
+                   "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \
+                   "-DL2_SIZE=1048576 -DL2_LINESIZE=32 "         \
+                   "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 "
+#define LIBNAME  "riscv64_zvl128b"
+#define CORENAME "RISCV64_ZVL128B"
+#endif
 
 #if defined(FORCE_E2K) || defined(__e2k__)
 #define FORCE
diff --git a/interface/CMakeLists.txt b/interface/CMakeLists.txt
index 4e082928b..55374674a 100644
--- a/interface/CMakeLists.txt
+++ b/interface/CMakeLists.txt
@@ -119,6 +119,7 @@ endif ()
 if (BUILD_BFLOAT16)
 	GenerateNamedObjects("bf16dot.c" "" "sbdot" ${CBLAS_FLAG} "" "" true "BFLOAT16")
 	GenerateNamedObjects("gemm.c" "" "sbgemm" ${CBLAS_FLAG} "" "" true "BFLOAT16")
+	GenerateNamedObjects("gemmt.c" "" "sbgemmt" ${CBLAS_FLAG} "" "" true "BFLOAT16")
 	GenerateNamedObjects("sbgemv.c" "" "sbgemv" ${CBLAS_FLAG} "" "" true "BFLOAT16")
 	GenerateNamedObjects("tobf16.c" "SINGLE_PREC" "sbstobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16")
 	GenerateNamedObjects("tobf16.c" "DOUBLE_PREC" "sbdtobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16")
@@ -130,6 +131,8 @@ endif ()
 foreach (float_type ${FLOAT_TYPES})
 
   if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX")
+    GenerateNamedObjects("zaxpy.c" "" "axpyc" ${CBLAS_FLAG} "" "" false ${float_type})
+
     GenerateNamedObjects("zger.c" "" "geru" ${CBLAS_FLAG} "" "" false ${float_type})
     GenerateNamedObjects("zger.c" "CONJ" "gerc" ${CBLAS_FLAG} "" "" false ${float_type})
     GenerateNamedObjects("zdot.c" "CONJ" "dotc" ${CBLAS_FLAG} "" "" false ${float_type})
diff --git a/interface/Makefile b/interface/Makefile
index 78335357b..048d679d6 100644
--- a/interface/Makefile
+++ b/interface/Makefile
@@ -270,7 +270,8 @@ CSBLAS1OBJS   = \
 	cblas_scopy.$(SUFFIX) cblas_sdot.$(SUFFIX) cblas_sdsdot.$(SUFFIX) cblas_dsdot.$(SUFFIX) \
 	cblas_srot.$(SUFFIX) cblas_srotg.$(SUFFIX) cblas_srotm.$(SUFFIX) cblas_srotmg.$(SUFFIX) \
 	cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) cblas_saxpby.$(SUFFIX) \
-	cblas_ismin.$(SUFFIX) cblas_ismax.$(SUFFIX) cblas_ssum.$(SUFFIX)
+	cblas_ismin.$(SUFFIX) cblas_ismax.$(SUFFIX) cblas_ssum.$(SUFFIX) cblas_samax.$(SUFFIX) \
+	cblas_samin.$(SUFFIX)
 
 CSBLAS2OBJS   = \
 	cblas_sgemv.$(SUFFIX) cblas_sger.$(SUFFIX) cblas_ssymv.$(SUFFIX) cblas_strmv.$(SUFFIX) \
@@ -295,7 +296,8 @@ CDBLAS1OBJS   = \
 	cblas_dcopy.$(SUFFIX) cblas_ddot.$(SUFFIX) \
 	cblas_drot.$(SUFFIX) cblas_drotg.$(SUFFIX) cblas_drotm.$(SUFFIX) cblas_drotmg.$(SUFFIX) \
 	cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) cblas_daxpby.$(SUFFIX) \
-	cblas_idmin.$(SUFFIX) cblas_idmax.$(SUFFIX) cblas_dsum.$(SUFFIX)
+	cblas_idmin.$(SUFFIX) cblas_idmax.$(SUFFIX) cblas_dsum.$(SUFFIX) cblas_damax.$(SUFFIX) \
+	cblas_damin.$(SUFFIX)
 
 CDBLAS2OBJS   = \
 	cblas_dgemv.$(SUFFIX) cblas_dger.$(SUFFIX) cblas_dsymv.$(SUFFIX) cblas_dtrmv.$(SUFFIX) \
@@ -315,7 +317,7 @@ CCBLAS1OBJS   = \
 	cblas_cdotc_sub.$(SUFFIX) cblas_cdotu_sub.$(SUFFIX) \
 	cblas_cscal.$(SUFFIX) cblas_csscal.$(SUFFIX) \
 	cblas_cswap.$(SUFFIX) cblas_scnrm2.$(SUFFIX) \
-	cblas_caxpby.$(SUFFIX) \
+	cblas_caxpby.$(SUFFIX) cblas_scamax.$(SUFFIX) cblas_caxpyc.$(SUFFIX) cblas_scamin.$(SUFFIX) \
 	cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX) cblas_scsum.$(SUFFIX) cblas_csrot.$(SUFFIX) cblas_crotg.$(SUFFIX)
 
 CCBLAS2OBJS   = \
@@ -340,12 +342,12 @@ CXERBLAOBJ = \
 
 CZBLAS1OBJS   = \
 	cblas_izamax.$(SUFFIX) cblas_izamin.$(SUFFIX) cblas_dzasum.$(SUFFIX)  cblas_zaxpy.$(SUFFIX) \
-	cblas_zcopy.$(SUFFIX) \
+	cblas_zcopy.$(SUFFIX) cblas_dzamax.$(SUFFIX) cblas_dzamin.$(SUFFIX) \
 	cblas_zdotc.$(SUFFIX) cblas_zdotu.$(SUFFIX) \
 	cblas_zdotc_sub.$(SUFFIX) cblas_zdotu_sub.$(SUFFIX) \
 	cblas_zscal.$(SUFFIX) cblas_zdscal.$(SUFFIX) \
 	cblas_zswap.$(SUFFIX) cblas_dznrm2.$(SUFFIX) \
-	cblas_zaxpby.$(SUFFIX) \
+	cblas_zaxpby.$(SUFFIX) cblas_zaxpyc.$(SUFFIX) \
 	cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX) cblas_dzsum.$(SUFFIX) cblas_zdrot.$(SUFFIX) cblas_zrotg.$(SUFFIX)
 
 
@@ -1301,7 +1303,7 @@ xhpr2.$(SUFFIX) xhpr2.$(PSUFFIX) : zhpr2.c
 ifeq ($(BUILD_BFLOAT16),1)
 sbgemm.$(SUFFIX) sbgemm.$(PSUFFIX) : gemm.c ../param.h
 	$(CC) -c $(CFLAGS) $< -o $(@F)
-sbgemmt.$(SUFFIX) sbgemmt.$(PSUFFIX) : gemmt.c ../param.h
+sbgemmt.$(SUFFIX) sbgemmt.$(PSUFFIX) : sbgemmt.c ../param.h
 	$(CC) -c $(CFLAGS) $< -o $(@F)
 endif
 
@@ -1533,6 +1535,30 @@ cblas_icmin.$(SUFFIX) cblas_icmin.$(PSUFFIX) : imax.c
 cblas_izmin.$(SUFFIX) cblas_izmin.$(PSUFFIX) : imax.c
 	$(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -DUSE_MIN $< -o $(@F)
 
+cblas_samax.$(SUFFIX) cblas_samax.$(PSUFFIX) : max.c
+	$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F)
+
+cblas_damax.$(SUFFIX) cblas_damax.$(PSUFFIX) : max.c
+	$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F)
+
+cblas_scamax.$(SUFFIX) cblas_scamax.$(PSUFFIX) : max.c
+	$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F)
+
+cblas_dzamax.$(SUFFIX) cblas_dzamax.$(PSUFFIX) : max.c
+	$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F)
+
+cblas_samin.$(SUFFIX) cblas_samin.$(PSUFFIX) : max.c
+	$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
+
+cblas_damin.$(SUFFIX) cblas_damin.$(PSUFFIX) : max.c
+	$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
+
+cblas_scamin.$(SUFFIX) cblas_scamin.$(PSUFFIX) : max.c
+	$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
+
+cblas_dzamin.$(SUFFIX) cblas_dzamin.$(PSUFFIX) : max.c
+	$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
+
 cblas_sasum.$(SUFFIX) cblas_sasum.$(PSUFFIX) : asum.c
 	$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
 
@@ -1627,6 +1653,15 @@ cblas_daxpy.$(SUFFIX) cblas_daxpy.$(PSUFFIX) : axpy.c
 cblas_caxpy.$(SUFFIX) cblas_caxpy.$(PSUFFIX) : zaxpy.c
 	$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
 
+cblas_caxpyc.$(SUFFIX) cblas_caxpyc.$(PSUFFIX) : zaxpy.c
+	$(CC) $(CFLAGS) -DCBLAS -c -DCONJ $< -o $(@F)
+
+cblas_zaxpyc.$(SUFFIX) cblas_zaxpyc.$(PSUFFIX) : zaxpy.c
+	$(CC) $(CFLAGS) -DCBLAS -c -DCONJ $< -o $(@F)
+
+cblas_xaxpyc.$(SUFFIX) cblas_xaxpyc.$(PSUFFIX) : zaxpy.c
+	$(CC) $(CFLAGS) -DCBLAS -c -DCONJ $< -o $(@F)
+
 cblas_zaxpy.$(SUFFIX) cblas_zaxpy.$(PSUFFIX) : zaxpy.c
 	$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
 
@@ -1932,7 +1967,7 @@ cblas_sgemmt.$(SUFFIX) cblas_sgemmt.$(PSUFFIX) : gemmt.c ../param.h
 	$(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F)
 
 ifeq ($(BUILD_BFLOAT16),1)
-cblas_sbgemmt.$(SUFFIX) cblas_sbgemmt.$(PSUFFIX) : gemmt.c ../param.h
+cblas_sbgemmt.$(SUFFIX) cblas_sbgemmt.$(PSUFFIX) : sbgemmt.c ../param.h
 	$(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F)
 endif
 
diff --git a/interface/gemmt.c b/interface/gemmt.c
index 046432670..018deb7fb 100644
--- a/interface/gemmt.c
+++ b/interface/gemmt.c
@@ -78,6 +78,9 @@ void NAME(char *UPLO, char *TRANSA, char *TRANSB,
 
 	char transA, transB, Uplo;
 	blasint nrowa, nrowb;
+#if defined(COMPLEX)
+	blasint ncolb;
+#endif
 	IFLOAT *buffer;
 	IFLOAT *aa, *bb;
 	FLOAT *cc;
@@ -155,19 +158,27 @@ void NAME(char *UPLO, char *TRANSA, char *TRANSB,
 		uplo = 0;
 	if (Uplo == 'L')
 		uplo = 1;
-
+	
 	nrowa = m;
-	if (transa) nrowa = k;
+	if (transa & 1) nrowa = k;
 	nrowb = k;
-	if (transb) nrowb = m;
+#if defined(COMPLEX)
+	ncolb = m;
+#endif
+	if (transb & 1) {
+		nrowb = m;
+#if defined(COMPLEX)
+		ncolb = k;
+#endif
+	}
 
 	info = 0;
 
 	if (ldc < MAX(1, m))
 		info = 13;
-	if (ldb < MAX(1, nrowa))
+	if (ldb < MAX(1, nrowb))
 		info = 10;
-	if (lda < MAX(1, nrowb))
+	if (lda < MAX(1, nrowa))
 		info = 8;
 	if (k < 0)
 		info = 5;
@@ -211,6 +222,9 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 	blasint info;
 	blasint lda, ldb;
 	FLOAT *a, *b;
+#if defined(COMPLEX)
+	blasint nrowb, ncolb;
+#endif
 	XFLOAT *buffer;
 
 	PRINT_DEBUG_CNAME;
@@ -262,11 +276,22 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 
 		info = -1;
 
-		blasint nrowa, nrowb;
+		blasint nrowa;
+#if !defined(COMPLEX)
+		blasint nrowb;
+#endif
 		nrowa = m;
-		if (transa) nrowa = k;
+		if (transa & 1) nrowa = k;
 		nrowb = k;
-		if (transb) nrowb = m;
+#if defined(COMPLEX)
+		ncolb = m;
+#endif
+		if (transb & 1) {
+			nrowb = m;
+#if defined(COMPLEX)
+			ncolb = k;
+#endif
+		}
 
 		if (ldc < MAX(1, m))
 			info = 13;
@@ -330,26 +355,38 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 
 		info = -1;
 
-		blasint ncola, ncolb;
-		ncola = k;
-		if (transa) ncola = m;
-		ncolb = m;
-		if (transb) ncolb = k;
+		blasint ncola; 
+#if !defined(COMPLEX)
+		blasint ncolb;
+#endif
+		ncola = m;
+		if (transa & 1) ncola = k;
+		ncolb = k;
+#if defined(COMPLEX)
+		nrowb = m;
+#endif
+
+		if (transb & 1) {
+#if defined(COMPLEX)
+			nrowb = k;
+#endif
+			ncolb = m;
+		}
 
 		if (ldc < MAX(1,m))
 			info = 13;
 		if (ldb < MAX(1, ncolb))
-			info = 10;
-		if (lda < MAX(1, ncola))
 			info = 8;
+		if (lda < MAX(1, ncola))
+			info = 10;
 		if (k < 0)
 			info = 5;
 		if (m < 0)
 			info = 4;
 		if (transb < 0)
-			info = 3;
-		if (transa < 0)
 			info = 2;
+		if (transa < 0)
+			info = 3;
 		if (uplo < 0)
 			info = 1;
 	}
@@ -428,7 +465,20 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 
 	IDEBUG_START;
 
-	const blasint incb = (transb == 0) ? 1 : ldb;
+#if defined(COMPLEX)
+	if (transb > 1){
+#ifndef CBLAS
+		IMATCOPY_K_CNC(nrowb, ncolb, (FLOAT)(1.0), (FLOAT)(0.0), b, ldb);
+#else
+		if (order == CblasColMajor)
+			IMATCOPY_K_CNC(nrowb, ncolb, (FLOAT)(1.0), (FLOAT)(0.0), b, ldb);
+		if (order == CblasRowMajor)
+			IMATCOPY_K_RNC(nrowb, ncolb, (FLOAT)(1.0), (FLOAT)(0.0), b, ldb);
+#endif
+	}
+#endif
+
+	const blasint incb = ((transb & 1) == 0) ? 1 : ldb;
 
 	if (uplo == 1) {
 		for (i = 0; i < m; i++) {
@@ -438,19 +488,19 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 #if defined(COMPLEX)
 			aa = a + i * 2;
 			bb = b + i * ldb * 2;
-			if (transa) {
+			if (transa & 1) {
 				aa = a + lda * i * 2;
 			}
-			if (transb)
+			if (transb & 1)
 				bb = b + i * 2;
 			cc = c + i * 2 * ldc + i * 2;
 #else
 			aa = a + i;
 			bb = b + i * ldb;
-			if (transa) {
+			if (transa & 1) {
 				aa = a + lda * i;
 			}
-			if (transb)
+			if (transb & 1)
 				bb = b + i;
 			cc = c + i * ldc + i;
 #endif
@@ -461,7 +511,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 				       NULL, 0);
 
 			if (alpha_r == ZERO && alpha_i == ZERO)
-				return;
+				continue;
 #else
 			if (beta != ONE)
 				SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0);
@@ -478,7 +528,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 #endif
 			// for alignment
 			buffer_size = (buffer_size + 3) & ~3;
-			STACK_ALLOC(buffer_size, FLOAT, buffer);
+			STACK_ALLOC(buffer_size, IFLOAT, buffer);
 
 #ifdef SMP
 
@@ -491,7 +541,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 #endif
 
 #if defined(COMPLEX)
-				if (!transa)
+				if (!(transa & 1))
 				(gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i,
 						     aa, lda, bb, incb, cc, 1,
 						     buffer);
@@ -500,7 +550,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 						     aa, lda, bb, incb, cc, 1,
 						     buffer);
 #else
-				if (!transa)
+				if (!(transa & 1))
 				(gemv[(int)transa]) (j, k, 0, alpha, aa, lda,
 						     bb, incb, cc, 1, buffer);
 				else
@@ -509,7 +559,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 #endif
 #ifdef SMP
 			} else {
-				if (!transa)
+				if (!(transa & 1))
 				(gemv_thread[(int)transa]) (j, k, alpha, aa,
 							    lda, bb, incb, cc,
 							    1, buffer,
@@ -533,13 +583,13 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 			l = j;
 #if defined COMPLEX
 			bb = b + i * ldb * 2;
-			if (transb) {
+			if (transb & 1) {
 				bb = b + i * 2;
 			}
 			cc = c + i * 2 * ldc;
 #else
 			bb = b + i * ldb;
-			if (transb) {
+			if (transb & 1) {
 				bb = b + i;
 			}
 			cc = c + i * ldc;
@@ -551,7 +601,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 				       NULL, 0);
 
 			if (alpha_r == ZERO && alpha_i == ZERO)
-				return;
+				continue;
 #else
 			if (beta != ONE)
 				SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0);
@@ -567,7 +617,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 #endif
 			// for alignment
 			buffer_size = (buffer_size + 3) & ~3;
-			STACK_ALLOC(buffer_size, FLOAT, buffer);
+			STACK_ALLOC(buffer_size, IFLOAT, buffer);
 
 #ifdef SMP
 
@@ -580,7 +630,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 #endif
 
 #if defined(COMPLEX)
-				if (!transa)
+				if (!(transa & 1))
 				(gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i,
 						     a, lda, bb, incb, cc, 1,
 						     buffer);
@@ -589,7 +639,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 						     a, lda, bb, incb, cc, 1,
 						     buffer);
 #else
-				if (!transa)
+				if (!(transa & 1))
 				(gemv[(int)transa]) (j, k, 0, alpha, a, lda, bb,
 						     incb, cc, 1, buffer);
 				else
@@ -599,7 +649,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 
 #ifdef SMP
 			} else {
-				if (!transa)
+				if (!(transa & 1))
 				(gemv_thread[(int)transa]) (j, k, alpha, a, lda,
 							    bb, incb, cc, 1,
 							    buffer, nthreads);
diff --git a/interface/imatcopy.c b/interface/imatcopy.c
index 6a1ad282c..69876e31e 100644
--- a/interface/imatcopy.c
+++ b/interface/imatcopy.c
@@ -154,7 +154,10 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
     }
 #endif
 
-	msize = (size_t)(*rows) * (*cols) * sizeof(FLOAT);
+	if ( *rows >  *cols )
+            msize = (size_t)(*rows) * (*ldb)  * sizeof(FLOAT);
+    else
+            msize = (size_t)(*cols) * (*ldb)  * sizeof(FLOAT);
 
 	b = malloc(msize);
 	if ( b == NULL )
diff --git a/interface/max.c b/interface/max.c
index f05977448..6c7d32bd9 100644
--- a/interface/max.c
+++ b/interface/max.c
@@ -145,8 +145,13 @@ FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){
 
 #else
 
+#ifdef COMPLEX
+FLOAT CNAME(blasint n, void *vx, blasint incx){
+  FLOAT *x = (FLOAT*) vx;
+#else
 FLOAT CNAME(blasint n, FLOAT *x, blasint incx){
-
+#endif
+  
   FLOAT ret;
 
   PRINT_DEBUG_CNAME;
diff --git a/interface/rotmg.c b/interface/rotmg.c
index 3a5ca8f95..b8f627221 100644
--- a/interface/rotmg.c
+++ b/interface/rotmg.c
@@ -96,12 +96,6 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){
 	else
 	{
 		dp2 = *dd2 * dy1;
-		if(dp2 == ZERO)
-		{
-			dflag = -TWO;
-			dparam[0] = dflag;
-			return;
-		}
 		dp1 = *dd1 * *dx1;
 		dq2 =  dp2 * dy1;
 		dq1 =  dp1 * *dx1;
@@ -113,24 +107,10 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){
 			dh12 =    dp2 /  dp1;
 
 			du   = ONE - dh12 * dh21;
-			if(du > ZERO)
-			{
-				dflag = ZERO;
-				*dd1  = *dd1 / du;
-				*dd2  = *dd2 / du;
-				*dx1  = *dx1 * du;
-			} else {
-				dflag = -ONE;
-
-				dh11  = ZERO;
-				dh12  = ZERO;
-				dh21  = ZERO;
-				dh22  = ZERO;
-
-				*dd1  = ZERO;
-				*dd2  = ZERO;
-				*dx1  = ZERO;
-			}
+			dflag = ZERO;
+			*dd1  = *dd1 / du;
+			*dd2  = *dd2 / du;
+			*dx1  = *dx1 * du;
 			
 		}
 		else
diff --git a/interface/sbgemmt.c b/interface/sbgemmt.c
new file mode 100644
index 000000000..759af4bfb
--- /dev/null
+++ b/interface/sbgemmt.c
@@ -0,0 +1,447 @@
+/*********************************************************************/
+/* Copyright 2024, The OpenBLAS Project.                             */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/*********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "common.h"
+
+#define SMP_THRESHOLD_MIN 65536.0
+#define ERROR_NAME "SBGEMMT "
+
+#ifndef GEMM_MULTITHREAD_THRESHOLD
+#define GEMM_MULTITHREAD_THRESHOLD 4
+#endif
+
+#ifndef CBLAS
+
+void NAME(char *UPLO, char *TRANSA, char *TRANSB,
+	  blasint * M, blasint * K,
+	  FLOAT * Alpha,
+	  IFLOAT * a, blasint * ldA,
+	  IFLOAT * b, blasint * ldB, FLOAT * Beta, FLOAT * c, blasint * ldC)
+{
+
+	blasint m, k;
+	blasint lda, ldb, ldc;
+	int transa, transb, uplo;
+	blasint info;
+
+	char transA, transB, Uplo;
+	blasint nrowa, nrowb;
+	IFLOAT *buffer;
+	IFLOAT *aa, *bb;
+	FLOAT *cc;
+	FLOAT alpha, beta;
+
+	PRINT_DEBUG_NAME;
+
+	m = *M;
+	k = *K;
+
+	alpha = *Alpha;
+	beta = *Beta;
+
+	lda = *ldA;
+	ldb = *ldB;
+	ldc = *ldC;
+
+	transA = *TRANSA;
+	transB = *TRANSB;
+	Uplo = *UPLO;
+	TOUPPER(transA);
+	TOUPPER(transB);
+	TOUPPER(Uplo);
+
+	transa = -1;
+	transb = -1;
+	uplo = -1;
+
+	if (transA == 'N')
+		transa = 0;
+	if (transA == 'T')
+		transa = 1;
+
+	if (transA == 'R')
+		transa = 0;
+	if (transA == 'C')
+		transa = 1;
+
+	if (transB == 'N')
+		transb = 0;
+	if (transB == 'T')
+		transb = 1;
+
+	if (transB == 'R')
+		transb = 0;
+	if (transB == 'C')
+		transb = 1;
+
+	if (Uplo == 'U')
+		uplo = 0;
+	if (Uplo == 'L')
+		uplo = 1;
+	nrowa = m;
+	if (transa & 1) nrowa = k;
+	nrowb = k;
+	if (transb & 1) nrowb = m;
+
+	info = 0;
+
+	if (ldc < MAX(1, m))
+		info = 13;
+	if (ldb < MAX(1, nrowb))
+		info = 10;
+	if (lda < MAX(1, nrowa))
+		info = 8;
+	if (k < 0)
+		info = 5;
+	if (m < 0)
+		info = 4;
+	if (transb < 0)
+		info = 3;
+	if (transa < 0)
+		info = 2;
+	if (uplo < 0)
+		info = 1;
+
+	if (info != 0) {
+		BLASFUNC(xerbla) (ERROR_NAME, &info, sizeof(ERROR_NAME));
+		return;
+	}
+#else
+
+void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
+	   enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint m,
+	   blasint k,
+	   FLOAT alpha,
+	   IFLOAT * A, blasint LDA,
+	   IFLOAT * B, blasint LDB, FLOAT beta, FLOAT * c, blasint ldc)
+{
+	IFLOAT *aa, *bb;
+        FLOAT *cc;
+
+	int transa, transb, uplo;
+	blasint info;
+	blasint lda, ldb;
+	IFLOAT *a, *b;
+	XFLOAT *buffer;
+
+	PRINT_DEBUG_CNAME;
+
+	uplo = -1;
+	transa = -1;
+	transb = -1;
+	info = 0;
+
+	if (order == CblasColMajor) {
+		if (Uplo == CblasUpper) uplo = 0;
+		if (Uplo == CblasLower) uplo = 1;
+
+		if (TransA == CblasNoTrans)
+			transa = 0;
+		if (TransA == CblasTrans)
+			transa = 1;
+
+		if (TransA == CblasConjNoTrans)
+			transa = 0;
+		if (TransA == CblasConjTrans)
+			transa = 1;
+
+		if (TransB == CblasNoTrans)
+			transb = 0;
+		if (TransB == CblasTrans)
+			transb = 1;
+
+		if (TransB == CblasConjNoTrans)
+			transb = 0;
+		if (TransB == CblasConjTrans)
+			transb = 1;
+
+		a = (void *)A;
+		b = (void *)B;
+		lda = LDA;
+		ldb = LDB;
+
+		info = -1;
+
+		blasint nrowa;
+		blasint nrowb;
+		nrowa = m;
+		if (transa & 1) nrowa = k;
+		nrowb = k;
+		if (transb & 1)  nrowb = m;
+
+		if (ldc < MAX(1, m))
+			info = 13;
+		if (ldb < MAX(1, nrowb))
+			info = 10;
+		if (lda < MAX(1, nrowa))
+			info = 8;
+		if (k < 0)
+			info = 5;
+		if (m < 0)
+			info = 4;
+		if (transb < 0)
+			info = 3;
+		if (transa < 0)
+			info = 2;
+		if (uplo < 0)
+			info = 1;
+	}
+
+	if (order == CblasRowMajor) {
+
+		a = (void *)B;
+		b = (void *)A;
+
+		lda = LDB;
+		ldb = LDA;
+
+		if (Uplo == CblasUpper) uplo = 0;
+		if (Uplo == CblasLower) uplo = 1;
+
+		if (TransB == CblasNoTrans)
+			transa = 0;
+		if (TransB == CblasTrans)
+			transa = 1;
+
+		if (TransB == CblasConjNoTrans)
+			transa = 0;
+		if (TransB == CblasConjTrans)
+			transa = 1;
+
+		if (TransA == CblasNoTrans)
+			transb = 0;
+		if (TransA == CblasTrans)
+			transb = 1;
+
+		if (TransA == CblasConjNoTrans)
+			transb = 0;
+		if (TransA == CblasConjTrans)
+			transb = 1;
+
+		info = -1;
+
+		blasint ncola; 
+		blasint ncolb;
+
+		ncola = m;
+		if (transa & 1) ncola = k;
+		ncolb = k;
+
+		if (transb & 1) {
+			ncolb = m;
+		}
+
+		if (ldc < MAX(1,m))
+			info = 13;
+		if (ldb < MAX(1, ncolb))
+			info = 8;
+		if (lda < MAX(1, ncola))
+			info = 10;
+		if (k < 0)
+			info = 5;
+		if (m < 0)
+			info = 4;
+		if (transb < 0)
+			info = 2;
+		if (transa < 0)
+			info = 3;
+		if (uplo < 0)
+			info = 1;
+	}
+
+	if (info >= 0) {
+		BLASFUNC(xerbla) (ERROR_NAME, &info, sizeof(ERROR_NAME));
+		return;
+	}
+
+#endif
+	int buffer_size;
+	blasint i, j;
+
+#ifdef SMP
+	int nthreads;
+#endif
+
+
+#ifdef SMP
+	static int (*gemv_thread[]) (BLASLONG, BLASLONG, FLOAT, IFLOAT *,
+				     BLASLONG, IFLOAT *, BLASLONG, FLOAT,
+				     FLOAT *, BLASLONG, int) = {
+		sbgemv_thread_n, sbgemv_thread_t,
+	};
+#endif
+	int (*gemv[]) (BLASLONG, BLASLONG, FLOAT, IFLOAT *, BLASLONG,
+		       IFLOAT *, BLASLONG, FLOAT, FLOAT *, BLASLONG) = {
+	SBGEMV_N, SBGEMV_T,};
+
+
+	if (m == 0)
+		return;
+
+	IDEBUG_START;
+
+	const blasint incb = ((transb & 1) == 0) ? 1 : ldb;
+
+	if (uplo == 1) {
+		for (i = 0; i < m; i++) {
+			j = m - i;
+
+			aa = a + i;
+			bb = b + i * ldb;
+			if (transa & 1) {
+				aa = a + lda * i;
+			}
+			if (transb & 1)
+				bb = b + i;
+			cc = c + i * ldc + i;
+
+#if 0
+			if (beta != ONE)
+				SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0);
+
+			if (alpha == ZERO)
+				continue;
+#endif
+
+			IDEBUG_START;
+
+			buffer_size = j + k + 128 / sizeof(FLOAT);
+#ifdef WINDOWS_ABI
+			buffer_size += 160 / sizeof(FLOAT);
+#endif
+			// for alignment
+			buffer_size = (buffer_size + 3) & ~3;
+			STACK_ALLOC(buffer_size, IFLOAT, buffer);
+
+#ifdef SMP
+
+			if (1L * j * k < 2304L * GEMM_MULTITHREAD_THRESHOLD)
+				nthreads = 1;
+			else
+				nthreads = num_cpu_avail(2);
+
+			if (nthreads == 1) {
+#endif
+
+				if (!(transa & 1))
+				(gemv[(int)transa]) (j, k, alpha, aa, lda,
+						     bb, incb, beta, cc, 1);
+				else
+				(gemv[(int)transa]) (k, j, alpha, aa, lda,
+						     bb, incb, beta, cc, 1);
+
+#ifdef SMP
+			} else {
+				if (!(transa & 1))
+				(gemv_thread[(int)transa]) (j, k, alpha, aa,
+							    lda, bb, incb, beta, cc,
+							    1, nthreads);
+				else
+				(gemv_thread[(int)transa]) (k, j, alpha, aa,
+							    lda, bb, incb, beta, cc,
+							    1, nthreads);
+
+			}
+#endif
+
+			STACK_FREE(buffer);
+		}
+	} else {
+
+		for (i = 0; i < m; i++) {
+			j = i + 1;
+
+			bb = b + i * ldb;
+			if (transb & 1) {
+				bb = b + i;
+			}
+			cc = c + i * ldc;
+
+#if 0
+			if (beta != ONE)
+				SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0);
+
+			if (alpha == ZERO)
+				continue;
+#endif
+			IDEBUG_START;
+
+			buffer_size = j + k + 128 / sizeof(FLOAT);
+#ifdef WINDOWS_ABI
+			buffer_size += 160 / sizeof(FLOAT);
+#endif
+			// for alignment
+			buffer_size = (buffer_size + 3) & ~3;
+			STACK_ALLOC(buffer_size, IFLOAT, buffer);
+
+#ifdef SMP
+
+			if (1L * j * k < 2304L * GEMM_MULTITHREAD_THRESHOLD)
+				nthreads = 1;
+			else
+				nthreads = num_cpu_avail(2);
+
+			if (nthreads == 1) {
+#endif
+
+				if (!(transa & 1))
+				(gemv[(int)transa]) (j, k, alpha, a, lda, bb,
+						     incb, beta, cc, 1);
+				else
+				(gemv[(int)transa]) (k, j, alpha, a, lda, bb,
+						     incb, beta, cc, 1);
+
+#ifdef SMP
+			} else {
+				if (!(transa & 1))
+				(gemv_thread[(int)transa]) (j, k, alpha, a, lda,
+							    bb, incb, beta, cc, 1,
+							    nthreads);
+				else
+				(gemv_thread[(int)transa]) (k, j, alpha, a, lda,
+							    bb, incb, beta, cc, 1,
+							    nthreads);
+			}
+#endif
+
+			STACK_FREE(buffer);
+		}
+	}
+
+	IDEBUG_END;
+
+	return;
+}
diff --git a/interface/zaxpby.c b/interface/zaxpby.c
index 3a4db7403..e5065270d 100644
--- a/interface/zaxpby.c
+++ b/interface/zaxpby.c
@@ -39,12 +39,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #ifndef CBLAS
 
-void NAME(blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *BETA, FLOAT *y, blasint *INCY)
+void NAME(blasint *N, void *VALPHA, FLOAT *x, blasint *INCX, void *VBETA, FLOAT *y, blasint *INCY)
 {
 
   blasint n    = *N;
   blasint incx = *INCX;
   blasint incy = *INCY;
+  FLOAT* ALPHA = (FLOAT*) VALPHA;
+  FLOAT* BETA = (FLOAT*) VBETA;
 
 #else
 
diff --git a/interface/zimatcopy.c b/interface/zimatcopy.c
index b66489eb7..b879c9ac2 100644
--- a/interface/zimatcopy.c
+++ b/interface/zimatcopy.c
@@ -183,7 +183,10 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
     }
 #endif
 
-	msize = (size_t)(*rows) * (*cols) * sizeof(FLOAT) * 2;
+		if ( *rows >  *cols )
+                msize = (size_t)(*rows) * (*ldb)  * sizeof(FLOAT) * 2;
+        else
+                msize = (size_t)(*cols) * (*ldb)  * sizeof(FLOAT) * 2;
 
 	b = malloc(msize);
 	if ( b == NULL )
diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt
index 60314eedb..74e6760c2 100644
--- a/kernel/CMakeLists.txt
+++ b/kernel/CMakeLists.txt
@@ -1349,6 +1349,9 @@ endif ()
     set_target_properties(kernel${TSUFFIX} PROPERTIES COMPILE_FLAGS "${KERNEL_DEFINITIONS}")
     get_target_property(KERNEL_INCLUDE_DIRECTORIES kernel${TSUFFIX} INCLUDE_DIRECTORIES)
     set_target_properties(kernel${TSUFFIX} PROPERTIES INCLUDE_DIRECTORIES "${KERNEL_INCLUDE_DIRECTORIES};${TARGET_CONF_DIR}")
+    if (USE_GEMM3M)
+      target_compile_definitions(kernel${TSUFFIX} PRIVATE USE_GEMM3M)
+    endif()
 endfunction ()
 
 
diff --git a/kernel/generic/trmmkernel_16x8.c b/kernel/generic/trmmkernel_16x8.c
new file mode 100644
index 000000000..5412eab70
--- /dev/null
+++ b/kernel/generic/trmmkernel_16x8.c
@@ -0,0 +1,3676 @@
+#include "common.h"
+
+int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset)
+{
+   BLASLONG i,j,k;
+   FLOAT *C0,*C1,*C2,*C3,*C4,*C5,*C6,*C7,*ptrba,*ptrbb;
+
+   FLOAT res0_0;
+   FLOAT res0_1;
+   FLOAT res0_2;
+   FLOAT res0_3;
+   FLOAT res0_4;
+   FLOAT res0_5;
+   FLOAT res0_6;
+   FLOAT res0_7;
+
+   FLOAT res0_8;
+   FLOAT res0_9;
+   FLOAT res0_10;
+   FLOAT res0_11;
+   FLOAT res0_12;
+   FLOAT res0_13;
+   FLOAT res0_14;
+   FLOAT res0_15;
+
+   FLOAT res1_0;
+   FLOAT res1_1;
+   FLOAT res1_2;
+   FLOAT res1_3;
+   FLOAT res1_4;
+   FLOAT res1_5;
+   FLOAT res1_6;
+   FLOAT res1_7;
+
+   FLOAT res1_8;
+   FLOAT res1_9;
+   FLOAT res1_10;
+   FLOAT res1_11;
+   FLOAT res1_12;
+   FLOAT res1_13;
+   FLOAT res1_14;
+   FLOAT res1_15;
+
+   FLOAT res2_0;
+   FLOAT res2_1;
+   FLOAT res2_2;
+   FLOAT res2_3;
+   FLOAT res2_4;
+   FLOAT res2_5;
+   FLOAT res2_6;
+   FLOAT res2_7;
+
+   FLOAT res2_8;
+   FLOAT res2_9;
+   FLOAT res2_10;
+   FLOAT res2_11;
+   FLOAT res2_12;
+   FLOAT res2_13;
+   FLOAT res2_14;
+   FLOAT res2_15;
+
+   FLOAT res3_0;
+   FLOAT res3_1;
+   FLOAT res3_2;
+   FLOAT res3_3;
+   FLOAT res3_4;
+   FLOAT res3_5;
+   FLOAT res3_6;
+   FLOAT res3_7;
+
+   FLOAT res3_8;
+   FLOAT res3_9;
+   FLOAT res3_10;
+   FLOAT res3_11;
+   FLOAT res3_12;
+   FLOAT res3_13;
+   FLOAT res3_14;
+   FLOAT res3_15;
+
+   FLOAT res4_0;
+   FLOAT res4_1;
+   FLOAT res4_2;
+   FLOAT res4_3;
+   FLOAT res4_4;
+   FLOAT res4_5;
+   FLOAT res4_6;
+   FLOAT res4_7;
+
+   FLOAT res4_8;
+   FLOAT res4_9;
+   FLOAT res4_10;
+   FLOAT res4_11;
+   FLOAT res4_12;
+   FLOAT res4_13;
+   FLOAT res4_14;
+   FLOAT res4_15;
+
+   FLOAT res5_0;
+   FLOAT res5_1;
+   FLOAT res5_2;
+   FLOAT res5_3;
+   FLOAT res5_4;
+   FLOAT res5_5;
+   FLOAT res5_6;
+   FLOAT res5_7;
+
+   FLOAT res5_8;
+   FLOAT res5_9;
+   FLOAT res5_10;
+   FLOAT res5_11;
+   FLOAT res5_12;
+   FLOAT res5_13;
+   FLOAT res5_14;
+   FLOAT res5_15;
+
+   FLOAT res6_0;
+   FLOAT res6_1;
+   FLOAT res6_2;
+   FLOAT res6_3;
+   FLOAT res6_4;
+   FLOAT res6_5;
+   FLOAT res6_6;
+   FLOAT res6_7;
+
+   FLOAT res6_8;
+   FLOAT res6_9;
+   FLOAT res6_10;
+   FLOAT res6_11;
+   FLOAT res6_12;
+   FLOAT res6_13;
+   FLOAT res6_14;
+   FLOAT res6_15;
+
+   FLOAT res7_0;
+   FLOAT res7_1;
+   FLOAT res7_2;
+   FLOAT res7_3;
+   FLOAT res7_4;
+   FLOAT res7_5;
+   FLOAT res7_6;
+   FLOAT res7_7;
+
+   FLOAT res7_8;
+   FLOAT res7_9;
+   FLOAT res7_10;
+   FLOAT res7_11;
+   FLOAT res7_12;
+   FLOAT res7_13;
+   FLOAT res7_14;
+   FLOAT res7_15;
+
+   FLOAT a0;
+   FLOAT a1;
+
+   FLOAT b0;
+   FLOAT b1;
+   FLOAT b2;
+   FLOAT b3;
+   FLOAT b4;
+   FLOAT b5;
+   FLOAT b6;
+   FLOAT b7;
+
+   BLASLONG off, temp;
+
+#if !defined(LEFT)
+   off = -offset;
+#else
+   off = 0;
+#endif
+
+   for (j=0; j<bn/8; j+=1)
+   {
+        C0 = C;
+        C1 = C0+ldc;
+        C2 = C0+2*ldc;
+        C3 = C0+3*ldc;
+        C4 = C0+4*ldc;
+        C5 = C0+5*ldc;
+        C6 = C0+6*ldc;
+        C7 = C0+7*ldc;
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+	off = offset;
+#endif
+
+
+        ptrba = ba;
+
+
+        for (i=0; i<bm/16; i+=1)
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*16;
+		ptrbb = bb + off*8;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+		res0_2 = 0;
+		res0_3 = 0;
+		res0_4 = 0;
+		res0_5 = 0;
+		res0_6 = 0;
+		res0_7 = 0;
+
+		res0_8  = 0;
+		res0_9  = 0;
+		res0_10 = 0;
+		res0_11 = 0;
+		res0_12 = 0;
+		res0_13 = 0;
+		res0_14 = 0;
+		res0_15 = 0;
+
+		res1_0 = 0;
+		res1_1 = 0;
+		res1_2 = 0;
+		res1_3 = 0;
+		res1_4 = 0;
+		res1_5 = 0;
+		res1_6 = 0;
+		res1_7 = 0;
+
+		res1_8  = 0;
+		res1_9  = 0;
+		res1_10 = 0;
+		res1_11 = 0;
+		res1_12 = 0;
+		res1_13 = 0;
+		res1_14 = 0;
+		res1_15 = 0;
+
+		res2_0 = 0;
+		res2_1 = 0;
+		res2_2 = 0;
+		res2_3 = 0;
+		res2_4 = 0;
+		res2_5 = 0;
+		res2_6 = 0;
+		res2_7 = 0;
+
+		res2_8  = 0;
+		res2_9  = 0;
+		res2_10 = 0;
+		res2_11 = 0;
+		res2_12 = 0;
+		res2_13 = 0;
+		res2_14 = 0;
+		res2_15 = 0;
+
+		res3_0 = 0;
+		res3_1 = 0;
+		res3_2 = 0;
+		res3_3 = 0;
+		res3_4 = 0;
+		res3_5 = 0;
+		res3_6 = 0;
+		res3_7 = 0;
+
+		res3_8  = 0;
+		res3_9  = 0;
+		res3_10 = 0;
+		res3_11 = 0;
+		res3_12 = 0;
+		res3_13 = 0;
+		res3_14 = 0;
+		res3_15 = 0;
+
+
+		res4_0 = 0;
+		res4_1 = 0;
+		res4_2 = 0;
+		res4_3 = 0;
+		res4_4 = 0;
+		res4_5 = 0;
+		res4_6 = 0;
+		res4_7 = 0;
+
+		res4_8  = 0;
+		res4_9  = 0;
+		res4_10 = 0;
+		res4_11 = 0;
+		res4_12 = 0;
+		res4_13 = 0;
+		res4_14 = 0;
+		res4_15 = 0;
+
+		res5_0 = 0;
+		res5_1 = 0;
+		res5_2 = 0;
+		res5_3 = 0;
+		res5_4 = 0;
+		res5_5 = 0;
+		res5_6 = 0;
+		res5_7 = 0;
+
+		res5_8  = 0;
+		res5_9  = 0;
+		res5_10 = 0;
+		res5_11 = 0;
+		res5_12 = 0;
+		res5_13 = 0;
+		res5_14 = 0;
+		res5_15 = 0;
+
+		res6_0 = 0;
+		res6_1 = 0;
+		res6_2 = 0;
+		res6_3 = 0;
+		res6_4 = 0;
+		res6_5 = 0;
+		res6_6 = 0;
+		res6_7 = 0;
+
+		res6_8  = 0;
+		res6_9  = 0;
+		res6_10 = 0;
+		res6_11 = 0;
+		res6_12 = 0;
+		res6_13 = 0;
+		res6_14 = 0;
+		res6_15 = 0;
+
+		res7_0 = 0;
+		res7_1 = 0;
+		res7_2 = 0;
+		res7_3 = 0;
+		res7_4 = 0;
+		res7_5 = 0;
+		res7_6 = 0;
+		res7_7 = 0;
+
+		res7_8  = 0;
+		res7_9  = 0;
+		res7_10 = 0;
+		res7_11 = 0;
+		res7_12 = 0;
+		res7_13 = 0;
+		res7_14 = 0;
+		res7_15 = 0;
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+16;	// number of values in A
+#else
+		temp = off+8;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+			b2 = ptrbb[2];
+			b3 = ptrbb[3];
+			b4 = ptrbb[4];
+			b5 = ptrbb[5];
+			b6 = ptrbb[6];
+			b7 = ptrbb[7];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+			res2_0 += a0*b2;
+			res3_0 += a0*b3;
+			res4_0 += a0*b4;
+			res5_0 += a0*b5;
+			res6_0 += a0*b6;
+			res7_0 += a0*b7;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+			res2_1 += a1*b2;
+			res3_1 += a1*b3;
+			res4_1 += a1*b4;
+			res5_1 += a1*b5;
+			res6_1 += a1*b6;
+			res7_1 += a1*b7;
+
+			a0 = ptrba[2];
+			res0_2 += a0*b0;
+			res1_2 += a0*b1;
+			res2_2 += a0*b2;
+			res3_2 += a0*b3;
+			res4_2 += a0*b4;
+			res5_2 += a0*b5;
+			res6_2 += a0*b6;
+			res7_2 += a0*b7;
+
+			a1 = ptrba[3];
+			res0_3 += a1*b0;
+			res1_3 += a1*b1;
+			res2_3 += a1*b2;
+			res3_3 += a1*b3;
+			res4_3 += a1*b4;
+			res5_3 += a1*b5;
+			res6_3 += a1*b6;
+			res7_3 += a1*b7;
+
+			a0 = ptrba[4];
+			res0_4 += a0*b0;
+			res1_4 += a0*b1;
+			res2_4 += a0*b2;
+			res3_4 += a0*b3;
+			res4_4 += a0*b4;
+			res5_4 += a0*b5;
+			res6_4 += a0*b6;
+			res7_4 += a0*b7;
+
+			a1 = ptrba[5];
+			res0_5 += a1*b0;
+			res1_5 += a1*b1;
+			res2_5 += a1*b2;
+			res3_5 += a1*b3;
+			res4_5 += a1*b4;
+			res5_5 += a1*b5;
+			res6_5 += a1*b6;
+			res7_5 += a1*b7;
+
+			a0 = ptrba[6];
+			res0_6 += a0*b0;
+			res1_6 += a0*b1;
+			res2_6 += a0*b2;
+			res3_6 += a0*b3;
+			res4_6 += a0*b4;
+			res5_6 += a0*b5;
+			res6_6 += a0*b6;
+			res7_6 += a0*b7;
+
+			a1 = ptrba[7];
+			res0_7 += a1*b0;
+			res1_7 += a1*b1;
+			res2_7 += a1*b2;
+			res3_7 += a1*b3;
+			res4_7 += a1*b4;
+			res5_7 += a1*b5;
+			res6_7 += a1*b6;
+			res7_7 += a1*b7;
+
+			a0 = ptrba[8];
+			res0_8 += a0*b0;
+			res1_8 += a0*b1;
+			res2_8 += a0*b2;
+			res3_8 += a0*b3;
+			res4_8 += a0*b4;
+			res5_8 += a0*b5;
+			res6_8 += a0*b6;
+			res7_8 += a0*b7;
+
+			a1 = ptrba[9];
+			res0_9 += a1*b0;
+			res1_9 += a1*b1;
+			res2_9 += a1*b2;
+			res3_9 += a1*b3;
+			res4_9 += a1*b4;
+			res5_9 += a1*b5;
+			res6_9 += a1*b6;
+			res7_9 += a1*b7;
+
+			a0 = ptrba[10];
+			res0_10 += a0*b0;
+			res1_10 += a0*b1;
+			res2_10 += a0*b2;
+			res3_10 += a0*b3;
+			res4_10 += a0*b4;
+			res5_10 += a0*b5;
+			res6_10 += a0*b6;
+			res7_10 += a0*b7;
+
+			a1 = ptrba[11];
+			res0_11 += a1*b0;
+			res1_11 += a1*b1;
+			res2_11 += a1*b2;
+			res3_11 += a1*b3;
+			res4_11 += a1*b4;
+			res5_11 += a1*b5;
+			res6_11 += a1*b6;
+			res7_11 += a1*b7;
+
+			a0 = ptrba[12];
+			res0_12 += a0*b0;
+			res1_12 += a0*b1;
+			res2_12 += a0*b2;
+			res3_12 += a0*b3;
+			res4_12 += a0*b4;
+			res5_12 += a0*b5;
+			res6_12 += a0*b6;
+			res7_12 += a0*b7;
+
+			a1 = ptrba[13];
+			res0_13 += a1*b0;
+			res1_13 += a1*b1;
+			res2_13 += a1*b2;
+			res3_13 += a1*b3;
+			res4_13 += a1*b4;
+			res5_13 += a1*b5;
+			res6_13 += a1*b6;
+			res7_13 += a1*b7;
+
+			a0 = ptrba[14];
+			res0_14 += a0*b0;
+			res1_14 += a0*b1;
+			res2_14 += a0*b2;
+			res3_14 += a0*b3;
+			res4_14 += a0*b4;
+			res5_14 += a0*b5;
+			res6_14 += a0*b6;
+			res7_14 += a0*b7;
+
+			a1 = ptrba[15];
+			res0_15 += a1*b0;
+			res1_15 += a1*b1;
+			res2_15 += a1*b2;
+			res3_15 += a1*b3;
+			res4_15 += a1*b4;
+			res5_15 += a1*b5;
+			res6_15 += a1*b6;
+			res7_15 += a1*b7;
+
+
+			ptrba = ptrba+16;
+			ptrbb = ptrbb+8;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+		res0_2 *= alpha;
+		res0_3 *= alpha;
+		res0_4 *= alpha;
+		res0_5 *= alpha;
+		res0_6 *= alpha;
+		res0_7 *= alpha;
+
+		res0_8  *= alpha;
+		res0_9  *= alpha;
+		res0_10 *= alpha;
+		res0_11 *= alpha;
+		res0_12 *= alpha;
+		res0_13 *= alpha;
+		res0_14 *= alpha;
+		res0_15 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+		res1_2 *= alpha;
+		res1_3 *= alpha;
+		res1_4 *= alpha;
+		res1_5 *= alpha;
+		res1_6 *= alpha;
+		res1_7 *= alpha;
+
+		res1_8  *= alpha;
+		res1_9  *= alpha;
+		res1_10 *= alpha;
+		res1_11 *= alpha;
+		res1_12 *= alpha;
+		res1_13 *= alpha;
+		res1_14 *= alpha;
+		res1_15 *= alpha;
+		
+		res2_0 *= alpha;
+		res2_1 *= alpha;
+		res2_2 *= alpha;
+		res2_3 *= alpha;
+		res2_4 *= alpha;
+		res2_5 *= alpha;
+		res2_6 *= alpha;
+		res2_7 *= alpha;
+
+		res2_8  *= alpha;
+		res2_9  *= alpha;
+		res2_10 *= alpha;
+		res2_11 *= alpha;
+		res2_12 *= alpha;
+		res2_13 *= alpha;
+		res2_14 *= alpha;
+		res2_15 *= alpha;
+
+		res3_0 *= alpha;
+		res3_1 *= alpha;
+		res3_2 *= alpha;
+		res3_3 *= alpha;
+		res3_4 *= alpha;
+		res3_5 *= alpha;
+		res3_6 *= alpha;
+		res3_7 *= alpha;
+
+		res3_8  *= alpha;
+		res3_9  *= alpha;
+		res3_10 *= alpha;
+		res3_11 *= alpha;
+		res3_12 *= alpha;
+		res3_13 *= alpha;
+		res3_14 *= alpha;
+		res3_15 *= alpha;
+
+		res4_0 *= alpha;
+		res4_1 *= alpha;
+		res4_2 *= alpha;
+		res4_3 *= alpha;
+		res4_4 *= alpha;
+		res4_5 *= alpha;
+		res4_6 *= alpha;
+		res4_7 *= alpha;
+
+		res4_8  *= alpha;
+		res4_9  *= alpha;
+		res4_10 *= alpha;
+		res4_11 *= alpha;
+		res4_12 *= alpha;
+		res4_13 *= alpha;
+		res4_14 *= alpha;
+		res4_15 *= alpha;
+
+		res5_0 *= alpha;
+		res5_1 *= alpha;
+		res5_2 *= alpha;
+		res5_3 *= alpha;
+		res5_4 *= alpha;
+		res5_5 *= alpha;
+		res5_6 *= alpha;
+		res5_7 *= alpha;
+
+		res5_8  *= alpha;
+		res5_9  *= alpha;
+		res5_10 *= alpha;
+		res5_11 *= alpha;
+		res5_12 *= alpha;
+		res5_13 *= alpha;
+		res5_14 *= alpha;
+		res5_15 *= alpha;
+		
+		res6_0 *= alpha;
+		res6_1 *= alpha;
+		res6_2 *= alpha;
+		res6_3 *= alpha;
+		res6_4 *= alpha;
+		res6_5 *= alpha;
+		res6_6 *= alpha;
+		res6_7 *= alpha;
+
+		res6_8  *= alpha;
+		res6_9  *= alpha;
+		res6_10 *= alpha;
+		res6_11 *= alpha;
+		res6_12 *= alpha;
+		res6_13 *= alpha;
+		res6_14 *= alpha;
+		res6_15 *= alpha;
+
+		res7_0 *= alpha;
+		res7_1 *= alpha;
+		res7_2 *= alpha;
+		res7_3 *= alpha;
+		res7_4 *= alpha;
+		res7_5 *= alpha;
+		res7_6 *= alpha;
+		res7_7 *= alpha;
+
+		res7_8  *= alpha;
+		res7_9  *= alpha;
+		res7_10 *= alpha;
+		res7_11 *= alpha;
+		res7_12 *= alpha;
+		res7_13 *= alpha;
+		res7_14 *= alpha;
+		res7_15 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+		C0[2] = res0_2;
+		C0[3] = res0_3;
+		C0[4] = res0_4;
+		C0[5] = res0_5;
+		C0[6] = res0_6;
+		C0[7] = res0_7;
+
+		C0[8]  = res0_8;
+		C0[9]  = res0_9;
+		C0[10] = res0_10;
+		C0[11] = res0_11;
+		C0[12] = res0_12;
+		C0[13] = res0_13;
+		C0[14] = res0_14;
+		C0[15] = res0_15;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+		C1[2] = res1_2;
+		C1[3] = res1_3;
+		C1[4] = res1_4;
+		C1[5] = res1_5;
+		C1[6] = res1_6;
+		C1[7] = res1_7;
+
+		C1[8]  = res1_8;
+		C1[9]  = res1_9;
+		C1[10] = res1_10;
+		C1[11] = res1_11;
+		C1[12] = res1_12;
+		C1[13] = res1_13;
+		C1[14] = res1_14;
+		C1[15] = res1_15;
+
+		C2[0] = res2_0;
+		C2[1] = res2_1;
+		C2[2] = res2_2;
+		C2[3] = res2_3;
+		C2[4] = res2_4;
+		C2[5] = res2_5;
+		C2[6] = res2_6;
+		C2[7] = res2_7;
+
+		C2[8]  = res2_8;
+		C2[9]  = res2_9;
+		C2[10] = res2_10;
+		C2[11] = res2_11;
+		C2[12] = res2_12;
+		C2[13] = res2_13;
+		C2[14] = res2_14;
+		C2[15] = res2_15;
+
+		C3[0] = res3_0;
+		C3[1] = res3_1;
+		C3[2] = res3_2;
+		C3[3] = res3_3;
+		C3[4] = res3_4;
+		C3[5] = res3_5;
+		C3[6] = res3_6;
+		C3[7] = res3_7;
+
+		C3[8]  = res3_8;
+		C3[9]  = res3_9;
+		C3[10] = res3_10;
+		C3[11] = res3_11;
+		C3[12] = res3_12;
+		C3[13] = res3_13;
+		C3[14] = res3_14;
+		C3[15] = res3_15;
+
+		C4[0] = res4_0;
+		C4[1] = res4_1;
+		C4[2] = res4_2;
+		C4[3] = res4_3;
+		C4[4] = res4_4;
+		C4[5] = res4_5;
+		C4[6] = res4_6;
+		C4[7] = res4_7;
+
+		C4[8]  = res4_8;
+		C4[9]  = res4_9;
+		C4[10] = res4_10;
+		C4[11] = res4_11;
+		C4[12] = res4_12;
+		C4[13] = res4_13;
+		C4[14] = res4_14;
+		C4[15] = res4_15;
+
+		C5[0] = res5_0;
+		C5[1] = res5_1;
+		C5[2] = res5_2;
+		C5[3] = res5_3;
+		C5[4] = res5_4;
+		C5[5] = res5_5;
+		C5[6] = res5_6;
+		C5[7] = res5_7;
+
+		C5[8]  = res5_8;
+		C5[9]  = res5_9;
+		C5[10] = res5_10;
+		C5[11] = res5_11;
+		C5[12] = res5_12;
+		C5[13] = res5_13;
+		C5[14] = res5_14;
+		C5[15] = res5_15;
+
+		C6[0] = res6_0;
+		C6[1] = res6_1;
+		C6[2] = res6_2;
+		C6[3] = res6_3;
+		C6[4] = res6_4;
+		C6[5] = res6_5;
+		C6[6] = res6_6;
+		C6[7] = res6_7;
+
+		C6[8]  = res6_8;
+		C6[9]  = res6_9;
+		C6[10] = res6_10;
+		C6[11] = res6_11;
+		C6[12] = res6_12;
+		C6[13] = res6_13;
+		C6[14] = res6_14;
+		C6[15] = res6_15;
+
+		C7[0] = res7_0;
+		C7[1] = res7_1;
+		C7[2] = res7_2;
+		C7[3] = res7_3;
+		C7[4] = res7_4;
+		C7[5] = res7_5;
+		C7[6] = res7_6;
+		C7[7] = res7_7;
+
+		C7[8]  = res7_8;
+		C7[9]  = res7_9;
+		C7[10] = res7_10;
+		C7[11] = res7_11;
+		C7[12] = res7_12;
+		C7[13] = res7_13;
+		C7[14] = res7_14;
+		C7[15] = res7_15;
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 16; // number of values in A
+#else
+		temp -= 8; // number of values in B
+#endif
+		ptrba += temp*16;
+		ptrbb += temp*8;
+#endif
+
+#ifdef LEFT
+		off += 16; // number of values in A
+#endif
+
+		C0 = C0+16;
+		C1 = C1+16;
+		C2 = C2+16;
+		C3 = C3+16;
+		C4 = C4+16;
+		C5 = C5+16;
+		C6 = C6+16;
+		C7 = C7+16;
+	}
+
+
+        if ( bm & 8)
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*8;
+		ptrbb = bb + off*8;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+		res0_2 = 0;
+		res0_3 = 0;
+		res0_4 = 0;
+		res0_5 = 0;
+		res0_6 = 0;
+		res0_7 = 0;
+
+		res1_0 = 0;
+		res1_1 = 0;
+		res1_2 = 0;
+		res1_3 = 0;
+		res1_4 = 0;
+		res1_5 = 0;
+		res1_6 = 0;
+		res1_7 = 0;
+
+		res2_0 = 0;
+		res2_1 = 0;
+		res2_2 = 0;
+		res2_3 = 0;
+		res2_4 = 0;
+		res2_5 = 0;
+		res2_6 = 0;
+		res2_7 = 0;
+
+		res3_0 = 0;
+		res3_1 = 0;
+		res3_2 = 0;
+		res3_3 = 0;
+		res3_4 = 0;
+		res3_5 = 0;
+		res3_6 = 0;
+		res3_7 = 0;
+
+		res4_0 = 0;
+		res4_1 = 0;
+		res4_2 = 0;
+		res4_3 = 0;
+		res4_4 = 0;
+		res4_5 = 0;
+		res4_6 = 0;
+		res4_7 = 0;
+
+		res5_0 = 0;
+		res5_1 = 0;
+		res5_2 = 0;
+		res5_3 = 0;
+		res5_4 = 0;
+		res5_5 = 0;
+		res5_6 = 0;
+		res5_7 = 0;
+
+		res6_0 = 0;
+		res6_1 = 0;
+		res6_2 = 0;
+		res6_3 = 0;
+		res6_4 = 0;
+		res6_5 = 0;
+		res6_6 = 0;
+		res6_7 = 0;
+
+		res7_0 = 0;
+		res7_1 = 0;
+		res7_2 = 0;
+		res7_3 = 0;
+		res7_4 = 0;
+		res7_5 = 0;
+		res7_6 = 0;
+		res7_7 = 0;
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+8;	// number of values in A
+#else
+		temp = off+8;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+			b2 = ptrbb[2];
+			b3 = ptrbb[3];
+			b4 = ptrbb[4];
+			b5 = ptrbb[5];
+			b6 = ptrbb[6];
+			b7 = ptrbb[7];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+			res2_0 += a0*b2;
+			res3_0 += a0*b3;
+			res4_0 += a0*b4;
+			res5_0 += a0*b5;
+			res6_0 += a0*b6;
+			res7_0 += a0*b7;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+			res2_1 += a1*b2;
+			res3_1 += a1*b3;
+			res4_1 += a1*b4;
+			res5_1 += a1*b5;
+			res6_1 += a1*b6;
+			res7_1 += a1*b7;
+
+			a0 = ptrba[2];
+			res0_2 += a0*b0;
+			res1_2 += a0*b1;
+			res2_2 += a0*b2;
+			res3_2 += a0*b3;
+			res4_2 += a0*b4;
+			res5_2 += a0*b5;
+			res6_2 += a0*b6;
+			res7_2 += a0*b7;
+
+			a1 = ptrba[3];
+			res0_3 += a1*b0;
+			res1_3 += a1*b1;
+			res2_3 += a1*b2;
+			res3_3 += a1*b3;
+			res4_3 += a1*b4;
+			res5_3 += a1*b5;
+			res6_3 += a1*b6;
+			res7_3 += a1*b7;
+
+			a0 = ptrba[4];
+			res0_4 += a0*b0;
+			res1_4 += a0*b1;
+			res2_4 += a0*b2;
+			res3_4 += a0*b3;
+			res4_4 += a0*b4;
+			res5_4 += a0*b5;
+			res6_4 += a0*b6;
+			res7_4 += a0*b7;
+
+			a1 = ptrba[5];
+			res0_5 += a1*b0;
+			res1_5 += a1*b1;
+			res2_5 += a1*b2;
+			res3_5 += a1*b3;
+			res4_5 += a1*b4;
+			res5_5 += a1*b5;
+			res6_5 += a1*b6;
+			res7_5 += a1*b7;
+
+			a0 = ptrba[6];
+			res0_6 += a0*b0;
+			res1_6 += a0*b1;
+			res2_6 += a0*b2;
+			res3_6 += a0*b3;
+			res4_6 += a0*b4;
+			res5_6 += a0*b5;
+			res6_6 += a0*b6;
+			res7_6 += a0*b7;
+
+			a1 = ptrba[7];
+			res0_7 += a1*b0;
+			res1_7 += a1*b1;
+			res2_7 += a1*b2;
+			res3_7 += a1*b3;
+			res4_7 += a1*b4;
+			res5_7 += a1*b5;
+			res6_7 += a1*b6;
+			res7_7 += a1*b7;
+
+			ptrba = ptrba+8;
+			ptrbb = ptrbb+8;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+		res0_2 *= alpha;
+		res0_3 *= alpha;
+		res0_4 *= alpha;
+		res0_5 *= alpha;
+		res0_6 *= alpha;
+		res0_7 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+		res1_2 *= alpha;
+		res1_3 *= alpha;
+		res1_4 *= alpha;
+		res1_5 *= alpha;
+		res1_6 *= alpha;
+		res1_7 *= alpha;
+		
+		res2_0 *= alpha;
+		res2_1 *= alpha;
+		res2_2 *= alpha;
+		res2_3 *= alpha;
+		res2_4 *= alpha;
+		res2_5 *= alpha;
+		res2_6 *= alpha;
+		res2_7 *= alpha;
+
+		res3_0 *= alpha;
+		res3_1 *= alpha;
+		res3_2 *= alpha;
+		res3_3 *= alpha;
+		res3_4 *= alpha;
+		res3_5 *= alpha;
+		res3_6 *= alpha;
+		res3_7 *= alpha;
+
+		res4_0 *= alpha;
+		res4_1 *= alpha;
+		res4_2 *= alpha;
+		res4_3 *= alpha;
+		res4_4 *= alpha;
+		res4_5 *= alpha;
+		res4_6 *= alpha;
+		res4_7 *= alpha;
+
+		res5_0 *= alpha;
+		res5_1 *= alpha;
+		res5_2 *= alpha;
+		res5_3 *= alpha;
+		res5_4 *= alpha;
+		res5_5 *= alpha;
+		res5_6 *= alpha;
+		res5_7 *= alpha;
+		
+		res6_0 *= alpha;
+		res6_1 *= alpha;
+		res6_2 *= alpha;
+		res6_3 *= alpha;
+		res6_4 *= alpha;
+		res6_5 *= alpha;
+		res6_6 *= alpha;
+		res6_7 *= alpha;
+
+		res7_0 *= alpha;
+		res7_1 *= alpha;
+		res7_2 *= alpha;
+		res7_3 *= alpha;
+		res7_4 *= alpha;
+		res7_5 *= alpha;
+		res7_6 *= alpha;
+		res7_7 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+		C0[2] = res0_2;
+		C0[3] = res0_3;
+		C0[4] = res0_4;
+		C0[5] = res0_5;
+		C0[6] = res0_6;
+		C0[7] = res0_7;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+		C1[2] = res1_2;
+		C1[3] = res1_3;
+		C1[4] = res1_4;
+		C1[5] = res1_5;
+		C1[6] = res1_6;
+		C1[7] = res1_7;
+
+		C2[0] = res2_0;
+		C2[1] = res2_1;
+		C2[2] = res2_2;
+		C2[3] = res2_3;
+		C2[4] = res2_4;
+		C2[5] = res2_5;
+		C2[6] = res2_6;
+		C2[7] = res2_7;
+
+		C3[0] = res3_0;
+		C3[1] = res3_1;
+		C3[2] = res3_2;
+		C3[3] = res3_3;
+		C3[4] = res3_4;
+		C3[5] = res3_5;
+		C3[6] = res3_6;
+		C3[7] = res3_7;
+
+		C4[0] = res4_0;
+		C4[1] = res4_1;
+		C4[2] = res4_2;
+		C4[3] = res4_3;
+		C4[4] = res4_4;
+		C4[5] = res4_5;
+		C4[6] = res4_6;
+		C4[7] = res4_7;
+
+		C5[0] = res5_0;
+		C5[1] = res5_1;
+		C5[2] = res5_2;
+		C5[3] = res5_3;
+		C5[4] = res5_4;
+		C5[5] = res5_5;
+		C5[6] = res5_6;
+		C5[7] = res5_7;
+
+		C6[0] = res6_0;
+		C6[1] = res6_1;
+		C6[2] = res6_2;
+		C6[3] = res6_3;
+		C6[4] = res6_4;
+		C6[5] = res6_5;
+		C6[6] = res6_6;
+		C6[7] = res6_7;
+
+		C7[0] = res7_0;
+		C7[1] = res7_1;
+		C7[2] = res7_2;
+		C7[3] = res7_3;
+		C7[4] = res7_4;
+		C7[5] = res7_5;
+		C7[6] = res7_6;
+		C7[7] = res7_7;
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 8; // number of values in A
+#else
+		temp -= 8; // number of values in B
+#endif
+		ptrba += temp*8;
+		ptrbb += temp*8;
+#endif
+
+#ifdef LEFT
+		off += 8; // number of values in A
+#endif
+
+		C0 = C0+8;
+		C1 = C1+8;
+		C2 = C2+8;
+		C3 = C3+8;
+		C4 = C4+8;
+		C5 = C5+8;
+		C6 = C6+8;
+		C7 = C7+8;
+	}
+
+	if ( bm & 4 )
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*4;
+		ptrbb = bb + off*8;
+#endif
+
+		res0_0 = 0;
+		res1_0 = 0;
+		res2_0 = 0;
+		res3_0 = 0;
+		res4_0 = 0;
+		res5_0 = 0;
+		res6_0 = 0;
+		res7_0 = 0;
+
+		res0_1 = 0;
+		res1_1 = 0;
+		res2_1 = 0;
+		res3_1 = 0;
+		res4_1 = 0;
+		res5_1 = 0;
+		res6_1 = 0;
+		res7_1 = 0;
+
+		res0_2 = 0;
+		res1_2 = 0;
+		res2_2 = 0;
+		res3_2 = 0;
+		res4_2 = 0;
+		res5_2 = 0;
+		res6_2 = 0;
+		res7_2 = 0;
+
+		res0_3 = 0;
+		res1_3 = 0;
+		res2_3 = 0;
+		res3_3 = 0;
+		res4_3 = 0;
+		res5_3 = 0;
+		res6_3 = 0;
+		res7_3 = 0;
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+4;	// number of values in A
+#else
+		temp = off+8;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+			b2 = ptrbb[2];
+			b3 = ptrbb[3];
+			b4 = ptrbb[4];
+			b5 = ptrbb[5];
+			b6 = ptrbb[6];
+			b7 = ptrbb[7];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+			res2_0 += a0*b2;
+			res3_0 += a0*b3;
+			res4_0 += a0*b4;
+			res5_0 += a0*b5;
+			res6_0 += a0*b6;
+			res7_0 += a0*b7;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+			res2_1 += a1*b2;
+			res3_1 += a1*b3;
+			res4_1 += a1*b4;
+			res5_1 += a1*b5;
+			res6_1 += a1*b6;
+			res7_1 += a1*b7;
+
+			a0 = ptrba[2];
+			res0_2 += a0*b0;
+			res1_2 += a0*b1;
+			res2_2 += a0*b2;
+			res3_2 += a0*b3;
+			res4_2 += a0*b4;
+			res5_2 += a0*b5;
+			res6_2 += a0*b6;
+			res7_2 += a0*b7;
+
+			a1 = ptrba[3];
+			res0_3 += a1*b0;
+			res1_3 += a1*b1;
+			res2_3 += a1*b2;
+			res3_3 += a1*b3;
+			res4_3 += a1*b4;
+			res5_3 += a1*b5;
+			res6_3 += a1*b6;
+			res7_3 += a1*b7;
+
+			ptrba = ptrba+4;
+			ptrbb = ptrbb+8;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+		res0_2 *= alpha;
+		res0_3 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+		res1_2 *= alpha;
+		res1_3 *= alpha;
+		
+		res2_0 *= alpha;
+		res2_1 *= alpha;
+		res2_2 *= alpha;
+		res2_3 *= alpha;
+
+		res3_0 *= alpha;
+		res3_1 *= alpha;
+		res3_2 *= alpha;
+		res3_3 *= alpha;
+
+		res4_0 *= alpha;
+		res4_1 *= alpha;
+		res4_2 *= alpha;
+		res4_3 *= alpha;
+
+		res5_0 *= alpha;
+		res5_1 *= alpha;
+		res5_2 *= alpha;
+		res5_3 *= alpha;
+		
+		res6_0 *= alpha;
+		res6_1 *= alpha;
+		res6_2 *= alpha;
+		res6_3 *= alpha;
+
+		res7_0 *= alpha;
+		res7_1 *= alpha;
+		res7_2 *= alpha;
+		res7_3 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+		C0[2] = res0_2;
+		C0[3] = res0_3;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+		C1[2] = res1_2;
+		C1[3] = res1_3;
+
+		C2[0] = res2_0;
+		C2[1] = res2_1;
+		C2[2] = res2_2;
+		C2[3] = res2_3;
+
+		C3[0] = res3_0;
+		C3[1] = res3_1;
+		C3[2] = res3_2;
+		C3[3] = res3_3;
+
+		C4[0] = res4_0;
+		C4[1] = res4_1;
+		C4[2] = res4_2;
+		C4[3] = res4_3;
+
+		C5[0] = res5_0;
+		C5[1] = res5_1;
+		C5[2] = res5_2;
+		C5[3] = res5_3;
+
+		C6[0] = res6_0;
+		C6[1] = res6_1;
+		C6[2] = res6_2;
+		C6[3] = res6_3;
+
+		C7[0] = res7_0;
+		C7[1] = res7_1;
+		C7[2] = res7_2;
+		C7[3] = res7_3;
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 4; // number of values in A
+#else
+		temp -= 8; // number of values in B
+#endif
+		ptrba += temp*4;
+		ptrbb += temp*8;
+#endif
+
+#ifdef LEFT
+		off += 4; // number of values in A
+#endif
+
+		C0 = C0+4;
+		C1 = C1+4;
+		C2 = C2+4;
+		C3 = C3+4;
+		C4 = C4+4;
+		C5 = C5+4;
+		C6 = C6+4;
+		C7 = C7+4;
+	}
+
+	if ( bm & 2 )
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*2;
+		ptrbb = bb + off*8;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+
+		res1_0 = 0;
+		res1_1 = 0;
+		
+		res2_0 = 0;
+		res2_1 = 0;
+
+		res3_0 = 0;
+		res3_1 = 0;
+
+		res4_0 = 0;
+		res4_1 = 0;
+
+		res5_0 = 0;
+		res5_1 = 0;
+		
+		res6_0 = 0;
+		res6_1 = 0;
+
+		res7_0 = 0;
+		res7_1 = 0;
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+2;	// number of values in A
+#else
+		temp = off+8;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+			b2 = ptrbb[2];
+			b3 = ptrbb[3];
+			b4 = ptrbb[4];
+			b5 = ptrbb[5];
+			b6 = ptrbb[6];
+			b7 = ptrbb[7];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+			res2_0 += a0*b2;
+			res3_0 += a0*b3;
+			res4_0 += a0*b4;
+			res5_0 += a0*b5;
+			res6_0 += a0*b6;
+			res7_0 += a0*b7;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+			res2_1 += a1*b2;
+			res3_1 += a1*b3;
+			res4_1 += a1*b4;
+			res5_1 += a1*b5;
+			res6_1 += a1*b6;
+			res7_1 += a1*b7;
+
+			ptrba = ptrba+2;
+			ptrbb = ptrbb+8;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+		
+		res2_0 *= alpha;
+		res2_1 *= alpha;
+
+		res3_0 *= alpha;
+		res3_1 *= alpha;
+
+		res4_0 *= alpha;
+		res4_1 *= alpha;
+
+		res5_0 *= alpha;
+		res5_1 *= alpha;
+		
+		res6_0 *= alpha;
+		res6_1 *= alpha;
+
+		res7_0 *= alpha;
+		res7_1 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+
+		C2[0] = res2_0;
+		C2[1] = res2_1;
+
+		C3[0] = res3_0;
+		C3[1] = res3_1;
+
+		C4[0] = res4_0;
+		C4[1] = res4_1;
+
+		C5[0] = res5_0;
+		C5[1] = res5_1;
+
+		C6[0] = res6_0;
+		C6[1] = res6_1;
+
+		C7[0] = res7_0;
+		C7[1] = res7_1;
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 2; // number of values in A
+#else
+		temp -= 8; // number of values in B
+#endif
+		ptrba += temp*2;
+		ptrbb += temp*8;
+#endif
+
+#ifdef LEFT
+		off += 2; // number of values in A
+#endif
+
+		C0 = C0+2;
+		C1 = C1+2;
+		C2 = C2+2;
+		C3 = C3+2;
+		C4 = C4+2;
+		C5 = C5+2;
+		C6 = C6+2;
+		C7 = C7+2;
+	}
+	
+	if ( bm & 1 )
+	{
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*1;
+		ptrbb = bb + off*8;
+#endif
+
+		res0_0 = 0;
+		res1_0 = 0;
+		res2_0 = 0;
+		res3_0 = 0;
+		res4_0 = 0;
+		res5_0 = 0;
+		res6_0 = 0;
+		res7_0 = 0;
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+1;	// number of values in A
+#else
+		temp = off+8;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+			b2 = ptrbb[2];
+			b3 = ptrbb[3];
+			b4 = ptrbb[4];
+			b5 = ptrbb[5];
+			b6 = ptrbb[6];
+			b7 = ptrbb[7];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+			res2_0 += a0*b2;
+			res3_0 += a0*b3;
+			res4_0 += a0*b4;
+			res5_0 += a0*b5;
+			res6_0 += a0*b6;
+			res7_0 += a0*b7;
+
+			ptrba = ptrba+1;
+			ptrbb = ptrbb+8;
+                }
+
+		res0_0 *= alpha;
+
+		res1_0 *= alpha;
+		
+		res2_0 *= alpha;
+
+		res3_0 *= alpha;
+
+		res4_0 *= alpha;
+
+		res5_0 *= alpha;
+		
+		res6_0 *= alpha;
+
+		res7_0 *= alpha;
+
+		C0[0] = res0_0;
+
+		C1[0] = res1_0;
+
+		C2[0] = res2_0;
+
+		C3[0] = res3_0;
+
+		C4[0] = res4_0;
+
+		C5[0] = res5_0;
+
+		C6[0] = res6_0;
+
+		C7[0] = res7_0;
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 1; // number of values in A
+#else
+		temp -= 8; // number of values in B
+#endif
+		ptrba += temp*1;
+		ptrbb += temp*8;
+#endif
+
+#ifdef LEFT
+		off += 1; // number of values in A
+#endif
+
+		C0 = C0+1;
+		C1 = C1+1;
+		C2 = C2+1;
+		C3 = C3+1;
+		C4 = C4+1;
+		C5 = C5+1;
+		C6 = C6+1;
+		C7 = C7+1;
+	}
+
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+		off += 8;
+#endif
+
+        k = (bk<<3);
+        bb = bb+k;
+        i = (ldc<<3);
+        C = C+i;
+    }
+
+
+   if( bn & 4 )
+   {
+        C0 = C;
+        C1 = C0+ldc;
+        C2 = C0+2*ldc;
+        C3 = C0+3*ldc;
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+	off = offset;
+#endif
+
+
+        ptrba = ba;
+
+
+        for (i=0; i<bm/16; i+=1)
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*16;
+		ptrbb = bb + off*4;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+		res0_2 = 0;
+		res0_3 = 0;
+		res0_4 = 0;
+		res0_5 = 0;
+		res0_6 = 0;
+		res0_7 = 0;
+
+		res0_8  = 0;
+		res0_9  = 0;
+		res0_10 = 0;
+		res0_11 = 0;
+		res0_12 = 0;
+		res0_13 = 0;
+		res0_14 = 0;
+		res0_15 = 0;
+
+		res1_0 = 0;
+		res1_1 = 0;
+		res1_2 = 0;
+		res1_3 = 0;
+		res1_4 = 0;
+		res1_5 = 0;
+		res1_6 = 0;
+		res1_7 = 0;
+
+		res1_8  = 0;
+		res1_9  = 0;
+		res1_10 = 0;
+		res1_11 = 0;
+		res1_12 = 0;
+		res1_13 = 0;
+		res1_14 = 0;
+		res1_15 = 0;
+
+		res2_0 = 0;
+		res2_1 = 0;
+		res2_2 = 0;
+		res2_3 = 0;
+		res2_4 = 0;
+		res2_5 = 0;
+		res2_6 = 0;
+		res2_7 = 0;
+
+		res2_8  = 0;
+		res2_9  = 0;
+		res2_10 = 0;
+		res2_11 = 0;
+		res2_12 = 0;
+		res2_13 = 0;
+		res2_14 = 0;
+		res2_15 = 0;
+
+		res3_0 = 0;
+		res3_1 = 0;
+		res3_2 = 0;
+		res3_3 = 0;
+		res3_4 = 0;
+		res3_5 = 0;
+		res3_6 = 0;
+		res3_7 = 0;
+
+		res3_8  = 0;
+		res3_9  = 0;
+		res3_10 = 0;
+		res3_11 = 0;
+		res3_12 = 0;
+		res3_13 = 0;
+		res3_14 = 0;
+		res3_15 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+16;	// number of values in A
+#else
+		temp = off+4;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+			b2 = ptrbb[2];
+			b3 = ptrbb[3];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+			res2_0 += a0*b2;
+			res3_0 += a0*b3;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+			res2_1 += a1*b2;
+			res3_1 += a1*b3;
+
+			a0 = ptrba[2];
+			res0_2 += a0*b0;
+			res1_2 += a0*b1;
+			res2_2 += a0*b2;
+			res3_2 += a0*b3;
+
+			a1 = ptrba[3];
+			res0_3 += a1*b0;
+			res1_3 += a1*b1;
+			res2_3 += a1*b2;
+			res3_3 += a1*b3;
+
+			a0 = ptrba[4];
+			res0_4 += a0*b0;
+			res1_4 += a0*b1;
+			res2_4 += a0*b2;
+			res3_4 += a0*b3;
+
+			a1 = ptrba[5];
+			res0_5 += a1*b0;
+			res1_5 += a1*b1;
+			res2_5 += a1*b2;
+			res3_5 += a1*b3;
+
+			a0 = ptrba[6];
+			res0_6 += a0*b0;
+			res1_6 += a0*b1;
+			res2_6 += a0*b2;
+			res3_6 += a0*b3;
+
+			a1 = ptrba[7];
+			res0_7 += a1*b0;
+			res1_7 += a1*b1;
+			res2_7 += a1*b2;
+			res3_7 += a1*b3;
+
+			a0 = ptrba[8];
+			res0_8 += a0*b0;
+			res1_8 += a0*b1;
+			res2_8 += a0*b2;
+			res3_8 += a0*b3;
+
+			a1 = ptrba[9];
+			res0_9 += a1*b0;
+			res1_9 += a1*b1;
+			res2_9 += a1*b2;
+			res3_9 += a1*b3;
+
+			a0 = ptrba[10];
+			res0_10 += a0*b0;
+			res1_10 += a0*b1;
+			res2_10 += a0*b2;
+			res3_10 += a0*b3;
+
+			a1 = ptrba[11];
+			res0_11 += a1*b0;
+			res1_11 += a1*b1;
+			res2_11 += a1*b2;
+			res3_11 += a1*b3;
+
+			a0 = ptrba[12];
+			res0_12 += a0*b0;
+			res1_12 += a0*b1;
+			res2_12 += a0*b2;
+			res3_12 += a0*b3;
+
+			a1 = ptrba[13];
+			res0_13 += a1*b0;
+			res1_13 += a1*b1;
+			res2_13 += a1*b2;
+			res3_13 += a1*b3;
+
+			a0 = ptrba[14];
+			res0_14 += a0*b0;
+			res1_14 += a0*b1;
+			res2_14 += a0*b2;
+			res3_14 += a0*b3;
+
+			a1 = ptrba[15];
+			res0_15 += a1*b0;
+			res1_15 += a1*b1;
+			res2_15 += a1*b2;
+			res3_15 += a1*b3;
+
+
+			ptrba = ptrba+16;
+			ptrbb = ptrbb+4;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+		res0_2 *= alpha;
+		res0_3 *= alpha;
+		res0_4 *= alpha;
+		res0_5 *= alpha;
+		res0_6 *= alpha;
+		res0_7 *= alpha;
+
+		res0_8  *= alpha;
+		res0_9  *= alpha;
+		res0_10 *= alpha;
+		res0_11 *= alpha;
+		res0_12 *= alpha;
+		res0_13 *= alpha;
+		res0_14 *= alpha;
+		res0_15 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+		res1_2 *= alpha;
+		res1_3 *= alpha;
+		res1_4 *= alpha;
+		res1_5 *= alpha;
+		res1_6 *= alpha;
+		res1_7 *= alpha;
+
+		res1_8  *= alpha;
+		res1_9  *= alpha;
+		res1_10 *= alpha;
+		res1_11 *= alpha;
+		res1_12 *= alpha;
+		res1_13 *= alpha;
+		res1_14 *= alpha;
+		res1_15 *= alpha;
+		
+		res2_0 *= alpha;
+		res2_1 *= alpha;
+		res2_2 *= alpha;
+		res2_3 *= alpha;
+		res2_4 *= alpha;
+		res2_5 *= alpha;
+		res2_6 *= alpha;
+		res2_7 *= alpha;
+
+		res2_8  *= alpha;
+		res2_9  *= alpha;
+		res2_10 *= alpha;
+		res2_11 *= alpha;
+		res2_12 *= alpha;
+		res2_13 *= alpha;
+		res2_14 *= alpha;
+		res2_15 *= alpha;
+
+		res3_0 *= alpha;
+		res3_1 *= alpha;
+		res3_2 *= alpha;
+		res3_3 *= alpha;
+		res3_4 *= alpha;
+		res3_5 *= alpha;
+		res3_6 *= alpha;
+		res3_7 *= alpha;
+
+		res3_8  *= alpha;
+		res3_9  *= alpha;
+		res3_10 *= alpha;
+		res3_11 *= alpha;
+		res3_12 *= alpha;
+		res3_13 *= alpha;
+		res3_14 *= alpha;
+		res3_15 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+		C0[2] = res0_2;
+		C0[3] = res0_3;
+		C0[4] = res0_4;
+		C0[5] = res0_5;
+		C0[6] = res0_6;
+		C0[7] = res0_7;
+
+		C0[8]  = res0_8;
+		C0[9]  = res0_9;
+		C0[10] = res0_10;
+		C0[11] = res0_11;
+		C0[12] = res0_12;
+		C0[13] = res0_13;
+		C0[14] = res0_14;
+		C0[15] = res0_15;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+		C1[2] = res1_2;
+		C1[3] = res1_3;
+		C1[4] = res1_4;
+		C1[5] = res1_5;
+		C1[6] = res1_6;
+		C1[7] = res1_7;
+
+		C1[8]  = res1_8;
+		C1[9]  = res1_9;
+		C1[10] = res1_10;
+		C1[11] = res1_11;
+		C1[12] = res1_12;
+		C1[13] = res1_13;
+		C1[14] = res1_14;
+		C1[15] = res1_15;
+
+		C2[0] = res2_0;
+		C2[1] = res2_1;
+		C2[2] = res2_2;
+		C2[3] = res2_3;
+		C2[4] = res2_4;
+		C2[5] = res2_5;
+		C2[6] = res2_6;
+		C2[7] = res2_7;
+
+		C2[8]  = res2_8;
+		C2[9]  = res2_9;
+		C2[10] = res2_10;
+		C2[11] = res2_11;
+		C2[12] = res2_12;
+		C2[13] = res2_13;
+		C2[14] = res2_14;
+		C2[15] = res2_15;
+
+		C3[0] = res3_0;
+		C3[1] = res3_1;
+		C3[2] = res3_2;
+		C3[3] = res3_3;
+		C3[4] = res3_4;
+		C3[5] = res3_5;
+		C3[6] = res3_6;
+		C3[7] = res3_7;
+
+		C3[8]  = res3_8;
+		C3[9]  = res3_9;
+		C3[10] = res3_10;
+		C3[11] = res3_11;
+		C3[12] = res3_12;
+		C3[13] = res3_13;
+		C3[14] = res3_14;
+		C3[15] = res3_15;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 16; // number of values in A
+#else
+		temp -= 4; // number of values in B
+#endif
+		ptrba += temp*16;
+		ptrbb += temp*4;
+#endif
+
+#ifdef LEFT
+		off += 16; // number of values in A
+#endif
+
+		C0 = C0+16;
+		C1 = C1+16;
+		C2 = C2+16;
+		C3 = C3+16;
+	}
+
+
+        if ( bm & 8)
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*8;
+		ptrbb = bb + off*4;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+		res0_2 = 0;
+		res0_3 = 0;
+		res0_4 = 0;
+		res0_5 = 0;
+		res0_6 = 0;
+		res0_7 = 0;
+
+		res1_0 = 0;
+		res1_1 = 0;
+		res1_2 = 0;
+		res1_3 = 0;
+		res1_4 = 0;
+		res1_5 = 0;
+		res1_6 = 0;
+		res1_7 = 0;
+
+		res2_0 = 0;
+		res2_1 = 0;
+		res2_2 = 0;
+		res2_3 = 0;
+		res2_4 = 0;
+		res2_5 = 0;
+		res2_6 = 0;
+		res2_7 = 0;
+
+		res3_0 = 0;
+		res3_1 = 0;
+		res3_2 = 0;
+		res3_3 = 0;
+		res3_4 = 0;
+		res3_5 = 0;
+		res3_6 = 0;
+		res3_7 = 0;
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+8;	// number of values in A
+#else
+		temp = off+4;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+			b2 = ptrbb[2];
+			b3 = ptrbb[3];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+			res2_0 += a0*b2;
+			res3_0 += a0*b3;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+			res2_1 += a1*b2;
+			res3_1 += a1*b3;
+
+			a0 = ptrba[2];
+			res0_2 += a0*b0;
+			res1_2 += a0*b1;
+			res2_2 += a0*b2;
+			res3_2 += a0*b3;
+
+			a1 = ptrba[3];
+			res0_3 += a1*b0;
+			res1_3 += a1*b1;
+			res2_3 += a1*b2;
+			res3_3 += a1*b3;
+
+			a0 = ptrba[4];
+			res0_4 += a0*b0;
+			res1_4 += a0*b1;
+			res2_4 += a0*b2;
+			res3_4 += a0*b3;
+
+			a1 = ptrba[5];
+			res0_5 += a1*b0;
+			res1_5 += a1*b1;
+			res2_5 += a1*b2;
+			res3_5 += a1*b3;
+
+			a0 = ptrba[6];
+			res0_6 += a0*b0;
+			res1_6 += a0*b1;
+			res2_6 += a0*b2;
+			res3_6 += a0*b3;
+
+			a1 = ptrba[7];
+			res0_7 += a1*b0;
+			res1_7 += a1*b1;
+			res2_7 += a1*b2;
+			res3_7 += a1*b3;
+
+			ptrba = ptrba+8;
+			ptrbb = ptrbb+4;
+
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+		res0_2 *= alpha;
+		res0_3 *= alpha;
+		res0_4 *= alpha;
+		res0_5 *= alpha;
+		res0_6 *= alpha;
+		res0_7 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+		res1_2 *= alpha;
+		res1_3 *= alpha;
+		res1_4 *= alpha;
+		res1_5 *= alpha;
+		res1_6 *= alpha;
+		res1_7 *= alpha;
+
+		res2_0 *= alpha;
+		res2_1 *= alpha;
+		res2_2 *= alpha;
+		res2_3 *= alpha;
+		res2_4 *= alpha;
+		res2_5 *= alpha;
+		res2_6 *= alpha;
+		res2_7 *= alpha;
+
+		res3_0 *= alpha;
+		res3_1 *= alpha;
+		res3_2 *= alpha;
+		res3_3 *= alpha;
+		res3_4 *= alpha;
+		res3_5 *= alpha;
+		res3_6 *= alpha;
+		res3_7 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+		C0[2] = res0_2;
+		C0[3] = res0_3;
+		C0[4] = res0_4;
+		C0[5] = res0_5;
+		C0[6] = res0_6;
+		C0[7] = res0_7;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+		C1[2] = res1_2;
+		C1[3] = res1_3;
+		C1[4] = res1_4;
+		C1[5] = res1_5;
+		C1[6] = res1_6;
+		C1[7] = res1_7;
+
+		C2[0] = res2_0;
+		C2[1] = res2_1;
+		C2[2] = res2_2;
+		C2[3] = res2_3;
+		C2[4] = res2_4;
+		C2[5] = res2_5;
+		C2[6] = res2_6;
+		C2[7] = res2_7;
+
+		C3[0] = res3_0;
+		C3[1] = res3_1;
+		C3[2] = res3_2;
+		C3[3] = res3_3;
+		C3[4] = res3_4;
+		C3[5] = res3_5;
+		C3[6] = res3_6;
+		C3[7] = res3_7;
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 8; // number of values in A
+#else
+		temp -= 4; // number of values in B
+#endif
+		ptrba += temp*8;
+		ptrbb += temp*4;
+#endif
+
+#ifdef LEFT
+		off += 8; // number of values in A
+#endif
+
+		C0 = C0+8;
+		C1 = C1+8;
+		C2 = C2+8;
+		C3 = C3+8;
+	}
+
+	if ( bm & 4 )
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*4;
+		ptrbb = bb + off*4;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+		res0_2 = 0;
+		res0_3 = 0;
+
+		res1_0 = 0;
+		res1_1 = 0;
+		res1_2 = 0;
+		res1_3 = 0;
+		
+		res2_0 = 0;
+		res2_1 = 0;
+		res2_2 = 0;
+		res2_3 = 0;
+
+		res3_0 = 0;
+		res3_1 = 0;
+		res3_2 = 0;
+		res3_3 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+4;	// number of values in A
+#else
+		temp = off+4;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+			b2 = ptrbb[2];
+			b3 = ptrbb[3];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+			res2_0 += a0*b2;
+			res3_0 += a0*b3;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+			res2_1 += a1*b2;
+			res3_1 += a1*b3;
+
+			a0 = ptrba[2];
+			res0_2 += a0*b0;
+			res1_2 += a0*b1;
+			res2_2 += a0*b2;
+			res3_2 += a0*b3;
+
+			a1 = ptrba[3];
+			res0_3 += a1*b0;
+			res1_3 += a1*b1;
+			res2_3 += a1*b2;
+			res3_3 += a1*b3;
+
+			ptrba = ptrba+4;
+			ptrbb = ptrbb+4;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+		res0_2 *= alpha;
+		res0_3 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+		res1_2 *= alpha;
+		res1_3 *= alpha;
+
+		res2_0 *= alpha;
+		res2_1 *= alpha;
+		res2_2 *= alpha;
+		res2_3 *= alpha;
+
+		res3_0 *= alpha;
+		res3_1 *= alpha;
+		res3_2 *= alpha;
+		res3_3 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+		C0[2] = res0_2;
+		C0[3] = res0_3;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+		C1[2] = res1_2;
+		C1[3] = res1_3;
+
+
+		C2[0] = res2_0;
+		C2[1] = res2_1;
+		C2[2] = res2_2;
+		C2[3] = res2_3;
+
+		C3[0] = res3_0;
+		C3[1] = res3_1;
+		C3[2] = res3_2;
+		C3[3] = res3_3;
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 4; // number of values in A
+#else
+		temp -= 4; // number of values in B
+#endif
+		ptrba += temp*4;
+		ptrbb += temp*4;
+#endif
+
+#ifdef LEFT
+		off += 4; // number of values in A
+#endif
+
+		C0 = C0+4;
+		C1 = C1+4;
+		C2 = C2+4;
+		C3 = C3+4;
+	}
+
+	if ( bm & 2 )
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*2;
+		ptrbb = bb + off*4;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+
+		res1_0 = 0;
+		res1_1 = 0;
+		
+		res2_0 = 0;
+		res2_1 = 0;
+
+		res3_0 = 0;
+		res3_1 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+2;	// number of values in A
+#else
+		temp = off+4;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+			b2 = ptrbb[2];
+			b3 = ptrbb[3];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+			res2_0 += a0*b2;
+			res3_0 += a0*b3;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+			res2_1 += a1*b2;
+			res3_1 += a1*b3;
+
+			ptrba = ptrba+2;
+			ptrbb = ptrbb+4;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+
+		res2_0 *= alpha;
+		res2_1 *= alpha;
+
+		res3_0 *= alpha;
+		res3_1 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+
+		C2[0] = res2_0;
+		C2[1] = res2_1;
+
+		C3[0] = res3_0;
+		C3[1] = res3_1;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 2; // number of values in A
+#else
+		temp -= 4; // number of values in B
+#endif
+		ptrba += temp*2;
+		ptrbb += temp*4;
+#endif
+
+#ifdef LEFT
+		off += 2; // number of values in A
+#endif
+
+		C0 = C0+2;
+		C1 = C1+2;
+		C2 = C2+2;
+		C3 = C3+2;
+	}
+	
+	if ( bm & 1 )
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*1;
+		ptrbb = bb + off*4;
+#endif
+
+		res0_0 = 0;
+		res1_0 = 0;
+		res2_0 = 0;
+		res3_0 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+1;	// number of values in A
+#else
+		temp = off+4;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+			b2 = ptrbb[2];
+			b3 = ptrbb[3];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+			res2_0 += a0*b2;
+			res3_0 += a0*b3;
+
+			ptrba = ptrba+1;
+			ptrbb = ptrbb+4;
+                }
+		res0_0 *= alpha;
+		res1_0 *= alpha;
+		res2_0 *= alpha;
+		res3_0 *= alpha;
+
+		C0[0] = res0_0;
+		C1[0] = res1_0;
+		C2[0] = res2_0;
+		C3[0] = res3_0;
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 1; // number of values in A
+#else
+		temp -= 4; // number of values in B
+#endif
+		ptrba += temp*1;
+		ptrbb += temp*4;
+#endif
+
+#ifdef LEFT
+		off += 1; // number of values in A
+#endif
+
+		C0 = C0+1;
+		C1 = C1+1;
+		C2 = C2+1;
+		C3 = C3+1;
+
+	}
+
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+		off += 4;
+#endif
+
+        k = (bk<<2);
+        bb = bb+k;
+        i = (ldc<<2);
+        C = C+i;
+    }
+
+
+   if(bn&2)
+   {
+        C0 = C;
+        C1 = C0+ldc;
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+	off = offset;
+#endif
+
+
+        ptrba = ba;
+
+
+        for (i=0; i<bm/16; i+=1)
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*16;
+		ptrbb = bb + off*2;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+		res0_2 = 0;
+		res0_3 = 0;
+		res0_4 = 0;
+		res0_5 = 0;
+		res0_6 = 0;
+		res0_7 = 0;
+
+		res0_8  = 0;
+		res0_9  = 0;
+		res0_10 = 0;
+		res0_11 = 0;
+		res0_12 = 0;
+		res0_13 = 0;
+		res0_14 = 0;
+		res0_15 = 0;
+
+		res1_0 = 0;
+		res1_1 = 0;
+		res1_2 = 0;
+		res1_3 = 0;
+		res1_4 = 0;
+		res1_5 = 0;
+		res1_6 = 0;
+		res1_7 = 0;
+
+		res1_8  = 0;
+		res1_9  = 0;
+		res1_10 = 0;
+		res1_11 = 0;
+		res1_12 = 0;
+		res1_13 = 0;
+		res1_14 = 0;
+		res1_15 = 0;
+
+
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+16;	// number of values in A
+#else
+		temp = off+2;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+
+			a0 = ptrba[2];
+			res0_2 += a0*b0;
+			res1_2 += a0*b1;
+
+			a1 = ptrba[3];
+			res0_3 += a1*b0;
+			res1_3 += a1*b1;
+
+			a0 = ptrba[4];
+			res0_4 += a0*b0;
+			res1_4 += a0*b1;
+
+			a1 = ptrba[5];
+			res0_5 += a1*b0;
+			res1_5 += a1*b1;
+
+			a0 = ptrba[6];
+			res0_6 += a0*b0;
+			res1_6 += a0*b1;
+
+			a1 = ptrba[7];
+			res0_7 += a1*b0;
+			res1_7 += a1*b1;
+
+			a0 = ptrba[8];
+			res0_8 += a0*b0;
+			res1_8 += a0*b1;
+
+			a1 = ptrba[9];
+			res0_9 += a1*b0;
+			res1_9 += a1*b1;
+
+			a0 = ptrba[10];
+			res0_10 += a0*b0;
+			res1_10 += a0*b1;
+
+			a1 = ptrba[11];
+			res0_11 += a1*b0;
+			res1_11 += a1*b1;
+
+			a0 = ptrba[12];
+			res0_12 += a0*b0;
+			res1_12 += a0*b1;
+
+			a1 = ptrba[13];
+			res0_13 += a1*b0;
+			res1_13 += a1*b1;
+
+			a0 = ptrba[14];
+			res0_14 += a0*b0;
+			res1_14 += a0*b1;
+
+			a1 = ptrba[15];
+			res0_15 += a1*b0;
+			res1_15 += a1*b1;
+
+
+			ptrba = ptrba+16;
+			ptrbb = ptrbb+2;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+		res0_2 *= alpha;
+		res0_3 *= alpha;
+		res0_4 *= alpha;
+		res0_5 *= alpha;
+		res0_6 *= alpha;
+		res0_7 *= alpha;
+
+		res0_8  *= alpha;
+		res0_9  *= alpha;
+		res0_10 *= alpha;
+		res0_11 *= alpha;
+		res0_12 *= alpha;
+		res0_13 *= alpha;
+		res0_14 *= alpha;
+		res0_15 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+		res1_2 *= alpha;
+		res1_3 *= alpha;
+		res1_4 *= alpha;
+		res1_5 *= alpha;
+		res1_6 *= alpha;
+		res1_7 *= alpha;
+
+		res1_8  *= alpha;
+		res1_9  *= alpha;
+		res1_10 *= alpha;
+		res1_11 *= alpha;
+		res1_12 *= alpha;
+		res1_13 *= alpha;
+		res1_14 *= alpha;
+		res1_15 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+		C0[2] = res0_2;
+		C0[3] = res0_3;
+		C0[4] = res0_4;
+		C0[5] = res0_5;
+		C0[6] = res0_6;
+		C0[7] = res0_7;
+
+		C0[8]  = res0_8;
+		C0[9]  = res0_9;
+		C0[10] = res0_10;
+		C0[11] = res0_11;
+		C0[12] = res0_12;
+		C0[13] = res0_13;
+		C0[14] = res0_14;
+		C0[15] = res0_15;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+		C1[2] = res1_2;
+		C1[3] = res1_3;
+		C1[4] = res1_4;
+		C1[5] = res1_5;
+		C1[6] = res1_6;
+		C1[7] = res1_7;
+
+		C1[8]  = res1_8;
+		C1[9]  = res1_9;
+		C1[10] = res1_10;
+		C1[11] = res1_11;
+		C1[12] = res1_12;
+		C1[13] = res1_13;
+		C1[14] = res1_14;
+		C1[15] = res1_15;
+
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 16; // number of values in A
+#else
+		temp -= 2; // number of values in B
+#endif
+		ptrba += temp*16;
+		ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+		off += 16; // number of values in A
+#endif
+
+		C0 = C0+16;
+		C1 = C1+16;
+	}
+
+
+
+
+        if ( bm & 8)
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*8;
+		ptrbb = bb + off*2;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+		res0_2 = 0;
+		res0_3 = 0;
+		res0_4 = 0;
+		res0_5 = 0;
+		res0_6 = 0;
+		res0_7 = 0;
+
+		res1_0 = 0;
+		res1_1 = 0;
+		res1_2 = 0;
+		res1_3 = 0;
+		res1_4 = 0;
+		res1_5 = 0;
+		res1_6 = 0;
+		res1_7 = 0;
+
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+8;	// number of values in A
+#else
+		temp = off+2;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+
+			a0 = ptrba[2];
+			res0_2 += a0*b0;
+			res1_2 += a0*b1;
+
+			a1 = ptrba[3];
+			res0_3 += a1*b0;
+			res1_3 += a1*b1;
+
+			a0 = ptrba[4];
+			res0_4 += a0*b0;
+			res1_4 += a0*b1;
+
+			a1 = ptrba[5];
+			res0_5 += a1*b0;
+			res1_5 += a1*b1;
+
+			a0 = ptrba[6];
+			res0_6 += a0*b0;
+			res1_6 += a0*b1;
+
+			a1 = ptrba[7];
+			res0_7 += a1*b0;
+			res1_7 += a1*b1;
+
+			ptrba = ptrba+8;
+			ptrbb = ptrbb+2;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+		res0_2 *= alpha;
+		res0_3 *= alpha;
+		res0_4 *= alpha;
+		res0_5 *= alpha;
+		res0_6 *= alpha;
+		res0_7 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+		res1_2 *= alpha;
+		res1_3 *= alpha;
+		res1_4 *= alpha;
+		res1_5 *= alpha;
+		res1_6 *= alpha;
+		res1_7 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+		C0[2] = res0_2;
+		C0[3] = res0_3;
+		C0[4] = res0_4;
+		C0[5] = res0_5;
+		C0[6] = res0_6;
+		C0[7] = res0_7;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+		C1[2] = res1_2;
+		C1[3] = res1_3;
+		C1[4] = res1_4;
+		C1[5] = res1_5;
+		C1[6] = res1_6;
+		C1[7] = res1_7;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 8; // number of values in A
+#else
+		temp -= 2; // number of values in B
+#endif
+		ptrba += temp*8;
+		ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+		off += 8; // number of values in A
+#endif
+
+		C0 = C0+8;
+		C1 = C1+8;
+	}
+
+	if ( bm & 4 )
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*4;
+		ptrbb = bb + off*2;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+		res0_2 = 0;
+		res0_3 = 0;
+
+		res1_0 = 0;
+		res1_1 = 0;
+		res1_2 = 0;
+		res1_3 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+4;	// number of values in A
+#else
+		temp = off+2;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+
+			a0 = ptrba[2];
+			res0_2 += a0*b0;
+			res1_2 += a0*b1;
+
+			a1 = ptrba[3];
+			res0_3 += a1*b0;
+			res1_3 += a1*b1;
+
+			ptrba = ptrba+4;
+			ptrbb = ptrbb+2;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+		res0_2 *= alpha;
+		res0_3 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+		res1_2 *= alpha;
+		res1_3 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+		C0[2] = res0_2;
+		C0[3] = res0_3;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+		C1[2] = res1_2;
+		C1[3] = res1_3;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 4; // number of values in A
+#else
+		temp -= 2; // number of values in B
+#endif
+		ptrba += temp*4;
+		ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+		off += 4; // number of values in A
+#endif
+
+		C0 = C0+4;
+		C1 = C1+4;
+
+	}
+
+	if ( bm & 2 )
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*2;
+		ptrbb = bb + off*2;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+
+		res1_0 = 0;
+		res1_1 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+2;	// number of values in A
+#else
+		temp = off+2;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+
+			ptrba = ptrba+2;
+			ptrbb = ptrbb+2;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 2; // number of values in A
+#else
+		temp -= 2; // number of values in B
+#endif
+		ptrba += temp*2;
+		ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+		off += 2; // number of values in A
+#endif
+
+		C0 = C0+2;
+		C1 = C1+2;
+
+	}
+
+	if ( bm & 1 )
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*1;
+		ptrbb = bb + off*2;
+#endif
+
+		res0_0 = 0;
+
+		res1_0 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+1;	// number of values in A
+#else
+		temp = off+2;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+
+			ptrba = ptrba+1;
+			ptrbb = ptrbb+2;
+                }
+
+		res0_0 *= alpha;
+
+		res1_0 *= alpha;
+
+		C0[0] = res0_0;
+
+		C1[0] = res1_0;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 1; // number of values in A
+#else
+		temp -= 2; // number of values in B
+#endif
+		ptrba += temp*1;
+		ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+		off += 1; // number of values in A
+#endif
+
+		C0 = C0+1;
+		C1 = C1+1;
+
+	}
+
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+		off += 2;
+#endif
+
+        k = (bk<<1);
+        bb = bb+k;
+        i = (ldc<<1);
+        C = C+i;
+    }
+
+
+   for (j=0; j<(bn&1); j+=1)
+   {
+        C0 = C;
+
+#if defined(TRMMKERNEL) &&  defined(LEFT)
+	off = offset;
+#endif
+
+        ptrba = ba;
+
+
+        for (i=0; i<bm/16; i+=1)
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*16;
+		ptrbb = bb + off*1;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+		res0_2 = 0;
+		res0_3 = 0;
+		res0_4 = 0;
+		res0_5 = 0;
+		res0_6 = 0;
+		res0_7 = 0;
+
+		res0_8  = 0;
+		res0_9  = 0;
+		res0_10 = 0;
+		res0_11 = 0;
+		res0_12 = 0;
+		res0_13 = 0;
+		res0_14 = 0;
+		res0_15 = 0;
+
+
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+16;	// number of values in A
+#else
+		temp = off+1;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+
+			a0 = ptrba[2];
+			res0_2 += a0*b0;
+
+			a1 = ptrba[3];
+			res0_3 += a1*b0;
+
+			a0 = ptrba[4];
+			res0_4 += a0*b0;
+
+			a1 = ptrba[5];
+			res0_5 += a1*b0;
+
+			a0 = ptrba[6];
+			res0_6 += a0*b0;
+
+			a1 = ptrba[7];
+			res0_7 += a1*b0;
+
+			a0 = ptrba[8];
+			res0_8 += a0*b0;
+
+			a1 = ptrba[9];
+			res0_9 += a1*b0;
+
+			a0 = ptrba[10];
+			res0_10 += a0*b0;
+
+			a1 = ptrba[11];
+			res0_11 += a1*b0;
+
+			a0 = ptrba[12];
+			res0_12 += a0*b0;
+
+			a1 = ptrba[13];
+			res0_13 += a1*b0;
+
+			a0 = ptrba[14];
+			res0_14 += a0*b0;
+
+			a1 = ptrba[15];
+			res0_15 += a1*b0;
+
+
+			ptrba = ptrba+16;
+			ptrbb = ptrbb+1;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+		res0_2 *= alpha;
+		res0_3 *= alpha;
+		res0_4 *= alpha;
+		res0_5 *= alpha;
+		res0_6 *= alpha;
+		res0_7 *= alpha;
+
+		res0_8  *= alpha;
+		res0_9  *= alpha;
+		res0_10 *= alpha;
+		res0_11 *= alpha;
+		res0_12 *= alpha;
+		res0_13 *= alpha;
+		res0_14 *= alpha;
+		res0_15 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+		C0[2] = res0_2;
+		C0[3] = res0_3;
+		C0[4] = res0_4;
+		C0[5] = res0_5;
+		C0[6] = res0_6;
+		C0[7] = res0_7;
+
+		C0[8]  = res0_8;
+		C0[9]  = res0_9;
+		C0[10] = res0_10;
+		C0[11] = res0_11;
+		C0[12] = res0_12;
+		C0[13] = res0_13;
+		C0[14] = res0_14;
+		C0[15] = res0_15;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 16; // number of values in A
+#else
+		temp -= 1; // number of values in B
+#endif
+		ptrba += temp*16;
+		ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+		off += 16; // number of values in A
+#endif
+
+		C0 = C0+16;
+	}
+
+
+
+
+        if ( bm & 8 )
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*8;
+		ptrbb = bb + off*1;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+		res0_2 = 0;
+		res0_3 = 0;
+		res0_4 = 0;
+		res0_5 = 0;
+		res0_6 = 0;
+		res0_7 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+8;	// number of values in A
+#else
+		temp = off+1;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+
+			a0 = ptrba[2];
+			res0_2 += a0*b0;
+
+			a1 = ptrba[3];
+			res0_3 += a1*b0;
+
+			a0 = ptrba[4];
+			res0_4 += a0*b0;
+
+			a1 = ptrba[5];
+			res0_5 += a1*b0;
+
+			a0 = ptrba[6];
+			res0_6 += a0*b0;
+
+			a1 = ptrba[7];
+			res0_7 += a1*b0;
+
+			ptrba = ptrba+8;
+			ptrbb = ptrbb+1;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+		res0_2 *= alpha;
+		res0_3 *= alpha;
+		res0_4 *= alpha;
+		res0_5 *= alpha;
+		res0_6 *= alpha;
+		res0_7 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+		C0[2] = res0_2;
+		C0[3] = res0_3;
+		C0[4] = res0_4;
+		C0[5] = res0_5;
+		C0[6] = res0_6;
+		C0[7] = res0_7;
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 8; // number of values in A
+#else
+		temp -= 1; // number of values in B
+#endif
+		ptrba += temp*8;
+		ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+		off += 8; // number of values in A
+#endif
+
+		C0 = C0+8;
+	}
+
+	if ( bm & 4 )
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*4;
+		ptrbb = bb + off*1;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+		res0_2 = 0;
+		res0_3 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+4;	// number of values in A
+#else
+		temp = off+1;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+
+			a0 = ptrba[2];
+			res0_2 += a0*b0;
+
+			a1 = ptrba[3];
+			res0_3 += a1*b0;
+
+			ptrba = ptrba+4;
+			ptrbb = ptrbb+1;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+		res0_2 *= alpha;
+		res0_3 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+		C0[2] = res0_2;
+		C0[3] = res0_3;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 4; // number of values in A
+#else
+		temp -= 1; // number of values in B
+#endif
+		ptrba += temp*4;
+		ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+		off += 4; // number of values in A
+#endif
+
+		C0 = C0+4;
+
+	}
+
+	if ( bm & 2 )
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*2;
+		ptrbb = bb + off*1;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+2;	// number of values in A
+#else
+		temp = off+1;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+
+			ptrba = ptrba+2;
+			ptrbb = ptrbb+1;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 2; // number of values in A
+#else
+		temp -= 1; // number of values in B
+#endif
+		ptrba += temp*2;
+		ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+		off += 2; // number of values in A
+#endif
+
+		C0 = C0+2;
+
+	}
+
+	if ( bm & 1 )
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*1;
+		ptrbb = bb + off*1;
+#endif
+
+		res0_0 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+1;	// number of values in A
+#else
+		temp = off+1;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+
+			ptrba = ptrba+1;
+			ptrbb = ptrbb+1;
+                }
+
+		res0_0 *= alpha;
+
+		C0[0] = res0_0;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 1; // number of values in A
+#else
+		temp -= 1; // number of values in B
+#endif
+		ptrba += temp*1;
+		ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+		off += 1; // number of values in A
+#endif
+
+		C0 = C0+1;
+
+	}
+
+
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+		off += 1;
+#endif
+
+        k = (bk<<0);
+        bb = bb+k;
+        C = C+ldc;
+   }
+
+   return 0;
+}
diff --git a/kernel/generic/zimatcopy_cnc.c b/kernel/generic/zimatcopy_cnc.c
index 8e772bd8a..6426cffc0 100644
--- a/kernel/generic/zimatcopy_cnc.c
+++ b/kernel/generic/zimatcopy_cnc.c
@@ -40,7 +40,6 @@ int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a,
 
 	if ( rows <= 0     )  return(0);
 	if ( cols <= 0     )  return(0);
-    if ( alpha_r == 1.0 && alpha_i == 0.0 ) return (0); 
 
 	aptr = a;
 	lda *= 2;
diff --git a/kernel/generic/zlaswp_ncopy_8.c b/kernel/generic/zlaswp_ncopy_8.c
new file mode 100644
index 000000000..8bd41749d
--- /dev/null
+++ b/kernel/generic/zlaswp_ncopy_8.c
@@ -0,0 +1,1051 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+
+#define a2	(a1 + 2)
+#define a4	(a3 + 2)
+#define a6	(a5 + 2)
+#define a8	(a7 + 2)
+
+int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint *ipiv, FLOAT *buffer){
+
+  BLASLONG i, j, ip1, ip2;
+  blasint *piv;
+  FLOAT *a1, *a3, *a5, *a7;
+  FLOAT *b1, *b2, *b3, *b4;
+  FLOAT *b5, *b6, *b7, *b8;
+  FLOAT A1, A2, A3, A4, A5, A6, A7, A8;
+  FLOAT B1, B2, B3, B4, B5, B6, B7, B8;
+
+  FLOAT A9, A10, A11, A12, A13, A14, A15, A16;
+  FLOAT B9, B10, B11, B12, B13, B14, B15, B16;
+
+  a -= 2;
+  lda *= 2;
+  k1 --;
+
+ ipiv += k1;
+
+  if (n  <= 0) return 0;
+
+  j = (n >> 3);
+  if (j > 0) {
+    do {
+      piv = ipiv;
+
+      a1 = a + (k1 + 1) * 2;
+
+      a3 = a1 + 1 * lda;
+      a5 = a1 + 2 * lda;
+      a7 = a1 + 3 * lda;
+
+      ip1 = *(piv + 0) * 2;
+      ip2 = *(piv + 1) * 2;
+      piv += 2;
+
+      b1 = a + ip1;
+      b2 = a + ip2;
+
+      b3 = b1 + 1 * lda;
+      b4 = b2 + 1 * lda;
+      b5 = b1 + 2 * lda;
+      b6 = b2 + 2 * lda;
+      b7 = b1 + 3 * lda;
+      b8 = b2 + 3 * lda;
+
+      i = ((k2 - k1) >> 1);
+
+      if (i > 0) {
+	do {
+	  ip1 = *(piv + 0) * 2;
+	  ip2 = *(piv + 1) * 2;
+	  piv += 2;
+
+		for( int pass = 0; pass < 2; ++pass ) {
+		  A1  = *(a1 + 0);
+		  A9  = *(a1 + 1);
+		  A2  = *(a2 + 0);
+		  A10 = *(a2 + 1);
+		  A3  = *(a3 + 0);
+		  A11 = *(a3 + 1);
+		  A4  = *(a4 + 0);
+		  A12 = *(a4 + 1);
+		  A5  = *(a5 + 0);
+		  A13 = *(a5 + 1);
+		  A6  = *(a6 + 0);
+		  A14 = *(a6 + 1);
+		  A7  = *(a7 + 0);
+		  A15 = *(a7 + 1);
+		  A8  = *(a8 + 0);
+		  A16 = *(a8 + 1);
+
+		  B1  = *(b1 + 0);
+		  B9  = *(b1 + 1);
+		  B2  = *(b2 + 0);
+		  B10 = *(b2 + 1);
+		  B3  = *(b3 + 0);
+		  B11 = *(b3 + 1);
+		  B4  = *(b4 + 0);
+		  B12 = *(b4 + 1);
+		  B5  = *(b5 + 0);
+		  B13 = *(b5 + 1);
+		  B6  = *(b6 + 0);
+		  B14 = *(b6 + 1);
+		  B7  = *(b7 + 0);
+		  B15 = *(b7 + 1);
+		  B8  = *(b8 + 0);
+		  B16 = *(b8 + 1);
+
+		if (b1 == a1) {
+		    if (b2 == a2) {
+		      *(buffer +  0) = A1;
+		      *(buffer +  1) = A9;
+		      *(buffer +  2) = A3;
+		      *(buffer +  3) = A11;
+		      *(buffer +  4) = A5;
+		      *(buffer +  5) = A13;
+		      *(buffer +  6) = A7;
+		      *(buffer +  7) = A15;
+
+		      *(buffer +  8) = A2;
+		      *(buffer +  9) = A10;
+		      *(buffer + 10) = A4;
+		      *(buffer + 11) = A12;
+		      *(buffer + 12) = A6;
+		      *(buffer + 13) = A14;
+		      *(buffer + 14) = A8;
+		      *(buffer + 15) = A16;
+		    } else {
+		      *(buffer +  0) = A1;
+		      *(buffer +  1) = A9;
+		      *(buffer +  2) = A3;
+		      *(buffer +  3) = A11;
+		      *(buffer +  4) = A5;
+		      *(buffer +  5) = A13;
+		      *(buffer +  6) = A7;
+		      *(buffer +  7) = A15;
+
+		      *(buffer +  8) = B2;
+		      *(buffer +  9) = B10;
+		      *(buffer + 10) = B4;
+		      *(buffer + 11) = B12;
+		      *(buffer + 12) = B6;
+		      *(buffer + 13) = B14;
+		      *(buffer + 14) = B8;
+		      *(buffer + 15) = B16;
+
+		      *(b2 + 0) = A2;
+		      *(b2 + 1) = A10;
+		      *(b4 + 0) = A4;
+		      *(b4 + 1) = A12;
+		      *(b6 + 0) = A6;
+		      *(b6 + 1) = A14;
+		      *(b8 + 0) = A8;
+		      *(b8 + 1) = A16;
+		    }
+		} else
+		  if (b1 == a2) {
+		      if (b2 == a2) {
+			*(buffer +  0) = A2;
+			*(buffer +  1) = A10;
+			*(buffer +  2) = A4;
+			*(buffer +  3) = A12;
+			*(buffer +  4) = A6;
+			*(buffer +  5) = A14;
+			*(buffer +  6) = A8;
+			*(buffer +  7) = A16;
+			*(buffer +  8) = A1;
+			*(buffer +  9) = A9;
+			*(buffer + 10) = A3;
+			*(buffer + 11) = A11;
+			*(buffer + 12) = A5;
+			*(buffer + 13) = A13;
+			*(buffer + 14) = A7;
+			*(buffer + 15) = A15;
+
+		      } else {
+			*(buffer +  0) = A2;
+			*(buffer +  1) = A10;
+			*(buffer +  2) = A4;
+			*(buffer +  3) = A12;
+			*(buffer +  4) = A6;
+			*(buffer +  5) = A14;
+			*(buffer +  6) = A8;
+			*(buffer +  7) = A16;
+			*(buffer +  8) = B2;
+			*(buffer +  9) = B10;
+			*(buffer + 10) = B4;
+			*(buffer + 11) = B12;
+			*(buffer + 12) = B6;
+			*(buffer + 13) = B14;
+			*(buffer + 14) = B8;
+			*(buffer + 15) = B16;
+
+			*(b2 + 0) = A1;
+			*(b2 + 1) = A9;
+			*(b4 + 0) = A3;
+			*(b4 + 1) = A11;
+			*(b6 + 0) = A5;
+			*(b6 + 1) = A13;
+			*(b8 + 0) = A7;
+			*(b8 + 1) = A15;
+		      }
+		  } else {
+		      if (b2 == a2) {
+			*(buffer +  0) = B1;
+			*(buffer +  1) = B9;
+			*(buffer +  2) = B3;
+			*(buffer +  3) = B11;
+			*(buffer +  4) = B5;
+			*(buffer +  5) = B13;
+			*(buffer +  6) = B7;
+			*(buffer +  7) = B15;
+			*(buffer +  8) = A2;
+			*(buffer +  9) = A10;
+			*(buffer + 10) = A4;
+			*(buffer + 11) = A12;
+			*(buffer + 12) = A6;
+			*(buffer + 13) = A14;
+			*(buffer + 14) = A8;
+			*(buffer + 15) = A16;
+
+			*(b1 + 0) = A1;
+			*(b1 + 1) = A9;
+			*(b3 + 0) = A3;
+			*(b3 + 1) = A11;
+			*(b5 + 0) = A5;
+			*(b5 + 1) = A13;
+			*(b7 + 0) = A7;
+			*(b7 + 1) = A15;
+		      } else
+			if (b2 == b1) {
+			  *(buffer +  0) = B1;
+			  *(buffer +  1) = B9;
+			  *(buffer +  2) = B3;
+			  *(buffer +  3) = B11;
+			  *(buffer +  4) = B5;
+			  *(buffer +  5) = B13;
+			  *(buffer +  6) = B7;
+			  *(buffer +  7) = B15;
+			  *(buffer +  8) = A1;
+			  *(buffer +  9) = A9;
+			  *(buffer + 10) = A3;
+			  *(buffer + 11) = A11;
+			  *(buffer + 12) = A5;
+			  *(buffer + 13) = A13;
+			  *(buffer + 14) = A7;
+			  *(buffer + 15) = A15;
+
+			  *(b1 + 0) = A2;
+			  *(b1 + 1) = A10;
+			  *(b3 + 0) = A4;
+			  *(b3 + 1) = A12;
+			  *(b5 + 0) = A6;
+			  *(b5 + 1) = A14;
+			  *(b7 + 0) = A8;
+			  *(b7 + 1) = A16;
+			} else {
+			  *(buffer +  0) = B1;
+			  *(buffer +  1) = B9;
+			  *(buffer +  2) = B3;
+			  *(buffer +  3) = B11;
+			  *(buffer +  4) = B5;
+			  *(buffer +  5) = B13;
+			  *(buffer +  6) = B7;
+			  *(buffer +  7) = B15;
+			  *(buffer +  8) = B2;
+			  *(buffer +  9) = B10;
+			  *(buffer + 10) = B4;
+			  *(buffer + 11) = B12;
+			  *(buffer + 12) = B6;
+			  *(buffer + 13) = B14;
+			  *(buffer + 14) = B8;
+			  *(buffer + 15) = B16;
+
+			  *(b1 + 0) = A1;
+			  *(b1 + 1) = A9;
+			  *(b2 + 0) = A2;
+			  *(b2 + 1) = A10;
+			  *(b3 + 0) = A3;
+			  *(b3 + 1) = A11;
+			  *(b4 + 0) = A4;
+			  *(b4 + 1) = A12;
+			  *(b5 + 0) = A5;
+			  *(b5 + 1) = A13;
+			  *(b6 + 0) = A6;
+			  *(b6 + 1) = A14;
+			  *(b7 + 0) = A7;
+			  *(b7 + 1) = A15;
+			  *(b8 + 0) = A8;
+			  *(b8 + 1) = A16;
+			}
+	  }
+	  b1 += 4*lda;
+	  b2 += 4*lda;
+	  b3 += 4*lda;
+	  b4 += 4*lda;
+	  b5 += 4*lda;
+	  b6 += 4*lda;
+	  b7 += 4*lda;
+	  b8 += 4*lda;
+
+	  a1 += 4;
+	  a3 += 4;
+	  a5 += 4;
+	  a7 += 4;
+
+	  buffer += 16;
+	}
+
+	  b1 = a + ip1;
+	  b2 = a + ip2;
+
+	  b3 = b1 + 1 * lda;
+	  b4 = b2 + 1 * lda;
+	  b5 = b1 + 2 * lda;
+	  b6 = b2 + 2 * lda;
+	  b7 = b1 + 3 * lda;
+	  b8 = b2 + 3 * lda;
+
+	i --;
+	} while (i > 0);
+      }
+
+      i = ((k2 - k1) & 1);
+
+      if (i > 0) {
+	A1  = *(a1 + 0);
+	A9  = *(a1 + 1);
+	B1  = *(b1 + 0);
+	B9  = *(b1 + 1);
+	A3  = *(a3 + 0);
+	A11 = *(a3 + 1);
+	B3  = *(b3 + 0);
+	B11 = *(b3 + 1);
+	A5  = *(a5 + 0);
+	A13 = *(a5 + 1);
+	B5  = *(b5 + 0);
+	B13 = *(b5 + 1);
+	A7  = *(a7 + 0);
+	A15 = *(a7 + 1);
+	B7  = *(b7 + 0);
+	B15 = *(b7 + 1);
+
+	if (a1 == b1) {
+	  *(buffer + 0) = A1;
+	  *(buffer + 1) = A9;
+	  *(buffer + 2) = A3;
+	  *(buffer + 3) = A11;
+	  *(buffer + 4) = A5;
+	  *(buffer + 5) = A13;
+	  *(buffer + 6) = A7;
+	  *(buffer + 7) = A15;
+	} else {
+	  *(buffer + 0) = B1;
+	  *(buffer + 1) = B9;
+	  *(buffer + 2) = B3;
+	  *(buffer + 3) = B11;
+	  *(buffer + 4) = B5;
+	  *(buffer + 5) = B13;
+	  *(buffer + 6) = B7;
+	  *(buffer + 7) = B15;
+
+	  *(b1 + 0) = A1;
+	  *(b1 + 1) = A9;
+	  *(b3 + 0) = A3;
+	  *(b3 + 1) = A11;
+	  *(b5 + 0) = A5;
+	  *(b5 + 1) = A13;
+	  *(b7 + 0) = A7;
+	  *(b7 + 1) = A15;
+	}
+	buffer += 8;
+      }
+
+      a += 4 * lda;
+
+      j --;
+    } while (j > 0);
+  }
+
+
+  if (n & 4) {
+    {
+      piv = ipiv;
+
+      a1 = a + (k1 + 1) * 2;
+
+      a3 = a1 + 1 * lda;
+      a5 = a1 + 2 * lda;
+      a7 = a1 + 3 * lda;
+
+      ip1 = *(piv + 0) * 2;
+      ip2 = *(piv + 1) * 2;
+      piv += 2;
+
+      b1 = a + ip1;
+      b2 = a + ip2;
+
+      b3 = b1 + 1 * lda;
+      b4 = b2 + 1 * lda;
+      b5 = b1 + 2 * lda;
+      b6 = b2 + 2 * lda;
+      b7 = b1 + 3 * lda;
+      b8 = b2 + 3 * lda;
+
+      i = ((k2 - k1) >> 1);
+
+      if (i > 0) {
+	do {
+	  A1  = *(a1 + 0);
+	  A9  = *(a1 + 1);
+	  A2  = *(a2 + 0);
+	  A10 = *(a2 + 1);
+	  A3  = *(a3 + 0);
+	  A11 = *(a3 + 1);
+	  A4  = *(a4 + 0);
+	  A12 = *(a4 + 1);
+	  A5  = *(a5 + 0);
+	  A13 = *(a5 + 1);
+	  A6  = *(a6 + 0);
+	  A14 = *(a6 + 1);
+	  A7  = *(a7 + 0);
+	  A15 = *(a7 + 1);
+	  A8  = *(a8 + 0);
+	  A16 = *(a8 + 1);
+
+	  B1  = *(b1 + 0);
+	  B9  = *(b1 + 1);
+	  B2  = *(b2 + 0);
+	  B10 = *(b2 + 1);
+	  B3  = *(b3 + 0);
+	  B11 = *(b3 + 1);
+	  B4  = *(b4 + 0);
+	  B12 = *(b4 + 1);
+	  B5  = *(b5 + 0);
+	  B13 = *(b5 + 1);
+	  B6  = *(b6 + 0);
+	  B14 = *(b6 + 1);
+	  B7  = *(b7 + 0);
+	  B15 = *(b7 + 1);
+	  B8  = *(b8 + 0);
+	  B16 = *(b8 + 1);
+
+	  ip1 = *(piv + 0) * 2;
+	  ip2 = *(piv + 1) * 2;
+	  piv += 2;
+
+	if (b1 == a1) {
+	    if (b2 == a2) {
+	      *(buffer +  0) = A1;
+	      *(buffer +  1) = A9;
+	      *(buffer +  2) = A3;
+	      *(buffer +  3) = A11;
+	      *(buffer +  4) = A5;
+	      *(buffer +  5) = A13;
+	      *(buffer +  6) = A7;
+	      *(buffer +  7) = A15;
+
+	      *(buffer +  8) = A2;
+	      *(buffer +  9) = A10;
+	      *(buffer + 10) = A4;
+	      *(buffer + 11) = A12;
+	      *(buffer + 12) = A6;
+	      *(buffer + 13) = A14;
+	      *(buffer + 14) = A8;
+	      *(buffer + 15) = A16;
+	    } else {
+	      *(buffer +  0) = A1;
+	      *(buffer +  1) = A9;
+	      *(buffer +  2) = A3;
+	      *(buffer +  3) = A11;
+	      *(buffer +  4) = A5;
+	      *(buffer +  5) = A13;
+	      *(buffer +  6) = A7;
+	      *(buffer +  7) = A15;
+
+	      *(buffer +  8) = B2;
+	      *(buffer +  9) = B10;
+	      *(buffer + 10) = B4;
+	      *(buffer + 11) = B12;
+	      *(buffer + 12) = B6;
+	      *(buffer + 13) = B14;
+	      *(buffer + 14) = B8;
+	      *(buffer + 15) = B16;
+
+	      *(b2 + 0) = A2;
+	      *(b2 + 1) = A10;
+	      *(b4 + 0) = A4;
+	      *(b4 + 1) = A12;
+	      *(b6 + 0) = A6;
+	      *(b6 + 1) = A14;
+	      *(b8 + 0) = A8;
+	      *(b8 + 1) = A16;
+	    }
+	} else
+	  if (b1 == a2) {
+	      if (b2 == a2) {
+		*(buffer +  0) = A2;
+		*(buffer +  1) = A10;
+		*(buffer +  2) = A4;
+		*(buffer +  3) = A12;
+		*(buffer +  4) = A6;
+		*(buffer +  5) = A14;
+		*(buffer +  6) = A8;
+		*(buffer +  7) = A16;
+		*(buffer +  8) = A1;
+		*(buffer +  9) = A9;
+		*(buffer + 10) = A3;
+		*(buffer + 11) = A11;
+		*(buffer + 12) = A5;
+		*(buffer + 13) = A13;
+		*(buffer + 14) = A7;
+		*(buffer + 15) = A15;
+
+	      } else {
+		*(buffer +  0) = A2;
+		*(buffer +  1) = A10;
+		*(buffer +  2) = A4;
+		*(buffer +  3) = A12;
+		*(buffer +  4) = A6;
+		*(buffer +  5) = A14;
+		*(buffer +  6) = A8;
+		*(buffer +  7) = A16;
+		*(buffer +  8) = B2;
+		*(buffer +  9) = B10;
+		*(buffer + 10) = B4;
+		*(buffer + 11) = B12;
+		*(buffer + 12) = B6;
+		*(buffer + 13) = B14;
+		*(buffer + 14) = B8;
+		*(buffer + 15) = B16;
+
+		*(b2 + 0) = A1;
+		*(b2 + 1) = A9;
+		*(b4 + 0) = A3;
+		*(b4 + 1) = A11;
+		*(b6 + 0) = A5;
+		*(b6 + 1) = A13;
+		*(b8 + 0) = A7;
+		*(b8 + 1) = A15;
+	      }
+	  } else {
+	      if (b2 == a2) {
+		*(buffer +  0) = B1;
+		*(buffer +  1) = B9;
+		*(buffer +  2) = B3;
+		*(buffer +  3) = B11;
+		*(buffer +  4) = B5;
+		*(buffer +  5) = B13;
+		*(buffer +  6) = B7;
+		*(buffer +  7) = B15;
+		*(buffer +  8) = A2;
+		*(buffer +  9) = A10;
+		*(buffer + 10) = A4;
+		*(buffer + 11) = A12;
+		*(buffer + 12) = A6;
+		*(buffer + 13) = A14;
+		*(buffer + 14) = A8;
+		*(buffer + 15) = A16;
+
+		*(b1 + 0) = A1;
+		*(b1 + 1) = A9;
+		*(b3 + 0) = A3;
+		*(b3 + 1) = A11;
+		*(b5 + 0) = A5;
+		*(b5 + 1) = A13;
+		*(b7 + 0) = A7;
+		*(b7 + 1) = A15;
+	      } else
+		if (b2 == b1) {
+		  *(buffer +  0) = B1;
+		  *(buffer +  1) = B9;
+		  *(buffer +  2) = B3;
+		  *(buffer +  3) = B11;
+		  *(buffer +  4) = B5;
+		  *(buffer +  5) = B13;
+		  *(buffer +  6) = B7;
+		  *(buffer +  7) = B15;
+		  *(buffer +  8) = A1;
+		  *(buffer +  9) = A9;
+		  *(buffer + 10) = A3;
+		  *(buffer + 11) = A11;
+		  *(buffer + 12) = A5;
+		  *(buffer + 13) = A13;
+		  *(buffer + 14) = A7;
+		  *(buffer + 15) = A15;
+
+		  *(b1 + 0) = A2;
+		  *(b1 + 1) = A10;
+		  *(b3 + 0) = A4;
+		  *(b3 + 1) = A12;
+		  *(b5 + 0) = A6;
+		  *(b5 + 1) = A14;
+		  *(b7 + 0) = A8;
+		  *(b7 + 1) = A16;
+		} else {
+		  *(buffer +  0) = B1;
+		  *(buffer +  1) = B9;
+		  *(buffer +  2) = B3;
+		  *(buffer +  3) = B11;
+		  *(buffer +  4) = B5;
+		  *(buffer +  5) = B13;
+		  *(buffer +  6) = B7;
+		  *(buffer +  7) = B15;
+		  *(buffer +  8) = B2;
+		  *(buffer +  9) = B10;
+		  *(buffer + 10) = B4;
+		  *(buffer + 11) = B12;
+		  *(buffer + 12) = B6;
+		  *(buffer + 13) = B14;
+		  *(buffer + 14) = B8;
+		  *(buffer + 15) = B16;
+
+		  *(b1 + 0) = A1;
+		  *(b1 + 1) = A9;
+		  *(b2 + 0) = A2;
+		  *(b2 + 1) = A10;
+		  *(b3 + 0) = A3;
+		  *(b3 + 1) = A11;
+		  *(b4 + 0) = A4;
+		  *(b4 + 1) = A12;
+		  *(b5 + 0) = A5;
+		  *(b5 + 1) = A13;
+		  *(b6 + 0) = A6;
+		  *(b6 + 1) = A14;
+		  *(b7 + 0) = A7;
+		  *(b7 + 1) = A15;
+		  *(b8 + 0) = A8;
+		  *(b8 + 1) = A16;
+		}
+	  }
+
+	 buffer += 16;
+
+	  b1 = a + ip1;
+	  b2 = a + ip2;
+
+	  b3 = b1 + 1 * lda;
+	  b4 = b2 + 1 * lda;
+	  b5 = b1 + 2 * lda;
+	  b6 = b2 + 2 * lda;
+	  b7 = b1 + 3 * lda;
+	  b8 = b2 + 3 * lda;
+
+	  a1 += 4;
+	  a3 += 4;
+	  a5 += 4;
+	  a7 += 4;
+
+	i --;
+	} while (i > 0);
+      }
+
+      i = ((k2 - k1) & 1);
+
+      if (i > 0) {
+	A1  = *(a1 + 0);
+	A9  = *(a1 + 1);
+	B1  = *(b1 + 0);
+	B9  = *(b1 + 1);
+	A3  = *(a3 + 0);
+	A11 = *(a3 + 1);
+	B3  = *(b3 + 0);
+	B11 = *(b3 + 1);
+	A5  = *(a5 + 0);
+	A13 = *(a5 + 1);
+	B5  = *(b5 + 0);
+	B13 = *(b5 + 1);
+	A7  = *(a7 + 0);
+	A15 = *(a7 + 1);
+	B7  = *(b7 + 0);
+	B15 = *(b7 + 1);
+
+	if (a1 == b1) {
+	  *(buffer + 0) = A1;
+	  *(buffer + 1) = A9;
+	  *(buffer + 2) = A3;
+	  *(buffer + 3) = A11;
+	  *(buffer + 4) = A5;
+	  *(buffer + 5) = A13;
+	  *(buffer + 6) = A7;
+	  *(buffer + 7) = A15;
+	} else {
+	  *(buffer + 0) = B1;
+	  *(buffer + 1) = B9;
+	  *(buffer + 2) = B3;
+	  *(buffer + 3) = B11;
+	  *(buffer + 4) = B5;
+	  *(buffer + 5) = B13;
+	  *(buffer + 6) = B7;
+	  *(buffer + 7) = B15;
+
+	  *(b1 + 0) = A1;
+	  *(b1 + 1) = A9;
+	  *(b3 + 0) = A3;
+	  *(b3 + 1) = A11;
+	  *(b5 + 0) = A5;
+	  *(b5 + 1) = A13;
+	  *(b7 + 0) = A7;
+	  *(b7 + 1) = A15;
+	}
+	buffer += 8;
+      }
+
+      a += 4 * lda;
+    }
+  } //if (n & 4)
+
+  if (n & 2) {
+    piv = ipiv;
+
+    a1 = a + (k1 + 1) * 2;
+    a3 = a1 + lda;
+
+    ip1 = *(piv + 0) * 2;
+    ip2 = *(piv + 1) * 2;
+    piv += 2;
+
+    b1 = a + ip1;
+    b2 = a + ip2;
+
+    b3 = b1 + lda;
+    b4 = b2 + lda;
+
+    i = ((k2 - k1) >> 1);
+
+    if (i > 0) {
+      do {
+	A1 = *(a1 + 0);
+	A2 = *(a1 + 1);
+	A3 = *(a2 + 0);
+	A4 = *(a2 + 1);
+	A5 = *(a3 + 0);
+	A6 = *(a3 + 1);
+	A7 = *(a4 + 0);
+	A8 = *(a4 + 1);
+
+	B1 = *(b1 + 0);
+	B2 = *(b1 + 1);
+	B3 = *(b2 + 0);
+	B4 = *(b2 + 1);
+	B5 = *(b3 + 0);
+	B6 = *(b3 + 1);
+	B7 = *(b4 + 0);
+	B8 = *(b4 + 1);
+
+	ip1 = *(piv + 0) * 2;
+	ip2 = *(piv + 1) * 2;
+	piv += 2;
+
+	if (b1 == a1) {
+	  if (b2 == a2) {
+	    *(buffer + 0) = A1;
+	    *(buffer + 1) = A2;
+	    *(buffer + 2) = A5;
+	    *(buffer + 3) = A6;
+	    *(buffer + 4) = A3;
+	    *(buffer + 5) = A4;
+	    *(buffer + 6) = A7;
+	    *(buffer + 7) = A8;
+	  } else {
+	    *(buffer + 0) = A1;
+	    *(buffer + 1) = A2;
+	    *(buffer + 2) = A5;
+	    *(buffer + 3) = A6;
+	    *(buffer + 4) = B3;
+	    *(buffer + 5) = B4;
+	    *(buffer + 6) = B7;
+	    *(buffer + 7) = B8;
+
+	    *(b2 + 0) = A3;
+	    *(b2 + 1) = A4;
+	    *(b4 + 0) = A7;
+	    *(b4 + 1) = A8;
+	  }
+	} else {
+	  if (b1 == a2) {
+	    if (b2 == a2) {
+	      *(buffer + 0) = A3;
+	      *(buffer + 1) = A4;
+	      *(buffer + 2) = A7;
+	      *(buffer + 3) = A8;
+	      *(buffer + 4) = A1;
+	      *(buffer + 5) = A2;
+	      *(buffer + 6) = A5;
+	      *(buffer + 7) = A6;
+	    } else {
+	      *(buffer + 0) = A3;
+	      *(buffer + 1) = A4;
+	      *(buffer + 2) = A7;
+	      *(buffer + 3) = A8;
+	      *(buffer + 4) = B3;
+	      *(buffer + 5) = B4;
+	      *(buffer + 6) = B7;
+	      *(buffer + 7) = B8;
+
+	      *(b2 + 0) = A1;
+	      *(b2 + 1) = A2;
+	      *(b4 + 0) = A5;
+	      *(b4 + 1) = A6;
+	    }
+	  } else {
+	    if (b2 == a2) {
+	      *(buffer + 0) = B1;
+	      *(buffer + 1) = B2;
+	      *(buffer + 2) = B5;
+	      *(buffer + 3) = B6;
+	      *(buffer + 4) = A3;
+	      *(buffer + 5) = A4;
+	      *(buffer + 6) = A7;
+	      *(buffer + 7) = A8;
+
+	      *(b1 + 0) = A1;
+	      *(b1 + 1) = A2;
+	      *(b3 + 0) = A5;
+	      *(b3 + 1) = A6;
+	    } else {
+	      if (b2 == b1) {
+		*(buffer + 0) = B1;
+		*(buffer + 1) = B2;
+		*(buffer + 2) = B5;
+		*(buffer + 3) = B6;
+		*(buffer + 4) = A1;
+		*(buffer + 5) = A2;
+		*(buffer + 6) = A5;
+		*(buffer + 7) = A6;
+
+		*(b1 + 0) = A3;
+		*(b1 + 1) = A4;
+		*(b3 + 0) = A7;
+		*(b3 + 1) = A8;
+	      } else {
+		*(buffer + 0) = B1;
+		*(buffer + 1) = B2;
+		*(buffer + 2) = B5;
+		*(buffer + 3) = B6;
+		*(buffer + 4) = B3;
+		*(buffer + 5) = B4;
+		*(buffer + 6) = B7;
+		*(buffer + 7) = B8;
+		*(b1 + 0) = A1;
+		*(b1 + 1) = A2;
+		*(b2 + 0) = A3;
+		*(b2 + 1) = A4;
+		*(b3 + 0) = A5;
+		*(b3 + 1) = A6;
+		*(b4 + 0) = A7;
+		*(b4 + 1) = A8;
+	      }
+	    }
+	  }
+	  }
+
+	  buffer += 8;
+
+	  b1 = a + ip1;
+	  b2 = a + ip2;
+
+	  b3 = b1 + lda;
+	  b4 = b2 + lda;
+
+	  a1 += 4;
+	  a3 += 4;
+
+	  i --;
+      } while (i > 0);
+    }
+
+    i = ((k2 - k1) & 1);
+
+    if (i > 0) {
+      A1 = *(a1 + 0);
+      A2 = *(a1 + 1);
+      B1 = *(b1 + 0);
+      B2 = *(b1 + 1);
+      A3 = *(a3 + 0);
+      A4 = *(a3 + 1);
+      B3 = *(b3 + 0);
+      B4 = *(b3 + 1);
+
+      if (a1 == b1) {
+	*(buffer + 0) = A1;
+	*(buffer + 1) = A2;
+	*(buffer + 2) = A3;
+	*(buffer + 3) = A4;
+
+      } else {
+	*(buffer + 0) = B1;
+	*(buffer + 1) = B2;
+	*(buffer + 2) = B3;
+	*(buffer + 3) = B4;
+	*(b1 + 0) = A1;
+	*(b1 + 1) = A2;
+	*(b3 + 0) = A3;
+	*(b3 + 1) = A4;
+      }
+      buffer += 4;
+    }
+
+    a += 2 * lda;
+  }
+
+  if (n & 1) {
+    piv = ipiv;
+
+    a1 = a + (k1 + 1) * 2;
+
+    ip1 = *(piv + 0) * 2;
+    ip2 = *(piv + 1) * 2;
+    piv += 2;
+
+    b1 = a + ip1;
+    b2 = a + ip2;
+
+    i = ((k2 - k1) >> 1);
+
+    if (i > 0) {
+      do {
+	A1 = *(a1 + 0);
+	A2 = *(a1 + 1);
+	A3 = *(a2 + 0);
+	A4 = *(a2 + 1);
+	B1 = *(b1 + 0);
+	B2 = *(b1 + 1);
+	B3 = *(b2 + 0);
+	B4 = *(b2 + 1);
+
+	ip1 = *(piv + 0) * 2;
+	ip2 = *(piv + 1) * 2;
+	piv += 2;
+
+	if (b1 == a1) {
+	  if (b2 == a2) {
+	    *(buffer + 0) = A1;
+	    *(buffer + 1) = A2;
+	    *(buffer + 2) = A3;
+	    *(buffer + 3) = A4;
+	  } else {
+	    *(buffer + 0) = A1;
+	    *(buffer + 1) = A2;
+	    *(buffer + 2) = B3;
+	    *(buffer + 3) = B4;
+
+	    *(b2 + 0) = A3;
+	    *(b2 + 1) = A4;
+	  }
+	} else
+	  if (b1 == a2) {
+	    if (b2 == a2) {
+	      *(buffer + 0) = A3;
+	      *(buffer + 1) = A4;
+	      *(buffer + 2) = A1;
+	      *(buffer + 3) = A2;
+	    } else {
+	      *(buffer + 0) = A3;
+	      *(buffer + 1) = A4;
+	      *(buffer + 2) = B3;
+	      *(buffer + 3) = B4;
+	      *(b2 + 0) = A1;
+	      *(b2 + 1) = A2;
+	    }
+	  } else {
+	    if (b2 == a2) {
+	      *(buffer + 0) = B1;
+	      *(buffer + 1) = B2;
+	      *(buffer + 2) = A3;
+	      *(buffer + 3) = A4;
+	      *(b1 + 0) = A1;
+	      *(b1 + 1) = A2;
+	    } else
+	      if (b2 == b1) {
+		*(buffer + 0) = B1;
+		*(buffer + 1) = B2;
+		*(buffer + 2) = A1;
+		*(buffer + 3) = A2;
+		*(b1 + 0) = A3;
+		*(b1 + 1) = A4;
+	      } else {
+		*(buffer + 0) = B1;
+		*(buffer + 1) = B2;
+		*(buffer + 2) = B3;
+		*(buffer + 3) = B4;
+		*(b1 + 0) = A1;
+		*(b1 + 1) = A2;
+		*(b2 + 0) = A3;
+		*(b2 + 1) = A4;
+	      }
+	  }
+
+	buffer += 4;
+
+	b1 = a + ip1;
+	b2 = a + ip2;
+
+	a1 += 4;
+
+	i --;
+      } while (i > 0);
+    }
+
+    i = ((k2 - k1) & 1);
+
+    if (i > 0) {
+      A1 = *(a1 + 0);
+      A2 = *(a1 + 1);
+      B1 = *(b1 + 0);
+      B2 = *(b1 + 1);
+
+      if (a1 == b1) {
+	*(buffer + 0) = A1;
+	*(buffer + 1) = A2;
+      } else {
+	*(buffer + 0) = B1;
+	*(buffer + 1) = B2;
+	*(b1 + 0) = A1;
+	*(b1 + 1) = A2;
+      }
+      // buffer += 2;
+    }
+  }
+
+  return 0;
+}
+
diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000
index e27ce3bee..c7ef44035 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON2K1000
+++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000
@@ -58,6 +58,8 @@ ZAXPYKERNEL =  caxpy_lsx.S
 
 SAXPBYKERNEL = axpby_lsx.S
 DAXPBYKERNEL = axpby_lsx.S
+CAXPBYKERNEL = caxpby_lsx.S
+ZAXPBYKERNEL = caxpby_lsx.S
 
 SSUMKERNEL  =  sum_lsx.S
 DSUMKERNEL  =  sum_lsx.S
@@ -98,9 +100,13 @@ DTRSMKERNEL_LT  = ../generic/trsm_kernel_LT.c
 DTRSMKERNEL_RN  = ../generic/trsm_kernel_RN.c
 DTRSMKERNEL_RT  = ../generic/trsm_kernel_RT.c
 
-CGEMMKERNEL  = cgemm_kernel_2x2_lsx.S
-CGEMMONCOPY  = cgemm_ncopy_2_lsx.S
-CGEMMOTCOPY  = cgemm_tcopy_2_lsx.S
+CGEMMKERNEL  = cgemm_kernel_8x4_lsx.S
+CGEMMINCOPY  = cgemm_ncopy_8_lsx.S
+CGEMMITCOPY  = cgemm_tcopy_8_lsx.S
+CGEMMONCOPY  = cgemm_ncopy_4_lsx.S
+CGEMMOTCOPY  = cgemm_tcopy_4_lsx.S
+CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
+CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
 CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
 CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
@@ -109,4 +115,14 @@ CTRSMKERNEL_LT  = ../generic/trsm_kernel_LT.c
 CTRSMKERNEL_RN  = ../generic/trsm_kernel_RN.c
 CTRSMKERNEL_RT  = ../generic/trsm_kernel_RT.c
 
+ZGEMMKERNEL  = zgemm_kernel_4x4_lsx.S
+ZGEMMONCOPY  = zgemm_ncopy_4_lsx.S
+ZGEMMOTCOPY  = zgemm_tcopy_4_lsx.S
+ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
+ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+ZTRSMKERNEL_LN  = ../generic/trsm_kernel_LN.c
+ZTRSMKERNEL_LT  = ../generic/trsm_kernel_LT.c
+ZTRSMKERNEL_RN  = ../generic/trsm_kernel_RN.c
+ZTRSMKERNEL_RT  = ../generic/trsm_kernel_RT.c
 endif
diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5
index f4429cfba..17d15656a 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON3R5
+++ b/kernel/loongarch64/KERNEL.LOONGSON3R5
@@ -58,6 +58,8 @@ ZAXPYKERNEL =  caxpy_lasx.S
 
 SAXPBYKERNEL = axpby_lasx.S
 DAXPBYKERNEL = axpby_lasx.S
+CAXPBYKERNEL = caxpby_lasx.S
+ZAXPBYKERNEL = caxpby_lasx.S
 
 SSUMKERNEL  =  sum_lasx.S
 DSUMKERNEL  =  sum_lasx.S
@@ -120,9 +122,13 @@ CTRSMKERNEL_LT  = ../generic/trsm_kernel_LT.c
 CTRSMKERNEL_RN  = ../generic/trsm_kernel_RN.c
 CTRSMKERNEL_RT  = ../generic/trsm_kernel_RT.c
 
-ZGEMMKERNEL  = zgemm_kernel_2x2_lasx.S
-ZGEMMONCOPY  = zgemm_ncopy_2_lasx.S
-ZGEMMOTCOPY  = zgemm_tcopy_2_lasx.S
+ZGEMMKERNEL  = zgemm_kernel_8x4_lasx.S
+ZGEMMINCOPY  = zgemm_ncopy_8_lasx.S
+ZGEMMITCOPY  = zgemm_tcopy_8_lasx.S
+ZGEMMONCOPY  = zgemm_ncopy_4_lasx.S
+ZGEMMOTCOPY  = zgemm_tcopy_4_lasx.S
+ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
+ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
 ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
 ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
diff --git a/kernel/loongarch64/amin_lsx.S b/kernel/loongarch64/amin_lsx.S
index c3c3f4ae9..47701b6e4 100644
--- a/kernel/loongarch64/amin_lsx.S
+++ b/kernel/loongarch64/amin_lsx.S
@@ -124,7 +124,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 .L13:
     FABS $f0, $f0
-    SUB $f0, $f0, $f0
     jirl $r0,  $r1, 0x0
     .align 3
 
diff --git a/kernel/loongarch64/axpby_lasx.S b/kernel/loongarch64/axpby_lasx.S
index f1d99cd3b..7a246ca5c 100644
--- a/kernel/loongarch64/axpby_lasx.S
+++ b/kernel/loongarch64/axpby_lasx.S
@@ -57,10 +57,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     PROLOGUE
 
     bge $r0, N, .L999
-    li.d TEMP, 1
     movgr2fr.d a1, $r0
     ffint.s.l a1, a1
-    slli.d  TEMP, TEMP, BASE_SHIFT
     slli.d  INCX, INCX, BASE_SHIFT
     slli.d  INCY, INCY, BASE_SHIFT
     MTG  t1, ALPHA
@@ -75,6 +73,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     xvreplgr2vr.w VXB, t2
     xvreplgr2vr.w VXZ, t3
 #endif
+    // If incx == 0 || incy == 0, do one by one
+    and TEMP, INCX, INCY
+    or  I,    N,    N
+    beqz TEMP, .L998
+
+    li.d TEMP, 1
+    slli.d  TEMP, TEMP, BASE_SHIFT
     srai.d I, N, 3
     bne INCX, TEMP, .L20
     bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
diff --git a/kernel/loongarch64/axpby_lsx.S b/kernel/loongarch64/axpby_lsx.S
index 45154c262..e50d4cdcc 100644
--- a/kernel/loongarch64/axpby_lsx.S
+++ b/kernel/loongarch64/axpby_lsx.S
@@ -57,10 +57,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     PROLOGUE
 
     bge $r0, N, .L999
-    li.d TEMP, 1
     movgr2fr.d a1, $r0
     ffint.s.l a1, a1
-    slli.d  TEMP, TEMP, BASE_SHIFT
     slli.d  INCX, INCX, BASE_SHIFT
     slli.d  INCY, INCY, BASE_SHIFT
     MTG  t1, ALPHA
@@ -75,6 +73,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     vreplgr2vr.w VXB, t2
     vreplgr2vr.w VXZ, t3
 #endif
+    // If incx == 0 || incy == 0, do one by one
+    and TEMP, INCX, INCY
+    or  I,    N,    N
+    beqz TEMP, .L998
+
+    li.d TEMP, 1
+    slli.d  TEMP, TEMP, BASE_SHIFT
     srai.d I, N, 3
     bne INCX, TEMP, .L20
     bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
diff --git a/kernel/loongarch64/caxpby_lasx.S b/kernel/loongarch64/caxpby_lasx.S
new file mode 100644
index 000000000..c5802092e
--- /dev/null
+++ b/kernel/loongarch64/caxpby_lasx.S
@@ -0,0 +1,1046 @@
+#define ASSEMBLER
+
+#include "common.h"
+#define N      $r4
+#define ALPHAR $f0
+#define ALPHAI $f1
+#define X      $r5
+#define INCX   $r6
+#define BETAR  $f2
+#define BETAI  $f3
+#define Y      $r7
+#define INCY   $r8
+
+#define I      $r12
+#define TEMP   $r13
+#define t1     $r14
+#define t2     $r16
+#define t3     $r15
+#define t4     $r17
+#define XX     $r18
+#define YY     $r19
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define s1     $f16
+#define s2     $f17
+#define s3     $f18
+#define s4     $f19
+#define VX0    $xr8
+#define VX1    $xr20
+#define VX2    $xr21
+#define VX3    $xr22
+#define VXAR   $xr23
+#define VXAI   $xr19
+#define VXBR   $xr14
+#define VXBI   $xr13
+#define VXZ    $xr12
+#define x1     $xr18
+#define x2     $xr17
+#define x3     $xr16
+#define x4     $xr15
+
+    PROLOGUE
+
+    bge $r0, N, .L999
+    movgr2fr.d a1, $r0
+    FFINT a1, a1
+    slli.d  INCX, INCX, ZBASE_SHIFT
+    slli.d  INCY, INCY, ZBASE_SHIFT
+    MTG t1, ALPHAR
+    MTG t2, ALPHAI
+    MTG t3, BETAR
+    MTG t4, BETAI
+#ifdef DOUBLE
+    xvreplgr2vr.d VXAR, t1
+    xvreplgr2vr.d VXAI, t2
+    xvreplgr2vr.d VXBR, t3
+    xvreplgr2vr.d VXBI, t4
+#else
+    xvreplgr2vr.w VXAR, t1
+    xvreplgr2vr.w VXAI, t2
+    xvreplgr2vr.w VXBR, t3
+    xvreplgr2vr.w VXBI, t4
+#endif
+    xvxor.v VXZ, VXZ, VXZ
+    // If incx == 0 || incy == 0, do one by one
+    and TEMP, INCX, INCY
+    or  I,    N,    N
+    beqz TEMP, .L998
+
+    li.d TEMP, 1
+    slli.d  TEMP, TEMP, ZBASE_SHIFT
+#ifdef DOUBLE
+    srai.d I, N, 2
+#else
+    srai.d I, N, 3
+#endif
+    bne INCX, TEMP, .L20
+    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
+    b .L11  // INCX==1 and INCY==1
+.L20:
+    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
+    b .L21 // INCX!=1 and INCY==1
+
+.L11:
+    bge $r0, I, .L997
+    CMPEQ $fcc0, BETAR, a1
+    CMPEQ $fcc1, BETAI, a1
+    CMPEQ $fcc2, ALPHAR, a1
+    CMPEQ $fcc3, ALPHAI, a1
+    bceqz $fcc0, .L13
+    bceqz $fcc1, .L13
+    b .L14
+    .align 3
+
+.L13:
+    bceqz $fcc2, .L114
+    bceqz $fcc3, .L114 //!(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0)
+    b .L113 //!(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0)
+
+.L14:
+    bceqz $fcc2, .L112
+    bceqz $fcc3, .L112 //(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0)
+    b .L111 //(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0)
+    .align 3
+
+.L111:  //(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0)
+    xvst VXZ, Y, 0 * SIZE
+#ifdef DOUBLE
+    xvst VXZ, Y, 4 * SIZE
+    addi.d Y, Y, 8 * SIZE
+#else
+    xvst VXZ, Y, 8 * SIZE
+    addi.d Y, Y, 16 * SIZE
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L111
+    b .L997
+    .align 3
+
+.L112:  //(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0)
+#ifdef DOUBLE
+    xvld VX0, X, 0 * SIZE
+    xvld VX1, X, 4 * SIZE
+    xvpickev.d x1, VX1, VX0
+    xvpickod.d x2, VX1, VX0
+#else
+    xvld VX0, X, 0 * SIZE
+    xvld VX1, X, 8 * SIZE
+    xvpickev.w x1, VX1, VX0
+    xvpickod.w x2, VX1, VX0
+#endif
+    XVFMUL x3, VXAI, x2
+    XVFMUL x4, VXAI, x1
+    XVMSUB x3, VXAR, x1, x3
+    XVFMADD x4, VXAR, x2, x4
+#ifdef DOUBLE
+    xvilvl.d VX2, x4 ,x3
+    xvilvh.d VX3, x4, x3
+    xvst VX2, Y, 0 * SIZE
+    xvst VX3, Y, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+#else
+    xvilvl.w VX2, x4 ,x3
+    xvilvh.w VX3, x4, x3
+    xvst VX2, Y, 0 * SIZE
+    xvst VX3, Y, 8 * SIZE
+    addi.d X, X, 16 * SIZE
+    addi.d Y, Y, 16 * SIZE
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L112
+    b .L997
+    .align 3
+
+.L113: //!(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0)
+#ifdef DOUBLE
+    xvld VX0, Y, 0 * SIZE
+    xvld VX1, Y, 4 * SIZE
+    xvpickev.d x1, VX1, VX0
+    xvpickod.d x2, VX1, VX0
+#else
+    xvld VX0, Y, 0 * SIZE
+    xvld VX1, Y, 8 * SIZE
+    xvpickev.w x1, VX1, VX0
+    xvpickod.w x2, VX1, VX0
+#endif
+    XVFMUL x3, VXBI, x2
+    XVFMUL x4, VXBI, x1
+    XVMSUB x3, VXBR, x1, x3
+    XVFMADD x4, VXBR, x2, x4
+#ifdef DOUBLE
+    xvilvl.d VX2, x4 ,x3
+    xvilvh.d VX3, x4, x3
+    xvst VX2, Y, 0 * SIZE
+    xvst VX3, Y, 4 * SIZE
+    addi.d X, Y, 8 * SIZE
+#else
+    xvilvl.w VX2, x4 ,x3
+    xvilvh.w VX3, x4, x3
+    xvst VX2, Y, 0 * SIZE
+    xvst VX3, Y, 8 * SIZE
+    addi.d X, Y, 16 * SIZE
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L113
+    b .L997
+    .align 3
+
+.L114:
+#ifdef DOUBLE
+    xvld VX0, X, 0 * SIZE
+    xvld VX1, X, 4 * SIZE
+    xvld VX2, Y, 0 * SIZE
+    xvld VX3, Y, 4 * SIZE
+    xvpickev.d x1, VX1, VX0
+    xvpickod.d x2, VX1, VX0
+    xvpickev.d x3, VX3, VX2
+    xvpickod.d x4, VX3, VX2
+#else
+    xvld VX0, X, 0 * SIZE
+    xvld VX1, X, 8 * SIZE
+    xvld VX2, Y, 0 * SIZE
+    xvld VX3, Y, 8 * SIZE
+    xvpickev.w x1, VX1, VX0
+    xvpickod.w x2, VX1, VX0
+    xvpickev.w x3, VX3, VX2
+    xvpickod.w x4, VX3, VX2
+#endif
+    XVFMUL VX0, VXAI, x2
+    XVFMUL VX1, VXAI, x1
+    XVFMUL VX2, VXBI, x4
+    XVFMUL VX3, VXBI, x3
+    XVMSUB VX0, VXAR, x1, VX0
+    XVFMADD VX1, VXAR, x2, VX1
+    XVMSUB VX2, VXBR, x3, VX2
+    XVFMADD VX3, VXBR, x4, VX3
+    XVFADD x3, VX0, VX2
+    XVFADD x4, VX1, VX3
+#ifdef DOUBLE
+    xvilvl.d VX2, x4 ,x3
+    xvilvh.d VX3, x4, x3
+    xvst VX2, Y, 0 * SIZE
+    xvst VX3, Y, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+#else
+    xvilvl.w VX2, x4 ,x3
+    xvilvh.w VX3, x4, x3
+    xvst VX2, Y, 0 * SIZE
+    xvst VX3, Y, 8 * SIZE
+    addi.d X, X, 16 * SIZE
+    addi.d Y, Y, 16 * SIZE
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L114
+    b .L997
+    .align 3
+
+.L12: // INCX==1 and INCY!=1
+    bge $r0, I, .L997
+    move YY, Y
+    .align 3
+
+.L121:
+#ifdef DOUBLE
+    xvld VX0, X, 0 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.d x3, t1, 0
+    xvinsgr2vr.d x4, t2, 0
+    xvinsgr2vr.d x3, t3, 2
+    xvinsgr2vr.d x4, t4, 2
+
+    xvld VX1, X, 4 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    xvinsgr2vr.d x3, t1, 1
+    xvinsgr2vr.d x4, t2, 1
+    xvinsgr2vr.d x3, t3, 3
+    xvinsgr2vr.d x4, t4, 3
+    add.d Y, Y, INCY
+
+    xvpickev.d x1, VX1, VX0
+    xvpickod.d x2, VX1, VX0
+    xvfmul.d VX0, VXAI, x2
+    xvfmul.d VX1, VXAI, x1
+    xvfmul.d VX2, VXBI, x4
+    xvfmul.d VX3, VXBI, x3
+    xvfmsub.d VX0, VXAR, x1, VX0
+    xvfmadd.d VX1, VXAR, x2, VX1
+    xvfmsub.d VX2, VXBR, x3, VX2
+    xvfmadd.d VX3, VXBR, x4, VX3
+    xvfadd.d x3, VX0, VX2
+    xvfadd.d x4, VX1, VX3
+    addi.d  I, I, -1
+    xvstelm.d x3, YY, 0 * SIZE, 0
+    xvstelm.d x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    xvstelm.d x3, YY, 0 * SIZE, 2
+    xvstelm.d x4, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    xvstelm.d x3, YY, 0 * SIZE, 1
+    xvstelm.d x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    xvstelm.d x3, YY, 0 * SIZE, 3
+    xvstelm.d x4, YY, 1 * SIZE, 3
+    add.d YY, YY, INCY
+    addi.d X, X, 8 * SIZE
+    blt $r0, I, .L121
+    b .L997
+    .align 3
+#else
+    xvld VX0, X, 0 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 0
+    xvinsgr2vr.w x4, t2, 0
+    xvinsgr2vr.w x3, t3, 1
+    xvinsgr2vr.w x4, t4, 1
+    xvld VX1, X, 8 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    xvinsgr2vr.w x3, t1, 4
+    xvinsgr2vr.w x4, t2, 4
+    xvinsgr2vr.w x3, t3, 5
+    xvinsgr2vr.w x4, t4, 5
+    add.d Y, Y, INCY
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 2
+    xvinsgr2vr.w x4, t2, 2
+    xvinsgr2vr.w x3, t3, 3
+    xvinsgr2vr.w x4, t4, 3
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    xvinsgr2vr.w x3, t1, 6
+    xvinsgr2vr.w x4, t2, 6
+    xvinsgr2vr.w x3, t3, 7
+    xvinsgr2vr.w x4, t4, 7
+    add.d Y, Y, INCY
+
+    xvpickev.w x1, VX1, VX0
+    xvpickod.w x2, VX1, VX0
+    XVFMUL VX0, VXAI, x2
+    XVFMUL VX1, VXAI, x1
+    XVFMUL VX2, VXBI, x4
+    XVFMUL VX3, VXBI, x3
+    XVMSUB VX0, VXAR, x1, VX0
+    XVFMADD VX1, VXAR, x2, VX1
+    XVMSUB VX2, VXBR, x3, VX2
+    XVFMADD VX3, VXBR, x4, VX3
+    XVFADD x3, VX0, VX2
+    XVFADD x4, VX1, VX3
+    addi.d  I, I, -1
+    xvstelm.w x3, YY, 0 * SIZE, 0
+    xvstelm.w x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 1
+    xvstelm.w x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 4
+    xvstelm.w x4, YY, 1 * SIZE, 4
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 5
+    xvstelm.w x4, YY, 1 * SIZE, 5
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 2
+    xvstelm.w x4, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 3
+    xvstelm.w x4, YY, 1 * SIZE, 3
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 6
+    xvstelm.w x4, YY, 1 * SIZE, 6
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 7
+    xvstelm.w x4, YY, 1 * SIZE, 7
+    add.d YY, YY, INCY
+    addi.d X, X, 16 * SIZE
+    blt $r0, I, .L121
+    b .L997
+    .align 3
+#endif
+
+.L21:// INCX!=1 and INCY==1
+    bge $r0, I, .L997
+    .align 3
+
+.L211:
+#ifdef DOUBLE
+    xvld VX2, Y, 0 * SIZE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d x1, t1, 0
+    xvinsgr2vr.d x2, t2, 0
+    xvinsgr2vr.d x1, t3, 2
+    xvinsgr2vr.d x2, t4, 2
+    xvld VX3, Y, 4 * SIZE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    xvinsgr2vr.d x1, t1, 1
+    xvinsgr2vr.d x2, t2, 1
+    xvinsgr2vr.d x1, t3, 3
+    xvinsgr2vr.d x2, t4, 3
+    add.d X, X, INCX
+
+    xvpickev.d x3, VX3, VX2
+    xvpickod.d x4, VX3, VX2
+    xvfmul.d VX0, VXAI, x2
+    xvfmul.d VX1, VXAI, x1
+    xvfmul.d VX2, VXBI, x4
+    xvfmul.d VX3, VXBI, x3
+    xvfmsub.d VX0, VXAR, x1, VX0
+    xvfmadd.d VX1, VXAR, x2, VX1
+    xvfmsub.d VX2, VXBR, x3, VX2
+    xvfmadd.d VX3, VXBR, x4, VX3
+    xvfadd.d x3, VX0, VX2
+    xvfadd.d x4, VX1, VX3
+    xvilvl.d VX2, x4 ,x3
+    xvilvh.d VX3, x4, x3
+    addi.d  I, I, -1
+    xvst VX2, Y, 0 * SIZE
+    xvst VX3, Y, 4 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    blt $r0, I, .L211
+    b .L997
+    .align 3
+#else
+    xvld VX2, Y, 0 * SIZE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 0
+    xvinsgr2vr.w x2, t2, 0
+    xvinsgr2vr.w x1, t3, 1
+    xvinsgr2vr.w x2, t4, 1
+    xvld VX3, Y, 8 * SIZE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 4
+    xvinsgr2vr.w x2, t2, 4
+    xvinsgr2vr.w x1, t3, 5
+    xvinsgr2vr.w x2, t4, 5
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 2
+    xvinsgr2vr.w x2, t2, 2
+    xvinsgr2vr.w x1, t3, 3
+    xvinsgr2vr.w x2, t4, 3
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    xvinsgr2vr.w x1, t1, 6
+    xvinsgr2vr.w x2, t2, 6
+    xvinsgr2vr.w x1, t3, 7
+    xvinsgr2vr.w x2, t4, 7
+    add.d X, X, INCX
+
+    xvpickev.w x3, VX3, VX2
+    xvpickod.w x4, VX3, VX2
+    XVFMUL VX0, VXAI, x2
+    XVFMUL VX1, VXAI, x1
+    XVFMUL VX2, VXBI, x4
+    XVFMUL VX3, VXBI, x3
+    XVMSUB VX0, VXAR, x1, VX0
+    XVFMADD VX1, VXAR, x2, VX1
+    XVMSUB VX2, VXBR, x3, VX2
+    XVFMADD VX3, VXBR, x4, VX3
+    XVFADD x3, VX0, VX2
+    XVFADD x4, VX1, VX3
+    xvilvl.w VX2, x4 ,x3
+    xvilvh.w VX3, x4, x3
+    addi.d  I, I, -1
+    xvst VX2, Y, 0 * SIZE
+    xvst VX3, Y, 8 * SIZE
+    addi.d Y, Y, 16 * SIZE
+    blt $r0, I, .L211
+    b .L997
+    .align 3
+#endif
+
+.L22:
+    bge $r0, I, .L997
+    move YY, Y
+    CMPEQ $fcc0, BETAR, a1
+    CMPEQ $fcc1, BETAI, a1
+    CMPEQ $fcc2, ALPHAR, a1
+    CMPEQ $fcc3, ALPHAI, a1
+    bceqz $fcc0, .L23
+    bceqz $fcc1, .L23
+    b .L24
+    .align 3
+
+.L23:
+    bceqz $fcc2, .L224
+    bceqz $fcc3, .L224 //!(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0)
+    b .L223 //!(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0)
+    .align 3
+
+.L24:
+    bceqz $fcc2, .L222
+    bceqz $fcc3, .L222 //(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0)
+    b .L221 //(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0)
+    .align 3
+
+.L221:  //(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0)
+#ifdef DOUBLE
+    xvstelm.d VXZ, Y, 0, 0
+    xvstelm.d VXZ, Y, 0, 0
+    add.d Y, Y, INCY
+    xvstelm.d VXZ, Y, 0, 0
+    xvstelm.d VXZ, Y, 0, 0
+    add.d Y, Y, INCY
+    xvstelm.d VXZ, Y, 0, 0
+    xvstelm.d VXZ, Y, 0, 0
+    add.d Y, Y, INCY
+    xvstelm.d VXZ, Y, 0, 0
+    xvstelm.d VXZ, Y, 0, 0
+    add.d Y, Y, INCY
+    addi.d I, I, -1
+    blt $r0, I, .L221
+    b .L997
+    .align 3
+#else
+    xvstelm.w VXZ, Y, 0, 0
+    xvstelm.w VXZ, Y, 0, 0
+    add.d Y, Y, INCY
+    xvstelm.w VXZ, Y, 0, 0
+    xvstelm.w VXZ, Y, 0, 0
+    add.d Y, Y, INCY
+    xvstelm.w VXZ, Y, 0, 0
+    xvstelm.w VXZ, Y, 0, 0
+    add.d Y, Y, INCY
+    xvstelm.w VXZ, YY, 0, 0
+    xvstelm.w VXZ, YY, 0, 0
+    add.d Y, Y, INCY
+    xvstelm.w VXZ, Y, 0, 0
+    xvstelm.w VXZ, Y, 0, 0
+    add.d Y, Y, INCY
+    xvstelm.w VXZ, Y, 0, 0
+    xvstelm.w VXZ, Y, 0, 0
+    add.d Y, Y, INCY
+    xvstelm.w VXZ, Y, 0, 0
+    xvstelm.w VXZ, Y, 0, 0
+    add.d Y, Y, INCY
+    xvstelm.w VXZ, Y, 0, 0
+    xvstelm.w VXZ, Y, 0, 0
+    add.d Y, Y, INCY
+    addi.d I, I, -1
+    blt $r0, I, .L221
+    b .L997
+    .align 3
+#endif
+
+.L222:  //(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0)
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d x1, t1, 0
+    xvinsgr2vr.d x2, t2, 0
+    xvinsgr2vr.d x1, t3, 1
+    xvinsgr2vr.d x2, t4, 1
+
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    xvinsgr2vr.d x1, t1, 2
+    xvinsgr2vr.d x2, t2, 2
+    xvinsgr2vr.d x1, t3, 3
+    xvinsgr2vr.d x2, t4, 3
+    add.d X, X, INCX
+    xvfmul.d x3, VXAI, x2
+    xvfmul.d x4, VXAI, x1
+    xvfmsub.d x3, VXAR, x1, x3
+    xvfmadd.d x4, VXAR, x2, x4
+    addi.d  I, I, -1
+    xvstelm.d x3, YY, 0 * SIZE, 0
+    xvstelm.d x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    xvstelm.d x3, YY, 0 * SIZE, 1
+    xvstelm.d x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    xvstelm.d x3, YY, 0 * SIZE, 2
+    xvstelm.d x4, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    xvstelm.d x3, YY, 0 * SIZE, 3
+    xvstelm.d x4, YY, 1 * SIZE, 3
+    add.d YY, YY, INCY
+    blt $r0, I, .L222
+    b .L997
+    .align 3
+#else
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 0
+    xvinsgr2vr.w x2, t2, 0
+    xvinsgr2vr.w x1, t3, 1
+    xvinsgr2vr.w x2, t4, 1
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 2
+    xvinsgr2vr.w x2, t2, 2
+    xvinsgr2vr.w x1, t3, 3
+    xvinsgr2vr.w x2, t4, 3
+
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 4
+    xvinsgr2vr.w x2, t2, 4
+    xvinsgr2vr.w x1, t3, 5
+    xvinsgr2vr.w x2, t4, 5
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    xvinsgr2vr.w x1, t1, 6
+    xvinsgr2vr.w x2, t2, 6
+    xvinsgr2vr.w x1, t3, 7
+    xvinsgr2vr.w x2, t4, 7
+    add.d X, X, INCX
+    XVFMUL x3, VXAI, x2
+    XVFMUL x4, VXAI, x1
+    XVMSUB x3, VXAR, x1, x3
+    XVFMADD x4, VXAR, x2, x4
+    addi.d  I, I, -1
+    xvstelm.w x3, YY, 0 * SIZE, 0
+    xvstelm.w x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 1
+    xvstelm.w x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 2
+    xvstelm.w x4, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 3
+    xvstelm.w x4, YY, 1 * SIZE, 3
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 4
+    xvstelm.w x4, YY, 1 * SIZE, 4
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 5
+    xvstelm.w x4, YY, 1 * SIZE, 5
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 6
+    xvstelm.w x4, YY, 1 * SIZE, 6
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 7
+    xvstelm.w x4, YY, 1 * SIZE, 7
+    add.d YY, YY, INCY
+    blt $r0, I, .L222
+    b .L997
+    .align 3
+#endif
+
+.L223:
+#ifdef DOUBLE
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.d x1, t1, 0
+    xvinsgr2vr.d x2, t2, 0
+    xvinsgr2vr.d x1, t3, 1
+    xvinsgr2vr.d x2, t4, 1
+
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    xvinsgr2vr.d x1, t1, 2
+    xvinsgr2vr.d x2, t2, 2
+    xvinsgr2vr.d x1, t3, 3
+    xvinsgr2vr.d x2, t4, 3
+    add.d Y, Y, INCY
+    xvfmul.d x3, VXBI, x2
+    xvfmul.d x4, VXBI, x1
+    xvfmsub.d x3, VXBR, x1, x3
+    xvfmadd.d x4, VXBR, x2, x4
+
+    addi.d  I, I, -1
+    xvstelm.d x3, YY, 0 * SIZE, 0
+    xvstelm.d x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    xvstelm.d x3, YY, 0 * SIZE, 1
+    xvstelm.d x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    xvstelm.d x3, YY, 0 * SIZE, 2
+    xvstelm.d x4, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    xvstelm.d x3, YY, 0 * SIZE, 3
+    xvstelm.d x4, YY, 1 * SIZE, 3
+    add.d YY, YY, INCY
+    blt $r0, I, .L223
+    b .L997
+    .align 3
+#else
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x1, t1, 0
+    xvinsgr2vr.w x2, t2, 0
+    xvinsgr2vr.w x1, t3, 1
+    xvinsgr2vr.w x2, t4, 1
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x1, t1, 2
+    xvinsgr2vr.w x2, t2, 2
+    xvinsgr2vr.w x1, t3, 3
+    xvinsgr2vr.w x2, t4, 3
+
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x1, t1, 4
+    xvinsgr2vr.w x2, t2, 4
+    xvinsgr2vr.w x1, t3, 5
+    xvinsgr2vr.w x2, t4, 5
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    xvinsgr2vr.w x1, t1, 6
+    xvinsgr2vr.w x2, t2, 6
+    xvinsgr2vr.w x1, t3, 7
+    xvinsgr2vr.w x2, t4, 7
+    add.d Y, Y, INCY
+
+    XVFMUL x3, VXBI, x2
+    XVFMUL x4, VXBI, x1
+    XVMSUB x3, VXBR, x1, x3
+    XVFMADD x4, VXBR, x2, x4
+    addi.d  I, I, -1
+    xvstelm.w x3, YY, 0 * SIZE, 0
+    xvstelm.w x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 1
+    xvstelm.w x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 2
+    xvstelm.w x4, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 3
+    xvstelm.w x4, YY, 1 * SIZE, 3
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 4
+    xvstelm.w x4, YY, 1 * SIZE, 4
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 5
+    xvstelm.w x4, YY, 1 * SIZE, 5
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 6
+    xvstelm.w x4, YY, 1 * SIZE, 6
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 7
+    xvstelm.w x4, YY, 1 * SIZE, 7
+    add.d YY, YY, INCY
+    blt $r0, I, .L223
+    b .L997
+    .align 3
+#endif
+
+.L224:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d x1, t1, 0
+    xvinsgr2vr.d x2, t2, 0
+    xvinsgr2vr.d x1, t3, 1
+    xvinsgr2vr.d x2, t4, 1
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d x1, t1, 2
+    xvinsgr2vr.d x2, t2, 2
+    xvinsgr2vr.d x1, t3, 3
+    xvinsgr2vr.d x2, t4, 3
+
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.d x3, t1, 0
+    xvinsgr2vr.d x4, t2, 0
+    xvinsgr2vr.d x3, t3, 1
+    xvinsgr2vr.d x4, t4, 1
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    xvinsgr2vr.d x3, t1, 2
+    xvinsgr2vr.d x4, t2, 2
+    xvinsgr2vr.d x3, t3, 3
+    xvinsgr2vr.d x4, t4, 3
+    add.d Y, Y, INCY
+    xvfmul.d VX0, VXAI, x2
+    xvfmul.d VX1, VXAI, x1
+    xvfmul.d VX2, VXBI, x4
+    xvfmul.d VX3, VXBI, x3
+    xvfmsub.d VX0, VXAR, x1, VX0
+    xvfmadd.d VX1, VXAR, x2, VX1
+    xvfmsub.d VX2, VXBR, x3, VX2
+    xvfmadd.d VX3, VXBR, x4, VX3
+    xvfadd.d x3, VX0, VX2
+    xvfadd.d x4, VX1, VX3
+    addi.d  I, I, -1
+
+    xvstelm.d x3, YY, 0 * SIZE, 0
+    xvstelm.d x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    xvstelm.d x3, YY, 0 * SIZE, 1
+    xvstelm.d x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    xvstelm.d x3, YY, 0 * SIZE, 2
+    xvstelm.d x4, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    xvstelm.d x3, YY, 0 * SIZE, 3
+    xvstelm.d x4, YY, 1 * SIZE, 3
+    add.d YY, YY, INCY
+    blt $r0, I, .L224
+    b .L997
+    .align 3
+#else
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 0
+    xvinsgr2vr.w x2, t2, 0
+    xvinsgr2vr.w x1, t3, 1
+    xvinsgr2vr.w x2, t4, 1
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 2
+    xvinsgr2vr.w x2, t2, 2
+    xvinsgr2vr.w x1, t3, 3
+    xvinsgr2vr.w x2, t4, 3
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 4
+    xvinsgr2vr.w x2, t2, 4
+    xvinsgr2vr.w x1, t3, 5
+    xvinsgr2vr.w x2, t4, 5
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 6
+    xvinsgr2vr.w x2, t2, 6
+    xvinsgr2vr.w x1, t3, 7
+    xvinsgr2vr.w x2, t4, 7
+
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 0
+    xvinsgr2vr.w x4, t2, 0
+    xvinsgr2vr.w x3, t3, 1
+    xvinsgr2vr.w x4, t4, 1
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 2
+    xvinsgr2vr.w x4, t2, 2
+    xvinsgr2vr.w x3, t3, 3
+    xvinsgr2vr.w x4, t4, 3
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 4
+    xvinsgr2vr.w x4, t2, 4
+    xvinsgr2vr.w x3, t3, 5
+    xvinsgr2vr.w x4, t4, 5
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    xvinsgr2vr.w x3, t1, 6
+    xvinsgr2vr.w x4, t2, 6
+    xvinsgr2vr.w x3, t3, 7
+    xvinsgr2vr.w x4, t4, 7
+    add.d Y, Y, INCY
+
+    XVFMUL VX0, VXAI, x2
+    XVFMUL VX1, VXAI, x1
+    XVFMUL VX2, VXBI, x4
+    XVFMUL VX3, VXBI, x3
+    XVMSUB VX0, VXAR, x1, VX0
+    XVFMADD VX1, VXAR, x2, VX1
+    XVMSUB VX2, VXBR, x3, VX2
+    XVFMADD VX3, VXBR, x4, VX3
+    XVFADD x3, VX0, VX2
+    XVFADD x4, VX1, VX3
+    addi.d  I, I, -1
+
+    xvstelm.w x3, YY, 0 * SIZE, 0
+    xvstelm.w x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 1
+    xvstelm.w x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 2
+    xvstelm.w x4, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 3
+    xvstelm.w x4, YY, 1 * SIZE, 3
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 4
+    xvstelm.w x4, YY, 1 * SIZE, 4
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 5
+    xvstelm.w x4, YY, 1 * SIZE, 5
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 6
+    xvstelm.w x4, YY, 1 * SIZE, 6
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 7
+    xvstelm.w x4, YY, 1 * SIZE, 7
+    add.d YY, YY, INCY
+    blt $r0, I, .L224
+    b .L997
+    .align 3
+#endif
+
+.L997:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L998:
+    LD a1, X, 0 * SIZE
+    LD a2, X, 1 * SIZE
+    LD a3, Y, 0 * SIZE
+    LD a4, Y, 1 * SIZE
+    addi.d I, I, -1
+    MUL s1, ALPHAI, a2
+    MUL s2, ALPHAI, a1
+    MUL s3, BETAI, a4
+    MUL s4, BETAI, a3
+    MSUB s1, ALPHAR, a1, s1
+    MADD s2, a2, ALPHAR, s2
+    MSUB s3, BETAR, a3, s3
+    MADD s4, a4, BETAR, s4
+    ADD s3, s3, s1
+    ADD s4, s4, s2
+    ST s3, Y, 0 * SIZE
+    ST s4, Y, 1 * SIZE
+    add.d X, X, INCX
+    add.d Y, Y, INCY
+    blt $r0, I, .L998
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/caxpby_lsx.S b/kernel/loongarch64/caxpby_lsx.S
new file mode 100644
index 000000000..247ae428e
--- /dev/null
+++ b/kernel/loongarch64/caxpby_lsx.S
@@ -0,0 +1,1029 @@
+#define ASSEMBLER
+
+#include "common.h"
+#define N      $r4
+#define ALPHAR $f0
+#define ALPHAI $f1
+#define X      $r5
+#define INCX   $r6
+#define BETAR  $f2
+#define BETAI  $f3
+#define Y      $r7
+#define INCY   $r8
+
+#define I      $r12
+#define TEMP   $r13
+#define t1     $r14
+#define t2     $r16
+#define t3     $r15
+#define t4     $r17
+#define XX     $r18
+#define YY     $r19
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define s1     $f16
+#define s2     $f17
+#define s3     $f18
+#define s4     $f19
+#define VX0    $vr8
+#define VX1    $vr20
+#define VX2    $vr21
+#define VX3    $vr22
+#define VXAR   $vr23
+#define VXAI   $vr19
+#define VXBR   $vr14
+#define VXBI   $vr13
+#define VXZ    $vr12
+#define x1     $vr18
+#define x2     $vr17
+#define x3     $vr16
+#define x4     $vr15
+
+    PROLOGUE
+
+    bge $r0, N, .L999
+    movgr2fr.d a1, $r0
+#ifdef DOUBLE
+    ffint.d.l a1, a1
+#else
+    ffint.s.l a1, a1
+#endif
+    slli.d  INCX, INCX, ZBASE_SHIFT
+    slli.d  INCY, INCY, ZBASE_SHIFT
+#ifdef DOUBLE
+    movfr2gr.d t1, ALPHAR
+    vreplgr2vr.d VXAR, t1
+    movfr2gr.d t2, ALPHAI
+    vreplgr2vr.d VXAI, t2
+    movfr2gr.d t3, BETAR
+    vreplgr2vr.d VXBR, t3
+    movfr2gr.d t4, BETAI
+    vreplgr2vr.d VXBI, t4
+#else
+    movfr2gr.s t1, ALPHAR
+    vreplgr2vr.w VXAR, t1
+    movfr2gr.s t2, ALPHAI
+    vreplgr2vr.w VXAI, t2
+    movfr2gr.s t3, BETAR
+    vreplgr2vr.w VXBR, t3
+    movfr2gr.s t4, BETAI
+    vreplgr2vr.w VXBI, t4
+#endif
+    vxor.v VXZ, VXZ, VXZ
+    // If incx == 0 || incy == 0, do one by one
+    and TEMP, INCX, INCY
+    or  I,    N,    N
+    beqz TEMP, .L998
+
+    li.d TEMP, 1
+    slli.d  TEMP, TEMP, ZBASE_SHIFT
+    srai.d I, N, 2
+    bne INCX, TEMP, .L20
+    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
+    b .L11  // INCX==1 and INCY==1
+.L20:
+    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
+    b .L21 // INCX!=1 and INCY==1
+
+.L11:
+    bge $r0, I, .L997
+#ifdef DOUBLE
+    fcmp.ceq.d $fcc0, BETAR, a1
+    fcmp.ceq.d $fcc1, BETAI, a1
+    fcmp.ceq.d $fcc2, ALPHAR, a1
+    fcmp.ceq.d $fcc3, ALPHAI, a1
+#else
+    fcmp.ceq.s $fcc0, BETAR, a1
+    fcmp.ceq.s $fcc1, BETAI, a1
+    fcmp.ceq.s $fcc2, ALPHAR, a1
+    fcmp.ceq.s $fcc3, ALPHAI, a1
+#endif
+    bceqz $fcc0, .L13
+    bceqz $fcc1, .L13
+    b .L14
+    .align 3
+
+.L13:
+    bceqz $fcc2, .L114
+    bceqz $fcc3, .L114 //!(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0)
+    b .L113 //!(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0)
+
+.L14:
+    bceqz $fcc2, .L112
+    bceqz $fcc3, .L112 //(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0)
+    b .L111 //(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0)
+    .align 3
+
+.L111:  //(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0)
+#ifdef DOUBLE
+    vst VXZ, Y, 0 * SIZE
+    vst VXZ, Y, 2 * SIZE
+    vst VXZ, Y, 4 * SIZE
+    vst VXZ, Y, 6 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L111
+    b .L997
+    .align 3
+#else
+    vst VXZ, Y, 0 * SIZE
+    vst VXZ, Y, 4 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L111
+    b .L997
+    .align 3
+#endif
+
+.L112:  //(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0)
+#ifdef DOUBLE
+    vld VX0, X, 0 * SIZE
+    vld VX1, X, 2 * SIZE
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    vfmul.d x3, VXAI, x2
+    vfmul.d x4, VXAI, x1
+    vfmsub.d x3, VXAR, x1, x3
+    vfmadd.d x4, VXAR, x2, x4
+    vilvl.d VX2, x4 ,x3
+    vilvh.d VX3, x4, x3
+    vst VX2, Y, 0 * SIZE
+    vst VX3, Y, 2 * SIZE
+
+    vld VX0, X, 4 * SIZE
+    vld VX1, X, 6 * SIZE
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    vfmul.d x3, VXAI, x2
+    vfmul.d x4, VXAI, x1
+    vfmsub.d x3, VXAR, x1, x3
+    vfmadd.d x4, VXAR, x2, x4
+    vilvl.d VX2, x4 ,x3
+    vilvh.d VX3, x4, x3
+    vst VX2, Y, 4 * SIZE
+    vst VX3, Y, 6 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L112
+    b .L997
+    .align 3
+#else
+    vld VX0, X, 0 * SIZE
+    vld VX1, X, 4 * SIZE
+    vpickev.w x1, VX1, VX0
+    vpickod.w x2, VX1, VX0
+    vfmul.s x3, VXAI, x2
+    vfmul.s x4, VXAI, x1
+    vfmsub.s x3, VXAR, x1, x3
+    vfmadd.s x4, VXAR, x2, x4
+    vilvl.w VX2, x4 ,x3
+    vilvh.w VX3, x4, x3
+    vst VX2, Y, 0 * SIZE
+    vst VX3, Y, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L112
+    b .L997
+    .align 3
+#endif
+
+.L113: //!(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0)
+#ifdef DOUBLE
+    vld VX0, Y, 0 * SIZE
+    vld VX1, Y, 2 * SIZE
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    vfmul.d x3, VXBI, x2
+    vfmul.d x4, VXBI, x1
+    vfmsub.d x3, VXBR, x1, x3
+    vfmadd.d x4, VXBR, x2, x4
+    vilvl.d VX2, x4 ,x3
+    vilvh.d VX3, x4, x3
+    vst VX2, Y, 0 * SIZE
+    vst VX3, Y, 2 * SIZE
+    vld VX0, Y, 4 * SIZE
+    vld VX1, Y, 6 * SIZE
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    vfmul.d x3, VXBI, x2
+    vfmul.d x4, VXBI, x1
+    vfmsub.d x3, VXBR, x1, x3
+    vfmadd.d x4, VXBR, x2, x4
+    vilvl.d VX2, x4 ,x3
+    vilvh.d VX3, x4, x3
+    vst VX2, Y, 4 * SIZE
+    vst VX3, Y, 6 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L113
+    b .L997
+    .align 3
+#else
+    vld VX0, Y, 0 * SIZE
+    vld VX1, Y, 4 * SIZE
+    vpickev.w x1, VX1, VX0
+    vpickod.w x2, VX1, VX0
+    vfmul.s x3, VXBI, x2
+    vfmul.s x4, VXBI, x1
+    vfmsub.s x3, VXBR, x1, x3
+    vfmadd.s x4, VXBR, x2, x4
+    vilvl.w VX2, x4 ,x3
+    vilvh.w VX3, x4, x3
+    vst VX2, Y, 0 * SIZE
+    vst VX3, Y, 4 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L113
+    b .L997
+    .align 3
+#endif
+
+.L114:
+#ifdef DOUBLE
+    vld VX0, X, 0 * SIZE
+    vld VX1, X, 2 * SIZE
+    vld VX2, Y, 0 * SIZE
+    vld VX3, Y, 2 * SIZE
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    vpickev.d x3, VX3, VX2
+    vpickod.d x4, VX3, VX2
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX1, VXAI, x1
+    vfmul.d VX2, VXBI, x4
+    vfmul.d VX3, VXBI, x3
+    vfmsub.d VX0, VXAR, x1, VX0
+    vfmadd.d VX1, VXAR, x2, VX1
+    vfmsub.d VX2, VXBR, x3, VX2
+    vfmadd.d VX3, VXBR, x4, VX3
+    vfadd.d x3, VX0, VX2
+    vfadd.d x4, VX1, VX3
+    vilvl.d VX2, x4 ,x3
+    vilvh.d VX3, x4, x3
+    vst VX2, Y, 0 * SIZE
+    vst VX3, Y, 2 * SIZE
+
+    vld VX0, X, 4 * SIZE
+    vld VX1, X, 6 * SIZE
+    vld VX2, Y, 4 * SIZE
+    vld VX3, Y, 6 * SIZE
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    vpickev.d x3, VX3, VX2
+    vpickod.d x4, VX3, VX2
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX1, VXAI, x1
+    vfmul.d VX2, VXBI, x4
+    vfmul.d VX3, VXBI, x3
+    vfmsub.d VX0, VXAR, x1, VX0
+    vfmadd.d VX1, VXAR, x2, VX1
+    vfmsub.d VX2, VXBR, x3, VX2
+    vfmadd.d VX3, VXBR, x4, VX3
+    vfadd.d x3, VX0, VX2
+    vfadd.d x4, VX1, VX3
+    vilvl.d VX2, x4 ,x3
+    vilvh.d VX3, x4, x3
+    vst VX2, Y, 4 * SIZE
+    vst VX3, Y, 6 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L114
+    b .L997
+    .align 3
+#else
+    vld VX0, X, 0 * SIZE
+    vld VX1, X, 4 * SIZE
+    vld VX2, Y, 0 * SIZE
+    vld VX3, Y, 4 * SIZE
+    vpickev.w x1, VX1, VX0
+    vpickod.w x2, VX1, VX0
+    vpickev.w x3, VX3, VX2
+    vpickod.w x4, VX3, VX2
+    vfmul.s VX0, VXAI, x2
+    vfmul.s VX1, VXAI, x1
+    vfmul.s VX2, VXBI, x4
+    vfmul.s VX3, VXBI, x3
+    vfmsub.s VX0, VXAR, x1, VX0
+    vfmadd.s VX1, VXAR, x2, VX1
+    vfmsub.s VX2, VXBR, x3, VX2
+    vfmadd.s VX3, VXBR, x4, VX3
+    vfadd.s x3, VX0, VX2
+    vfadd.s x4, VX1, VX3
+    vilvl.w VX2, x4 ,x3
+    vilvh.w VX3, x4, x3
+    vst VX2, Y, 0 * SIZE
+    vst VX3, Y, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L114
+    b .L997
+    .align 3
+#endif
+
+.L12: // INCX==1 and INCY!=1
+    bge $r0, I, .L997
+    move YY, Y
+    .align 3
+
+.L121:
+#ifdef DOUBLE
+    vld VX0, X, 0 * SIZE
+    vld VX1, X, 2 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    vinsgr2vr.d x3, t1, 0
+    vinsgr2vr.d x4, t2, 0
+    vinsgr2vr.d x3, t3, 1
+    vinsgr2vr.d x4, t4, 1
+    add.d Y, Y, INCY
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX1, VXAI, x1
+    vfmul.d VX2, VXBI, x4
+    vfmul.d VX3, VXBI, x3
+    vfmsub.d VX0, VXAR, x1, VX0
+    vfmadd.d VX1, VXAR, x2, VX1
+    vfmsub.d VX2, VXBR, x3, VX2
+    vfmadd.d VX3, VXBR, x4, VX3
+    vfadd.d x3, VX0, VX2
+    vfadd.d x4, VX1, VX3
+    vstelm.d x3, YY, 0 * SIZE, 0
+    vstelm.d x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.d x3, YY, 0 * SIZE, 1
+    vstelm.d x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+
+    vld VX0, X, 4 * SIZE
+    vld VX1, X, 6 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    vinsgr2vr.d x3, t1, 0
+    vinsgr2vr.d x4, t2, 0
+    vinsgr2vr.d x3, t3, 1
+    vinsgr2vr.d x4, t4, 1
+    add.d Y, Y, INCY
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX1, VXAI, x1
+    vfmul.d VX2, VXBI, x4
+    vfmul.d VX3, VXBI, x3
+    vfmsub.d VX0, VXAR, x1, VX0
+    vfmadd.d VX1, VXAR, x2, VX1
+    vfmsub.d VX2, VXBR, x3, VX2
+    vfmadd.d VX3, VXBR, x4, VX3
+    vfadd.d x3, VX0, VX2
+    vfadd.d x4, VX1, VX3
+    addi.d  I, I, -1
+    vstelm.d x3, YY, 0 * SIZE, 0
+    vstelm.d x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.d x3, YY, 0 * SIZE, 1
+    vstelm.d x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    addi.d X, X, 8 * SIZE
+    blt $r0, I, .L121
+    b .L997
+    .align 3
+#else
+    vld VX0, X, 0 * SIZE
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.w x3, t1, 0
+    vinsgr2vr.w x4, t2, 0
+    vinsgr2vr.w x3, t3, 1
+    vinsgr2vr.w x4, t4, 1
+
+    vld VX1, X, 4 * SIZE
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    vinsgr2vr.w x3, t1, 2
+    vinsgr2vr.w x4, t2, 2
+    vinsgr2vr.w x3, t3, 3
+    vinsgr2vr.w x4, t4, 3
+    add.d Y, Y, INCY
+
+    vpickev.w x1, VX1, VX0
+    vpickod.w x2, VX1, VX0
+    vfmul.s VX0, VXAI, x2
+    vfmul.s VX1, VXAI, x1
+    vfmul.s VX2, VXBI, x4
+    vfmul.s VX3, VXBI, x3
+    vfmsub.s VX0, VXAR, x1, VX0
+    vfmadd.s VX1, VXAR, x2, VX1
+    vfmsub.s VX2, VXBR, x3, VX2
+    vfmadd.s VX3, VXBR, x4, VX3
+    vfadd.s x3, VX0, VX2
+    vfadd.s x4, VX1, VX3
+    addi.d  I, I, -1
+    vstelm.w x3, YY, 0 * SIZE, 0
+    vstelm.w x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.w x3, YY, 0 * SIZE, 1
+    vstelm.w x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    vstelm.w x3, YY, 0 * SIZE, 2
+    vstelm.w x4, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    vstelm.w x3, YY, 0 * SIZE, 3
+    vstelm.w x4, YY, 1 * SIZE, 3
+    add.d YY, YY, INCY
+    addi.d X, X, 8 * SIZE
+    blt $r0, I, .L121
+    b .L997
+    .align 3
+#endif
+
+.L21:// INCX!=1 and INCY==1
+    bge $r0, I, .L997
+    .align 3
+
+.L211:
+#ifdef DOUBLE
+    vld VX2, Y, 0 * SIZE
+    vld VX3, Y, 2 * SIZE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    add.d X, X, INCX
+    vpickev.d x3, VX3, VX2
+    vpickod.d x4, VX3, VX2
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX1, VXAI, x1
+    vfmul.d VX2, VXBI, x4
+    vfmul.d VX3, VXBI, x3
+    vfmsub.d VX0, VXAR, x1, VX0
+    vfmadd.d VX1, VXAR, x2, VX1
+    vfmsub.d VX2, VXBR, x3, VX2
+    vfmadd.d VX3, VXBR, x4, VX3
+    vfadd.d x3, VX0, VX2
+    vfadd.d x4, VX1, VX3
+    vilvl.d VX2, x4 ,x3
+    vilvh.d VX3, x4, x3
+    vst VX2, Y, 0 * SIZE
+    vst VX3, Y, 2 * SIZE
+
+    vld VX2, Y, 4 * SIZE
+    vld VX3, Y, 6 * SIZE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    add.d X, X, INCX
+    vpickev.d x3, VX3, VX2
+    vpickod.d x4, VX3, VX2
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX1, VXAI, x1
+    vfmul.d VX2, VXBI, x4
+    vfmul.d VX3, VXBI, x3
+    vfmsub.d VX0, VXAR, x1, VX0
+    vfmadd.d VX1, VXAR, x2, VX1
+    vfmsub.d VX2, VXBR, x3, VX2
+    vfmadd.d VX3, VXBR, x4, VX3
+    vfadd.d x3, VX0, VX2
+    vfadd.d x4, VX1, VX3
+    vilvl.d VX2, x4 ,x3
+    vilvh.d VX3, x4, x3
+    addi.d  I, I, -1
+    vst VX3, Y, 4 * SIZE
+    vst VX3, Y, 6 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    blt $r0, I, .L211
+    b .L997
+    .align 3
+#else
+    vld VX2, Y, 0 * SIZE
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w x1, t1, 0
+    vinsgr2vr.w x2, t2, 0
+    vinsgr2vr.w x1, t3, 1
+    vinsgr2vr.w x2, t4, 1
+    vld VX3, Y, 4 * SIZE
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    vinsgr2vr.w x1, t1, 2
+    vinsgr2vr.w x2, t2, 2
+    vinsgr2vr.w x1, t3, 3
+    vinsgr2vr.w x2, t4, 3
+    add.d X, X, INCX
+
+    vpickev.w x3, VX3, VX2
+    vpickod.w x4, VX3, VX2
+    vfmul.s VX0, VXAI, x2
+    vfmul.s VX1, VXAI, x1
+    vfmul.s VX2, VXBI, x4
+    vfmul.s VX3, VXBI, x3
+    vfmsub.s VX0, VXAR, x1, VX0
+    vfmadd.s VX1, VXAR, x2, VX1
+    vfmsub.s VX2, VXBR, x3, VX2
+    vfmadd.s VX3, VXBR, x4, VX3
+    vfadd.s x3, VX0, VX2
+    vfadd.s x4, VX1, VX3
+    vilvl.w VX2, x4 ,x3
+    vilvh.w VX3, x4, x3
+    addi.d  I, I, -1
+    vst VX2, Y, 0 * SIZE
+    vst VX3, Y, 4 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    blt $r0, I, .L211
+    b .L997
+    .align 3
+#endif
+
+.L22:
+    bge $r0, I, .L997
+    move YY, Y
+#ifdef DOUBLE
+    fcmp.ceq.d $fcc0, BETAR, a1
+    fcmp.ceq.d $fcc1, BETAI, a1
+    fcmp.ceq.d $fcc2, ALPHAR, a1
+    fcmp.ceq.d $fcc3, ALPHAI, a1
+#else
+    fcmp.ceq.s $fcc0, BETAR, a1
+    fcmp.ceq.s $fcc1, BETAI, a1
+    fcmp.ceq.s $fcc2, ALPHAR, a1
+    fcmp.ceq.s $fcc3, ALPHAI, a1
+#endif
+    bceqz $fcc0, .L23
+    bceqz $fcc1, .L23
+    b .L24
+    .align 3
+
+.L23:
+    bceqz $fcc2, .L224
+    bceqz $fcc3, .L224 //!(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0)
+    b .L223 //!(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0)
+    .align 3
+
+.L24:
+    bceqz $fcc2, .L222
+    bceqz $fcc3, .L222 //(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0)
+    b .L221 //(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0)
+    .align 3
+
+.L221:  //(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0)
+#ifdef DOUBLE
+    vstelm.d VXZ, Y, 0, 0
+    vstelm.d VXZ, Y, 0, 0
+    add.d Y, Y, INCY
+    vstelm.d VXZ, Y, 0, 0
+    vstelm.d VXZ, Y, 0, 0
+    add.d Y, Y, INCY
+    vstelm.d VXZ, Y, 0, 0
+    vstelm.d VXZ, Y, 0, 0
+    add.d Y, Y, INCY
+    vstelm.d VXZ, Y, 0, 0
+    vstelm.d VXZ, Y, 0, 0
+    add.d Y, Y, INCY
+    addi.d I, I, -1
+    blt $r0, I, .L221
+    b .L997
+    .align 3
+#else
+    vstelm.w VXZ, Y, 0, 0
+    vstelm.w VXZ, Y, 0, 0
+    add.d Y, Y, INCY
+    vstelm.w VXZ, Y, 0, 0
+    vstelm.w VXZ, Y, 0, 0
+    add.d Y, Y, INCY
+    vstelm.w VXZ, Y, 0, 0
+    vstelm.w VXZ, Y, 0, 0
+    add.d Y, Y, INCY
+    vstelm.w VXZ, Y, 0, 0
+    vstelm.w VXZ, Y, 0, 0
+    add.d Y, Y, INCY
+    addi.d I, I, -1
+    blt $r0, I, .L221
+    b .L997
+    .align 3
+#endif
+
+.L222:  //(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0)
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    vfmul.d x3, VXAI, x2
+    vfmul.d x4, VXAI, x1
+    vfmsub.d x3, VXAR, x1, x3
+    vfmadd.d x4, VXAR, x2, x4
+    vstelm.d x3, YY, 0 * SIZE, 0
+    vstelm.d x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.d x3, YY, 0 * SIZE, 1
+    vstelm.d x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    add.d X, X, INCX
+    vfmul.d x3, VXAI, x2
+    vfmul.d x4, VXAI, x1
+    vfmsub.d x3, VXAR, x1, x3
+    vfmadd.d x4, VXAR, x2, x4
+    addi.d  I, I, -1
+    vstelm.d x3, YY, 0 * SIZE, 0
+    vstelm.d x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.d x3, YY, 0 * SIZE, 1
+    vstelm.d x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    blt $r0, I, .L222
+    b .L997
+    .align 3
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w x1, t1, 0
+    vinsgr2vr.w x2, t2, 0
+    vinsgr2vr.w x1, t3, 1
+    vinsgr2vr.w x2, t4, 1
+
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    vinsgr2vr.w x1, t1, 2
+    vinsgr2vr.w x2, t2, 2
+    vinsgr2vr.w x1, t3, 3
+    vinsgr2vr.w x2, t4, 3
+    add.d X, X, INCX
+    vfmul.s x3, VXAI, x2
+    vfmul.s x4, VXAI, x1
+    vfmsub.s x3, VXAR, x1, x3
+    vfmadd.s x4, VXAR, x2, x4
+    addi.d  I, I, -1
+    vstelm.w x3, YY, 0 * SIZE, 0
+    vstelm.w x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.w x3, YY, 0 * SIZE, 1
+    vstelm.w x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    vstelm.w x3, YY, 0 * SIZE, 2
+    vstelm.w x4, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    vstelm.w x3, YY, 0 * SIZE, 3
+    vstelm.w x4, YY, 1 * SIZE, 3
+    add.d YY, YY, INCY
+    blt $r0, I, .L222
+    b .L997
+    .align 3
+#endif
+
+.L223:
+#ifdef DOUBLE
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    add.d Y, Y, INCY
+    vfmul.d x3, VXBI, x2
+    vfmul.d x4, VXBI, x1
+    vfmsub.d x3, VXBR, x1, x3
+    vfmadd.d x4, VXBR, x2, x4
+    vstelm.d x3, YY, 0 * SIZE, 0
+    vstelm.d x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.d x3, YY, 0 * SIZE, 1
+    vstelm.d x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    add.d Y, Y, INCY
+    vfmul.d x3, VXBI, x2
+    vfmul.d x4, VXBI, x1
+    vfmsub.d x3, VXBR, x1, x3
+    vfmadd.d x4, VXBR, x2, x4
+    addi.d  I, I, -1
+    vstelm.d x3, YY, 0 * SIZE, 0
+    vstelm.d x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.d x3, YY, 0 * SIZE, 1
+    vstelm.d x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    blt $r0, I, .L223
+    b .L997
+    .align 3
+#else
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.w x1, t1, 0
+    vinsgr2vr.w x2, t2, 0
+    vinsgr2vr.w x1, t3, 1
+    vinsgr2vr.w x2, t4, 1
+
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    vinsgr2vr.w x1, t1, 2
+    vinsgr2vr.w x2, t2, 2
+    vinsgr2vr.w x1, t3, 3
+    vinsgr2vr.w x2, t4, 3
+    add.d Y, Y, INCY
+    vfmul.s x3, VXBI, x2
+    vfmul.s x4, VXBI, x1
+    vfmsub.s x3, VXBR, x1, x3
+    vfmadd.s x4, VXBR, x2, x4
+
+    addi.d  I, I, -1
+    vstelm.w x3, YY, 0 * SIZE, 0
+    vstelm.w x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.w x3, YY, 0 * SIZE, 1
+    vstelm.w x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    vstelm.w x3, YY, 0 * SIZE, 2
+    vstelm.w x4, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    vstelm.w x3, YY, 0 * SIZE, 3
+    vstelm.w x4, YY, 1 * SIZE, 3
+    add.d YY, YY, INCY
+    blt $r0, I, .L223
+    b .L997
+    .align 3
+#endif
+
+.L224:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    vinsgr2vr.d x3, t1, 0
+    vinsgr2vr.d x4, t2, 0
+    vinsgr2vr.d x3, t3, 1
+    vinsgr2vr.d x4, t4, 1
+    add.d Y, Y, INCY
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX1, VXAI, x1
+    vfmul.d VX2, VXBI, x4
+    vfmul.d VX3, VXBI, x3
+    vfmsub.d VX0, VXAR, x1, VX0
+    vfmadd.d VX1, VXAR, x2, VX1
+    vfmsub.d VX2, VXBR, x3, VX2
+    vfmadd.d VX3, VXBR, x4, VX3
+    vfadd.d x3, VX0, VX2
+    vfadd.d x4, VX1, VX3
+    vstelm.d x3, YY, 0 * SIZE, 0
+    vstelm.d x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.d x3, YY, 0 * SIZE, 1
+    vstelm.d x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    vinsgr2vr.d x3, t1, 0
+    vinsgr2vr.d x4, t2, 0
+    vinsgr2vr.d x3, t3, 1
+    vinsgr2vr.d x4, t4, 1
+    add.d Y, Y, INCY
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX1, VXAI, x1
+    vfmul.d VX2, VXBI, x4
+    vfmul.d VX3, VXBI, x3
+    vfmsub.d VX0, VXAR, x1, VX0
+    vfmadd.d VX1, VXAR, x2, VX1
+    vfmsub.d VX2, VXBR, x3, VX2
+    vfmadd.d VX3, VXBR, x4, VX3
+    vfadd.d x3, VX0, VX2
+    vfadd.d x4, VX1, VX3
+    vstelm.d x3, YY, 0 * SIZE, 0
+    vstelm.d x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.d x3, YY, 0 * SIZE, 1
+    vstelm.d x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L224
+    b .L997
+    .align 3
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w x1, t1, 0
+    vinsgr2vr.w x2, t2, 0
+    vinsgr2vr.w x1, t3, 1
+    vinsgr2vr.w x2, t4, 1
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w x1, t1, 2
+    vinsgr2vr.w x2, t2, 2
+    vinsgr2vr.w x1, t3, 3
+    vinsgr2vr.w x2, t4, 3
+
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.w x3, t1, 0
+    vinsgr2vr.w x4, t2, 0
+    vinsgr2vr.w x3, t3, 1
+    vinsgr2vr.w x4, t4, 1
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    vinsgr2vr.w x3, t1, 2
+    vinsgr2vr.w x4, t2, 2
+    vinsgr2vr.w x3, t3, 3
+    vinsgr2vr.w x4, t4, 3
+    add.d Y, Y, INCY
+    vfmul.s VX0, VXAI, x2
+    vfmul.s VX1, VXAI, x1
+    vfmul.s VX2, VXBI, x4
+    vfmul.s VX3, VXBI, x3
+    vfmsub.s VX0, VXAR, x1, VX0
+    vfmadd.s VX1, VXAR, x2, VX1
+    vfmsub.s VX2, VXBR, x3, VX2
+    vfmadd.s VX3, VXBR, x4, VX3
+    vfadd.s x3, VX0, VX2
+    vfadd.s x4, VX1, VX3
+    addi.d  I, I, -1
+
+    vstelm.w x3, YY, 0 * SIZE, 0
+    vstelm.w x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.w x3, YY, 0 * SIZE, 1
+    vstelm.w x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    vstelm.w x3, YY, 0 * SIZE, 2
+    vstelm.w x4, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    vstelm.w x3, YY, 0 * SIZE, 3
+    vstelm.w x4, YY, 1 * SIZE, 3
+    add.d YY, YY, INCY
+    blt $r0, I, .L224
+    b .L997
+    .align 3
+#endif
+
+.L997:
+    andi I, N, 3
+    bge $r0, I, .L999
+    .align 3
+
+.L998:
+#ifdef DOUBLE
+    fld.d a1, X, 0 * SIZE
+    fld.d a2, X, 1 * SIZE
+    fld.d a3, Y, 0 * SIZE
+    fld.d a4, Y, 1 * SIZE
+    addi.d I, I, -1
+    fmul.d s1, ALPHAI, a2
+    fmul.d s2, ALPHAI, a1
+    fmul.d s3, BETAI, a4
+    fmul.d s4, BETAI, a3
+    fmsub.d s1, ALPHAR, a1, s1
+    fmadd.d s2, a2, ALPHAR, s2
+    fmsub.d s3, BETAR, a3, s3
+    fmadd.d s4, a4, BETAR, s4
+    fadd.d s3, s3, s1
+    fadd.d s4, s4, s2
+    fst.d s3, Y, 0 * SIZE
+    fst.d s4, Y, 1 * SIZE
+    add.d X, X, INCX
+    add.d Y, Y, INCY
+    blt $r0, I, .L998
+    .align 3
+#else
+    fld.s a1, X, 0 * SIZE
+    fld.s a2, X, 1 * SIZE
+    fld.s a3, Y, 0 * SIZE
+    fld.s a4, Y, 1 * SIZE
+    addi.d I, I, -1
+    fmul.s s1, ALPHAI, a2
+    fmul.s s2, ALPHAI, a1
+    fmul.s s3, BETAI, a4
+    fmul.s s4, BETAI, a3
+    fmsub.s s1, ALPHAR, a1, s1
+    fmadd.s s2, a2, ALPHAR, s2
+    fmsub.s s3, BETAR, a3, s3
+    fmadd.s s4, a4, BETAR, s4
+    fadd.s s3, s3, s1
+    fadd.s s4, s4, s2
+    fst.s s3, Y, 0 * SIZE
+    fst.s s4, Y, 1 * SIZE
+    add.d X, X, INCX
+    add.d Y, Y, INCY
+    blt $r0, I, .L998
+    .align 3
+#endif
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/cgemm_kernel_8x4_lsx.S b/kernel/loongarch64/cgemm_kernel_8x4_lsx.S
new file mode 100644
index 000000000..1e9fd8524
--- /dev/null
+++ b/kernel/loongarch64/cgemm_kernel_8x4_lsx.S
@@ -0,0 +1,3313 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+
+/* Function parameters */
+#define M      $r4   // param 1: bm
+#define N      $r5   // param 2: bn
+#define K      $r6   // param 3: bk
+#define ALPHA_R $f0   // param 4: alphar
+#define ALPHA_I $f1   // param 5: alphai
+#define A      $r7   // param 6: ba
+#define B      $r8  // param 7: bb
+#define C      $r9  // param 8: bc
+#define LDC    $r10  // param 9: ldc
+
+#if defined (TRMMKERNEL)
+#define OFFSET $r11  // param 10: offset
+#endif
+#define OFF    $r26
+
+#define I      $r12
+#define J      $r13
+#define L      $r14
+#define TL     $r15
+#define A0     $r16
+#define B0     $r17
+#define C0     $r18
+#define C1     $r19
+#define C2     $r20
+#define C3     $r23
+#define T0     $r24
+#define T1     $r25
+#define T2     $r26
+#define T3     $r27
+
+#define a1     $f2
+#define a2     $f3
+#define a3     $f4
+#define a4     $f5
+#define a5     $f6
+#define a6     $f7
+#define a7     $f8
+#define a8     $f9
+#define b1     $f10
+#define b2     $f11
+#define b3     $f12
+#define b4     $f13
+#define b5     $f14
+#define b6     $f15
+#define b7     $f16
+#define b8     $f17
+#define c11    $f18
+#define c12    $f19
+#define c21    $f20
+#define c22    $f21
+#define c31    $f22
+#define c32    $f23
+#define c41    $f24
+#define c42    $f25
+
+/* LSX vectors */
+#define U0     $vr30
+#define U1     $vr31
+#define U2     $vr2
+#define U3     $vr3
+#define U4     $vr4
+#define U5     $vr5
+#define U6     $vr6
+#define U7     $vr7
+#define U8     $vr8
+#define U9     $vr9
+#define U10    $vr10
+#define U11    $vr11
+#define U12    $vr12
+#define U13    $vr13
+#define U14    $vr14
+#define U15    $vr15
+#define D0     $vr16
+#define D1     $vr17
+#define D2     $vr18
+#define D3     $vr19
+#define D4     $vr20
+#define D5     $vr21
+#define D6     $vr22
+#define D7     $vr23
+#define D8     $vr24
+#define D9     $vr25
+#define D10    $vr26
+#define D11    $vr27
+#define D12    $vr28
+#define D13    $vr29
+#define VALPHAR $vr28
+#define VALPHAI $vr29
+
+
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+#define    VMADD1       VFMADD
+#define    VMADD2       VFMADD
+#define    VMADD3       VNMSUB
+#define    VMADD4       VFMADD
+
+#define    MADD1       MADD
+#define    MADD2       MADD
+#define    MADD3       NMSUB
+#define    MADD4       MADD
+#endif
+
+#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
+#define    VMADD1       VFMADD
+#define    VMADD2       VFMADD
+#define    VMADD3       VFMADD
+#define    VMADD4       VNMSUB
+
+#define    MADD1       MADD
+#define    MADD2       MADD
+#define    MADD3       MADD
+#define    MADD4       NMSUB
+#endif
+
+#if   defined(RN) || defined(RT) || defined(CN) || defined(CT)
+#define    VMADD1       VFMADD
+#define    VMADD2       VNMSUB
+#define    VMADD3       VFMADD
+#define    VMADD4       VFMADD
+
+#define    MADD1       MADD
+#define    MADD2       NMSUB
+#define    MADD3       MADD
+#define    MADD4       MADD
+#endif
+
+#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
+#define    VMADD1       VFMADD
+#define    VMADD2       VNMSUB
+#define    VMADD3       VNMSUB
+#define    VMADD4       VNMSUB
+
+#define    MADD1       MADD
+#define    MADD2       NMSUB
+#define    MADD3       NMSUB
+#define    MADD4       NMSUB
+#endif
+
+    PROLOGUE
+
+    addi.d     $sp,    $sp,   -128
+    SDARG      $r23,   $sp,   0
+    SDARG      $r24,   $sp,   8
+    SDARG      $r25,   $sp,   16
+    SDARG      $r26,   $sp,   24
+    SDARG      $r27,   $sp,   32
+    ST         $f23,   $sp,   40
+    ST         $f24,   $sp,   48
+    ST         $f25,   $sp,   56
+    ST         $f26,   $sp,   64
+    ST         $f27,   $sp,   72
+    ST         $f28,   $sp,   80
+    ST         $f29,   $sp,   88
+    ST         $f30,   $sp,   96
+    ST         $f31,   $sp,   104
+    ST         ALPHA_R,$sp,   112
+    ST         ALPHA_I,$sp,   120
+
+    vldrepl.w  VALPHAR, $sp, 112
+    vldrepl.w  VALPHAI, $sp, 120
+
+#if defined (TRMMKERNEL) && !defined(LEFT)
+    sub.d      OFF,    $r0,   OFFSET
+#else
+    xor        OFF,    OFF,   OFF
+#endif
+
+    slli.d     LDC,    LDC,   2
+
+    move       J,      $r0
+    srai.d     T0,     N,     2  //bn/4
+    beq        J,      T0,    .L19
+
+.L10:  /* for(j=0; j<bn/4; j+=1) */
+    move       C0,     C
+    slli.d     TL,     LDC,   1
+    add.d      C1,     C0,    TL
+    add.d      C2,     C1,    TL
+    add.d      C3,     C2,    TL
+    move       A0,     A    //ptrba
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move       OFF,    OFFSET
+#endif
+
+    move       I,      $r0
+    srai.d     T0,     M,     3  //bm/8
+    beq        I,      T0,    .L150
+
+.L11:  /* for(i=0; i<bm/8; i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B     //ptrbb
+#else
+    slli.d     T3,     OFF,   0x06
+    add.d      A0,     A0,    T3
+    slli.d     T3,     OFF,   0x05
+    add.d      B0,     B,     T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF   //temp
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   8
+#else
+    addi.d     TL,     OFF,   4
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    vxor.v    U0,     U0,   U0
+    vxor.v    U1,     U1,   U1
+    vxor.v    U2,     U2,   U2
+    vxor.v    U3,     U3,   U3
+    vxor.v    U4,     U4,   U4
+    vxor.v    U5,     U5,   U5
+    vxor.v    U6,     U6,   U6
+    vxor.v    U7,     U7,   U7
+    vxor.v    U8,     U8,   U8
+    vxor.v    U9,     U9,   U9
+    vxor.v    U10,    U10,  U10
+    vxor.v    U11,    U11,  U11
+    vxor.v    U12,    U12,  U12
+    vxor.v    U13,    U13,  U13
+    vxor.v    U14,    U14,  U14
+    vxor.v    U15,    U15,  U15
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L13
+    blt        TL,     L,     .L13
+
+.L12:  /* for(k=0; k<temp; k+=1) */
+    vld       D0,     A0,    0x00  // a0ri a1ri
+    vld       D2,     B0,    0x00  // b0ri b1ri
+    vld       D3,     B0,    0x10  // b2ri b3ri
+
+    vshuf4i.w  D4,     D0,    0x00  //a0r
+    vshuf4i.w  D5,     D0,    0x55  //a0i
+
+    vpackev.w  D6,     D3,    D2
+    vshuf4i.w  D6,     D6,    0xd8  //b0r b1r b2r b3r
+
+    vpackod.w  D7,     D3,    D2
+    vshuf4i.w  D7,     D7,    0xd8  //b0i b1i b2i b3i
+
+    VMADD1    U0,     D4,    D6,     U0  //00r 10r 20r 30r
+    VMADD2    U1,     D5,    D6,     U1  //00i 10i 20i 30i
+    VMADD3    U0,     D5,    D7,     U0
+    VMADD4    U1,     D4,    D7,     U1
+
+    vshuf4i.w  D4,     D0,    0xaa  //a1r
+    vshuf4i.w  D5,     D0,    0xff  //a1i
+
+    VMADD1    U2,     D4,    D6,     U2  //01r 11r 21r 31r
+    VMADD2    U3,     D5,    D6,     U3  //01i 11i 21i 31i
+    VMADD3    U2,     D5,    D7,     U2
+    VMADD4    U3,     D4,    D7,     U3
+
+    vld       D0,     A0,    0x10  // a2ri a3ri
+
+    vshuf4i.w  D4,     D0,    0x00  //a2r
+    vshuf4i.w  D5,     D0,    0x55  //a2i
+
+    VMADD1    U4,     D4,    D6,     U4  //02r 12r 22r 32r
+    VMADD2    U5,     D5,    D6,     U5  //02i 12i 22i 32i
+    VMADD3    U4,     D5,    D7,     U4
+    VMADD4    U5,     D4,    D7,     U5
+
+    vshuf4i.w  D4,     D0,    0xaa  //a3r
+    vshuf4i.w  D5,     D0,    0xff  //a3i
+
+    VMADD1    U6,     D4,    D6,     U6  //03r 13r 23r 33r
+    VMADD2    U7,     D5,    D6,     U7  //03i 13i 23i 33i
+    VMADD3    U6,     D5,    D7,     U6
+    VMADD4    U7,     D4,    D7,     U7
+
+    vld       D0,     A0,    0x20  // a4ri a5ri
+
+    vshuf4i.w  D4,     D0,    0x00  //a4r
+    vshuf4i.w  D5,     D0,    0x55  //a4i
+
+    VMADD1    U8,     D4,    D6,     U8  //04r 14r 24r 34r
+    VMADD2    U9,     D5,    D6,     U9  //04i 14i 24i 34i
+    VMADD3    U8,     D5,    D7,     U8
+    VMADD4    U9,     D4,    D7,     U9
+
+    vshuf4i.w  D4,     D0,    0xaa  //a5r
+    vshuf4i.w  D5,     D0,    0xff  //a5i
+
+    VMADD1    U10,     D4,    D6,     U10  //05r 15r 25r 35r
+    VMADD2    U11,     D5,    D6,     U11  //05i 15i 25i 35i
+    VMADD3    U10,     D5,    D7,     U10
+    VMADD4    U11,     D4,    D7,     U11
+
+    vld       D0,     A0,    0x30  // a6ri a7ri
+
+    vshuf4i.w  D4,     D0,    0x00  //a6r
+    vshuf4i.w  D5,     D0,    0x55  //a6i
+
+    VMADD1    U12,     D4,    D6,     U12  //06r 16r 26r 36r
+    VMADD2    U13,     D5,    D6,     U13  //06i 16i 26i 36i
+    VMADD3    U12,     D5,    D7,     U12
+    VMADD4    U13,     D4,    D7,     U13
+
+    vshuf4i.w  D4,     D0,    0xaa  //a5r
+    vshuf4i.w  D5,     D0,    0xff  //a5i
+
+    VMADD1    U14,     D4,    D6,     U14  //07r 17r 27r 37r
+    VMADD2    U15,     D5,    D6,     U15  //07i 17i 27i 37i
+    VMADD3    U14,     D5,    D7,     U14
+    VMADD4    U15,     D4,    D7,     U15
+
+    addi.d     A0,     A0,    0x40
+    addi.d     B0,     B0,    0x20
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L12
+
+.L13:
+#if defined(TRMMKERNEL)
+    //res00 res10 res20 res30
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C1,    0x00 //c1: 0 1 2 3
+    vld       D2,     C2,    0x00 //c2: 0 1 2 3
+    vld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    vand.v    D4,     D1,    D1
+    vpermi.w  D4,     D0,    0x44 //c0:0 1, c1:0 1
+    vshuf4i.w  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    vand.v    D5,     D3,    D3
+    vpermi.w  D5,     D2,    0x44 //c2:0 1, c3:0 1
+    vshuf4i.w  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    vpermi.w  D8,     D6,    0x44 //c0[0] c1[0] c2[0] c3[0]
+    vpermi.w  D9,     D7,    0x44 //c0[1] c1[1] c2[1] c3[1]
+
+    vfmul.s      D8,    U0,    VALPHAR
+    vfmul.s      D9,    U1,    VALPHAR
+    VNMSUB      D8,    U1,    VALPHAI, D8
+    VFMADD      D9,    U0,    VALPHAI, D9
+
+    vand.v    D10,     D9,    D9 //c0[1] c1[1] c2[1] c3[1]
+    vand.v    D11,     D9,    D9 //c0[0] c1[0] c2[0] c3[0]
+
+    vpermi.w  D10,     D8,    0x44 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    vpermi.w  D11,     D8,    0xee //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res01 res11 res21 res31
+    vand.v    D4,     D1,    D1
+    vpermi.w  D4,     D0,    0xee //c0:2 3, c1:2 3
+    vshuf4i.w  D6,     D4,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w  D7,     D4,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    vand.v    D5,     D3,    D3
+    vpermi.w  D5,     D2,    0xee //c2:2 3, c3:2 3
+    vshuf4i.w  D8,     D5,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w  D9,     D5,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    vpermi.w  D8,     D6,    0x44 //c0[2] c1[2] c2[2] c3[2]
+    vpermi.w  D9,     D7,    0x44 //c0[3] c1[3] c2[3] c3[3]
+
+    vfmul.s      D8,    U2,    VALPHAR
+    vfmul.s      D9,    U3,    VALPHAR
+    VNMSUB      D8,    U3,    VALPHAI, D8
+    VFMADD      D9,    U2,    VALPHAI, D9
+
+    vand.v    D4,     D9,    D9
+    vpermi.w  D4,     D8,    0x44 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w D4,     D4,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+    vand.v    D2,     D4,    D4
+
+    vand.v    D5,     D9,    D9
+    vpermi.w  D5,     D8,    0xee //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w D5,     D5,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+    vand.v    D3,     D5,    D5
+
+    vand.v    D0,     D10,    D10 //c0[0] c0[1] c1[0] c1[1]
+    vand.v    D1,     D11,    D11 //c2[0] c2[1] c3[0] c3[1]
+
+    vpermi.w  D4,     D0,     0x44 //c0: 0 1 2 3
+    vpermi.w  D2,     D0,     0xee //c1: 0 1 2 3
+    vpermi.w  D5,     D1,     0x44 //c2: 0 1 2 3
+    vpermi.w  D3,     D1,     0xee //c3: 0 1 2 3
+
+    vst       D4,     C0,    0x00
+    vst       D2,     C1,    0x00
+    vst       D5,     C2,    0x00
+    vst       D3,     C3,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+
+    //res02 res12 res22 res32
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C1,    0x00 //c1: 0 1 2 3
+    vld       D2,     C2,    0x00 //c2: 0 1 2 3
+    vld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    vand.v    D4,     D1,    D1
+    vpermi.w  D4,     D0,    0x44 //c0:0 1, c1:0 1
+    vshuf4i.w  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    vand.v    D5,     D3,    D3
+    vpermi.w  D5,     D2,    0x44 //c2:0 1, c3:0 1
+    vshuf4i.w  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    vpermi.w  D8,     D6,    0x44 //c0[0] c1[0] c2[0] c3[0]
+    vpermi.w  D9,     D7,    0x44 //c0[1] c1[1] c2[1] c3[1]
+
+    vfmul.s      D8,    U4,    VALPHAR
+    vfmul.s      D9,    U5,    VALPHAR
+    VNMSUB      D8,    U5,    VALPHAI, D8
+    VFMADD      D9,    U4,    VALPHAI, D9
+
+    vand.v    D10,     D9,    D9 //c0[1] c1[1] c2[1] c3[1]
+    vand.v    D11,     D9,    D9 //c0[0] c1[0] c2[0] c3[0]
+
+    vpermi.w  D10,     D8,    0x44 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    vpermi.w  D11,     D8,    0xee //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res03 res13 res23 res33
+    vand.v    D4,     D1,    D1
+    vpermi.w  D4,     D0,    0xee //c0:2 3, c1:2 3
+    vshuf4i.w  D6,     D4,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w  D7,     D4,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    vand.v    D5,     D3,    D3
+    vpermi.w  D5,     D2,    0xee //c2:2 3, c3:2 3
+    vshuf4i.w  D8,     D5,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w  D9,     D5,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    vpermi.w  D8,     D6,    0x44 //c0[2] c1[2] c2[2] c3[2]
+    vpermi.w  D9,     D7,    0x44 //c0[3] c1[3] c2[3] c3[3]
+
+    vfmul.s      D8,    U6,    VALPHAR
+    vfmul.s      D9,    U7,    VALPHAR
+    VNMSUB      D8,    U7,    VALPHAI, D8
+    VFMADD      D9,    U6,    VALPHAI, D9
+
+    vand.v    D4,     D9,    D9
+    vpermi.w  D4,     D8,    0x44 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w D4,     D4,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+    vand.v    D2,     D4,    D4
+
+    vand.v    D5,     D9,    D9
+    vpermi.w  D5,     D8,    0xee //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w D5,     D5,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+    vand.v    D3,     D5,    D5
+
+    vand.v    D0,     D10,    D10 //c0[0] c0[1] c1[0] c1[1]
+    vand.v    D1,     D11,    D11 //c2[0] c2[1] c3[0] c3[1]
+
+    vpermi.w  D4,     D0,     0x44 //c0: 0 1 2 3
+    vpermi.w  D2,     D0,     0xee //c1: 0 1 2 3
+    vpermi.w  D5,     D1,     0x44 //c2: 0 1 2 3
+    vpermi.w  D3,     D1,     0xee //c3: 0 1 2 3
+
+    vst       D4,     C0,    0x00
+    vst       D2,     C1,    0x00
+    vst       D5,     C2,    0x00
+    vst       D3,     C3,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+
+    //res04 res14 res24 res34
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C1,    0x00 //c1: 0 1 2 3
+    vld       D2,     C2,    0x00 //c2: 0 1 2 3
+    vld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    vand.v    D4,     D1,    D1
+    vpermi.w  D4,     D0,    0x44 //c0:0 1, c1:0 1
+    vshuf4i.w  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    vand.v    D5,     D3,    D3
+    vpermi.w  D5,     D2,    0x44 //c2:0 1, c3:0 1
+    vshuf4i.w  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    vpermi.w  D8,     D6,    0x44 //c0[0] c1[0] c2[0] c3[0]
+    vpermi.w  D9,     D7,    0x44 //c0[1] c1[1] c2[1] c3[1]
+
+    vfmul.s      D8,    U8,    VALPHAR
+    vfmul.s      D9,    U9,    VALPHAR
+    VNMSUB      D8,    U9,    VALPHAI, D8
+    VFMADD      D9,    U8,    VALPHAI, D9
+
+    vand.v    D10,     D9,    D9 //c0[1] c1[1] c2[1] c3[1]
+    vand.v    D11,     D9,    D9 //c0[0] c1[0] c2[0] c3[0]
+
+    vpermi.w  D10,     D8,    0x44 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    vpermi.w  D11,     D8,    0xee //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res05 res15 res25 res35
+    vand.v    D4,     D1,    D1
+    vpermi.w  D4,     D0,    0xee //c0:2 3, c1:2 3
+    vshuf4i.w  D6,     D4,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w  D7,     D4,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    vand.v    D5,     D3,    D3
+    vpermi.w  D5,     D2,    0xee //c2:2 3, c3:2 3
+    vshuf4i.w  D8,     D5,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w  D9,     D5,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    vpermi.w  D8,     D6,    0x44 //c0[2] c1[2] c2[2] c3[2]
+    vpermi.w  D9,     D7,    0x44 //c0[3] c1[3] c2[3] c3[3]
+
+    vfmul.s      D8,    U10,    VALPHAR
+    vfmul.s      D9,    U11,    VALPHAR
+    VNMSUB      D8,    U11,    VALPHAI, D8
+    VFMADD      D9,    U10,    VALPHAI, D9
+
+    vand.v    D4,     D9,    D9
+    vpermi.w  D4,     D8,    0x44 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w D4,     D4,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+    vand.v    D2,     D4,    D4
+
+    vand.v    D5,     D9,    D9
+    vpermi.w  D5,     D8,    0xee //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w D5,     D5,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+    vand.v    D3,     D5,    D5
+
+    vand.v    D0,     D10,    D10 //c0[0] c0[1] c1[0] c1[1]
+    vand.v    D1,     D11,    D11 //c2[0] c2[1] c3[0] c3[1]
+
+    vpermi.w  D4,     D0,     0x44 //c0: 0 1 2 3
+    vpermi.w  D2,     D0,     0xee //c1: 0 1 2 3
+    vpermi.w  D5,     D1,     0x44 //c2: 0 1 2 3
+    vpermi.w  D3,     D1,     0xee //c3: 0 1 2 3
+
+    vst       D4,     C0,    0x00
+    vst       D2,     C1,    0x00
+    vst       D5,     C2,    0x00
+    vst       D3,     C3,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+
+    //res06 res16 res26 res36
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C1,    0x00 //c1: 0 1 2 3
+    vld       D2,     C2,    0x00 //c2: 0 1 2 3
+    vld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    vand.v    D4,     D1,    D1
+    vpermi.w  D4,     D0,    0x44 //c0:0 1, c1:0 1
+    vshuf4i.w  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    vand.v    D5,     D3,    D3
+    vpermi.w  D5,     D2,    0x44 //c2:0 1, c3:0 1
+    vshuf4i.w  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    vpermi.w  D8,     D6,    0x44 //c0[0] c1[0] c2[0] c3[0]
+    vpermi.w  D9,     D7,    0x44 //c0[1] c1[1] c2[1] c3[1]
+
+    vfmul.s      D8,    U12,    VALPHAR
+    vfmul.s      D9,    U13,    VALPHAR
+    VNMSUB      D8,    U13,    VALPHAI, D8
+    VFMADD      D9,    U12,    VALPHAI, D9
+
+    vand.v    D10,     D9,    D9 //c0[1] c1[1] c2[1] c3[1]
+    vand.v    D11,     D9,    D9 //c0[0] c1[0] c2[0] c3[0]
+
+    vpermi.w  D10,     D8,    0x44 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    vpermi.w  D11,     D8,    0xee //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res07 res17 res27 res37
+    vand.v    D4,     D1,    D1
+    vpermi.w  D4,     D0,    0xee //c0:2 3, c1:2 3
+    vshuf4i.w  D6,     D4,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w  D7,     D4,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    vand.v    D5,     D3,    D3
+    vpermi.w  D5,     D2,    0xee //c2:2 3, c3:2 3
+    vshuf4i.w  D8,     D5,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w  D9,     D5,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    vpermi.w  D8,     D6,    0x44 //c0[2] c1[2] c2[2] c3[2]
+    vpermi.w  D9,     D7,    0x44 //c0[3] c1[3] c2[3] c3[3]
+
+    vfmul.s      D8,    U14,    VALPHAR
+    vfmul.s      D9,    U15,    VALPHAR
+    VNMSUB      D8,    U15,    VALPHAI, D8
+    VFMADD      D9,    U14,    VALPHAI, D9
+
+    vand.v    D4,     D9,    D9
+    vpermi.w  D4,     D8,    0x44 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w D4,     D4,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+    vand.v    D2,     D4,    D4
+
+    vand.v    D5,     D9,    D9
+    vpermi.w  D5,     D8,    0xee //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w D5,     D5,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+    vand.v    D3,     D5,    D5
+
+    vand.v    D0,     D10,    D10 //c0[0] c0[1] c1[0] c1[1]
+    vand.v    D1,     D11,    D11 //c2[0] c2[1] c3[0] c3[1]
+
+    vpermi.w  D4,     D0,     0x44 //c0: 0 1 2 3
+    vpermi.w  D2,     D0,     0xee //c1: 0 1 2 3
+    vpermi.w  D5,     D1,     0x44 //c2: 0 1 2 3
+    vpermi.w  D3,     D1,     0xee //c3: 0 1 2 3
+
+    vst       D4,     C0,    0x00
+    vst       D2,     C1,    0x00
+    vst       D5,     C2,    0x00
+    vst       D3,     C3,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+#else
+    //res00 res10 res20 res30
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C1,    0x00 //c1: 0 1 2 3
+    vld       D2,     C2,    0x00 //c2: 0 1 2 3
+    vld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    vand.v    D4,     D1,    D1
+    vpermi.w  D4,     D0,    0x44 //c0:0 1, c1:0 1
+    vshuf4i.w  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    vand.v    D5,     D3,    D3
+    vpermi.w  D5,     D2,    0x44 //c2:0 1, c3:0 1
+    vshuf4i.w  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    vpermi.w  D8,     D6,    0x44 //c0[0] c1[0] c2[0] c3[0]
+    vpermi.w  D9,     D7,    0x44 //c0[1] c1[1] c2[1] c3[1]
+
+    VFMADD      D8,    U0,    VALPHAR, D8
+    VFMADD      D9,    U1,    VALPHAR, D9
+    VNMSUB      D8,    U1,    VALPHAI, D8
+    VFMADD      D9,    U0,    VALPHAI, D9
+
+    vand.v    D10,     D9,    D9 //c0[1] c1[1] c2[1] c3[1]
+    vand.v    D11,     D9,    D9 //c0[0] c1[0] c2[0] c3[0]
+
+    vpermi.w  D10,     D8,    0x44 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    vpermi.w  D11,     D8,    0xee //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res01 res11 res21 res31
+    vand.v    D4,     D1,    D1
+    vpermi.w  D4,     D0,    0xee //c0:2 3, c1:2 3
+    vshuf4i.w  D6,     D4,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w  D7,     D4,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    vand.v    D5,     D3,    D3
+    vpermi.w  D5,     D2,    0xee //c2:2 3, c3:2 3
+    vshuf4i.w  D8,     D5,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w  D9,     D5,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    vpermi.w  D8,     D6,    0x44 //c0[2] c1[2] c2[2] c3[2]
+    vpermi.w  D9,     D7,    0x44 //c0[3] c1[3] c2[3] c3[3]
+
+    VFMADD      D8,    U2,    VALPHAR, D8
+    VFMADD      D9,    U3,    VALPHAR, D9
+    VNMSUB      D8,    U3,    VALPHAI, D8
+    VFMADD      D9,    U2,    VALPHAI, D9
+
+    vand.v    D4,     D9,    D9
+    vpermi.w  D4,     D8,    0x44 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w D4,     D4,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+    vand.v    D2,     D4,    D4
+
+    vand.v    D5,     D9,    D9
+    vpermi.w  D5,     D8,    0xee //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w D5,     D5,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+    vand.v    D3,     D5,    D5
+
+    vand.v    D0,     D10,    D10 //c0[0] c0[1] c1[0] c1[1]
+    vand.v    D1,     D11,    D11 //c2[0] c2[1] c3[0] c3[1]
+
+    vpermi.w  D4,     D0,     0x44 //c0: 0 1 2 3
+    vpermi.w  D2,     D0,     0xee //c1: 0 1 2 3
+    vpermi.w  D5,     D1,     0x44 //c2: 0 1 2 3
+    vpermi.w  D3,     D1,     0xee //c3: 0 1 2 3
+
+    vst       D4,     C0,    0x00
+    vst       D2,     C1,    0x00
+    vst       D5,     C2,    0x00
+    vst       D3,     C3,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+
+    //res02 res12 res22 res32
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C1,    0x00 //c1: 0 1 2 3
+    vld       D2,     C2,    0x00 //c2: 0 1 2 3
+    vld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    vand.v    D4,     D1,    D1
+    vpermi.w  D4,     D0,    0x44 //c0:0 1, c1:0 1
+    vshuf4i.w  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    vand.v    D5,     D3,    D3
+    vpermi.w  D5,     D2,    0x44 //c2:0 1, c3:0 1
+    vshuf4i.w  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    vpermi.w  D8,     D6,    0x44 //c0[0] c1[0] c2[0] c3[0]
+    vpermi.w  D9,     D7,    0x44 //c0[1] c1[1] c2[1] c3[1]
+
+    VFMADD      D8,    U4,    VALPHAR, D8
+    VFMADD      D9,    U5,    VALPHAR, D9
+    VNMSUB      D8,    U5,    VALPHAI, D8
+    VFMADD      D9,    U4,    VALPHAI, D9
+
+    vand.v    D10,     D9,    D9 //c0[1] c1[1] c2[1] c3[1]
+    vand.v    D11,     D9,    D9 //c0[0] c1[0] c2[0] c3[0]
+
+    vpermi.w  D10,     D8,    0x44 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    vpermi.w  D11,     D8,    0xee //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res03 res13 res23 res33
+    vand.v    D4,     D1,    D1
+    vpermi.w  D4,     D0,    0xee //c0:2 3, c1:2 3
+    vshuf4i.w  D6,     D4,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w  D7,     D4,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    vand.v    D5,     D3,    D3
+    vpermi.w  D5,     D2,    0xee //c2:2 3, c3:2 3
+    vshuf4i.w  D8,     D5,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w  D9,     D5,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    vpermi.w  D8,     D6,    0x44 //c0[2] c1[2] c2[2] c3[2]
+    vpermi.w  D9,     D7,    0x44 //c0[3] c1[3] c2[3] c3[3]
+
+    VFMADD      D8,    U6,    VALPHAR, D8
+    VFMADD      D9,    U7,    VALPHAR, D9
+    VNMSUB      D8,    U7,    VALPHAI, D8
+    VFMADD      D9,    U6,    VALPHAI, D9
+
+    vand.v    D4,     D9,    D9
+    vpermi.w  D4,     D8,    0x44 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w D4,     D4,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+    vand.v    D2,     D4,    D4
+
+    vand.v    D5,     D9,    D9
+    vpermi.w  D5,     D8,    0xee //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w D5,     D5,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+    vand.v    D3,     D5,    D5
+
+    vand.v    D0,     D10,    D10 //c0[0] c0[1] c1[0] c1[1]
+    vand.v    D1,     D11,    D11 //c2[0] c2[1] c3[0] c3[1]
+
+    vpermi.w  D4,     D0,     0x44 //c0: 0 1 2 3
+    vpermi.w  D2,     D0,     0xee //c1: 0 1 2 3
+    vpermi.w  D5,     D1,     0x44 //c2: 0 1 2 3
+    vpermi.w  D3,     D1,     0xee //c3: 0 1 2 3
+
+    vst       D4,     C0,    0x00
+    vst       D2,     C1,    0x00
+    vst       D5,     C2,    0x00
+    vst       D3,     C3,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+
+    //res04 res14 res24 res34
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C1,    0x00 //c1: 0 1 2 3
+    vld       D2,     C2,    0x00 //c2: 0 1 2 3
+    vld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    vand.v    D4,     D1,    D1
+    vpermi.w  D4,     D0,    0x44 //c0:0 1, c1:0 1
+    vshuf4i.w  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    vand.v    D5,     D3,    D3
+    vpermi.w  D5,     D2,    0x44 //c2:0 1, c3:0 1
+    vshuf4i.w  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    vpermi.w  D8,     D6,    0x44 //c0[0] c1[0] c2[0] c3[0]
+    vpermi.w  D9,     D7,    0x44 //c0[1] c1[1] c2[1] c3[1]
+
+    VFMADD      D8,    U8,    VALPHAR, D8
+    VFMADD      D9,    U9,    VALPHAR, D9
+    VNMSUB      D8,    U9,    VALPHAI, D8
+    VFMADD      D9,    U8,    VALPHAI, D9
+
+    vand.v    D10,     D9,    D9 //c0[1] c1[1] c2[1] c3[1]
+    vand.v    D11,     D9,    D9 //c0[0] c1[0] c2[0] c3[0]
+
+    vpermi.w  D10,     D8,    0x44 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    vpermi.w  D11,     D8,    0xee //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res05 res15 res25 res35
+    vand.v    D4,     D1,    D1
+    vpermi.w  D4,     D0,    0xee //c0:2 3, c1:2 3
+    vshuf4i.w  D6,     D4,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w  D7,     D4,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    vand.v    D5,     D3,    D3
+    vpermi.w  D5,     D2,    0xee //c2:2 3, c3:2 3
+    vshuf4i.w  D8,     D5,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w  D9,     D5,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    vpermi.w  D8,     D6,    0x44 //c0[2] c1[2] c2[2] c3[2]
+    vpermi.w  D9,     D7,    0x44 //c0[3] c1[3] c2[3] c3[3]
+
+    VFMADD      D8,    U10,    VALPHAR, D8
+    VFMADD      D9,    U11,    VALPHAR, D9
+    VNMSUB      D8,    U11,    VALPHAI, D8
+    VFMADD      D9,    U10,    VALPHAI, D9
+
+    vand.v    D4,     D9,    D9
+    vpermi.w  D4,     D8,    0x44 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w D4,     D4,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+    vand.v    D2,     D4,    D4
+
+    vand.v    D5,     D9,    D9
+    vpermi.w  D5,     D8,    0xee //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w D5,     D5,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+    vand.v    D3,     D5,    D5
+
+    vand.v    D0,     D10,    D10 //c0[0] c0[1] c1[0] c1[1]
+    vand.v    D1,     D11,    D11 //c2[0] c2[1] c3[0] c3[1]
+
+    vpermi.w  D4,     D0,     0x44 //c0: 0 1 2 3
+    vpermi.w  D2,     D0,     0xee //c1: 0 1 2 3
+    vpermi.w  D5,     D1,     0x44 //c2: 0 1 2 3
+    vpermi.w  D3,     D1,     0xee //c3: 0 1 2 3
+
+    vst       D4,     C0,    0x00
+    vst       D2,     C1,    0x00
+    vst       D5,     C2,    0x00
+    vst       D3,     C3,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+
+    //res06 res16 res26 res36
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C1,    0x00 //c1: 0 1 2 3
+    vld       D2,     C2,    0x00 //c2: 0 1 2 3
+    vld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    vand.v    D4,     D1,    D1
+    vpermi.w  D4,     D0,    0x44 //c0:0 1, c1:0 1
+    vshuf4i.w  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    vand.v    D5,     D3,    D3
+    vpermi.w  D5,     D2,    0x44 //c2:0 1, c3:0 1
+    vshuf4i.w  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    vpermi.w  D8,     D6,    0x44 //c0[0] c1[0] c2[0] c3[0]
+    vpermi.w  D9,     D7,    0x44 //c0[1] c1[1] c2[1] c3[1]
+
+    VFMADD      D8,    U12,    VALPHAR, D8
+    VFMADD      D9,    U13,    VALPHAR, D9
+    VNMSUB      D8,    U13,    VALPHAI, D8
+    VFMADD      D9,    U12,    VALPHAI, D9
+
+    vand.v    D10,     D9,    D9 //c0[1] c1[1] c2[1] c3[1]
+    vand.v    D11,     D9,    D9 //c0[0] c1[0] c2[0] c3[0]
+
+    vpermi.w  D10,     D8,    0x44 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    vpermi.w  D11,     D8,    0xee //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res07 res17 res27 res37
+    vand.v    D4,     D1,    D1
+    vpermi.w  D4,     D0,    0xee //c0:2 3, c1:2 3
+    vshuf4i.w  D6,     D4,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w  D7,     D4,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    vand.v    D5,     D3,    D3
+    vpermi.w  D5,     D2,    0xee //c2:2 3, c3:2 3
+    vshuf4i.w  D8,     D5,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w  D9,     D5,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    vpermi.w  D8,     D6,    0x44 //c0[2] c1[2] c2[2] c3[2]
+    vpermi.w  D9,     D7,    0x44 //c0[3] c1[3] c2[3] c3[3]
+
+    VFMADD      D8,    U14,    VALPHAR, D8
+    VFMADD      D9,    U15,    VALPHAR, D9
+    VNMSUB      D8,    U15,    VALPHAI, D8
+    VFMADD      D9,    U14,    VALPHAI, D9
+
+    vand.v    D4,     D9,    D9
+    vpermi.w  D4,     D8,    0x44 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w D4,     D4,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+    vand.v    D2,     D4,    D4
+
+    vand.v    D5,     D9,    D9
+    vpermi.w  D5,     D8,    0xee //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w D5,     D5,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+    vand.v    D3,     D5,    D5
+
+    vand.v    D0,     D10,    D10 //c0[0] c0[1] c1[0] c1[1]
+    vand.v    D1,     D11,    D11 //c2[0] c2[1] c3[0] c3[1]
+
+    vpermi.w  D4,     D0,     0x44 //c0: 0 1 2 3
+    vpermi.w  D2,     D0,     0xee //c1: 0 1 2 3
+    vpermi.w  D5,     D1,     0x44 //c2: 0 1 2 3
+    vpermi.w  D3,     D1,     0xee //c3: 0 1 2 3
+
+    vst       D4,     C0,    0x00
+    vst       D2,     C1,    0x00
+    vst       D5,     C2,    0x00
+    vst       D3,     C3,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+#endif
+
+#if defined(TRMMKERNEL)
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -8
+#else
+    addi.d     TL,     TL,   -4
+#endif
+    slli.d     T3,     TL,   0x06
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x05
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   8
+#endif
+
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     I,      I,     1
+    blt        I,      T0,    .L11
+
+.L150:
+    move       I,      $r0
+    andi       T0,     M,     4
+    beq        I,      T0,    .L18
+
+.L15:  /* if (bm & 4) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,  0x05
+    add.d      A0,     A0,   T3
+    slli.d     T3,     OFF,  0x05
+    add.d      B0,     B,    T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   4
+#else
+    addi.d     TL,     OFF,   4
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    vxor.v    U0,     U0,   U0
+    vxor.v    U1,     U1,   U1
+    vxor.v    U2,     U2,   U2
+    vxor.v    U3,     U3,   U3
+    vxor.v    U4,     U4,   U4
+    vxor.v    U5,     U5,   U5
+    vxor.v    U6,     U6,   U6
+    vxor.v    U7,     U7,   U7
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L17
+    blt        TL,     L,     .L17
+
+.L16:  /* for (k=0; k<temp; k++) */
+    vld       D0,     A0,    0x00  // a0ri a1ri
+    vld       D2,     B0,    0x00  // b0ri b1ri
+    vld       D3,     B0,    0x10  // b2ri b3ri
+
+    vshuf4i.w  D4,     D0,    0x00  //a0r
+    vshuf4i.w  D5,     D0,    0x55  //a0i
+
+    vpackev.w  D6,     D3,    D2
+    vshuf4i.w  D6,     D6,    0xd8  //b0r b1r b2r b3r
+
+    vpackod.w  D7,     D3,    D2
+    vshuf4i.w  D7,     D7,    0xd8  //b0i b1i b2i b3i
+
+    VMADD1    U0,     D4,    D6,     U0  //00r 10r 20r 30r
+    VMADD2    U1,     D5,    D6,     U1  //00i 10i 20i 30i
+    VMADD3    U0,     D5,    D7,     U0
+    VMADD4    U1,     D4,    D7,     U1
+
+    vshuf4i.w  D4,     D0,    0xaa  //a1r
+    vshuf4i.w  D5,     D0,    0xff  //a1i
+
+    VMADD1    U2,     D4,    D6,     U2  //01r 11r 21r 31r
+    VMADD2    U3,     D5,    D6,     U3  //01i 11i 21i 31i
+    VMADD3    U2,     D5,    D7,     U2
+    VMADD4    U3,     D4,    D7,     U3
+
+    vld       D0,     A0,    0x10  // a2ri a3ri
+
+    vshuf4i.w  D4,     D0,    0x00  //a2r
+    vshuf4i.w  D5,     D0,    0x55  //a2i
+
+    VMADD1    U4,     D4,    D6,     U4  //02r 12r 22r 32r
+    VMADD2    U5,     D5,    D6,     U5  //02i 12i 22i 32i
+    VMADD3    U4,     D5,    D7,     U4
+    VMADD4    U5,     D4,    D7,     U5
+
+    vshuf4i.w  D4,     D0,    0xaa  //a3r
+    vshuf4i.w  D5,     D0,    0xff  //a3i
+
+    VMADD1    U6,     D4,    D6,     U6  //03r 13r 23r 33r
+    VMADD2    U7,     D5,    D6,     U7  //03i 13i 23i 33i
+    VMADD3    U6,     D5,    D7,     U6
+    VMADD4    U7,     D4,    D7,     U7
+
+    addi.d     A0,     A0,    0x20
+    addi.d     B0,     B0,    0x20
+
+    addi.d     L,      L,     1
+    blt        L,      TL,     .L16
+
+.L17:
+#if defined(TRMMKERNEL)
+    //res00 res10 res20 res30
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C1,    0x00 //c1: 0 1 2 3
+    vld       D2,     C2,    0x00 //c2: 0 1 2 3
+    vld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    vand.v    D4,     D1,    D1
+    vpermi.w  D4,     D0,    0x44 //c0:0 1, c1:0 1
+    vshuf4i.w  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    vand.v    D5,     D3,    D3
+    vpermi.w  D5,     D2,    0x44 //c2:0 1, c3:0 1
+    vshuf4i.w  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    vpermi.w  D8,     D6,    0x44 //c0[0] c1[0] c2[0] c3[0]
+    vpermi.w  D9,     D7,    0x44 //c0[1] c1[1] c2[1] c3[1]
+
+    vfmul.s      D8,    U0,    VALPHAR
+    vfmul.s      D9,    U1,    VALPHAR
+    VNMSUB      D8,    U1,    VALPHAI, D8
+    VFMADD      D9,    U0,    VALPHAI, D9
+
+    vand.v    D10,     D9,    D9 //c0[1] c1[1] c2[1] c3[1]
+    vand.v    D11,     D9,    D9 //c0[0] c1[0] c2[0] c3[0]
+
+    vpermi.w  D10,     D8,    0x44 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    vpermi.w  D11,     D8,    0xee //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res01 res11 res21 res31
+    vand.v    D4,     D1,    D1
+    vpermi.w  D4,     D0,    0xee //c0:2 3, c1:2 3
+    vshuf4i.w  D6,     D4,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w  D7,     D4,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    vand.v    D5,     D3,    D3
+    vpermi.w  D5,     D2,    0xee //c2:2 3, c3:2 3
+    vshuf4i.w  D8,     D5,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w  D9,     D5,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    vpermi.w  D8,     D6,    0x44 //c0[2] c1[2] c2[2] c3[2]
+    vpermi.w  D9,     D7,    0x44 //c0[3] c1[3] c2[3] c3[3]
+
+    vfmul.s      D8,    U2,    VALPHAR
+    vfmul.s      D9,    U3,    VALPHAR
+    VNMSUB      D8,    U3,    VALPHAI, D8
+    VFMADD      D9,    U2,    VALPHAI, D9
+
+    vand.v    D4,     D9,    D9
+    vpermi.w  D4,     D8,    0x44 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w D4,     D4,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+    vand.v    D2,     D4,    D4
+
+    vand.v    D5,     D9,    D9
+    vpermi.w  D5,     D8,    0xee //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w D5,     D5,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+    vand.v    D3,     D5,    D5
+
+    vand.v    D0,     D10,    D10 //c0[0] c0[1] c1[0] c1[1]
+    vand.v    D1,     D11,    D11 //c2[0] c2[1] c3[0] c3[1]
+
+    vpermi.w  D4,     D0,     0x44 //c0: 0 1 2 3
+    vpermi.w  D2,     D0,     0xee //c1: 0 1 2 3
+    vpermi.w  D5,     D1,     0x44 //c2: 0 1 2 3
+    vpermi.w  D3,     D1,     0xee //c3: 0 1 2 3
+
+    vst       D4,     C0,    0x00
+    vst       D2,     C1,    0x00
+    vst       D5,     C2,    0x00
+    vst       D3,     C3,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+
+    //res02 res12 res22 res32
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C1,    0x00 //c1: 0 1 2 3
+    vld       D2,     C2,    0x00 //c2: 0 1 2 3
+    vld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    vand.v    D4,     D1,    D1
+    vpermi.w  D4,     D0,    0x44 //c0:0 1, c1:0 1
+    vshuf4i.w  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    vand.v    D5,     D3,    D3
+    vpermi.w  D5,     D2,    0x44 //c2:0 1, c3:0 1
+    vshuf4i.w  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    vpermi.w  D8,     D6,    0x44 //c0[0] c1[0] c2[0] c3[0]
+    vpermi.w  D9,     D7,    0x44 //c0[1] c1[1] c2[1] c3[1]
+
+    vfmul.s      D8,    U4,    VALPHAR
+    vfmul.s      D9,    U5,    VALPHAR
+    VNMSUB      D8,    U5,    VALPHAI, D8
+    VFMADD      D9,    U4,    VALPHAI, D9
+
+    vand.v    D10,     D9,    D9 //c0[1] c1[1] c2[1] c3[1]
+    vand.v    D11,     D9,    D9 //c0[0] c1[0] c2[0] c3[0]
+
+    vpermi.w  D10,     D8,    0x44 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    vpermi.w  D11,     D8,    0xee //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res03 res13 res23 res33
+    vand.v    D4,     D1,    D1
+    vpermi.w  D4,     D0,    0xee //c0:2 3, c1:2 3
+    vshuf4i.w  D6,     D4,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w  D7,     D4,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    vand.v    D5,     D3,    D3
+    vpermi.w  D5,     D2,    0xee //c2:2 3, c3:2 3
+    vshuf4i.w  D8,     D5,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w  D9,     D5,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    vpermi.w  D8,     D6,    0x44 //c0[2] c1[2] c2[2] c3[2]
+    vpermi.w  D9,     D7,    0x44 //c0[3] c1[3] c2[3] c3[3]
+
+    vfmul.s      D8,    U6,    VALPHAR
+    vfmul.s      D9,    U7,    VALPHAR
+    VNMSUB      D8,    U7,    VALPHAI, D8
+    VFMADD      D9,    U6,    VALPHAI, D9
+
+    vand.v    D4,     D9,    D9
+    vpermi.w  D4,     D8,    0x44 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w D4,     D4,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+    vand.v    D2,     D4,    D4
+
+    vand.v    D5,     D9,    D9
+    vpermi.w  D5,     D8,    0xee //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w D5,     D5,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+    vand.v    D3,     D5,    D5
+
+    vand.v    D0,     D10,    D10 //c0[0] c0[1] c1[0] c1[1]
+    vand.v    D1,     D11,    D11 //c2[0] c2[1] c3[0] c3[1]
+
+    vpermi.w  D4,     D0,     0x44 //c0: 0 1 2 3
+    vpermi.w  D2,     D0,     0xee //c1: 0 1 2 3
+    vpermi.w  D5,     D1,     0x44 //c2: 0 1 2 3
+    vpermi.w  D3,     D1,     0xee //c3: 0 1 2 3
+
+    vst       D4,     C0,    0x00
+    vst       D2,     C1,    0x00
+    vst       D5,     C2,    0x00
+    vst       D3,     C3,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+#else
+    //res00 res10 res20 res30
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C1,    0x00 //c1: 0 1 2 3
+    vld       D2,     C2,    0x00 //c2: 0 1 2 3
+    vld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    vand.v    D4,     D1,    D1
+    vpermi.w  D4,     D0,    0x44 //c0:0 1, c1:0 1
+    vshuf4i.w  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    vand.v    D5,     D3,    D3
+    vpermi.w  D5,     D2,    0x44 //c2:0 1, c3:0 1
+    vshuf4i.w  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    vpermi.w  D8,     D6,    0x44 //c0[0] c1[0] c2[0] c3[0]
+    vpermi.w  D9,     D7,    0x44 //c0[1] c1[1] c2[1] c3[1]
+
+    VFMADD      D8,    U0,    VALPHAR, D8
+    VFMADD      D9,    U1,    VALPHAR, D9
+    VNMSUB      D8,    U1,    VALPHAI, D8
+    VFMADD      D9,    U0,    VALPHAI, D9
+
+    vst       VALPHAR,     C0,    0x00
+    vst       VALPHAI,     C1,    0x00
+
+    vand.v    D10,     D9,    D9 //c0[1] c1[1] c2[1] c3[1]
+    vand.v    D11,     D9,    D9 //c0[0] c1[0] c2[0] c3[0]
+
+    vpermi.w  D10,     D8,    0x44 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    vpermi.w  D11,     D8,    0xee //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res01 res11 res21 res31
+    vand.v    D4,     D1,    D1
+    vpermi.w  D4,     D0,    0xee //c0:2 3, c1:2 3
+    vshuf4i.w  D6,     D4,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w  D7,     D4,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    vand.v    D5,     D3,    D3
+    vpermi.w  D5,     D2,    0xee //c2:2 3, c3:2 3
+    vshuf4i.w  D8,     D5,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w  D9,     D5,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    vpermi.w  D8,     D6,    0x44 //c0[2] c1[2] c2[2] c3[2]
+    vpermi.w  D9,     D7,    0x44 //c0[3] c1[3] c2[3] c3[3]
+
+    VFMADD      D8,    U2,    VALPHAR, D8
+    VFMADD      D9,    U3,    VALPHAR, D9
+    VNMSUB      D8,    U3,    VALPHAI, D8
+    VFMADD      D9,    U2,    VALPHAI, D9
+
+    vand.v    D4,     D9,    D9
+    vpermi.w  D4,     D8,    0x44 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w D4,     D4,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+    vand.v    D2,     D4,    D4
+
+    vand.v    D5,     D9,    D9
+    vpermi.w  D5,     D8,    0xee //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w D5,     D5,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+    vand.v    D3,     D5,    D5
+
+    vand.v    D0,     D10,    D10 //c0[0] c0[1] c1[0] c1[1]
+    vand.v    D1,     D11,    D11 //c2[0] c2[1] c3[0] c3[1]
+
+    vpermi.w  D4,     D0,     0x44 //c0: 0 1 2 3
+    vpermi.w  D2,     D0,     0xee //c1: 0 1 2 3
+    vpermi.w  D5,     D1,     0x44 //c2: 0 1 2 3
+    vpermi.w  D3,     D1,     0xee //c3: 0 1 2 3
+
+    // vst       VALPHAR,C0,    0x00
+    // LD        $f15,   C0,    0x00
+    // LD        $f15,   C0,    0x04
+    // LD        $f15,   C0,    0x08
+    // LD        $f15,   C0,    0x0c
+
+    // vst       VALPHAI,C0,    0x00
+    // LD        $f15,   C0,    0x00
+    // LD        $f15,   C0,    0x04
+    // LD        $f15,   C0,    0x08
+    // LD        $f15,   C0,    0x0c
+
+    vst       D4,     C0,    0x00
+    vst       D2,     C1,    0x00
+    vst       D5,     C2,    0x00
+    vst       D3,     C3,    0x00
+
+    // LD        $f15,   C0,    0x00
+    // LD        $f15,   C0,    0x04
+    // LD        $f15,   C0,    0x08
+    // LD        $f15,   C0,    0x0c
+
+    // LD        $f15,   C1,    0x00
+    // LD        $f15,   C1,    0x04
+    // LD        $f15,   C1,    0x08
+    // LD        $f15,   C1,    0x0c
+
+    // LD        $f15,   C2,    0x00
+    // LD        $f15,   C2,    0x04
+    // LD        $f15,   C2,    0x08
+    // LD        $f15,   C2,    0x0c
+
+    // LD        $f15,   C3,    0x00
+    // LD        $f15,   C3,    0x04
+    // LD        $f15,   C3,    0x08
+    // LD        $f15,   C3,    0x0c
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+
+    //res02 res12 res22 res32
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C1,    0x00 //c1: 0 1 2 3
+    vld       D2,     C2,    0x00 //c2: 0 1 2 3
+    vld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    vand.v    D4,     D1,    D1
+    vpermi.w  D4,     D0,    0x44 //c0:0 1, c1:0 1
+    vshuf4i.w  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    vand.v    D5,     D3,    D3
+    vpermi.w  D5,     D2,    0x44 //c2:0 1, c3:0 1
+    vshuf4i.w  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    vpermi.w  D8,     D6,    0x44 //c0[0] c1[0] c2[0] c3[0]
+    vpermi.w  D9,     D7,    0x44 //c0[1] c1[1] c2[1] c3[1]
+
+    VFMADD      D8,    U4,    VALPHAR, D8
+    VFMADD      D9,    U5,    VALPHAR, D9
+    VNMSUB      D8,    U5,    VALPHAI, D8
+    VFMADD      D9,    U4,    VALPHAI, D9
+
+    vand.v    D10,     D9,    D9 //c0[1] c1[1] c2[1] c3[1]
+    vand.v    D11,     D9,    D9 //c0[0] c1[0] c2[0] c3[0]
+
+    vpermi.w  D10,     D8,    0x44 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    vpermi.w  D11,     D8,    0xee //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res03 res13 res23 res33
+    vand.v    D4,     D1,    D1
+    vpermi.w  D4,     D0,    0xee //c0:2 3, c1:2 3
+    vshuf4i.w  D6,     D4,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w  D7,     D4,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    vand.v    D5,     D3,    D3
+    vpermi.w  D5,     D2,    0xee //c2:2 3, c3:2 3
+    vshuf4i.w  D8,     D5,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w  D9,     D5,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    vpermi.w  D8,     D6,    0x44 //c0[2] c1[2] c2[2] c3[2]
+    vpermi.w  D9,     D7,    0x44 //c0[3] c1[3] c2[3] c3[3]
+
+    VFMADD      D8,    U6,    VALPHAR, D8
+    VFMADD      D9,    U7,    VALPHAR, D9
+    VNMSUB      D8,    U7,    VALPHAI, D8
+    VFMADD      D9,    U6,    VALPHAI, D9
+
+    vand.v    D4,     D9,    D9
+    vpermi.w  D4,     D8,    0x44 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w D4,     D4,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+    vand.v    D2,     D4,    D4
+
+    vand.v    D5,     D9,    D9
+    vpermi.w  D5,     D8,    0xee //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w D5,     D5,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+    vand.v    D3,     D5,    D5
+
+    vand.v    D0,     D10,    D10 //c0[0] c0[1] c1[0] c1[1]
+    vand.v    D1,     D11,    D11 //c2[0] c2[1] c3[0] c3[1]
+
+    vpermi.w  D4,     D0,     0x44 //c0: 0 1 2 3
+    vpermi.w  D2,     D0,     0xee //c1: 0 1 2 3
+    vpermi.w  D5,     D1,     0x44 //c2: 0 1 2 3
+    vpermi.w  D3,     D1,     0xee //c3: 0 1 2 3
+
+    vst       D4,     C0,    0x00
+    vst       D2,     C1,    0x00
+    vst       D5,     C2,    0x00
+    vst       D3,     C3,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -4
+#else
+    addi.d     TL,     TL,   -4
+#endif
+    slli.d     T3,     TL,   0x05
+    add.d      A0,     A0,   T3
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   4
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+.L18:   /* if (bm & 2) */
+    move       I,      $r0
+    andi       T0,     M,     2
+    beq        I,      T0,    .L183
+
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,  0x04
+    add.d      A0,     A0,   T3
+    slli.d     T3,     OFF,  0x05
+    add.d      B0,     B,    T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   2
+#else
+    addi.d     TL,     OFF,   4
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    vxor.v    U0,     U0,   U0
+    vxor.v    U1,     U1,   U1
+    vxor.v    U2,     U2,   U2
+    vxor.v    U3,     U3,   U3
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L182
+    blt        TL,     L,     .L182
+
+.L181:  /* for (k=0; k<temp; k++) */
+    vld       D0,     A0,    0x00  // a0ri a1ri
+    vld       D2,     B0,    0x00  // b0ri b1ri
+    vld       D3,     B0,    0x10  // b2ri b3ri
+
+    vshuf4i.w  D4,     D0,    0x00  //a0r
+    vshuf4i.w  D5,     D0,    0x55  //a0i
+
+    vpackev.w  D6,     D3,    D2
+    vshuf4i.w  D6,     D6,    0xd8  //b0r b1r b2r b3r
+
+    vpackod.w  D7,     D3,    D2
+    vshuf4i.w  D7,     D7,    0xd8  //b0i b1i b2i b3i
+
+    VMADD1    U0,     D4,    D6,     U0  //00r 10r 20r 30r
+    VMADD2    U1,     D5,    D6,     U1  //00i 10i 20i 30i
+    VMADD3    U0,     D5,    D7,     U0
+    VMADD4    U1,     D4,    D7,     U1
+
+    vshuf4i.w  D4,     D0,    0xaa  //a1r
+    vshuf4i.w  D5,     D0,    0xff  //a1i
+
+    VMADD1    U2,     D4,    D6,     U2  //01r 11r 21r 31r
+    VMADD2    U3,     D5,    D6,     U3  //01i 11i 21i 31i
+    VMADD3    U2,     D5,    D7,     U2
+    VMADD4    U3,     D4,    D7,     U3
+
+    addi.d     A0,     A0,    0x10
+    addi.d     B0,     B0,    0x20
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L181
+
+.L182:
+#if defined(TRMMKERNEL)
+    //res00 res10 res20 res30
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C1,    0x00 //c1: 0 1 2 3
+    vld       D2,     C2,    0x00 //c2: 0 1 2 3
+    vld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    vand.v    D4,     D1,    D1
+    vpermi.w  D4,     D0,    0x44 //c0:0 1, c1:0 1
+    vshuf4i.w  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    vand.v    D5,     D3,    D3
+    vpermi.w  D5,     D2,    0x44 //c2:0 1, c3:0 1
+    vshuf4i.w  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    vpermi.w  D8,     D6,    0x44 //c0[0] c1[0] c2[0] c3[0]
+    vpermi.w  D9,     D7,    0x44 //c0[1] c1[1] c2[1] c3[1]
+
+    vfmul.s      D8,    U0,    VALPHAR
+    vfmul.s      D9,    U1,    VALPHAR
+    VNMSUB      D8,    U1,    VALPHAI, D8
+    VFMADD      D9,    U0,    VALPHAI, D9
+
+    vand.v    D10,     D9,    D9 //c0[1] c1[1] c2[1] c3[1]
+    vand.v    D11,     D9,    D9 //c0[0] c1[0] c2[0] c3[0]
+
+    vpermi.w  D10,     D8,    0x44 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    vpermi.w  D11,     D8,    0xee //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res01 res11 res21 res31
+    vand.v    D4,     D1,    D1
+    vpermi.w  D4,     D0,    0xee //c0:2 3, c1:2 3
+    vshuf4i.w  D6,     D4,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w  D7,     D4,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    vand.v    D5,     D3,    D3
+    vpermi.w  D5,     D2,    0xee //c2:2 3, c3:2 3
+    vshuf4i.w  D8,     D5,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w  D9,     D5,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    vpermi.w  D8,     D6,    0x44 //c0[2] c1[2] c2[2] c3[2]
+    vpermi.w  D9,     D7,    0x44 //c0[3] c1[3] c2[3] c3[3]
+
+    vfmul.s      D8,    U2,    VALPHAR
+    vfmul.s      D9,    U3,    VALPHAR
+    VNMSUB      D8,    U3,    VALPHAI, D8
+    VFMADD      D9,    U2,    VALPHAI, D9
+
+    vand.v    D4,     D9,    D9
+    vpermi.w  D4,     D8,    0x44 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w D4,     D4,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+    vand.v    D2,     D4,    D4
+
+    vand.v    D5,     D9,    D9
+    vpermi.w  D5,     D8,    0xee //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w D5,     D5,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+    vand.v    D3,     D5,    D5
+
+    vand.v    D0,     D10,    D10 //c0[0] c0[1] c1[0] c1[1]
+    vand.v    D1,     D11,    D11 //c2[0] c2[1] c3[0] c3[1]
+
+    vpermi.w  D4,     D0,     0x44 //c0: 0 1 2 3
+    vpermi.w  D2,     D0,     0xee //c1: 0 1 2 3
+    vpermi.w  D5,     D1,     0x44 //c2: 0 1 2 3
+    vpermi.w  D3,     D1,     0xee //c3: 0 1 2 3
+
+    vst       D4,     C0,    0x00
+    vst       D2,     C1,    0x00
+    vst       D5,     C2,    0x00
+    vst       D3,     C3,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+#else
+    //res00 res10 res20 res30
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C1,    0x00 //c1: 0 1 2 3
+    vld       D2,     C2,    0x00 //c2: 0 1 2 3
+    vld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    vand.v    D4,     D1,    D1
+    vpermi.w  D4,     D0,    0x44 //c0:0 1, c1:0 1
+    vshuf4i.w  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    vand.v    D5,     D3,    D3
+    vpermi.w  D5,     D2,    0x44 //c2:0 1, c3:0 1
+    vshuf4i.w  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    vpermi.w  D8,     D6,    0x44 //c0[0] c1[0] c2[0] c3[0]
+    vpermi.w  D9,     D7,    0x44 //c0[1] c1[1] c2[1] c3[1]
+
+    VFMADD      D8,    U0,    VALPHAR, D8
+    VFMADD      D9,    U1,    VALPHAR, D9
+    VNMSUB      D8,    U1,    VALPHAI, D8
+    VFMADD      D9,    U0,    VALPHAI, D9
+
+    vand.v    D10,     D9,    D9 //c0[1] c1[1] c2[1] c3[1]
+    vand.v    D11,     D9,    D9 //c0[0] c1[0] c2[0] c3[0]
+
+    vpermi.w  D10,     D8,    0x44 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    vpermi.w  D11,     D8,    0xee //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res01 res11 res21 res31
+    vand.v    D4,     D1,    D1
+    vpermi.w  D4,     D0,    0xee //c0:2 3, c1:2 3
+    vshuf4i.w  D6,     D4,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w  D7,     D4,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    vand.v    D5,     D3,    D3
+    vpermi.w  D5,     D2,    0xee //c2:2 3, c3:2 3
+    vshuf4i.w  D8,     D5,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w  D9,     D5,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    vpermi.w  D8,     D6,    0x44 //c0[2] c1[2] c2[2] c3[2]
+    vpermi.w  D9,     D7,    0x44 //c0[3] c1[3] c2[3] c3[3]
+
+    VFMADD      D8,    U2,    VALPHAR, D8
+    VFMADD      D9,    U3,    VALPHAR, D9
+    VNMSUB      D8,    U3,    VALPHAI, D8
+    VFMADD      D9,    U2,    VALPHAI, D9
+
+    vand.v    D4,     D9,    D9
+    vpermi.w  D4,     D8,    0x44 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w D4,     D4,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+    vand.v    D2,     D4,    D4
+
+    vand.v    D5,     D9,    D9
+    vpermi.w  D5,     D8,    0xee //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w D5,     D5,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+    vand.v    D3,     D5,    D5
+
+    vand.v    D0,     D10,    D10 //c0[0] c0[1] c1[0] c1[1]
+    vand.v    D1,     D11,    D11 //c2[0] c2[1] c3[0] c3[1]
+
+    vpermi.w  D4,     D0,     0x44 //c0: 0 1 2 3
+    vpermi.w  D2,     D0,     0xee //c1: 0 1 2 3
+    vpermi.w  D5,     D1,     0x44 //c2: 0 1 2 3
+    vpermi.w  D3,     D1,     0xee //c3: 0 1 2 3
+
+    vst       D4,     C0,    0x00
+    vst       D2,     C1,    0x00
+    vst       D5,     C2,    0x00
+    vst       D3,     C3,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -2
+#else
+    addi.d     TL,     TL,   -4
+#endif
+    slli.d     T3,     TL,   0x04
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x05
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   2
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+.L183:   /* if (bm & 1) */
+    move       I,      $r0
+    andi       T0,     M,     1
+    beq        I,      T0,    .L186
+
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,  0x03
+    add.d      A0,     A0,   T3
+    slli.d     T3,     OFF,  0x05
+    add.d      B0,     B,    T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   1
+#else
+    addi.d     TL,     OFF,   4
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    MTC        c11,    $r0
+    MTC        c12,    $r0
+    MTC        c21,    $r0
+    MTC        c22,    $r0
+    MTC        c31,    $r0
+    MTC        c32,    $r0
+    MTC        c41,    $r0
+    MTC        c42,    $r0
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L185
+    blt        TL,     L,     .L185
+
+.L184:  /* for (k=0; k<temp; k++) */
+    LD         a1,     A0,    0x00        //a0r
+    LD         a2,     A0,    0x04        //a0i
+
+    LD         b1,     B0,    0x00        //b0r
+    LD         b2,     B0,    0x04        //b0i
+    LD         b3,     B0,    0x08        //b1r
+    LD         b4,     B0,    0x0c        //b1i
+    LD         b5,     B0,    0x10        //b2r
+    LD         b6,     B0,    0x14        //b2i
+    LD         b7,     B0,    0x18        //b3r
+    LD         b8,     B0,    0x1c        //b3i
+
+    MADD1      c11,    a1,    b1,     c11  //res00r
+    MADD2      c12,    a2,    b1,     c12  //res00i
+    MADD3      c11,    a2,    b2,     c11
+    MADD4      c12,    a1,    b2,     c12
+
+    MADD1      c21,    a1,    b3,     c21  //res10r
+    MADD2      c22,    a2,    b3,     c22  //res10i
+    MADD3      c21,    a2,    b4,     c21
+    MADD4      c22,    a1,    b4,     c22
+
+    MADD1      c31,    a1,    b5,     c31  //res20r
+    MADD2      c32,    a2,    b5,     c32  //res20i
+    MADD3      c31,    a2,    b6,     c31
+    MADD4      c32,    a1,    b6,     c32
+
+    MADD1      c41,    a1,    b7,     c41  //res30r
+    MADD2      c42,    a2,    b7,     c42  //res30i
+    MADD3      c41,    a2,    b8,     c41
+    MADD4      c42,    a1,    b8,     c42
+
+    addi.d     A0,     A0,    0x08
+    addi.d     B0,     B0,    0x20
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L184
+
+.L185:
+#if defined(TRMMKERNEL)
+    //res00 res10 res20 res30
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x04    //C0[1]
+
+    MUL       a5,     c11,   ALPHA_R
+    MUL       a6,     c12,   ALPHA_R
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x04
+
+    LD         a5,     C1,    0x00    //C1[0]
+    LD         a6,     C1,    0x04    //C1[1]
+
+    MUL       a5,     c21,   ALPHA_R
+    MUL       a6,     c22,   ALPHA_R
+    NMSUB      a5,     c22,   ALPHA_I, a5
+    MADD       a6,     c21,   ALPHA_I, a6
+
+    ST         a5,     C1,    0x00
+    ST         a6,     C1,    0x04
+
+    LD         a5,     C2,    0x00    //C2[0]
+    LD         a6,     C2,    0x04    //C2[1]
+
+    MUL       a5,     c31,   ALPHA_R
+    MUL       a6,     c32,   ALPHA_R
+    NMSUB      a5,     c32,   ALPHA_I, a5
+    MADD       a6,     c31,   ALPHA_I, a6
+
+    ST         a5,     C2,    0x00
+    ST         a6,     C2,    0x04
+
+    LD         a5,     C3,    0x00    //C3[0]
+    LD         a6,     C3,    0x04    //C3[1]
+
+    MUL       a5,     c41,   ALPHA_R
+    MUL       a6,     c42,   ALPHA_R
+    NMSUB      a5,     c42,   ALPHA_I, a5
+    MADD       a6,     c41,   ALPHA_I, a6
+
+    ST         a5,     C3,    0x00
+    ST         a6,     C3,    0x04
+
+    addi.d     C0,     C0,    0x08
+    addi.d     C1,     C1,    0x08
+    addi.d     C2,     C2,    0x08
+    addi.d     C3,     C3,    0x08
+#else
+    //res00 res10 res20 res30
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x04    //C0[1]
+
+    MADD       a5,     c11,   ALPHA_R, a5
+    MADD       a6,     c12,   ALPHA_R, a6
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x04
+
+    LD         a5,     C1,    0x00    //C1[0]
+    LD         a6,     C1,    0x04    //C1[1]
+
+    MADD       a5,     c21,   ALPHA_R, a5
+    MADD       a6,     c22,   ALPHA_R, a6
+    NMSUB      a5,     c22,   ALPHA_I, a5
+    MADD       a6,     c21,   ALPHA_I, a6
+
+    ST         a5,     C1,    0x00
+    ST         a6,     C1,    0x04
+
+    LD         a5,     C2,    0x00    //C2[0]
+    LD         a6,     C2,    0x04    //C2[1]
+
+    MADD       a5,     c31,   ALPHA_R, a5
+    MADD       a6,     c32,   ALPHA_R, a6
+    NMSUB      a5,     c32,   ALPHA_I, a5
+    MADD       a6,     c31,   ALPHA_I, a6
+
+    ST         a5,     C2,    0x00
+    ST         a6,     C2,    0x04
+
+    LD         a5,     C3,    0x00    //C3[0]
+    LD         a6,     C3,    0x04    //C3[1]
+
+    MADD       a5,     c41,   ALPHA_R, a5
+    MADD       a6,     c42,   ALPHA_R, a6
+    NMSUB      a5,     c42,   ALPHA_I, a5
+    MADD       a6,     c41,   ALPHA_I, a6
+
+    ST         a5,     C3,    0x00
+    ST         a6,     C3,    0x04
+
+    addi.d     C0,     C0,    0x08
+    addi.d     C1,     C1,    0x08
+    addi.d     C2,     C2,    0x08
+    addi.d     C3,     C3,    0x08
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -1
+#else
+    addi.d     TL,     TL,   -4
+#endif
+    slli.d     T3,     TL,   0x03
+    add.d      A0,     A0,   T3
+    slli.d     C3,     TL,   0x05
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   1
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+
+.L186:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    addi.d     OFF,    OFF,   4
+#endif
+
+    slli.d     L,      K,     0x05
+    add.d      B,      B,     L
+
+    slli.d     I,      LDC,   0x03
+    add.d      C,      C,     I
+
+    addi.d     J,      J,     1
+    srai.d     T0,     N,     2
+    blt        J,      T0,    .L10
+
+.L19:
+    move       J,      $r0
+    andi       T0,     N,     2
+    beq        J,      T0,    .L30
+
+.L20: /* for (j=0; j<(bn&2); j+=2) */
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move       OFF,    OFFSET
+#endif
+
+    move       C0,     C
+    slli.d     TL,     LDC,   1
+    add.d      C1,     C0,    TL
+    move       A0,     A    //ptrba
+
+    move       I,      $r0
+    srai.d     T0,     M,     3  //bm/8
+    beq        I,      T0,    .L24
+
+.L21:  /* for (i=0; i<bm/8; i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,  0x06
+    add.d      A0,     A0,   T3
+    slli.d     T3,     OFF,  0x04
+    add.d      B0,     B,    T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   8
+#else
+    addi.d     TL,     OFF,   2
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    vxor.v    U0,     U0,   U0
+    vxor.v    U1,     U1,   U1
+    vxor.v    U2,     U2,   U2
+    vxor.v    U3,     U3,   U3
+    vxor.v    U4,     U4,   U4
+    vxor.v    U5,     U5,   U5
+    vxor.v    U6,     U6,   U6
+    vxor.v    U7,     U7,   U7
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L23
+    blt        TL,     L,     .L23
+
+.L22:  /* for (k=0; k<temp; k++) */
+    vld       D0,     A0,    0x00  // a0ri a1ri
+    vld       D2,     B0,    0x00  // b0ri b1ri
+
+    vshuf4i.w  D4,     D0,    0xa0  //a0rr a1rr
+    vshuf4i.w  D5,     D0,    0xf5  //a0ii a1ii
+
+    vshuf4i.w  D6,     D2,    0x88  //b0r b1r b0r b1r
+    vshuf4i.w  D7,     D2,    0xdd  //b0i b1i b0i b1i
+
+    VMADD1    U0,     D4,    D6,     U0  //00r 10r 01r 11r
+    VMADD2    U1,     D5,    D6,     U1  //00i 10i 01i 11i
+    VMADD3    U0,     D5,    D7,     U0
+    VMADD4    U1,     D4,    D7,     U1
+
+    vld       D0,     A0,    0x10  // a2ri a3ri
+
+    vshuf4i.w  D4,     D0,    0xa0  //a2rr a3rr
+    vshuf4i.w  D5,     D0,    0xf5  //a2ii a3ii
+
+    VMADD1    U2,     D4,    D6,     U2  //02r 12r 03r 13r
+    VMADD2    U3,     D5,    D6,     U3  //02i 12i 03i 13i
+    VMADD3    U2,     D5,    D7,     U2
+    VMADD4    U3,     D4,    D7,     U3
+
+    vld       D0,     A0,    0x20  // a4ri a5ri
+
+    vshuf4i.w  D4,     D0,    0xa0  //a4rr a5rr
+    vshuf4i.w  D5,     D0,    0xf5  //a4ii a5ii
+
+    VMADD1    U4,     D4,    D6,     U4  //04r 14r 05r 15r
+    VMADD2    U5,     D5,    D6,     U5  //04i 14i 05i 15i
+    VMADD3    U4,     D5,    D7,     U4
+    VMADD4    U5,     D4,    D7,     U5
+
+    vld       D0,     A0,    0x30  // a6ri a7ri
+
+    vshuf4i.w  D4,     D0,    0xa0  //a6rr a7rr
+    vshuf4i.w  D5,     D0,    0xf5  //a6ii a7ii
+
+    VMADD1    U6,     D4,    D6,     U6  //06r 16r 07r 17r
+    VMADD2    U7,     D5,    D6,     U7  //06i 16i 07i 17i
+    VMADD3    U6,     D5,    D7,     U6
+    VMADD4    U7,     D4,    D7,     U7
+
+    addi.d     A0,     A0,    0x40
+    addi.d     B0,     B0,    0x10
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L22
+
+.L23:
+#if defined(TRMMKERNEL)
+    //res00 res10 res01 res11
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C1,    0x00 //c1: 0 1 2 3
+
+    vpackev.w D2,     D1,    D0  //0 4 2 6
+    vpackod.w D3,     D1,    D0  //1 5 3 7
+
+    vfmul.s      D2,    U0,    VALPHAR
+    vfmul.s      D3,    U1,    VALPHAR
+    VNMSUB      D2,    U1,    VALPHAI, D2
+    VFMADD      D3,    U0,    VALPHAI, D3
+
+    vpackev.w D4,     D3,    D2  //0 1 2 3
+    vpackod.w D5,     D3,    D2  //4 5 6 7
+
+    vst       D4,     C0,    0x00 //c0: 0 1 2 3
+    vst       D5,     C1,    0x00 //c1: 0 1 2 3
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res02 res12 res03 res13
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C1,    0x00 //c1: 0 1 2 3
+
+    vpackev.w D2,     D1,    D0  //0 4 2 6
+    vpackod.w D3,     D1,    D0  //1 5 3 7
+
+    vfmul.s      D2,    U2,    VALPHAR
+    vfmul.s      D3,    U3,    VALPHAR
+    VNMSUB      D2,    U3,    VALPHAI, D2
+    VFMADD      D3,    U2,    VALPHAI, D3
+
+    vpackev.w D4,     D3,    D2  //0 1 2 3
+    vpackod.w D5,     D3,    D2  //4 5 6 7
+
+    vst       D4,     C0,    0x00 //c0: 0 1 2 3
+    vst       D5,     C1,    0x00 //c1: 0 1 2 3
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res04 res14 res05 res15
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C1,    0x00 //c1: 0 1 2 3
+
+    vpackev.w D2,     D1,    D0  //0 4 2 6
+    vpackod.w D3,     D1,    D0  //1 5 3 7
+
+    vfmul.s      D2,    U4,    VALPHAR
+    vfmul.s      D3,    U5,    VALPHAR
+    VNMSUB      D2,    U5,    VALPHAI, D2
+    VFMADD      D3,    U4,    VALPHAI, D3
+
+    vpackev.w D4,     D3,    D2  //0 1 2 3
+    vpackod.w D5,     D3,    D2  //4 5 6 7
+
+    vst       D4,     C0,    0x00 //c0: 0 1 2 3
+    vst       D5,     C1,    0x00 //c1: 0 1 2 3
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res06 res16 res07 res17
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C1,    0x00 //c1: 0 1 2 3
+
+    vpackev.w D2,     D1,    D0  //0 4 2 6
+    vpackod.w D3,     D1,    D0  //1 5 3 7
+
+    vfmul.s      D2,    U6,    VALPHAR
+    vfmul.s      D3,    U7,    VALPHAR
+    VNMSUB      D2,    U7,    VALPHAI, D2
+    VFMADD      D3,    U6,    VALPHAI, D3
+
+    vpackev.w D4,     D3,    D2  //0 1 2 3
+    vpackod.w D5,     D3,    D2  //4 5 6 7
+
+    vst       D4,     C0,    0x00 //c0: 0 1 2 3
+    vst       D5,     C1,    0x00 //c1: 0 1 2 3
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+#else
+    //res00 res10 res01 res11
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C1,    0x00 //c1: 0 1 2 3
+
+    vpackev.w D2,     D1,    D0  //0 4 2 6
+    vpackod.w D3,     D1,    D0  //1 5 3 7
+
+    VFMADD      D2,    U0,    VALPHAR, D2
+    VFMADD      D3,    U1,    VALPHAR, D3
+    VNMSUB      D2,    U1,    VALPHAI, D2
+    VFMADD      D3,    U0,    VALPHAI, D3
+
+    vpackev.w D4,     D3,    D2  //0 1 2 3
+    vpackod.w D5,     D3,    D2  //4 5 6 7
+
+    vst       D4,     C0,    0x00 //c0: 0 1 2 3
+    vst       D5,     C1,    0x00 //c1: 0 1 2 3
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res02 res12 res03 res13
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C1,    0x00 //c1: 0 1 2 3
+
+    vpackev.w D2,     D1,    D0  //0 4 2 6
+    vpackod.w D3,     D1,    D0  //1 5 3 7
+
+    VFMADD      D2,    U2,    VALPHAR, D2
+    VFMADD      D3,    U3,    VALPHAR, D3
+    VNMSUB      D2,    U3,    VALPHAI, D2
+    VFMADD      D3,    U2,    VALPHAI, D3
+
+    vpackev.w D4,     D3,    D2  //0 1 2 3
+    vpackod.w D5,     D3,    D2  //4 5 6 7
+
+    vst       D4,     C0,    0x00 //c0: 0 1 2 3
+    vst       D5,     C1,    0x00 //c1: 0 1 2 3
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res04 res14 res05 res15
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C1,    0x00 //c1: 0 1 2 3
+
+    vpackev.w D2,     D1,    D0  //0 4 2 6
+    vpackod.w D3,     D1,    D0  //1 5 3 7
+
+    VFMADD      D2,    U4,    VALPHAR, D2
+    VFMADD      D3,    U5,    VALPHAR, D3
+    VNMSUB      D2,    U5,    VALPHAI, D2
+    VFMADD      D3,    U4,    VALPHAI, D3
+
+    vpackev.w D4,     D3,    D2  //0 1 2 3
+    vpackod.w D5,     D3,    D2  //4 5 6 7
+
+    vst       D4,     C0,    0x00 //c0: 0 1 2 3
+    vst       D5,     C1,    0x00 //c1: 0 1 2 3
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res06 res16 res07 res17
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C1,    0x00 //c1: 0 1 2 3
+
+    vpackev.w D2,     D1,    D0  //0 4 2 6
+    vpackod.w D3,     D1,    D0  //1 5 3 7
+
+    VFMADD      D2,    U6,    VALPHAR, D2
+    VFMADD      D3,    U7,    VALPHAR, D3
+    VNMSUB      D2,    U7,    VALPHAI, D2
+    VFMADD      D3,    U6,    VALPHAI, D3
+
+    vpackev.w D4,     D3,    D2  //0 1 2 3
+    vpackod.w D5,     D3,    D2  //4 5 6 7
+
+    vst       D4,     C0,    0x00 //c0: 0 1 2 3
+    vst       D5,     C1,    0x00 //c1: 0 1 2 3
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -8
+#else
+    addi.d     TL,     TL,   -2
+#endif
+    slli.d     T3,     TL,   0x06
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x04
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   8
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     I,      I,     1
+    blt        I,      T0,    .L21
+
+.L24:   /* if ( bm & 4 ) */
+    move       I,      $r0
+    andi       T1,     M,     4    //bm&4
+    beq        I,      T1,    .L280
+
+.L25:
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,   0x05
+    add.d      A0,     A0,    T3
+    slli.d     T3,     OFF,   0x04
+    add.d      B0,     B,     T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   4
+#else
+    addi.d     TL,     OFF,   2
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    vxor.v    U0,     U0,   U0
+    vxor.v    U1,     U1,   U1
+    vxor.v    U2,     U2,   U2
+    vxor.v    U3,     U3,   U3
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L27
+    blt        TL,     L,     .L27
+
+.L26:  /* for (k=0; k<temp; k++) */
+    vld       D0,     A0,    0x00  // a0ri a1ri
+    vld       D2,     B0,    0x00  // b0ri b1ri
+
+    vshuf4i.w  D4,     D0,    0xa0  //a0rr a1rr
+    vshuf4i.w  D5,     D0,    0xf5  //a0ii a1ii
+
+    vshuf4i.w  D6,     D2,    0x88  //b0r b1r b0r b1r
+    vshuf4i.w  D7,     D2,    0xdd  //b0i b1i b0i b1i
+
+    VMADD1    U0,     D4,    D6,     U0  //00r 10r 01r 11r
+    VMADD2    U1,     D5,    D6,     U1  //00i 10i 01i 11i
+    VMADD3    U0,     D5,    D7,     U0
+    VMADD4    U1,     D4,    D7,     U1
+
+    vld       D0,     A0,    0x10  // a2ri a3ri
+
+    vshuf4i.w  D4,     D0,    0xa0  //a2rr a3rr
+    vshuf4i.w  D5,     D0,    0xf5  //a2ii a3ii
+
+    VMADD1    U2,     D4,    D6,     U2  //02r 12r 03r 13r
+    VMADD2    U3,     D5,    D6,     U3  //02i 12i 03i 13i
+    VMADD3    U2,     D5,    D7,     U2
+    VMADD4    U3,     D4,    D7,     U3
+
+    addi.d     A0,     A0,    0x20
+    addi.d     B0,     B0,    0x10
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L26
+
+.L27:
+#if defined(TRMMKERNEL)
+    //res00 res10 res01 res11
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C1,    0x00 //c1: 0 1 2 3
+
+    vpackev.w D2,     D1,    D0  //0 4 2 6
+    vpackod.w D3,     D1,    D0  //1 5 3 7
+
+    vfmul.s      D2,    U0,    VALPHAR
+    vfmul.s      D3,    U1,    VALPHAR
+    VNMSUB      D2,    U1,    VALPHAI, D2
+    VFMADD      D3,    U0,    VALPHAI, D3
+
+    vpackev.w D4,     D3,    D2  //0 1 2 3
+    vpackod.w D5,     D3,    D2  //4 5 6 7
+
+    vst       D4,     C0,    0x00 //c0: 0 1 2 3
+    vst       D5,     C1,    0x00 //c1: 0 1 2 3
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res02 res12 res03 res13
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C1,    0x00 //c1: 0 1 2 3
+
+    vpackev.w D2,     D1,    D0  //0 4 2 6
+    vpackod.w D3,     D1,    D0  //1 5 3 7
+
+    vfmul.s      D2,    U2,    VALPHAR
+    vfmul.s      D3,    U3,    VALPHAR
+    VNMSUB      D2,    U3,    VALPHAI, D2
+    VFMADD      D3,    U2,    VALPHAI, D3
+
+    vpackev.w D4,     D3,    D2  //0 1 2 3
+    vpackod.w D5,     D3,    D2  //4 5 6 7
+
+    vst       D4,     C0,    0x00 //c0: 0 1 2 3
+    vst       D5,     C1,    0x00 //c1: 0 1 2 3
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+#else
+    //res00 res10 res01 res11
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C1,    0x00 //c1: 0 1 2 3
+
+    vpackev.w D2,     D1,    D0  //0 4 2 6
+    vpackod.w D3,     D1,    D0  //1 5 3 7
+
+    VFMADD      D2,    U0,    VALPHAR, D2
+    VFMADD      D3,    U1,    VALPHAR, D3
+    VNMSUB      D2,    U1,    VALPHAI, D2
+    VFMADD      D3,    U0,    VALPHAI, D3
+
+    vpackev.w D4,     D3,    D2  //0 1 2 3
+    vpackod.w D5,     D3,    D2  //4 5 6 7
+
+    vst       D4,     C0,    0x00 //c0: 0 1 2 3
+    vst       D5,     C1,    0x00 //c1: 0 1 2 3
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res02 res12 res03 res13
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C1,    0x00 //c1: 0 1 2 3
+
+    vpackev.w D2,     D1,    D0  //0 4 2 6
+    vpackod.w D3,     D1,    D0  //1 5 3 7
+
+    VFMADD      D2,    U2,    VALPHAR, D2
+    VFMADD      D3,    U3,    VALPHAR, D3
+    VNMSUB      D2,    U3,    VALPHAI, D2
+    VFMADD      D3,    U2,    VALPHAI, D3
+
+    vpackev.w D4,     D3,    D2  //0 1 2 3
+    vpackod.w D5,     D3,    D2  //4 5 6 7
+
+    vst       D4,     C0,    0x00 //c0: 0 1 2 3
+    vst       D5,     C1,    0x00 //c1: 0 1 2 3
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -4
+#else
+    addi.d     TL,     TL,   -2
+#endif
+    slli.d     T3,     TL,   0x05
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x04
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   4
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+.L280:   /* if ( bm & 2 )*/
+    move       I,      $r0
+    andi       T1,     M,     2    //bm&2
+    beq        I,      T1,    .L284
+
+.L281:
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,   0x04
+    add.d      A0,     A0,    T3
+    add.d      B0,     B,     T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   2
+#else
+    addi.d     TL,     OFF,   2
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    vxor.v    U0,     U0,   U0
+    vxor.v    U1,     U1,   U1
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L283
+    blt        TL,     L,     .L283
+
+.L282:  /* for (k=0; k<temp; k++) */
+    vld       D0,     A0,    0x00  // a0ri a1ri
+    vld       D2,     B0,    0x00  // b0ri b1ri
+
+    vshuf4i.w  D4,     D0,    0xa0  //a0rr a1rr
+    vshuf4i.w  D5,     D0,    0xf5  //a0ii a1ii
+
+    vshuf4i.w  D6,     D2,    0x88  //b0r b1r b0r b1r
+    vshuf4i.w  D7,     D2,    0xdd  //b0i b1i b0i b1i
+
+    VMADD1    U0,     D4,    D6,     U0  //00r 10r 01r 11r
+    VMADD2    U1,     D5,    D6,     U1  //00i 10i 01i 11i
+    VMADD3    U0,     D5,    D7,     U0
+    VMADD4    U1,     D4,    D7,     U1
+
+    addi.d     A0,     A0,    0x10
+    addi.d     B0,     B0,    0x10
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L282
+
+.L283:
+#if defined(TRMMKERNEL)
+    //res00 res10 res01 res11
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C1,    0x00 //c1: 0 1 2 3
+
+    vpackev.w D2,     D1,    D0  //0 4 2 6
+    vpackod.w D3,     D1,    D0  //1 5 3 7
+
+    vfmul.s      D2,    U0,    VALPHAR
+    vfmul.s      D3,    U1,    VALPHAR
+    VNMSUB      D2,    U1,    VALPHAI, D2
+    VFMADD      D3,    U0,    VALPHAI, D3
+
+    vpackev.w D4,     D3,    D2  //0 1 2 3
+    vpackod.w D5,     D3,    D2  //4 5 6 7
+
+    vst       D4,     C0,    0x00 //c0: 0 1 2 3
+    vst       D5,     C1,    0x00 //c1: 0 1 2 3
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+#else
+    //res00 res10 res01 res11
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C1,    0x00 //c1: 0 1 2 3
+
+    vpackev.w D2,     D1,    D0  //0 4 2 6
+    vpackod.w D3,     D1,    D0  //1 5 3 7
+
+    VFMADD      D2,    U0,    VALPHAR, D2
+    VFMADD      D3,    U1,    VALPHAR, D3
+    VNMSUB      D2,    U1,    VALPHAI, D2
+    VFMADD      D3,    U0,    VALPHAI, D3
+
+    vpackev.w D4,     D3,    D2  //0 1 2 3
+    vpackod.w D5,     D3,    D2  //4 5 6 7
+
+    vst       D4,     C0,    0x00 //c0: 0 1 2 3
+    vst       D5,     C1,    0x00 //c1: 0 1 2 3
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -2
+#else
+    addi.d     TL,     TL,   -2
+#endif
+    slli.d     T3,     TL,   0x04
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x04
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   2
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+.L284:   /* if ( bm & 1 )*/
+    move       I,      $r0
+    andi       T1,     M,     1    //bm&1
+    beq        I,      T1,    .L288
+
+.L285:
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,   0x03
+    add.d      A0,     A0,    T3
+    slli.d     T3,     OFF,   0x04
+    add.d      B0,     B,     T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   1
+#else
+    addi.d     TL,     OFF,   2
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    MTC        c11,    $r0
+    MTC        c12,    $r0
+    MTC        c21,    $r0
+    MTC        c22,    $r0
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L287
+    blt        TL,     L,     .L287
+
+.L286:  /* for (k=0; k<temp; k++) */
+    LD         a1,     A0,    0x00        //a0r
+    LD         a2,     A0,    0x04        //a0i
+
+    LD         b1,     B0,    0x00        //b0r
+    LD         b2,     B0,    0x04        //b0i
+    LD         b3,     B0,    0x08        //b1r
+    LD         b4,     B0,    0x0c        //b1i
+
+    MADD1      c11,    a1,    b1,     c11  //res00r
+    MADD2      c12,    a2,    b1,     c12  //res00i
+    MADD3      c11,    a2,    b2,     c11
+    MADD4      c12,    a1,    b2,     c12
+
+    MADD1      c21,    a1,    b3,     c21  //res10r
+    MADD2      c22,    a2,    b3,     c22  //res10i
+    MADD3      c21,    a2,    b4,     c21
+    MADD4      c22,    a1,    b4,     c22
+
+    addi.d     A0,     A0,    0x08
+    addi.d     B0,     B0,    0x10
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L286
+
+.L287:
+#if defined(TRMMKERNEL)
+    //res00 res10
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x04    //C0[1]
+    LD         a7,     C1,    0x00    //C1[0]
+    LD         a8,     C1,    0x04    //C1[1]
+
+    MUL       a5,     c11,   ALPHA_R
+    MUL       a6,     c12,   ALPHA_R
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+
+    MUL       a7,     c21,   ALPHA_R
+    MUL       a8,     c22,   ALPHA_R
+    NMSUB      a7,     c22,   ALPHA_I, a7
+    MADD       a8,     c21,   ALPHA_I, a8
+
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x04
+
+    ST         a7,     C1,    0x00
+    ST         a8,     C1,    0x04
+
+    addi.d     C0,     C0,    0x08
+    addi.d     C1,     C1,    0x08
+#else
+    //res00 res10
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x04    //C0[1]
+    LD         a7,     C1,    0x00    //C1[0]
+    LD         a8,     C1,    0x04    //C1[1]
+
+    MADD       a5,     c11,   ALPHA_R, a5
+    MADD       a6,     c12,   ALPHA_R, a6
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+
+    MADD       a7,     c21,   ALPHA_R, a7
+    MADD       a8,     c22,   ALPHA_R, a8
+    NMSUB      a7,     c22,   ALPHA_I, a7
+    MADD       a8,     c21,   ALPHA_I, a8
+
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x04
+
+    ST         a7,     C1,    0x00
+    ST         a8,     C1,    0x04
+
+    addi.d     C0,     C0,    0x08
+    addi.d     C1,     C1,    0x08
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -1
+#else
+    addi.d     TL,     TL,   -2
+#endif
+    slli.d     T3,     TL,   0x03
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x04
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   1
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+.L288:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    addi.d     OFF,    OFF,   2
+#endif
+    slli.d     L,      K,     4
+    add.d      B,      B,     L
+
+    slli.d     I,      LDC,   2
+    add.d      C,      C,     I
+
+    addi.d     J,      J,     2
+    andi       T0,     N,     2
+    blt        J,      T0,    .L20
+
+.L30:
+    move       J,      $r0
+    andi       T0,     N,     1
+    beq        J,      T0,    .L999
+
+.L300:  /* for (j=0; j<(bn&1); j+=1) */
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move       OFF,    OFFSET
+#endif
+
+    move       C0,     C
+    move       A0,     A    //ptrba
+
+    move       I,      $r0
+    srai.d     T0,     M,     3  //bm/8
+    beq        I,      T0,    .L34
+
+.L31:  /* for (i=0; i<bm/8; i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,  0x06
+    add.d      A0,     A0,   T3
+    slli.d     T3,     OFF,  0x03
+    add.d      B0,     B,    T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   8
+#else
+    addi.d     TL,     OFF,   1
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    vxor.v    U0,     U0,   U0
+    vxor.v    U1,     U1,   U1
+    vxor.v    U2,     U2,   U2
+    vxor.v    U3,     U3,   U3
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L33
+    blt        TL,     L,     .L33
+
+.L32:  /* for (k=0; k<temp; k++) */
+    vld       D0,     A0,    0x00  // a0ri a1ri
+    vld       D1,     A0,    0x10  // a2ri a3ri
+
+    vldrepl.w D2,     B0,    0x00 //b0r
+    vldrepl.w D3,     B0,    0x04 //b0i
+
+    vpackev.w D4,     D1,    D0
+    vshuf4i.w  D4,     D4,    0xd8  //a0r a1r a2r a3r
+
+    vpackod.w D5,     D1,    D0
+    vshuf4i.w  D5,     D5,    0xd8  //a0i a1i a2i a3i
+
+    VMADD1    U0,     D4,    D2,     U0  //00r 01r 02r 03r
+    VMADD2    U1,     D5,    D2,     U1  //00i 01i 02i 03i
+    VMADD3    U0,     D5,    D3,     U0
+    VMADD4    U1,     D4,    D3,     U1
+
+    vld       D0,     A0,    0x20  // a4ri a5ri
+    vld       D1,     A0,    0x30  // a6ri a7ri
+
+    vpackev.w D4,     D1,    D0
+    vshuf4i.w  D4,     D4,    0xd8  //a4r a5r a6r a7r
+
+    vpackod.w D5,     D1,    D0
+    vshuf4i.w  D5,     D5,    0xd8  //a4i a5i a6i a7i
+
+    VMADD1    U2,     D4,    D2,     U2  //04r 05r 06r 07r
+    VMADD2    U3,     D5,    D2,     U3  //04i 05i 06i 07i
+    VMADD3    U2,     D5,    D3,     U2
+    VMADD4    U3,     D4,    D3,     U3
+
+    addi.d     A0,     A0,    0x40
+    addi.d     B0,     B0,    0x08
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L32
+
+.L33:
+#if defined(TRMMKERNEL)
+    //res00 res01 res02 res03
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C0,    0x10 //c0: 4 5 6 7
+
+    vpackev.w D2,     D1,    D0
+    vshuf4i.w  D2,     D2,    0xd8  //0 2 4 6
+    vpackod.w D3,     D1,    D0
+    vshuf4i.w  D3,     D3,    0xd8  //1 3 5 7
+
+    vfmul.s      D2,    U0,    VALPHAR
+    vfmul.s      D3,    U1,    VALPHAR
+    VNMSUB      D2,    U1,    VALPHAI, D2
+    VFMADD      D3,    U0,    VALPHAI, D3
+
+    vand.v    D4,     D3,   D3  //1 3 5 7
+    vpermi.w  D4,     D2,   0x44 //0 2 1 3
+    vshuf4i.w  D4,     D4,   0xd8 //0 1 2 3
+
+    vand.v    D5,     D3,   D3  //1 3 5 7
+    vpermi.w  D5,     D2,   0xee //4 6 5 7
+    vshuf4i.w  D5,     D5,   0xd8 //4 5 6 7
+
+    vst       D4,     C0,    0x00
+    vst       D5,     C0,    0x10
+
+    //res04 res05 res06 res07
+    vld       D0,     C0,    0x20 //c0: 8 9 10 11
+    vld       D1,     C0,    0x30 //c0: 12 13 14 15
+
+    vpackev.w D2,     D1,    D0
+    vshuf4i.w  D2,     D2,    0xd8  //8 10 12 14
+    vpackod.w D3,     D1,    D0
+    vshuf4i.w  D3,     D3,    0xd8  //9 11 13 15
+
+    vfmul.s      D2,    U2,    VALPHAR
+    vfmul.s      D3,    U3,    VALPHAR
+    VNMSUB      D2,    U3,    VALPHAI, D2
+    VFMADD      D3,    U2,    VALPHAI, D3
+
+    vand.v    D4,     D3,   D3  //8 10 12 14
+    vpermi.w  D4,     D2,   0x44 //8 10 9 11
+    vshuf4i.w  D4,     D4,   0xd8 //8 9 10 11
+
+    vand.v    D5,     D3,   D3  //9 11 13 15
+    vpermi.w  D5,     D2,   0xee //12 14 13 15
+    vshuf4i.w  D5,     D5,   0xd8 //12 13 14 15
+
+    vst       D4,     C0,    0x20
+    vst       D5,     C0,    0x30
+
+    addi.d     C0,     C0,    0x40
+#else
+    //res00 res01 res02 res03
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C0,    0x10 //c0: 4 5 6 7
+
+    vpackev.w D2,     D1,    D0
+    vshuf4i.w  D2,     D2,    0xd8  //0 2 4 6
+    vpackod.w D3,     D1,    D0
+    vshuf4i.w  D3,     D3,    0xd8  //1 3 5 7
+
+    VFMADD      D2,    U0,    VALPHAR, D2
+    VFMADD      D3,    U1,    VALPHAR, D3
+    VNMSUB      D2,    U1,    VALPHAI, D2
+    VFMADD      D3,    U0,    VALPHAI, D3
+
+    vand.v    D4,     D3,   D3  //1 3 5 7
+    vpermi.w  D4,     D2,   0x44 //0 2 1 3
+    vshuf4i.w  D4,     D4,   0xd8 //0 1 2 3
+
+    vand.v    D5,     D3,   D3  //1 3 5 7
+    vpermi.w  D5,     D2,   0xee //4 6 5 7
+    vshuf4i.w  D5,     D5,   0xd8 //4 5 6 7
+
+    vst       D4,     C0,    0x00
+    vst       D5,     C0,    0x10
+
+    //res04 res05 res06 res07
+    vld       D0,     C0,    0x20 //c0: 8 9 10 11
+    vld       D1,     C0,    0x30 //c0: 12 13 14 15
+
+    vpackev.w D2,     D1,    D0
+    vshuf4i.w  D2,     D2,    0xd8  //8 10 12 14
+    vpackod.w D3,     D1,    D0
+    vshuf4i.w  D3,     D3,    0xd8  //9 11 13 15
+
+    VFMADD      D2,    U2,    VALPHAR, D2
+    VFMADD      D3,    U3,    VALPHAR, D3
+    VNMSUB      D2,    U3,    VALPHAI, D2
+    VFMADD      D3,    U2,    VALPHAI, D3
+
+    vand.v    D4,     D3,   D3  //8 10 12 14
+    vpermi.w  D4,     D2,   0x44 //8 10 9 11
+    vshuf4i.w  D4,     D4,   0xd8 //8 9 10 11
+
+    vand.v    D5,     D3,   D3  //9 11 13 15
+    vpermi.w  D5,     D2,   0xee //12 14 13 15
+    vshuf4i.w  D5,     D5,   0xd8 //12 13 14 15
+
+    vst       D4,     C0,    0x20
+    vst       D5,     C0,    0x30
+
+    addi.d     C0,     C0,    0x40
+#endif
+
+#if defined(TRMMKERNEL)
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -8
+#else
+    addi.d     TL,     TL,   -1
+#endif
+    slli.d     T3,     TL,   0x06
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x03
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   8
+#endif
+
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     I,      I,     1
+    blt        I,      T0,    .L31
+
+.L34:   /* if ( bm & 4 ) */
+    move       I,      $r0
+    andi       T1,     M,     4    //bm&4
+    beq        I,      T1,    .L38
+
+.L35:
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,   0x05
+    add.d      A0,     A0,    T3
+    slli.d     T3,     OFF,   0x03
+    add.d      B0,     B,     T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   4
+#else
+    addi.d     TL,     OFF,   1
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    vxor.v    U0,     U0,   U0
+    vxor.v    U1,     U1,   U1
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L37
+    blt        TL,     L,     .L37
+
+.L36:  /* for (k=0; k<temp; k++) */
+    vld       D0,     A0,    0x00  // a0ri a1ri
+    vld       D1,     A0,    0x10  // a2ri a3ri
+
+    vldrepl.w D2,     B0,    0x00 //b0r
+    vldrepl.w D3,     B0,    0x04 //b0i
+
+    vpackev.w D4,     D1,    D0
+    vshuf4i.w  D4,     D4,    0xd8  //a0r a1r a2r a3r
+
+    vpackod.w D5,     D1,    D0
+    vshuf4i.w  D5,     D5,    0xd8  //a0i a1i a2i a3i
+
+    VMADD1    U0,     D4,    D2,     U0  //00r 01r 02r 03r
+    VMADD2    U1,     D5,    D2,     U1  //00i 01i 02i 03i
+    VMADD3    U0,     D5,    D3,     U0
+    VMADD4    U1,     D4,    D3,     U1
+
+    addi.d     A0,     A0,    0x20
+    addi.d     B0,     B0,    0x08
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L36
+
+.L37:
+#if defined(TRMMKERNEL)
+    //res00 res01 res02 res03
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C0,    0x10 //c0: 4 5 6 7
+
+    vpackev.w D2,     D1,    D0
+    vshuf4i.w  D2,     D2,    0xd8  //0 2 4 6
+    vpackod.w D3,     D1,    D0
+    vshuf4i.w  D3,     D3,    0xd8  //1 3 5 7
+
+    vfmul.s      D2,    U0,    VALPHAR
+    vfmul.s      D3,    U1,    VALPHAR
+    VNMSUB      D2,    U1,    VALPHAI, D2
+    VFMADD      D3,    U0,    VALPHAI, D3
+
+    vand.v    D4,     D3,   D3  //1 3 5 7
+    vpermi.w  D4,     D2,   0x44 //0 2 1 3
+    vshuf4i.w  D4,     D4,   0xd8 //0 1 2 3
+
+    vand.v    D5,     D3,   D3  //1 3 5 7
+    vpermi.w  D5,     D2,   0xee //4 6 5 7
+    vshuf4i.w  D5,     D5,   0xd8 //4 5 6 7
+
+    vst       D4,     C0,    0x00
+    vst       D5,     C0,    0x10
+
+    addi.d     C0,     C0,    0x20
+#else
+    //res00 res01 res02 res03
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C0,    0x10 //c0: 4 5 6 7
+
+    vpackev.w D2,     D1,    D0
+    vshuf4i.w  D2,     D2,    0xd8  //0 2 4 6
+    vpackod.w D3,     D1,    D0
+    vshuf4i.w  D3,     D3,    0xd8  //1 3 5 7
+
+    VFMADD      D2,    U0,    VALPHAR, D2
+    VFMADD      D3,    U1,    VALPHAR, D3
+    VNMSUB      D2,    U1,    VALPHAI, D2
+    VFMADD      D3,    U0,    VALPHAI, D3
+
+    vand.v    D4,     D3,   D3  //1 3 5 7
+    vpermi.w  D4,     D2,   0x44 //0 2 1 3
+    vshuf4i.w  D4,     D4,   0xd8 //0 1 2 3
+
+    vand.v    D5,     D3,   D3  //1 3 5 7
+    vpermi.w  D5,     D2,   0xee //4 6 5 7
+    vshuf4i.w  D5,     D5,   0xd8 //4 5 6 7
+
+    vst       D4,     C0,    0x00
+    vst       D5,     C0,    0x10
+
+    addi.d     C0,     C0,    0x20
+#endif
+
+#if defined(TRMMKERNEL)
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -4
+#else
+    addi.d     TL,     TL,   -1
+#endif
+    slli.d     T3,     TL,   0x05
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x03
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   4
+#endif
+
+#endif   // #if defined(TRMMKERNEL)
+
+.L38:   /* if ( bm & 2 ) */
+    move       I,      $r0
+    andi       T1,     M,     2    //bm&2
+    beq        I,      T1,    .L312
+
+.L39:
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,   0x04
+    add.d      A0,     A0,    T3
+    slli.d     T3,     OFF,   0x03
+    add.d      B0,     B,     T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   2
+#else
+    addi.d     TL,     OFF,   1
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    MTC        c11,    $r0
+    MTC        c12,    $r0
+    MTC        c21,    $r0
+    MTC        c22,    $r0
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L311
+    blt        TL,     L,     .L311
+
+.L310:  /* for (k=0; k<temp; k++) */
+    LD         a1,     A0,    0x00        //a0r
+    LD         a2,     A0,    0x04        //a0i
+    LD         a3,     A0,    0x08        //a1r
+    LD         a4,     A0,    0x0c        //a1i
+
+    LD         b1,     B0,    0x00        //b0r
+    LD         b2,     B0,    0x04        //b0i
+
+    MADD1      c11,    a1,    b1,     c11  //res00r
+    MADD2      c12,    a2,    b1,     c12  //res00i
+    MADD3      c11,    a2,    b2,     c11
+    MADD4      c12,    a1,    b2,     c12
+
+    MADD1      c21,    a3,    b1,     c21  //res10r
+    MADD2      c22,    a4,    b1,     c22  //res10i
+    MADD3      c21,    a4,    b2,     c21
+    MADD4      c22,    a3,    b2,     c22
+
+    addi.d     A0,     A0,    0x10
+    addi.d     B0,     B0,    0x08
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L310
+
+.L311:
+#if defined(TRMMKERNEL)
+    //res00 res10
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x04    //C0[1]
+    LD         a7,     C0,    0x08    //C0[2]
+    LD         a8,     C0,    0x0c    //C0[3]
+
+    MUL       a5,     c11,   ALPHA_R
+    MUL       a6,     c12,   ALPHA_R
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+
+    MUL       a7,     c21,   ALPHA_R
+    MUL       a8,     c22,   ALPHA_R
+    NMSUB      a7,     c22,   ALPHA_I, a7
+    MADD       a8,     c21,   ALPHA_I, a8
+
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x04
+
+    ST         a7,     C0,    0x08
+    ST         a8,     C0,    0x0c
+
+    addi.d     C0,     C0,    0x10
+#else
+    //res00 res10
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x04    //C0[1]
+    LD         a7,     C0,    0x08    //C0[2]
+    LD         a8,     C0,    0x0c    //C0[3]
+
+    MADD       a5,     c11,   ALPHA_R, a5
+    MADD       a6,     c12,   ALPHA_R, a6
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+
+    MADD       a7,     c21,   ALPHA_R, a7
+    MADD       a8,     c22,   ALPHA_R, a8
+    NMSUB      a7,     c22,   ALPHA_I, a7
+    MADD       a8,     c21,   ALPHA_I, a8
+
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x04
+
+    ST         a7,     C0,    0x08
+    ST         a8,     C0,    0x0c
+
+    addi.d     C0,     C0,    0x10
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -2
+#else
+    addi.d     TL,     TL,   -1
+#endif
+    slli.d     T3,     TL,   0x04
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x03
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   2
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+.L312:   /* if ( bm & 1 )*/
+    move       I,      $r0
+    andi       T1,     M,     1    //bm&1
+    beq        I,      T1,    .L316
+
+.L313:
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,   0x03
+    add.d      A0,     A0,    T3
+    slli.d     T3,     OFF,   0x03
+    add.d      B0,     B,     T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   1
+#else
+    addi.d     TL,     OFF,   1
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    MTC        c11,    $r0
+    MTC        c12,    $r0
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L315
+    blt        TL,     L,     .L315
+
+.L314:  /* for (k=0; k<temp; k++) */
+    LD         a1,     A0,    0x00
+    LD         a2,     A0,    0x04
+
+    LD         b1,     B0,    0x00
+    LD         b2,     B0,    0x04
+
+    MADD1      c11,    a1,    b1,     c11
+    MADD2      c12,    a2,    b1,     c12
+    MADD3      c11,    a2,    b2,     c11
+    MADD4      c12,    a1,    b2,     c12
+
+    addi.d     A0,     A0,    0x08
+    addi.d     B0,     B0,    0x08
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L314
+
+.L315:
+#if defined(TRMMKERNEL)
+    MUL        a5,     c11,   ALPHA_R
+    MUL        a6,     c12,   ALPHA_I
+    SUB        a5,     a5,    a6
+    ST         a5,     C0,    0x00
+
+    MUL        a5,     c12,   ALPHA_R
+    MUL        a6,     c11,   ALPHA_I
+    ADD        a6,     a5,    a6
+    ST         a6,     C0,    0x04
+
+    addi.d     C0,     C0,    0x08
+#else
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x04    //C0[1]
+
+    MADD       a5,     c11,   ALPHA_R, a5
+    MADD       a6,     c12,   ALPHA_R, a6
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x04
+
+    addi.d     C0,     C0,    0x08
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -1
+#else
+    addi.d     TL,     TL,   -1
+#endif
+    slli.d     T3,     TL,   0x03
+    add.d      A0,     A0,   T3
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   1
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+.L316:
+    slli.d     L,      K,     3
+    add.d      B,      B,     L
+
+    slli.d     I,      LDC,   1
+    add.d      C,      C,     I
+
+    addi.d     J,      J,     1
+    andi       T0,     N,     1
+    blt        J,      T0,    .L300
+
+.L999:
+    LDARG      $r23,   $sp,   0
+    LDARG      $r24,   $sp,   8
+    LDARG      $r25,   $sp,   16
+    LDARG      $r26,   $sp,   24
+    LDARG      $r27,   $sp,   32
+    LD         $f23,   $sp,   40
+    LD         $f24,   $sp,   48
+    LD         $f25,   $sp,   56
+    LD         $f26,   $sp,   64
+    LD         $f27,   $sp,   72
+    LD         $f28,   $sp,   80
+    LD         $f29,   $sp,   88
+    LD         $f30,   $sp,   96
+    LD         $f31,   $sp,   104
+
+    addi.d     $sp,    $sp,   128
+    jirl       $r0,    $r1,   0x0
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/cgemm_ncopy_4_lsx.S b/kernel/loongarch64/cgemm_ncopy_4_lsx.S
new file mode 100644
index 000000000..bfc712fb2
--- /dev/null
+++ b/kernel/loongarch64/cgemm_ncopy_4_lsx.S
@@ -0,0 +1,341 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define S5     $r16
+#define S6     $r17
+#define S7     $r18
+#define TD     $r20
+#define TS     $r11
+#define TL     $r19
+#define T0     $r23
+#define ZERO   $r0
+
+#define F0     $f0
+#define F1     $f1
+#define F2     $f2
+#define F3     $f3
+#define F4     $f4
+#define F5     $f5
+#define F6     $f6
+#define F7     $f7
+
+/* LSX vectors */
+#define U0     $vr0
+#define U1     $vr1
+#define U2     $vr2
+#define U3     $vr3
+#define U4     $vr4
+#define U5     $vr5
+#define U6     $vr6
+#define U7     $vr7
+#define D0     $vr8
+#define D1     $vr9
+#define D2     $vr10
+#define D3     $vr11
+#define D4     $vr12
+#define D5     $vr13
+#define D6     $vr14
+#define D7     $vr15
+#define D8     $vr16
+
+    PROLOGUE
+
+    addi.d     $sp,  $sp,  -8
+    SDARG      $r23, $sp,  0
+
+    move       TD,   DST   //boffset
+    move       TS,   SRC   //aoffset
+
+    slli.d     TL,   LDA,  0x02
+    slli.d     TL,   TL,   0x01
+
+    srai.d     J,    N,    0x02
+    beq        J,    ZERO,  .L_N0
+
+.L_J1: /* J-- */
+    move       S1,   TS
+    add.d      S2,   S1,   TL
+    add.d      S3,   S2,   TL
+    add.d      S4,   S3,   TL
+
+    slli.d     T0,   TL,   0x02
+    add.d      TS,   TS,   T0
+
+    srai.d     I,    M,    0x02
+    beq        I,    ZERO,  .L_I3
+
+.L_I1: /* I-- */
+    vld       U0,   S1,   0x00
+    vld       U1,   S1,   0x10
+
+    vld       U2,   S2,   0x00
+    vld       U3,   S2,   0x10
+
+    vld       U4,   S3,   0x00
+    vld       U5,   S3,   0x10
+
+    vld       U6,   S4,   0x00
+    vld       U7,   S4,   0x10
+
+    vand.v    D0,   U2,   U2
+    vand.v    D1,   U3,   U3
+    vand.v    D2,   U2,   U2
+    vand.v    D3,   U3,   U3
+    vand.v    D4,   U6,   U6
+    vand.v    D5,   U7,   U7
+    vand.v    D6,   U6,   U6
+    vand.v    D7,   U7,   U7
+
+    vpermi.w  D0,   U0,   0x44
+    vpermi.w  D4,   U4,   0x44
+    vpermi.w  D2,   U0,   0xee
+    vpermi.w  D6,   U4,   0xee
+    vpermi.w  D1,   U1,   0x44
+    vpermi.w  D5,   U5,   0x44
+    vpermi.w  D3,   U1,   0xee
+    vpermi.w  D7,   U5,   0xee
+
+    vst       D0,   TD,   0x00
+    vst       D4,   TD,   0x10
+    vst       D2,   TD,   0x20
+    vst       D6,   TD,   0x30
+    vst       D1,   TD,   0x40
+    vst       D5,   TD,   0x50
+    vst       D3,   TD,   0x60
+    vst       D7,   TD,   0x70
+
+    addi.d     S1,   S1,   0x20   // a_offset
+    addi.d     S2,   S2,   0x20
+    addi.d     S3,   S3,   0x20
+    addi.d     S4,   S4,   0x20
+    addi.d     TD,   TD,   0x80  // b_offset
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_I1
+
+.L_I3:  /* if(m&2) */
+    andi       I,    M,    0x02
+    beq        I,    ZERO, .L_II20
+
+    vld       U0,   S1,   0x00
+    vld       U1,   S2,   0x00
+    vld       U2,   S3,   0x00
+    vld       U3,   S4,   0x00
+
+    vand.v    D0,   U1,   U1
+    vand.v    D1,   U1,   U1
+    vand.v    D2,   U3,   U3
+    vand.v    D3,   U3,   U3
+
+    vpermi.w  D0,   U0,   0x44
+    vpermi.w  D2,   U2,   0x44
+    vpermi.w  D1,   U0,   0xee
+    vpermi.w  D3,   U2,   0xee
+
+    vst       D0,   TD,   0x00
+    vst       D2,   TD,   0x10
+    vst       D1,   TD,   0x20
+    vst       D3,   TD,   0x30
+
+    addi.d     S1,   S1,   0x10
+    addi.d     S2,   S2,   0x10
+    addi.d     S3,   S3,   0x10
+    addi.d     S4,   S4,   0x10
+    addi.d     TD,   TD,   0x40
+
+.L_II20:  /* if(m&1) */
+    andi       I,    M,    0x01
+    beq        I,    ZERO, .L_J0
+
+    fld.s      F0,   S1,   0x00
+    fld.s      F1,   S1,   0x04
+
+    fld.s      F2,   S2,   0x00
+    fld.s      F3,   S2,   0x04
+
+    fld.s      F4,   S3,   0x00
+    fld.s      F5,   S3,   0x04
+
+    fld.s      F6,   S4,   0x00
+    fld.s      F7,   S4,   0x04
+
+    fst.s      F0,   TD,   0x00
+    fst.s      F1,   TD,   0x04
+    fst.s      F2,   TD,   0x08
+    fst.s      F3,   TD,   0x0c
+    fst.s      F4,   TD,   0x10
+    fst.s      F5,   TD,   0x14
+    fst.s      F6,   TD,   0x18
+    fst.s      F7,   TD,   0x1c
+
+    addi.d     TD,   TD,   0x20
+
+.L_J0:
+    addi.d    J,     J,    -1
+    blt       ZERO,  J,    .L_J1
+
+.L_N0:  /* if(n&2) */
+    andi       I,     N,   0x02
+    beq        ZERO,  I,   .L_N20
+
+    move       S1,    TS
+    add.d      S2,    S1,   TL
+
+    slli.d     T0,    TL,   0x01
+    add.d      TS,    TS,   T0
+
+    srai.d     I,     M,    0x02
+    beq        ZERO,  I,    .L_N10
+
+.L_N11: /* if(i>0) */
+    vld       U0,    S1,   0x00
+    vld       U1,    S1,   0x10
+    vld       U2,    S2,   0x00
+    vld       U3,    S2,   0x10
+
+    vand.v    D0,    U2,   U2
+    vand.v    D1,    U3,   U3
+    vand.v    D2,    U2,   U2
+    vand.v    D3,    U3,   U3
+
+    vpermi.w  D0,    U0,   0x44
+    vpermi.w  D2,    U0,   0xee
+    vpermi.w  D1,    U1,   0x44
+    vpermi.w  D3,    U1,   0xee
+
+    vst       D0,    TD,   0x00
+    vst       D2,    TD,   0x10
+    vst       D1,    TD,   0x20
+    vst       D3,    TD,   0x30
+
+    addi.d     S1,    S1,   0x20   // a_offset
+    addi.d     S2,    S2,   0x20
+    addi.d     TD,    TD,   0x40   // b_offset
+
+    addi.d     I,     I,   -1
+    blt        ZERO,  I,   .L_N11
+
+.L_N10:  /* if(m&2) */
+    andi       I,     M,    0x02
+    beq        I,     ZERO, .L_N130
+
+    vld       U0,    S1,   0x00
+    vld       U1,    S2,   0x00
+    vand.v    D0,    U1,   U1
+
+    vpermi.w  D0,    U0,   0x44
+    vpermi.w  U1,    U0,   0xee
+
+    vst       D0,    TD,   0x00
+    vst       U1,    TD,   0x10
+
+    addi.d     S1,    S1,   0x10   // a_offset
+    addi.d     S2,    S2,   0x10
+    addi.d     TD,    TD,   0x20   // b_offset
+
+.L_N130:  /* if(m&1) */
+    andi       I,     M,    0x01
+    beq        I,     ZERO, .L_N20
+
+    fld.s      F0,     S1,   0x00
+    fld.s      F1,     S1,   0x04
+
+    fld.s      F2,     S2,   0x00
+    fld.s      F3,     S2,   0x04
+
+    fst.s      F0,     TD,   0x00
+    fst.s      F1,     TD,   0x04
+    fst.s      F2,     TD,   0x08
+    fst.s      F3,     TD,   0x0c
+
+    addi.d     TD,    TD,   0x10
+
+.L_N20:   /* if(n&1) */
+    andi       I,     N,    0x01
+    beq        I,     ZERO, .L_N00
+
+    move       S1,   TS
+    srai.d     I,    M,    0x02
+
+    beq        I,    ZERO, .L_N30
+
+.L_N21:  /* if(i>0) */
+    vld       U0,   S1,   0x00
+    vld       U1,   S1,   0x10
+
+    vst       U0,   TD,   0x00
+    vst       U1,   TD,   0x10
+
+    addi.d     S1,   S1,   0x20   // aoffset1
+    addi.d     TD,   TD,   0x20   // b_offset
+
+    addi.d     I,     I,   -1
+    blt        ZERO,  I,   .L_N21
+
+.L_N30:  /* if(m&2) */
+    andi       I,     M,    0x02
+    beq        I,     ZERO, .L_N330
+
+    vld       U0,   S1,   0x00
+
+    vst       U0,   TD,   0x00
+
+    addi.d     S1,   S1,   0x10   // aoffset1
+    addi.d     TD,   TD,   0x10   // b_offset
+
+.L_N330:  /* if(m&1) */
+    andi       I,     M,    0x01
+    beq        I,     ZERO, .L_N00
+
+    fld.s      F0,   S1,   0x00
+    fld.s      F1,   S1,   0x04
+
+    fst.s      F0,   TD,   0x00
+    fst.s      F1,   TD,   0x04
+
+.L_N00:
+    LDARG     $r23,  $sp, 0
+    addi.d    $sp,   $sp, 8
+    jirl      $r0,   $r1, 0x00
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/cgemm_ncopy_8_lsx.S b/kernel/loongarch64/cgemm_ncopy_8_lsx.S
new file mode 100644
index 000000000..87a88e37d
--- /dev/null
+++ b/kernel/loongarch64/cgemm_ncopy_8_lsx.S
@@ -0,0 +1,263 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define S5     $r16
+#define S6     $r17
+#define S7     $r18
+#define S8     $r19
+#define TD     $r20
+#define TS     $r11
+#define TL     $r7
+#define T0     $r23
+#define ZERO   $r0
+
+#define F0     $f0
+#define F1     $f1
+#define F2     $f2
+#define F3     $f3
+#define F4     $f4
+#define F5     $f5
+#define F6     $f6
+#define F7     $f7
+
+/* LSX vectors */
+#define U0     $vr0
+#define U1     $vr1
+#define U2     $vr2
+#define U3     $vr3
+#define U4     $vr4
+#define U5     $vr5
+#define U6     $vr6
+#define U7     $vr7
+#define D0     $vr8
+#define D1     $vr9
+#define D2     $vr10
+#define D3     $vr11
+#define D4     $vr12
+#define D5     $vr13
+#define D6     $vr14
+#define D7     $vr15
+#define D8     $vr16
+
+    PROLOGUE
+
+    addi.d     $sp,  $sp,  -8
+    SDARG      $r23, $sp,  0
+
+    move       TD,   DST  //boffset
+    move       TS,   SRC  //aoffset
+
+    slli.d     TL,   LDA,  0x02  //lda
+    slli.d     TL,   TL,   0x01
+
+    slli.d     T0,   TL,   0x03
+    srai.d     J,    N,    0x03  //j
+
+    beq        J,    ZERO, .L_N1
+
+.L_J1:  /* if(j>0) j--*/
+    move       S1,   TS
+    add.d      S2,   TS,   TL
+    move       I,    M
+    add.d      S3,   S2,   TL
+    add.d      S4,   S3,   TL
+    add.d      S5,   S4,   TL
+    add.d      S6,   S5,   TL
+    add.d      S7,   S6,   TL
+    add.d      S8,   S7,   TL
+    add.d      TS,   TS,   T0
+
+    beq        I,    ZERO, .L_J11
+
+.L_I1:  /* if(i>0) i--*/
+    fld.s      F0,   S1,   0x00
+    fld.s      F1,   S1,   0x04
+    fld.s      F2,   S2,   0x00
+    fld.s      F3,   S2,   0x04
+    fld.s      F4,   S3,   0x00
+    fld.s      F5,   S3,   0x04
+    fld.s      F6,   S4,   0x00
+    fld.s      F7,   S4,   0x04
+
+    fst.s      F0,   TD,   0x00
+    fst.s      F1,   TD,   0x04
+    fst.s      F2,   TD,   0x08
+    fst.s      F3,   TD,   0x0c
+    fst.s      F4,   TD,   0x10
+    fst.s      F5,   TD,   0x14
+    fst.s      F6,   TD,   0x18
+    fst.s      F7,   TD,   0x1c
+
+    fld.s      F0,   S5,   0x00
+    fld.s      F1,   S5,   0x04
+    fld.s      F2,   S6,   0x00
+    fld.s      F3,   S6,   0x04
+    fld.s      F4,   S7,   0x00
+    fld.s      F5,   S7,   0x04
+    fld.s      F6,   S8,   0x00
+    fld.s      F7,   S8,   0x04
+
+    fst.s      F0,   TD,   0x20
+    fst.s      F1,   TD,   0x24
+    fst.s      F2,   TD,   0x28
+    fst.s      F3,   TD,   0x2c
+    fst.s      F4,   TD,   0x30
+    fst.s      F5,   TD,   0x34
+    fst.s      F6,   TD,   0x38
+    fst.s      F7,   TD,   0x3c
+
+    addi.d     S1,   S1,   0x08
+    addi.d     S2,   S2,   0x08
+    addi.d     S3,   S3,   0x08
+    addi.d     S4,   S4,   0x08
+    addi.d     S5,   S5,   0x08
+    addi.d     S6,   S6,   0x08
+    addi.d     S7,   S7,   0x08
+    addi.d     S8,   S8,   0x08
+    addi.d     TD,   TD,   0x40
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_I1
+
+.L_J11: /* j--*/
+    addi.d     J,    J,    -1
+    blt        ZERO, J,    .L_J1
+
+.L_N1:  /* if(n&4)*/
+    andi       I,     N,    0x04
+    beq        I,     ZERO, .L_N2
+
+    move       S1,   TS
+    add.d      S2,   TS,   TL
+    move       I,    M
+    add.d      S3,   S2,   TL
+    add.d      S4,   S3,   TL
+    add.d      TS,   S4,   TL
+
+    beq        I,     ZERO, .L_N2
+
+.L_N11:  /* if(i>0)*/
+    fld.s      F0,   S1,   0x00
+    fld.s      F1,   S1,   0x04
+    fld.s      F2,   S2,   0x00
+    fld.s      F3,   S2,   0x04
+    fld.s      F4,   S3,   0x00
+    fld.s      F5,   S3,   0x04
+    fld.s      F6,   S4,   0x00
+    fld.s      F7,   S4,   0x04
+
+    fst.s      F0,   TD,   0x00
+    fst.s      F1,   TD,   0x04
+    fst.s      F2,   TD,   0x08
+    fst.s      F3,   TD,   0x0c
+    fst.s      F4,   TD,   0x10
+    fst.s      F5,   TD,   0x14
+    fst.s      F6,   TD,   0x18
+    fst.s      F7,   TD,   0x1c
+
+    addi.d     S1,   S1,   0x08
+    addi.d     S2,   S2,   0x08
+    addi.d     S3,   S3,   0x08
+    addi.d     S4,   S4,   0x08
+    addi.d     TD,   TD,   0x20
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_N11
+
+.L_N2:  /* if(n&2)*/
+    andi       I,     N,    0x02
+    beq        I,     ZERO, .L_N3
+
+    move       S1,   TS
+    add.d      S2,   TS,   TL
+    move       I,    M
+    add.d      TS,   S2,   TL
+
+    beq        I,    ZERO, .L_N3
+
+.L_N21:  /* if(i>0)*/
+    fld.s      F0,   S1,   0x00
+    fld.s      F1,   S1,   0x04
+    fld.s      F2,   S2,   0x00
+    fld.s      F3,   S2,   0x04
+
+    fst.s      F0,   TD,   0x00
+    fst.s      F1,   TD,   0x04
+    fst.s      F2,   TD,   0x08
+    fst.s      F3,   TD,   0x0c
+
+    addi.d     S1,   S1,   0x08
+    addi.d     S2,   S2,   0x08
+    addi.d     TD,   TD,   0x10
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_N21
+
+.L_N3:  /* if(n&2)*/
+    andi       I,    N,    0x01
+    beq        I,    ZERO, .L_N0
+
+    move       S1,   TS
+    move       I,    M
+
+    beq        I,    ZERO, .L_N0
+
+.L_N31:  /* if(i>0)*/
+    fld.s      F0,   S1,   0x00
+    fld.s      F1,   S1,   0x04
+
+    fst.s      F0,   TD,   0x00
+    fst.s      F1,   TD,   0x04
+
+    addi.d     S1,   S1,   0x08
+    addi.d     TD,   TD,   0x08
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_N31
+
+.L_N0:
+    LDARG     $r23,  $sp, 0
+    addi.d    $sp,   $sp, 8
+    jirl      $r0,   $r1, 0x00
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/cgemm_tcopy_4_lsx.S b/kernel/loongarch64/cgemm_tcopy_4_lsx.S
new file mode 100644
index 000000000..6d63d62e7
--- /dev/null
+++ b/kernel/loongarch64/cgemm_tcopy_4_lsx.S
@@ -0,0 +1,324 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define TD     $r16
+#define TS     $r17
+#define TL     $r18
+#define T0     $r19
+#define S8     $r20
+#define S9     $r23
+#define S10    $r11
+#define ZERO   $r0
+
+#define F0     $f0
+#define F1     $f1
+#define F2     $f2
+#define F3     $f3
+#define F4     $f4
+#define F5     $f5
+#define F6     $f6
+#define F7     $f7
+
+/* LSX vectors */
+#define U0     $vr0
+#define U1     $vr1
+#define U2     $vr2
+#define U3     $vr3
+#define U4     $vr4
+#define U5     $vr5
+#define U6     $vr6
+#define U7     $vr7
+#define U8     $vr8
+#define U9     $vr9
+#define U10    $vr10
+#define U11    $vr11
+#define U12    $vr12
+#define U13    $vr13
+#define U14    $vr14
+#define U15    $vr15
+
+
+    PROLOGUE
+
+    addi.d     $sp,  $sp,  -8
+    SDARG      $r23, $sp,  0
+
+    move       TS,     SRC          //aoffset
+    move       TD,     DST          //boffset
+    slli.d     TL,     LDA,   0x02  //lda
+    slli.d     TL,     TL,    0x01  //lda
+
+    ori        T0,     ZERO,  0x03
+    andn       T0,     N,     T0
+    mul.w      T0,     M,     T0
+    slli.d     T0,     T0,    0x01
+    slli.d     T0,     T0,    0x02
+    add.d      S9,     DST,   T0  //boffset2
+
+    ori        T0,     ZERO,  0x01
+    andn       T0,     N,     T0
+    mul.w      T0,     M,     T0
+    slli.d     T0,     T0,    0x01
+    slli.d     T0,     T0,    0x02
+    add.d      S10,    DST,   T0  //boffset3
+
+    srai.d     J,      M,     0x02  //j
+
+    beq        J,      ZERO,  .L_M1
+
+.L_J1:  /* if(j>0) j--*/
+    move       S1,     TS     //aoffset1
+    add.d      S2,     S1,    TL
+    add.d      S3,     S2,    TL
+    add.d      S4,     S3,    TL
+
+    slli.d     T0,     TL,    0x02
+    add.d      TS,     TS,    T0
+
+    move       S8,     TD     //boffset1
+    addi.d     TD,     TD,    0x80
+
+    srai.d     I,      N,     0x02
+
+    beq        ZERO,   I,     .L_JN1
+
+.L_JI1:  /* if(i>0) i--*/
+    vld       U0,   S1,   0x00
+    vld       U1,   S1,   0x10
+
+    vld       U2,   S2,   0x00
+    vld       U3,   S2,   0x10
+
+    vld       U4,   S3,   0x00
+    vld       U5,   S3,   0x10
+
+    vld       U6,   S4,   0x00
+    vld       U7,   S4,   0x10
+
+    vst       U0,   S8,   0x00
+    vst       U1,   S8,   0x10
+    vst       U2,   S8,   0x20
+    vst       U3,   S8,   0x30
+    vst       U4,   S8,   0x40
+    vst       U5,   S8,   0x50
+    vst       U6,   S8,   0x60
+    vst       U7,   S8,   0x70
+
+    addi.d     S1,   S1,   0x20
+    addi.d     S2,   S2,   0x20
+    addi.d     S3,   S3,   0x20
+    addi.d     S4,   S4,   0x20
+    slli.d     T0,   M,    0x05
+    add.d      S8,   S8,   T0
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_JI1
+
+.L_JN1:  /* if(n&2) */
+    andi       I,    N,    0x02
+    beq        ZERO, I,    .L_JN2
+
+    vld       U0,   S1,   0x00
+    vld       U1,   S2,   0x00
+    vld       U2,   S3,   0x00
+    vld       U3,   S4,   0x00
+
+    vst       U0,   S9,   0x00
+    vst       U1,   S9,   0x10
+    vst       U2,   S9,   0x20
+    vst       U3,   S9,   0x30
+
+    addi.d     S1,   S1,   0x10
+    addi.d     S2,   S2,   0x10
+    addi.d     S3,   S3,   0x10
+    addi.d     S4,   S4,   0x10
+    addi.d     S9,   S9,   0x40
+
+.L_JN2:  /* if(n&1) */
+    andi       I,    N,    0x01
+    beq        ZERO, I,    .L_J0
+
+    fld.s      F0,   S1,   0x00
+    fld.s      F1,   S1,   0x04
+
+    fld.s      F2,   S2,   0x00
+    fld.s      F3,   S2,   0x04
+
+    fld.s      F4,   S3,   0x00
+    fld.s      F5,   S3,   0x04
+
+    fld.s      F6,   S4,   0x00
+    fld.s      F7,   S4,   0x04
+
+    fst.s      F0,   S10,   0x00
+    fst.s      F1,   S10,   0x04
+    fst.s      F2,   S10,   0x08
+    fst.s      F3,   S10,   0x0c
+    fst.s      F4,   S10,   0x10
+    fst.s      F5,   S10,   0x14
+    fst.s      F6,   S10,   0x18
+    fst.s      F7,   S10,   0x1c
+
+    addi.d     S10,  S10,   0x20
+
+.L_J0:
+    addi.d     J,      J,     -1
+    blt        ZERO, J,   .L_J1
+
+.L_M1:  /* if(m&2) */
+    andi       I,      M,    0x02
+    beq        ZERO,   I,    .L_M2
+
+    move       S1,     TS     //aoffset1
+    add.d      S2,     S1,    TL
+
+    slli.d     T0,     TL,    0x01
+    add.d      TS,     TS,    T0
+
+    move       S8,     TD     //boffset1
+    addi.d     TD,     TD,    0x40
+
+    srai.d     I,      N,     0x02
+    beq        ZERO,   I,     .L_M1N1
+
+.L_M1I1:  /* if(i>0) */
+    vld       U0,     S1,    0x00
+    vld       U1,     S1,    0x10
+
+    vld       U2,     S2,    0x00
+    vld       U3,     S2,    0x10
+
+    vst       U0,     S8,    0x00
+    vst       U1,     S8,    0x10
+    vst       U2,     S8,    0x20
+    vst       U3,     S8,    0x30
+
+    addi.d     S1,     S1,    0x20
+    addi.d     S2,     S2,    0x20
+    slli.d     T0,     M,     0x05
+    add.d      S8,     S8,    T0
+
+    addi.d     I,      I,     -1
+    blt        ZERO,   I,     .L_M1I1
+
+.L_M1N1:  /* if(n&2) */
+    andi       I,      N,    0x02
+    beq        ZERO,   I,    .L_M1N2
+
+    vld       U0,     S1,    0x00
+    vld       U1,     S2,    0x00
+
+    vst       U0,     S9,    0x00
+    vst       U1,     S9,    0x10
+
+    addi.d     S1,     S1,    0x10
+    addi.d     S2,     S2,    0x10
+    addi.d     S9,     S9,    0x20
+
+.L_M1N2:  /* if(n&1) */
+    andi       I,      N,    0x01
+    beq        ZERO,   I,    .L_M2
+
+    fld.s      F0,      S1,    0x00
+    fld.s      F1,      S1,    0x04
+
+    fld.s      F2,      S2,    0x00
+    fld.s      F3,      S2,    0x04
+
+    fst.s      F0,      S10,   0x00
+    fst.s      F1,      S10,   0x04
+    fst.s      F2,      S10,   0x08
+    fst.s      F3,      S10,   0x0c
+
+    addi.d     S10,    S10,   0x10
+
+.L_M2:  /* if(m&1) */
+    andi       I,      M,    0x01
+    beq        ZERO,   I,    .L_M0
+
+    move       S1,     TS     //aoffset1
+    move       S8,     TD     //boffset1
+
+    srai.d     I,      N,     0x02
+    beq        ZERO,   I,     .L_M2N1
+
+.L_M2I1:  /* if(i>0) */
+    vld       U0,     S1,    0x00
+    vld       U1,     S1,    0x10
+
+    vst       U0,     S8,    0x00
+    vst       U1,     S8,    0x10
+
+    addi.d     S1,     S1,    0x20
+    slli.d     T0,     M,     0x05
+    add.d      S8,     S8,    T0
+
+    addi.d     I,      I,     -1
+    blt        ZERO,   I,     .L_M2I1
+
+.L_M2N1:  /* if(n&2) */
+    andi       I,      N,    0x02
+    beq        ZERO,   I,    .L_M2N2
+
+    vld       U0,     S1,    0x00
+
+    vst       U0,     S9,    0x00
+
+    addi.d     S1,     S1,    0x10
+
+.L_M2N2:  /* if(n&1) */
+    andi       I,      N,    0x01
+    beq        ZERO,   I,    .L_M0
+
+    fld.s      F0,     S1,    0x00
+    fld.s      F1,     S1,    0x04
+
+    fst.s      F0,     S10,    0x00
+    fst.s      F1,     S10,    0x04
+
+.L_M0:
+    LDARG     $r23,  $sp, 0
+    addi.d    $sp,   $sp, 8
+    jirl      $r0,   $r1, 0x00
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/cgemm_tcopy_8_lsx.S b/kernel/loongarch64/cgemm_tcopy_8_lsx.S
new file mode 100644
index 000000000..2935bbc07
--- /dev/null
+++ b/kernel/loongarch64/cgemm_tcopy_8_lsx.S
@@ -0,0 +1,277 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define S5     $r16
+#define S6     $r17
+#define S7     $r18
+#define S8     $r19
+#define TD     $r20
+#define TS     $r11
+#define TL     $r7
+#define T0     $r23
+#define ZERO   $r0
+
+#define F0     $f0
+#define F1     $f1
+#define F2     $f2
+#define F3     $f3
+#define F4     $f4
+#define F5     $f5
+#define F6     $f6
+#define F7     $f7
+
+/* LASX vectors */
+#define U0     $vr0
+#define U1     $vr1
+#define U2     $vr2
+#define U3     $vr3
+#define U4     $vr4
+#define U5     $vr5
+#define U6     $vr6
+#define U7     $vr7
+#define D0     $vr8
+#define D1     $vr9
+#define D2     $vr10
+#define D3     $vr11
+#define D4     $vr12
+#define D5     $vr13
+#define D6     $vr14
+#define D7     $vr15
+
+
+    PROLOGUE
+
+    addi.d     $sp,  $sp,  -8
+    SDARG      $r23, $sp,  0
+
+    move       TS,     SRC          //aoffset
+    move       TD,     DST          //boffset
+    slli.d     TL,     LDA,   0x02  //lda
+    slli.d     TL,     TL,    0x01
+
+    srai.d     J,      N,     0x03  //j
+
+    beq        J,      ZERO,  .L_N1
+
+.L_J1:  /* if(j>0) j--*/
+    move       S1,     TS     //aoffset1
+    slli.d     T0,     TL,    0x01  //2*lda
+    add.d      S2,     TS,    TL
+    addi.d     TS,     TS,    0x40
+
+    srai.d     I,      M,     0x01
+    beq        ZERO,   I,     .L_J1M1
+
+.L_J1I1:  /* if(i>0) i--*/
+    vld       U0,     S1,    0x00
+    vld       U1,     S1,    0x10
+    vld       U2,     S1,    0x20
+    vld       U3,     S1,    0x30
+
+    vld       U4,     S2,    0x00
+    vld       U5,     S2,    0x10
+    vld       U6,     S2,    0x20
+    vld       U7,     S2,    0x30
+
+    vst       U0,     TD,    0x00
+    vst       U1,     TD,    0x10
+    vst       U2,     TD,    0x20
+    vst       U3,     TD,    0x30
+    vst       U4,     TD,    0x40
+    vst       U5,     TD,    0x50
+    vst       U6,     TD,    0x60
+    vst       U7,     TD,    0x70
+
+    add.d      S1,     S1,    T0
+    add.d      S2,     S2,    T0
+    addi.d     TD,     TD,    0x80
+
+    addi.d     I,      I,     -1
+    blt        ZERO,   I,     .L_J1I1
+
+.L_J1M1:  /* if(m&1) */
+    andi       I,      M,     0x01
+    beq        ZERO,   I,     .L_J0
+
+    vld       U0,     S1,    0x00
+    vld       U1,     S1,    0x10
+    vld       U2,     S1,    0x20
+    vld       U3,     S1,    0x30
+
+    vst       U0,     TD,    0x00
+    vst       U1,     TD,    0x10
+    vst       U2,     TD,    0x20
+    vst       U3,     TD,    0x30
+
+    addi.d     TD,     TD,    0x40
+
+.L_J0:
+    addi.d     J,      J,     -1
+    blt        ZERO,   J,     .L_J1
+
+.L_N1:  /* if(n&4) */
+    andi       I,      N,     0x04
+    beq        ZERO,   I,     .L_N2
+
+    move       S1,     TS     //aoffset1
+    slli.d     T0,     TL,    0x01  //2*lda
+    add.d      S2,     TS,    TL
+    addi.d     TS,     TS,    0x20
+
+    srai.d     I,      M,     0x01
+    beq        ZERO,   I,     .L_N1M1
+
+.L_N1I1:   /* if(i>0) i-- */
+    vld       U0,     S1,    0x00
+    vld       U1,     S1,    0x10
+
+    vld       U2,     S2,    0x00
+    vld       U3,     S2,    0x10
+
+    vst       U0,     TD,    0x00
+    vst       U1,     TD,    0x10
+    vst       U2,     TD,    0x20
+    vst       U3,     TD,    0x30
+
+    add.d      S1,     S1,    T0
+    add.d      S2,     S2,    T0
+    addi.d     TD,     TD,    0x40
+
+    addi.d     I,      I,     -1
+    blt        ZERO,   I,     .L_N1I1
+
+.L_N1M1:  /* if(m&1) */
+    andi       I,      M,     0x01
+    beq        ZERO,   I,     .L_N2
+
+    vld       U0,     S1,    0x00
+    vld       U1,     S1,    0x10
+
+    vst       U0,     TD,    0x00
+    vst       U1,     TD,    0x10
+
+    addi.d     TD,     TD,    0x20
+
+.L_N2:  /* if(n&2) */
+    andi       I,      N,     0x02
+    beq        ZERO,   I,     .L_N3
+
+    move       S1,     TS     //aoffset1
+    slli.d     T0,     TL,    0x01  //2*lda
+    add.d      S2,     TS,    TL
+    addi.d     TS,     TS,    0x10
+
+    srai.d     I,      M,     0x01
+    beq        ZERO,   I,     .L_N2M1
+
+.L_N2I1:  /* if(i>0) i-- */
+    vld       U0,     S1,    0x00
+    vld       U1,     S2,    0x00
+
+    vst       U0,     TD,    0x00
+    vst       U1,     TD,    0x10
+
+    add.d      S1,     S1,    T0
+    add.d      S2,     S2,    T0
+
+    addi.d     TD,     TD,    0x20
+
+    addi.d     I,      I,     -1
+    blt        ZERO,   I,     .L_N2I1
+
+.L_N2M1:   /* if(m&1) */
+    andi       I,      M,     0x01
+    beq        ZERO,   I,     .L_N3
+
+    vld       U0,     S1,    0x00
+
+    vst       U0,     TD,    0x00
+
+    addi.d     TD,     TD,    0x10
+
+.L_N3:   /* if(n&1) */
+    andi       I,      N,     0x01
+    beq        ZERO,   I,     .L_N0
+
+    move       S1,     TS     //aoffset1
+    slli.d     T0,     TL,    0x01  //2*lda
+    add.d      S2,     TS,    TL
+
+    srai.d     I,      M,     0x01
+    beq        ZERO,   I,     .L_N3M1
+
+.L_N3I1:  /* if(i>0) i-- */
+    fld.s      F0,     S1,    0x00
+    fld.s      F1,     S1,    0x04
+
+    fld.s      F2,     S2,    0x00
+    fld.s      F3,     S2,    0x04
+
+    fst.s      F0,     TD,    0x00
+    fst.s      F1,     TD,    0x04
+    fst.s      F2,     TD,    0x08
+    fst.s      F3,     TD,    0x0c
+
+    add.d      S1,     S1,    T0
+    add.d      S2,     S2,    T0
+    addi.d     TD,     TD,    0x10
+
+    addi.d     I,      I,     -1
+    blt        ZERO,   I,     .L_N3I1
+
+.L_N3M1:  /* if(m&1) */
+    andi       I,      M,     0x01
+    beq        ZERO,   I,     .L_N0
+
+    fld.s      F0,     S1,    0x00
+    fld.s      F1,     S1,    0x04
+
+    fst.s      F0,     TD,    0x00
+    fst.s      F1,     TD,    0x04
+
+.L_N0:
+    LDARG      $r23,   $sp,   0
+    addi.d     $sp,    $sp,   8
+    jirl       $r0,    $r1,   0x00
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/zgemm_kernel_4x4_lsx.S b/kernel/loongarch64/zgemm_kernel_4x4_lsx.S
new file mode 100644
index 000000000..6c4841b24
--- /dev/null
+++ b/kernel/loongarch64/zgemm_kernel_4x4_lsx.S
@@ -0,0 +1,2316 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+
+/* Function parameters */
+#define M      $r4   // param 1: bm
+#define N      $r5   // param 2: bn
+#define K      $r6   // param 3: bk
+#define ALPHA_R $f0   // param 4: alphar
+#define ALPHA_I $f1   // param 5: alphai
+#define A      $r7   // param 6: ba
+#define B      $r8  // param 7: bb
+#define C      $r9  // param 8: bc
+#define LDC    $r10  // param 9: ldc
+
+#if defined (TRMMKERNEL)
+#define OFFSET $r11  // param 10: offset
+#endif
+#define OFF    $r26
+
+#define I      $r12
+#define J      $r13
+#define L      $r14
+#define TL     $r15
+#define A0     $r16
+#define B0     $r17
+#define C0     $r18
+#define C1     $r19
+#define C2     $r20
+#define C3     $r23
+#define T0     $r24
+#define T1     $r25
+#define T2     $r26
+#define T3     $r27
+
+#define a1     $f2
+#define a2     $f3
+#define a3     $f4
+#define a4     $f5
+#define a5     $f6
+#define a6     $f7
+#define a7     $f8
+#define a8     $f9
+#define b1     $f10
+#define b2     $f11
+#define b3     $f12
+#define b4     $f13
+#define b5     $f14
+#define b6     $f15
+#define b7     $f16
+#define b8     $f17
+#define c11    $f18
+#define c12    $f19
+#define c21    $f20
+#define c22    $f21
+#define c31    $f22
+#define c32    $f23
+#define c41    $f24
+#define c42    $f25
+
+/* LSX vectors */
+#define U0     $vr30
+#define U1     $vr31
+#define U2     $vr2
+#define U3     $vr3
+#define U4     $vr4
+#define U5     $vr5
+#define U6     $vr6
+#define U7     $vr7
+#define U8     $vr8
+#define U9     $vr9
+#define U10    $vr10
+#define U11    $vr11
+#define U12    $vr12
+#define U13    $vr13
+#define U14    $vr14
+#define U15    $vr15
+#define D0     $vr16
+#define D1     $vr17
+#define D2     $vr18
+#define D3     $vr19
+#define D4     $vr20
+#define D5     $vr21
+#define D6     $vr22
+#define D7     $vr23
+#define D8     $vr24
+#define D9     $vr25
+#define D10    $vr26
+#define D11    $vr27
+#define D12    $vr28
+#define D13    $vr29
+#define VALPHAR $vr28
+#define VALPHAI $vr29
+
+
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+#define    VMADD1       VFMADD
+#define    VMADD2       VFMADD
+#define    VMADD3       VNMSUB
+#define    VMADD4       VFMADD
+
+#define    MADD1       MADD
+#define    MADD2       MADD
+#define    MADD3       NMSUB
+#define    MADD4       MADD
+#endif
+
+#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
+#define    VMADD1       VFMADD
+#define    VMADD2       VFMADD
+#define    VMADD3       VFMADD
+#define    VMADD4       VNMSUB
+
+#define    MADD1       MADD
+#define    MADD2       MADD
+#define    MADD3       MADD
+#define    MADD4       NMSUB
+#endif
+
+#if   defined(RN) || defined(RT) || defined(CN) || defined(CT)
+#define    VMADD1       VFMADD
+#define    VMADD2       VNMSUB
+#define    VMADD3       VFMADD
+#define    VMADD4       VFMADD
+
+#define    MADD1       MADD
+#define    MADD2       NMSUB
+#define    MADD3       MADD
+#define    MADD4       MADD
+#endif
+
+#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
+#define    VMADD1       VFMADD
+#define    VMADD2       VNMSUB
+#define    VMADD3       VNMSUB
+#define    VMADD4       VNMSUB
+
+#define    MADD1       MADD
+#define    MADD2       NMSUB
+#define    MADD3       NMSUB
+#define    MADD4       NMSUB
+#endif
+
+    PROLOGUE
+
+    addi.d     $sp,    $sp,   -128
+    SDARG      $r23,   $sp,   0
+    SDARG      $r24,   $sp,   8
+    SDARG      $r25,   $sp,   16
+    SDARG      $r26,   $sp,   24
+    SDARG      $r27,   $sp,   32
+    ST         $f23,   $sp,   40
+    ST         $f24,   $sp,   48
+    ST         $f25,   $sp,   56
+    ST         $f26,   $sp,   64
+    ST         $f27,   $sp,   72
+    ST         $f28,   $sp,   80
+    ST         $f29,   $sp,   88
+    ST         $f30,   $sp,   96
+    ST         $f31,   $sp,   104
+    ST         ALPHA_R,$sp,   112
+    ST         ALPHA_I,$sp,   120
+
+    vldrepl.d  VALPHAR, $sp, 112
+    vldrepl.d  VALPHAI, $sp, 120
+
+#if defined (TRMMKERNEL) && !defined(LEFT)
+    sub.d      OFF,    $r0,   OFFSET
+#else
+    xor        OFF,    OFF,   OFF
+#endif
+
+    slli.d     LDC,    LDC,   BASE_SHIFT
+
+    move       J,      $r0
+    srai.d     T0,     N,     2  //bn/4
+    beq        J,      T0,    .L19
+
+.L10:  /* for(j=0; j<bn/4; j+=1) */
+    move       C0,     C
+    slli.d     TL,     LDC,   1
+    add.d      C1,     C0,    TL
+    add.d      C2,     C1,    TL
+    add.d      C3,     C2,    TL
+    move       A0,     A    //ptrba
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move       OFF,    OFFSET
+#endif
+
+    move       I,      $r0
+    srai.d     T0,     M,     2  //bm/4
+    beq        I,      T0,    .L18
+
+.L11:  /* for(i=0; i<bm/4; i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B     //ptrbb
+#else
+    slli.d     T3,     OFF,   0x06
+    add.d      A0,     A0,    T3
+    slli.d     T3,     OFF,   0x06
+    add.d      B0,     B,     T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF   //temp
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   4
+#else
+    addi.d     TL,     OFF,   4
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    vxor.v    U0,     U0,   U0
+    vxor.v    U1,     U1,   U1
+    vxor.v    U2,     U2,   U2
+    vxor.v    U3,     U3,   U3
+    vxor.v    U4,     U4,   U4
+    vxor.v    U5,     U5,   U5
+    vxor.v    U6,     U6,   U6
+    vxor.v    U7,     U7,   U7
+    vxor.v    U8,     U8,   U8
+    vxor.v    U9,     U9,   U9
+    vxor.v    U10,    U10,  U10
+    vxor.v    U11,    U11,  U11
+    vxor.v    U12,    U12,  U12
+    vxor.v    U13,    U13,  U13
+    vxor.v    U14,    U14,  U14
+    vxor.v    U15,    U15,  U15
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L13
+    blt        TL,     L,     .L13
+
+.L12:  /* for(k=0; k<temp; k+=1) */
+    vld       D1,     B0,    0x00  // b0ri
+    vld       D2,     B0,    0x10  // b1ri
+    vld       D3,     B0,    0x20  // b2ri
+    vld       D4,     B0,    0x30  // b3ri
+    vld       D0,     A0,    0x00  // a0ri
+
+    vand.v     D5,     D0,    D0
+    vand.v     D6,     D0,    D0
+    vshuf4i.d  D5,     D0,    0x00  //a0rr
+    vshuf4i.d  D6,     D0,    0x55  //a0ii
+
+    vpackev.d D7,     D2,    D1     //b0r b1r
+    vpackod.d D8,     D2,    D1     //b0i b1i
+
+    vpackev.d D9,     D4,    D3     //b2r b3r
+    vpackod.d D10,    D4,    D3     //b2i b3i
+
+    VMADD1    U0,     D5,    D7,     U0  //00r 10r
+    VMADD2    U1,     D6,    D7,     U1  //00i 10i
+    VMADD3    U0,     D6,    D8,     U0
+    VMADD4    U1,     D5,    D8,     U1
+
+    VMADD1    U2,     D5,    D9,     U2  //20r 30r
+    VMADD2    U3,     D6,    D9,     U3  //20i 30i
+    VMADD3    U2,     D6,    D10,    U2
+    VMADD4    U3,     D5,    D10,    U3
+
+    vld       D0,     A0,    0x10  // a1ri
+
+    vand.v     D5,     D0,    D0
+    vand.v     D6,     D0,    D0
+    vshuf4i.d  D5,     D0,    0x00  //a1rr
+    vshuf4i.d  D6,     D0,    0x55  //a1ii
+
+    VMADD1    U4,     D5,    D7,     U4  //01r 11r
+    VMADD2    U5,     D6,    D7,     U5  //01i 11i
+    VMADD3    U4,     D6,    D8,     U4
+    VMADD4    U5,     D5,    D8,     U5
+
+    VMADD1    U6,     D5,    D9,     U6  //21r 31r
+    VMADD2    U7,     D6,    D9,     U7  //21i 31i
+    VMADD3    U6,     D6,    D10,    U6
+    VMADD4    U7,     D5,    D10,    U7
+
+    vld       D0,     A0,    0x20  // a2ri
+
+    vand.v     D5,     D0,    D0
+    vand.v     D6,     D0,    D0
+    vshuf4i.d  D5,     D0,    0x00  //a2rr
+    vshuf4i.d  D6,     D0,    0x55  //a2ii
+
+    VMADD1    U8,     D5,    D7,     U8  //02r 12r
+    VMADD2    U9,     D6,    D7,     U9  //02i 12i
+    VMADD3    U8,     D6,    D8,     U8
+    VMADD4    U9,     D5,    D8,     U9
+
+    VMADD1    U10,     D5,    D9,     U10  //22r 32r
+    VMADD2    U11,     D6,    D9,     U11  //22i 32i
+    VMADD3    U10,     D6,    D10,    U10
+    VMADD4    U11,     D5,    D10,    U11
+
+    vld       D0,     A0,    0x30  // a3ri
+
+    vand.v     D5,     D0,    D0
+    vand.v     D6,     D0,    D0
+    vshuf4i.d  D5,     D0,    0x00  //a3rr
+    vshuf4i.d  D6,     D0,    0x55  //a3ii
+
+    VMADD1    U12,     D5,    D7,     U12  //03r 13r
+    VMADD2    U13,     D6,    D7,     U13  //03i 13i
+    VMADD3    U12,     D6,    D8,     U12
+    VMADD4    U13,     D5,    D8,     U13
+
+    VMADD1    U14,     D5,    D9,     U14  //23r 33r
+    VMADD2    U15,     D6,    D9,     U15  //23i 33i
+    VMADD3    U14,     D6,    D10,    U14
+    VMADD4    U15,     D5,    D10,    U15
+
+    addi.d     A0,     A0,    0x40
+    addi.d     B0,     B0,    0x40
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L12
+
+.L13:
+#if defined(TRMMKERNEL)
+    //res00 res10
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C1,    0x00 //c1: 0 1
+
+    vpackev.d D2,     D1,    D0   //c0[0] c1[0]
+    vpackod.d D3,     D1,    D0   //c0[1] c1[1]
+
+    vfmul.d      D2,    U0,    VALPHAR
+    vfmul.d      D3,    U1,    VALPHAR
+    VNMSUB      D2,    U1,    VALPHAI, D2
+    VFMADD      D3,    U0,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0[0] c0[1]
+    vpackod.d D5,     D3,    D2   //c1[0] c1[1]
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C1,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res20 res30
+    vld       D0,     C2,    0x00 //c2: 0 1
+    vld       D1,     C3,    0x00 //c3: 0 1
+
+    vpackev.d D2,     D1,    D0   //c2[0] c3[0]
+    vpackod.d D3,     D1,    D0   //c2[1] c3[1]
+
+    vfmul.d      D2,    U2,    VALPHAR
+    vfmul.d      D3,    U3,    VALPHAR
+    VNMSUB      D2,    U3,    VALPHAI, D2
+    VFMADD      D3,    U2,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c2[0] c2[1]
+    vpackod.d D5,     D3,    D2   //c3[0] c3[1]
+
+    vst        D4,     C2,    0x00
+    vst        D5,     C3,    0x00
+
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+
+    //res01 res11
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C1,    0x00 //c1: 0 1
+
+    vpackev.d D2,     D1,    D0   //c0[0] c1[0]
+    vpackod.d D3,     D1,    D0   //c0[1] c1[1]
+
+    vfmul.d      D2,    U4,    VALPHAR
+    vfmul.d      D3,    U5,    VALPHAR
+    VNMSUB      D2,    U5,    VALPHAI, D2
+    VFMADD      D3,    U4,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0[0] c0[1]
+    vpackod.d D5,     D3,    D2   //c1[0] c1[1]
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C1,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res21 res31
+    vld       D0,     C2,    0x00 //c2: 0 1
+    vld       D1,     C3,    0x00 //c3: 0 1
+
+    vpackev.d D2,     D1,    D0   //c2[0] c3[0]
+    vpackod.d D3,     D1,    D0   //c2[1] c3[1]
+
+    vfmul.d      D2,    U6,    VALPHAR
+    vfmul.d      D3,    U7,    VALPHAR
+    VNMSUB      D2,    U7,    VALPHAI, D2
+    VFMADD      D3,    U6,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c2[0] c2[1]
+    vpackod.d D5,     D3,    D2   //c3[0] c3[1]
+
+    vst        D4,     C2,    0x00
+    vst        D5,     C3,    0x00
+
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+
+    //res02 res12
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C1,    0x00 //c1: 0 1
+
+    vpackev.d D2,     D1,    D0   //c0[0] c1[0]
+    vpackod.d D3,     D1,    D0   //c0[1] c1[1]
+
+    vfmul.d      D2,    U8,    VALPHAR
+    vfmul.d      D3,    U9,    VALPHAR
+    VNMSUB      D2,    U9,    VALPHAI, D2
+    VFMADD      D3,    U8,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0[0] c0[1]
+    vpackod.d D5,     D3,    D2   //c1[0] c1[1]
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C1,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res22 res32
+    vld       D0,     C2,    0x00 //c2: 0 1
+    vld       D1,     C3,    0x00 //c3: 0 1
+
+    vpackev.d D2,     D1,    D0   //c2[0] c3[0]
+    vpackod.d D3,     D1,    D0   //c2[1] c3[1]
+
+    vfmul.d      D2,    U10,    VALPHAR
+    vfmul.d      D3,    U11,    VALPHAR
+    VNMSUB      D2,    U11,    VALPHAI, D2
+    VFMADD      D3,    U10,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c2[0] c2[1]
+    vpackod.d D5,     D3,    D2   //c3[0] c3[1]
+
+    vst        D4,     C2,    0x00
+    vst        D5,     C3,    0x00
+
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+
+    //res03 res13
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C1,    0x00 //c1: 0 1
+
+    vpackev.d D2,     D1,    D0   //c0[0] c1[0]
+    vpackod.d D3,     D1,    D0   //c0[1] c1[1]
+
+    vfmul.d      D2,    U12,    VALPHAR
+    vfmul.d      D3,    U13,    VALPHAR
+    VNMSUB      D2,    U13,    VALPHAI, D2
+    VFMADD      D3,    U12,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0[0] c0[1]
+    vpackod.d D5,     D3,    D2   //c1[0] c1[1]
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C1,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res23 res33
+    vld       D0,     C2,    0x00 //c2: 0 1
+    vld       D1,     C3,    0x00 //c3: 0 1
+
+    vpackev.d D2,     D1,    D0   //c2[0] c3[0]
+    vpackod.d D3,     D1,    D0   //c2[1] c3[1]
+
+    vfmul.d      D2,    U14,    VALPHAR
+    vfmul.d      D3,    U15,    VALPHAR
+    VNMSUB      D2,    U15,    VALPHAI, D2
+    VFMADD      D3,    U14,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c2[0] c2[1]
+    vpackod.d D5,     D3,    D2   //c3[0] c3[1]
+
+    vst        D4,     C2,    0x00
+    vst        D5,     C3,    0x00
+
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+#else
+    //res00 res10
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C1,    0x00 //c1: 0 1
+
+    vst       U0,     C0,    0x00
+    fld.d     $f27,  C0,    0x00
+    fld.d     $f27,  C0,    0x08
+
+    vst       U1,     C0,    0x00
+    fld.d     $f27,  C0,    0x00
+    fld.d     $f27,  C0,    0x08
+
+    vst       U2,     C0,    0x00
+    fld.d     $f27,  C0,    0x00
+    fld.d     $f27,  C0,    0x08
+
+    vst       U3,     C0,    0x00
+    fld.d     $f27,  C0,    0x00
+    fld.d     $f27,  C0,    0x08
+
+    vst       U4,     C0,    0x00
+    fld.d     $f27,  C0,    0x00
+    fld.d     $f27,  C0,    0x08
+
+    vst       U5,     C0,    0x00
+    fld.d     $f27,  C0,    0x00
+    fld.d     $f27,  C0,    0x08
+
+    vst       U6,     C0,    0x00
+    fld.d     $f27,  C0,    0x00
+    fld.d     $f27,  C0,    0x08
+
+    vst       U7,     C0,    0x00
+    fld.d     $f27,  C0,    0x00
+    fld.d     $f27,  C0,    0x08
+
+    vst       U8,     C0,    0x00
+    fld.d     $f27,  C0,    0x00
+    fld.d     $f27,  C0,    0x08
+
+    vst       U9,     C0,    0x00
+    fld.d     $f27,  C0,    0x00
+    fld.d     $f27,  C0,    0x08
+
+    vst       U10,     C0,    0x00
+    fld.d     $f27,  C0,    0x00
+    fld.d     $f27,  C0,    0x08
+
+    vst       U11,     C0,    0x00
+    fld.d     $f27,  C0,    0x00
+    fld.d     $f27,  C0,    0x08
+
+    vst       U12,     C0,    0x00
+    fld.d     $f27,  C0,    0x00
+    fld.d     $f27,  C0,    0x08
+
+    vst       U13,     C0,    0x00
+    fld.d     $f27,  C0,    0x00
+    fld.d     $f27,  C0,    0x08
+
+    vst       U14,     C0,    0x00
+    fld.d     $f27,  C0,    0x00
+    fld.d     $f27,  C0,    0x08
+
+    vst       U15,     C0,    0x00
+    fld.d     $f27,  C0,    0x00
+    fld.d     $f27,  C0,    0x08
+
+    vpackev.d D2,     D1,    D0   //c0[0] c1[0]
+    vpackod.d D3,     D1,    D0   //c0[1] c1[1]
+
+    VFMADD      D2,    U0,    VALPHAR, D2
+    VFMADD      D3,    U1,    VALPHAR, D3
+    VNMSUB      D2,    U1,    VALPHAI, D2
+    VFMADD      D3,    U0,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0[0] c0[1]
+    vpackod.d D5,     D3,    D2   //c1[0] c1[1]
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C1,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res20 res30
+    vld       D0,     C2,    0x00 //c2: 0 1
+    vld       D1,     C3,    0x00 //c3: 0 1
+
+    vpackev.d D2,     D1,    D0   //c2[0] c3[0]
+    vpackod.d D3,     D1,    D0   //c2[1] c3[1]
+
+    VFMADD      D2,    U2,    VALPHAR, D2
+    VFMADD      D3,    U3,    VALPHAR, D3
+    VNMSUB      D2,    U3,    VALPHAI, D2
+    VFMADD      D3,    U2,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c2[0] c2[1]
+    vpackod.d D5,     D3,    D2   //c3[0] c3[1]
+
+    vst        D4,     C2,    0x00
+    vst        D5,     C3,    0x00
+
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+
+    //res01 res11
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C1,    0x00 //c1: 0 1
+
+    vpackev.d D2,     D1,    D0   //c0[0] c1[0]
+    vpackod.d D3,     D1,    D0   //c0[1] c1[1]
+
+    VFMADD      D2,    U4,    VALPHAR, D2
+    VFMADD      D3,    U5,    VALPHAR, D3
+    VNMSUB      D2,    U5,    VALPHAI, D2
+    VFMADD      D3,    U4,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0[0] c0[1]
+    vpackod.d D5,     D3,    D2   //c1[0] c1[1]
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C1,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res21 res31
+    vld       D0,     C2,    0x00 //c2: 0 1
+    vld       D1,     C3,    0x00 //c3: 0 1
+
+    vpackev.d D2,     D1,    D0   //c2[0] c3[0]
+    vpackod.d D3,     D1,    D0   //c2[1] c3[1]
+
+    VFMADD      D2,    U6,    VALPHAR, D2
+    VFMADD      D3,    U7,    VALPHAR, D3
+    VNMSUB      D2,    U7,    VALPHAI, D2
+    VFMADD      D3,    U6,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c2[0] c2[1]
+    vpackod.d D5,     D3,    D2   //c3[0] c3[1]
+
+    vst        D4,     C2,    0x00
+    vst        D5,     C3,    0x00
+
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+
+    //res02 res12
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C1,    0x00 //c1: 0 1
+
+    vpackev.d D2,     D1,    D0   //c0[0] c1[0]
+    vpackod.d D3,     D1,    D0   //c0[1] c1[1]
+
+    VFMADD      D2,    U8,    VALPHAR, D2
+    VFMADD      D3,    U9,    VALPHAR, D3
+    VNMSUB      D2,    U9,    VALPHAI, D2
+    VFMADD      D3,    U8,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0[0] c0[1]
+    vpackod.d D5,     D3,    D2   //c1[0] c1[1]
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C1,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res22 res32
+    vld       D0,     C2,    0x00 //c2: 0 1
+    vld       D1,     C3,    0x00 //c3: 0 1
+
+    vpackev.d D2,     D1,    D0   //c2[0] c3[0]
+    vpackod.d D3,     D1,    D0   //c2[1] c3[1]
+
+    VFMADD      D2,    U10,    VALPHAR, D2
+    VFMADD      D3,    U11,    VALPHAR, D3
+    VNMSUB      D2,    U11,    VALPHAI, D2
+    VFMADD      D3,    U10,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c2[0] c2[1]
+    vpackod.d D5,     D3,    D2   //c3[0] c3[1]
+
+    vst        D4,     C2,    0x00
+    vst        D5,     C3,    0x00
+
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+
+    //res03 res13
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C1,    0x00 //c1: 0 1
+
+    vpackev.d D2,     D1,    D0   //c0[0] c1[0]
+    vpackod.d D3,     D1,    D0   //c0[1] c1[1]
+
+    VFMADD      D2,    U12,    VALPHAR, D2
+    VFMADD      D3,    U13,    VALPHAR, D3
+    VNMSUB      D2,    U13,    VALPHAI, D2
+    VFMADD      D3,    U12,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0[0] c0[1]
+    vpackod.d D5,     D3,    D2   //c1[0] c1[1]
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C1,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res23 res33
+    vld       D0,     C2,    0x00 //c2: 0 1
+    vld       D1,     C3,    0x00 //c3: 0 1
+
+    vpackev.d D2,     D1,    D0   //c2[0] c3[0]
+    vpackod.d D3,     D1,    D0   //c2[1] c3[1]
+
+    VFMADD      D2,    U14,    VALPHAR, D2
+    VFMADD      D3,    U15,    VALPHAR, D3
+    VNMSUB      D2,    U15,    VALPHAI, D2
+    VFMADD      D3,    U14,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c2[0] c2[1]
+    vpackod.d D5,     D3,    D2   //c3[0] c3[1]
+
+    vst        D4,     C2,    0x00
+    vst        D5,     C3,    0x00
+
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+#endif
+
+#if defined(TRMMKERNEL)
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -4
+#else
+    addi.d     TL,     TL,   -4
+#endif
+    slli.d     T3,     TL,   0x06
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x06
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   4
+#endif
+
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     I,      I,     1
+    blt        I,      T0,    .L11
+
+.L18:   /* if (bm & 2) */
+    move       I,      $r0
+    andi       T0,     M,     2
+    beq        I,      T0,    .L183
+
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,  0x05
+    add.d      A0,     A0,   T3
+    slli.d     T3,     OFF,  0x06
+    add.d      B0,     B,    T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   2
+#else
+    addi.d     TL,     OFF,   4
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    vxor.v    U0,     U0,   U0
+    vxor.v    U1,     U1,   U1
+    vxor.v    U2,     U2,   U2
+    vxor.v    U3,     U3,   U3
+    vxor.v    U4,     U4,   U4
+    vxor.v    U5,     U5,   U5
+    vxor.v    U6,     U6,   U6
+    vxor.v    U7,     U7,   U7
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L182
+    blt        TL,     L,     .L182
+
+.L181:  /* for (k=0; k<temp; k++) */
+    vld       D1,     B0,    0x00  // b0ri
+    vld       D2,     B0,    0x10  // b1ri
+    vld       D3,     B0,    0x20  // b2ri
+    vld       D4,     B0,    0x30  // b3ri
+    vld       D0,     A0,    0x00  // a0ri
+
+    vand.v     D5,     D0,    D0
+    vand.v     D6,     D0,    D0
+    vshuf4i.d  D5,     D0,    0x00  //a0rr
+    vshuf4i.d  D6,     D0,    0x55  //a0ii
+
+    vpackev.d D7,     D2,    D1     //b0r b1r
+    vpackod.d D8,     D2,    D1     //b0i b1i
+
+    vpackev.d D9,     D4,    D3     //b2r b3r
+    vpackod.d D10,    D4,    D3     //b2i b3i
+
+    VMADD1    U0,     D5,    D7,     U0  //00r 10r
+    VMADD2    U1,     D6,    D7,     U1  //00i 10i
+    VMADD3    U0,     D6,    D8,     U0
+    VMADD4    U1,     D5,    D8,     U1
+
+    VMADD1    U2,     D5,    D9,     U2  //20r 30r
+    VMADD2    U3,     D6,    D9,     U3  //20i 30i
+    VMADD3    U2,     D6,    D10,    U2
+    VMADD4    U3,     D5,    D10,    U3
+
+    vld       D0,     A0,    0x10  // a1ri
+
+    vand.v     D5,     D0,    D0
+    vand.v     D6,     D0,    D0
+    vshuf4i.d  D5,     D0,    0x00  //a1rr
+    vshuf4i.d  D6,     D0,    0x55  //a1ii
+
+    VMADD1    U4,     D5,    D7,     U4  //01r 11r
+    VMADD2    U5,     D6,    D7,     U5  //01i 11i
+    VMADD3    U4,     D6,    D8,     U4
+    VMADD4    U5,     D5,    D8,     U5
+
+    VMADD1    U6,     D5,    D9,     U6  //21r 31r
+    VMADD2    U7,     D6,    D9,     U7  //21i 31i
+    VMADD3    U6,     D6,    D10,    U6
+    VMADD4    U7,     D5,    D10,    U7
+
+    addi.d     A0,     A0,    0x20
+    addi.d     B0,     B0,    0x40
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L181
+
+.L182:
+#if defined(TRMMKERNEL)
+    //res00 res10
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C1,    0x00 //c1: 0 1
+
+    vpackev.d D2,     D1,    D0   //c0[0] c1[0]
+    vpackod.d D3,     D1,    D0   //c0[1] c1[1]
+
+    vfmul.d      D2,    U0,    VALPHAR
+    vfmul.d      D3,    U1,    VALPHAR
+    VNMSUB      D2,    U1,    VALPHAI, D2
+    VFMADD      D3,    U0,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0[0] c0[1]
+    vpackod.d D5,     D3,    D2   //c1[0] c1[1]
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C1,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res20 res30
+    vld       D0,     C2,    0x00 //c2: 0 1
+    vld       D1,     C3,    0x00 //c3: 0 1
+
+    vpackev.d D2,     D1,    D0   //c2[0] c3[0]
+    vpackod.d D3,     D1,    D0   //c2[1] c3[1]
+
+    vfmul.d      D2,    U2,    VALPHAR
+    vfmul.d      D3,    U3,    VALPHAR
+    VNMSUB      D2,    U3,    VALPHAI, D2
+    VFMADD      D3,    U2,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c2[0] c2[1]
+    vpackod.d D5,     D3,    D2   //c3[0] c3[1]
+
+    vst        D4,     C2,    0x00
+    vst        D5,     C3,    0x00
+
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+
+    //res01 res11
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C1,    0x00 //c1: 0 1
+
+    vpackev.d D2,     D1,    D0   //c0[0] c1[0]
+    vpackod.d D3,     D1,    D0   //c0[1] c1[1]
+
+    vfmul.d      D2,    U4,    VALPHAR
+    vfmul.d      D3,    U5,    VALPHAR
+    VNMSUB      D2,    U5,    VALPHAI, D2
+    VFMADD      D3,    U4,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0[0] c0[1]
+    vpackod.d D5,     D3,    D2   //c1[0] c1[1]
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C1,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res21 res31
+    vld       D0,     C2,    0x00 //c2: 0 1
+    vld       D1,     C3,    0x00 //c3: 0 1
+
+    vpackev.d D2,     D1,    D0   //c2[0] c3[0]
+    vpackod.d D3,     D1,    D0   //c2[1] c3[1]
+
+    vfmul.d      D2,    U6,    VALPHAR
+    vfmul.d      D3,    U7,    VALPHAR
+    VNMSUB      D2,    U7,    VALPHAI, D2
+    VFMADD      D3,    U6,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c2[0] c2[1]
+    vpackod.d D5,     D3,    D2   //c3[0] c3[1]
+
+    vst        D4,     C2,    0x00
+    vst        D5,     C3,    0x00
+
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+#else
+    //res00 res10
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C1,    0x00 //c1: 0 1
+
+    vpackev.d D2,     D1,    D0   //c0[0] c1[0]
+    vpackod.d D3,     D1,    D0   //c0[1] c1[1]
+
+    VFMADD      D2,    U0,    VALPHAR, D2
+    VFMADD      D3,    U1,    VALPHAR, D3
+    VNMSUB      D2,    U1,    VALPHAI, D2
+    VFMADD      D3,    U0,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0[0] c0[1]
+    vpackod.d D5,     D3,    D2   //c1[0] c1[1]
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C1,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res20 res30
+    vld       D0,     C2,    0x00 //c2: 0 1
+    vld       D1,     C3,    0x00 //c3: 0 1
+
+    vpackev.d D2,     D1,    D0   //c2[0] c3[0]
+    vpackod.d D3,     D1,    D0   //c2[1] c3[1]
+
+    VFMADD      D2,    U2,    VALPHAR, D2
+    VFMADD      D3,    U3,    VALPHAR, D3
+    VNMSUB      D2,    U3,    VALPHAI, D2
+    VFMADD      D3,    U2,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c2[0] c2[1]
+    vpackod.d D5,     D3,    D2   //c3[0] c3[1]
+
+    vst        D4,     C2,    0x00
+    vst        D5,     C3,    0x00
+
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+
+    //res01 res11
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C1,    0x00 //c1: 0 1
+
+    vpackev.d D2,     D1,    D0   //c0[0] c1[0]
+    vpackod.d D3,     D1,    D0   //c0[1] c1[1]
+
+    VFMADD      D2,    U4,    VALPHAR, D2
+    VFMADD      D3,    U5,    VALPHAR, D3
+    VNMSUB      D2,    U5,    VALPHAI, D2
+    VFMADD      D3,    U4,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0[0] c0[1]
+    vpackod.d D5,     D3,    D2   //c1[0] c1[1]
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C1,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res21 res31
+    vld       D0,     C2,    0x00 //c2: 0 1
+    vld       D1,     C3,    0x00 //c3: 0 1
+
+    vpackev.d D2,     D1,    D0   //c2[0] c3[0]
+    vpackod.d D3,     D1,    D0   //c2[1] c3[1]
+
+    VFMADD      D2,    U6,    VALPHAR, D2
+    VFMADD      D3,    U7,    VALPHAR, D3
+    VNMSUB      D2,    U7,    VALPHAI, D2
+    VFMADD      D3,    U6,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c2[0] c2[1]
+    vpackod.d D5,     D3,    D2   //c3[0] c3[1]
+
+    vst        D4,     C2,    0x00
+    vst        D5,     C3,    0x00
+
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -2
+#else
+    addi.d     TL,     TL,   -4
+#endif
+    slli.d     T3,     TL,   0x05
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x06
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   2
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+.L183:   /* if (bm & 1) */
+    move       I,      $r0
+    andi       T0,     M,     1
+    beq        I,      T0,    .L186
+
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,  0x04
+    add.d      A0,     A0,   T3
+    slli.d     T3,     OFF,  0x06
+    add.d      B0,     B,    T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   1
+#else
+    addi.d     TL,     OFF,   4
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    vxor.v    U0,     U0,   U0
+    vxor.v    U1,     U1,   U1
+    vxor.v    U2,     U2,   U2
+    vxor.v    U3,     U3,   U3
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L185
+    blt        TL,     L,     .L185
+
+.L184:  /* for (k=0; k<temp; k++) */
+    vld       D1,     B0,    0x00  // b0ri
+    vld       D2,     B0,    0x10  // b1ri
+    vld       D3,     B0,    0x20  // b2ri
+    vld       D4,     B0,    0x30  // b3ri
+    vld       D0,     A0,    0x00  // a0ri
+
+    vand.v     D5,     D0,    D0
+    vand.v     D6,     D0,    D0
+    vshuf4i.d  D5,     D0,    0x00  //a0rr
+    vshuf4i.d  D6,     D0,    0x55  //a0ii
+
+    vpackev.d D7,     D2,    D1     //b0r b1r
+    vpackod.d D8,     D2,    D1     //b0i b1i
+
+    vpackev.d D9,     D4,    D3     //b2r b3r
+    vpackod.d D10,    D4,    D3     //b2i b3i
+
+    VMADD1    U0,     D5,    D7,     U0  //00r 10r
+    VMADD2    U1,     D6,    D7,     U1  //00i 10i
+    VMADD3    U0,     D6,    D8,     U0
+    VMADD4    U1,     D5,    D8,     U1
+
+    VMADD1    U2,     D5,    D9,     U2  //20r 30r
+    VMADD2    U3,     D6,    D9,     U3  //20i 30i
+    VMADD3    U2,     D6,    D10,    U2
+    VMADD4    U3,     D5,    D10,    U3
+
+    addi.d     A0,     A0,    0x10
+    addi.d     B0,     B0,    0x40
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L184
+
+.L185:
+#if defined(TRMMKERNEL)
+    //res00 res10
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C1,    0x00 //c1: 0 1
+
+    vpackev.d D2,     D1,    D0   //c0[0] c1[0]
+    vpackod.d D3,     D1,    D0   //c0[1] c1[1]
+
+    vfmul.d      D2,    U0,    VALPHAR
+    vfmul.d      D3,    U1,    VALPHAR
+    VNMSUB      D2,    U1,    VALPHAI, D2
+    VFMADD      D3,    U0,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0[0] c0[1]
+    vpackod.d D5,     D3,    D2   //c1[0] c1[1]
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C1,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res20 res30
+    vld       D0,     C2,    0x00 //c2: 0 1
+    vld       D1,     C3,    0x00 //c3: 0 1
+
+    vpackev.d D2,     D1,    D0   //c2[0] c3[0]
+    vpackod.d D3,     D1,    D0   //c2[1] c3[1]
+
+    vfmul.d      D2,    U2,    VALPHAR
+    vfmul.d      D3,    U3,    VALPHAR
+    VNMSUB      D2,    U3,    VALPHAI, D2
+    VFMADD      D3,    U2,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c2[0] c2[1]
+    vpackod.d D5,     D3,    D2   //c3[0] c3[1]
+
+    vst        D4,     C2,    0x00
+    vst        D5,     C3,    0x00
+
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+#else
+    //res00 res10
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C1,    0x00 //c1: 0 1
+
+    vpackev.d D2,     D1,    D0   //c0[0] c1[0]
+    vpackod.d D3,     D1,    D0   //c0[1] c1[1]
+
+    VFMADD      D2,    U0,    VALPHAR, D2
+    VFMADD      D3,    U1,    VALPHAR, D3
+    VNMSUB      D2,    U1,    VALPHAI, D2
+    VFMADD      D3,    U0,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0[0] c0[1]
+    vpackod.d D5,     D3,    D2   //c1[0] c1[1]
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C1,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res20 res30
+    vld       D0,     C2,    0x00 //c2: 0 1
+    vld       D1,     C3,    0x00 //c3: 0 1
+
+    vpackev.d D2,     D1,    D0   //c2[0] c3[0]
+    vpackod.d D3,     D1,    D0   //c2[1] c3[1]
+
+    VFMADD      D2,    U2,    VALPHAR, D2
+    VFMADD      D3,    U3,    VALPHAR, D3
+    VNMSUB      D2,    U3,    VALPHAI, D2
+    VFMADD      D3,    U2,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c2[0] c2[1]
+    vpackod.d D5,     D3,    D2   //c3[0] c3[1]
+
+    vst        D4,     C2,    0x00
+    vst        D5,     C3,    0x00
+
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -1
+#else
+    addi.d     TL,     TL,   -4
+#endif
+    slli.d     T3,     TL,   0x04
+    add.d      A0,     A0,   T3
+    slli.d     C3,     TL,   0x06
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   1
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+
+.L186:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    addi.d     OFF,    OFF,   4
+#endif
+
+    slli.d     L,      K,     0x06
+    add.d      B,      B,     L
+
+    slli.d     I,      LDC,   0x03
+    add.d      C,      C,     I
+
+    addi.d     J,      J,     1
+    srai.d     T0,     N,     2
+    blt        J,      T0,    .L10
+
+.L19:
+    move       J,      $r0
+    andi       T0,     N,     2
+    beq        J,      T0,    .L30
+
+.L20: /* for (j=0; j<(bn&2); j+=2) */
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move       OFF,    OFFSET
+#endif
+
+    move       C0,     C
+    slli.d     TL,     LDC,   1
+    add.d      C1,     C0,    TL
+    move       A0,     A    //ptrba
+
+    move       I,      $r0
+    srai.d     T0,     M,     2  //bm/4
+    beq        I,      T0,    .L280
+
+.L21:  /* for (i=0; i<bm/4; i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,  0x06
+    add.d      A0,     A0,   T3
+    slli.d     T3,     OFF,  0x05
+    add.d      B0,     B,    T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   4
+#else
+    addi.d     TL,     OFF,   2
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    vxor.v    U0,     U0,   U0
+    vxor.v    U1,     U1,   U1
+    vxor.v    U2,     U2,   U2
+    vxor.v    U3,     U3,   U3
+    vxor.v    U4,     U4,   U4
+    vxor.v    U5,     U5,   U5
+    vxor.v    U6,     U6,   U6
+    vxor.v    U7,     U7,   U7
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L23
+    blt        TL,     L,     .L23
+
+.L22:  /* for (k=0; k<temp; k++) */
+    vld       D1,     B0,    0x00  // b0ri
+    vld       D2,     B0,    0x10  // b1ri
+    vld       D0,     A0,    0x00  // a0ri
+
+    vand.v     D5,     D0,    D0
+    vand.v     D6,     D0,    D0
+    vshuf4i.d  D5,     D0,    0x00  //a0rr
+    vshuf4i.d  D6,     D0,    0x55  //a0ii
+
+    vpackev.d D7,     D2,    D1     //b0r b1r
+    vpackod.d D8,     D2,    D1     //b0i b1i
+
+    VMADD1    U0,     D5,    D7,     U0  //00r 10r
+    VMADD2    U1,     D6,    D7,     U1  //00i 10i
+    VMADD3    U0,     D6,    D8,     U0
+    VMADD4    U1,     D5,    D8,     U1
+
+    vld       D0,     A0,    0x10  // a1ri
+
+    vand.v     D5,     D0,    D0
+    vand.v     D6,     D0,    D0
+    vshuf4i.d  D5,     D0,    0x00  //a1rr
+    vshuf4i.d  D6,     D0,    0x55  //a1ii
+
+    VMADD1    U2,     D5,    D7,     U2  //01r 11r
+    VMADD2    U3,     D6,    D7,     U3  //01i 11i
+    VMADD3    U2,     D6,    D8,     U2
+    VMADD4    U3,     D5,    D8,     U3
+
+    vld       D0,     A0,    0x20  // a2ri
+
+    vand.v     D5,     D0,    D0
+    vand.v     D6,     D0,    D0
+    vshuf4i.d  D5,     D0,    0x00  //a2rr
+    vshuf4i.d  D6,     D0,    0x55  //a2ii
+
+    VMADD1    U4,     D5,    D7,     U4  //02r 12r
+    VMADD2    U5,     D6,    D7,     U5  //02i 12i
+    VMADD3    U4,     D6,    D8,     U4
+    VMADD4    U5,     D5,    D8,     U5
+
+    vld       D0,     A0,    0x30  // a3ri
+
+    vand.v     D5,     D0,    D0
+    vand.v     D6,     D0,    D0
+    vshuf4i.d  D5,     D0,    0x00  //a3rr
+    vshuf4i.d  D6,     D0,    0x55  //a3ii
+
+    VMADD1    U6,     D5,    D7,     U6  //03r 13r
+    VMADD2    U7,     D6,    D7,     U7  //03i 13i
+    VMADD3    U6,     D6,    D8,     U6
+    VMADD4    U7,     D5,    D8,     U7
+
+    addi.d     A0,     A0,    0x40
+    addi.d     B0,     B0,    0x20
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L22
+
+.L23:
+#if defined(TRMMKERNEL)
+    //res00 res10
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C1,    0x00 //c1: 0 1
+
+    vpackev.d D2,     D1,    D0   //c0[0] c1[0]
+    vpackod.d D3,     D1,    D0   //c0[1] c1[1]
+
+    vfmul.d      D2,    U0,    VALPHAR
+    vfmul.d      D3,    U1,    VALPHAR
+    VNMSUB      D2,    U1,    VALPHAI, D2
+    VFMADD      D3,    U0,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0[0] c0[1]
+    vpackod.d D5,     D3,    D2   //c1[0] c1[1]
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C1,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res01 res11
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C1,    0x00 //c1: 0 1
+
+    vpackev.d D2,     D1,    D0   //c0[0] c1[0]
+    vpackod.d D3,     D1,    D0   //c0[1] c1[1]
+
+    vfmul.d      D2,    U2,    VALPHAR
+    vfmul.d      D3,    U3,    VALPHAR
+    VNMSUB      D2,    U3,    VALPHAI, D2
+    VFMADD      D3,    U2,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0[0] c0[1]
+    vpackod.d D5,     D3,    D2   //c1[0] c1[1]
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C1,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res02 res12
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C1,    0x00 //c1: 0 1
+
+    vpackev.d D2,     D1,    D0   //c0[0] c1[0]
+    vpackod.d D3,     D1,    D0   //c0[1] c1[1]
+
+    vfmul.d      D2,    U4,    VALPHAR
+    vfmul.d      D3,    U5,    VALPHAR
+    VNMSUB      D2,    U5,    VALPHAI, D2
+    VFMADD      D3,    U4,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0[0] c0[1]
+    vpackod.d D5,     D3,    D2   //c1[0] c1[1]
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C1,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res03 res13
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C1,    0x00 //c1: 0 1
+
+    vpackev.d D2,     D1,    D0   //c0[0] c1[0]
+    vpackod.d D3,     D1,    D0   //c0[1] c1[1]
+
+    vfmul.d      D2,    U6,    VALPHAR
+    vfmul.d      D3,    U7,    VALPHAR
+    VNMSUB      D2,    U7,    VALPHAI, D2
+    VFMADD      D3,    U6,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0[0] c0[1]
+    vpackod.d D5,     D3,    D2   //c1[0] c1[1]
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C1,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+#else
+    //res00 res10
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C1,    0x00 //c1: 0 1
+
+    vpackev.d D2,     D1,    D0   //c0[0] c1[0]
+    vpackod.d D3,     D1,    D0   //c0[1] c1[1]
+
+    VFMADD      D2,    U0,    VALPHAR, D2
+    VFMADD      D3,    U1,    VALPHAR, D3
+    VNMSUB      D2,    U1,    VALPHAI, D2
+    VFMADD      D3,    U0,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0[0] c0[1]
+    vpackod.d D5,     D3,    D2   //c1[0] c1[1]
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C1,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res01 res11
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C1,    0x00 //c1: 0 1
+
+    vpackev.d D2,     D1,    D0   //c0[0] c1[0]
+    vpackod.d D3,     D1,    D0   //c0[1] c1[1]
+
+    VFMADD      D2,    U2,    VALPHAR, D2
+    VFMADD      D3,    U3,    VALPHAR, D3
+    VNMSUB      D2,    U3,    VALPHAI, D2
+    VFMADD      D3,    U2,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0[0] c0[1]
+    vpackod.d D5,     D3,    D2   //c1[0] c1[1]
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C1,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res02 res12
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C1,    0x00 //c1: 0 1
+
+    vpackev.d D2,     D1,    D0   //c0[0] c1[0]
+    vpackod.d D3,     D1,    D0   //c0[1] c1[1]
+
+    VFMADD      D2,    U4,    VALPHAR, D2
+    VFMADD      D3,    U5,    VALPHAR, D3
+    VNMSUB      D2,    U5,    VALPHAI, D2
+    VFMADD      D3,    U4,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0[0] c0[1]
+    vpackod.d D5,     D3,    D2   //c1[0] c1[1]
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C1,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res03 res13
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C1,    0x00 //c1: 0 1
+
+    vpackev.d D2,     D1,    D0   //c0[0] c1[0]
+    vpackod.d D3,     D1,    D0   //c0[1] c1[1]
+
+    VFMADD      D2,    U6,    VALPHAR, D2
+    VFMADD      D3,    U7,    VALPHAR, D3
+    VNMSUB      D2,    U7,    VALPHAI, D2
+    VFMADD      D3,    U6,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0[0] c0[1]
+    vpackod.d D5,     D3,    D2   //c1[0] c1[1]
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C1,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -4
+#else
+    addi.d     TL,     TL,   -2
+#endif
+    slli.d     T3,     TL,   0x06
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x05
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   4
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     I,      I,     1
+    blt        I,      T0,    .L21
+
+.L280:   /* if ( bm & 2 )*/
+    move       I,      $r0
+    andi       T1,     M,     2    //bm&2
+    beq        I,      T1,    .L284
+
+.L281:
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,   0x05
+    add.d      A0,     A0,    T3
+    add.d      B0,     B,     T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   2
+#else
+    addi.d     TL,     OFF,   2
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    vxor.v    U0,     U0,   U0
+    vxor.v    U1,     U1,   U1
+    vxor.v    U2,     U2,   U2
+    vxor.v    U3,     U3,   U3
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L283
+    blt        TL,     L,     .L283
+
+.L282:  /* for (k=0; k<temp; k++) */
+    vld       D1,     B0,    0x00  // b0ri
+    vld       D2,     B0,    0x10  // b1ri
+    vld       D0,     A0,    0x00  // a0ri
+
+    vand.v     D5,     D0,    D0
+    vand.v     D6,     D0,    D0
+    vshuf4i.d  D5,     D0,    0x00  //a0rr
+    vshuf4i.d  D6,     D0,    0x55  //a0ii
+
+    vpackev.d D7,     D2,    D1     //b0r b1r
+    vpackod.d D8,     D2,    D1     //b0i b1i
+
+    VMADD1    U0,     D5,    D7,     U0  //00r 10r
+    VMADD2    U1,     D6,    D7,     U1  //00i 10i
+    VMADD3    U0,     D6,    D8,     U0
+    VMADD4    U1,     D5,    D8,     U1
+
+    vld       D0,     A0,    0x10  // a1ri
+
+    vand.v     D5,     D0,    D0
+    vand.v     D6,     D0,    D0
+    vshuf4i.d  D5,     D0,    0x00  //a1rr
+    vshuf4i.d  D6,     D0,    0x55  //a1ii
+
+    VMADD1    U2,     D5,    D7,     U2  //01r 11r
+    VMADD2    U3,     D6,    D7,     U3  //01i 11i
+    VMADD3    U2,     D6,    D8,     U2
+    VMADD4    U3,     D5,    D8,     U3
+
+    addi.d     A0,     A0,    0x20
+    addi.d     B0,     B0,    0x20
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L282
+
+.L283:
+#if defined(TRMMKERNEL)
+    //res00 res10
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C1,    0x00 //c1: 0 1
+
+    vpackev.d D2,     D1,    D0   //c0[0] c1[0]
+    vpackod.d D3,     D1,    D0   //c0[1] c1[1]
+
+    vfmul.d      D2,    U0,    VALPHAR
+    vfmul.d      D3,    U1,    VALPHAR
+    VNMSUB      D2,    U1,    VALPHAI, D2
+    VFMADD      D3,    U0,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0[0] c0[1]
+    vpackod.d D5,     D3,    D2   //c1[0] c1[1]
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C1,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res01 res11
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C1,    0x00 //c1: 0 1
+
+    vpackev.d D2,     D1,    D0   //c0[0] c1[0]
+    vpackod.d D3,     D1,    D0   //c0[1] c1[1]
+
+    vfmul.d      D2,    U2,    VALPHAR
+    vfmul.d      D3,    U3,    VALPHAR
+    VNMSUB      D2,    U3,    VALPHAI, D2
+    VFMADD      D3,    U2,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0[0] c0[1]
+    vpackod.d D5,     D3,    D2   //c1[0] c1[1]
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C1,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+#else
+    //res00 res10
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C1,    0x00 //c1: 0 1
+
+    vpackev.d D2,     D1,    D0   //c0[0] c1[0]
+    vpackod.d D3,     D1,    D0   //c0[1] c1[1]
+
+    VFMADD      D2,    U0,    VALPHAR, D2
+    VFMADD      D3,    U1,    VALPHAR, D3
+    VNMSUB      D2,    U1,    VALPHAI, D2
+    VFMADD      D3,    U0,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0[0] c0[1]
+    vpackod.d D5,     D3,    D2   //c1[0] c1[1]
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C1,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res01 res11
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C1,    0x00 //c1: 0 1
+
+    vpackev.d D2,     D1,    D0   //c0[0] c1[0]
+    vpackod.d D3,     D1,    D0   //c0[1] c1[1]
+
+    VFMADD      D2,    U2,    VALPHAR, D2
+    VFMADD      D3,    U3,    VALPHAR, D3
+    VNMSUB      D2,    U3,    VALPHAI, D2
+    VFMADD      D3,    U2,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0[0] c0[1]
+    vpackod.d D5,     D3,    D2   //c1[0] c1[1]
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C1,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -2
+#else
+    addi.d     TL,     TL,   -2
+#endif
+    slli.d     T3,     TL,   0x05
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x05
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   2
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+.L284:   /* if ( bm & 1 )*/
+    move       I,      $r0
+    andi       T1,     M,     1    //bm&1
+    beq        I,      T1,    .L288
+
+.L285:
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,   0x04
+    add.d      A0,     A0,    T3
+    slli.d     T3,     OFF,   0x05
+    add.d      B0,     B,     T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   1
+#else
+    addi.d     TL,     OFF,   2
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    vxor.v    U0,     U0,   U0
+    vxor.v    U1,     U1,   U1
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L287
+    blt        TL,     L,     .L287
+
+.L286:  /* for (k=0; k<temp; k++) */
+    vld       D1,     B0,    0x00  // b0ri
+    vld       D2,     B0,    0x10  // b1ri
+    vld       D0,     A0,    0x00  // a0ri
+
+    vand.v     D5,     D0,    D0
+    vand.v     D6,     D0,    D0
+    vshuf4i.d  D5,     D0,    0x00  //a0rr
+    vshuf4i.d  D6,     D0,    0x55  //a0ii
+
+    vpackev.d D7,     D2,    D1     //b0r b1r
+    vpackod.d D8,     D2,    D1     //b0i b1i
+
+    VMADD1    U0,     D5,    D7,     U0  //00r 10r
+    VMADD2    U1,     D6,    D7,     U1  //00i 10i
+    VMADD3    U0,     D6,    D8,     U0
+    VMADD4    U1,     D5,    D8,     U1
+
+    addi.d     A0,     A0,    0x10
+    addi.d     B0,     B0,    0x20
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L286
+
+.L287:
+#if defined(TRMMKERNEL)
+    //res00 res10
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C1,    0x00 //c1: 0 1
+
+    vpackev.d D2,     D1,    D0   //c0[0] c1[0]
+    vpackod.d D3,     D1,    D0   //c0[1] c1[1]
+
+    vfmul.d      D2,    U0,    VALPHAR
+    vfmul.d      D3,    U1,    VALPHAR
+    VNMSUB      D2,    U1,    VALPHAI, D2
+    VFMADD      D3,    U0,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0[0] c0[1]
+    vpackod.d D5,     D3,    D2   //c1[0] c1[1]
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C1,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+#else
+    //res00 res10
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C1,    0x00 //c1: 0 1
+
+    vpackev.d D2,     D1,    D0   //c0[0] c1[0]
+    vpackod.d D3,     D1,    D0   //c0[1] c1[1]
+
+    VFMADD      D2,    U0,    VALPHAR, D2
+    VFMADD      D3,    U1,    VALPHAR, D3
+    VNMSUB      D2,    U1,    VALPHAI, D2
+    VFMADD      D3,    U0,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0[0] c0[1]
+    vpackod.d D5,     D3,    D2   //c1[0] c1[1]
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C1,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -1
+#else
+    addi.d     TL,     TL,   -2
+#endif
+    slli.d     T3,     TL,   0x04
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x05
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   1
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+.L288:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    addi.d     OFF,    OFF,   2
+#endif
+    slli.d     L,      K,     5
+    add.d      B,      B,     L
+
+    slli.d     I,      LDC,   2
+    add.d      C,      C,     I
+
+    addi.d     J,      J,     2
+    andi       T0,     N,     2
+    blt        J,      T0,    .L20
+
+.L30:
+    move       J,      $r0
+    andi       T0,     N,     1
+    beq        J,      T0,    .L999
+
+.L300:  /* for (j=0; j<(bn&1); j+=1) */
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move       OFF,    OFFSET
+#endif
+
+    move       C0,     C
+    move       A0,     A    //ptrba
+
+    move       I,      $r0
+    srai.d     T0,     M,     2  //bm/4
+    beq        I,      T0,    .L38
+
+.L31:  /* for (i=0; i<bm/4; i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,  0x06
+    add.d      A0,     A0,   T3
+    slli.d     T3,     OFF,  0x04
+    add.d      B0,     B,    T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   4
+#else
+    addi.d     TL,     OFF,   1
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    vxor.v    U0,     U0,   U0
+    vxor.v    U1,     U1,   U1
+    vxor.v    U2,     U2,   U2
+    vxor.v    U3,     U3,   U3
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L33
+    blt        TL,     L,     .L33
+
+.L32:  /* for (k=0; k<temp; k++) */
+    vld       D1,     B0,    0x00  // b0ri
+
+    vld       D0,     A0,    0x00  // a0ri
+    vld       D2,     A0,    0x10  // a1ri
+
+    vpackev.d  D5,     D2,    D0  //a0r a1r
+    vpackod.d  D6,     D2,    D0  //a0i a1i
+
+    vand.v     D7,     D1,    D1
+    vand.v     D8,     D1,    D1
+    vshuf4i.d  D7,     D1,    0x00  //b0rr
+    vshuf4i.d  D8,     D1,    0x55  //b0ii
+
+    VMADD1    U0,     D5,    D7,     U0  //00r 01r
+    VMADD2    U1,     D6,    D7,     U1  //00i 01i
+    VMADD3    U0,     D6,    D8,     U0
+    VMADD4    U1,     D5,    D8,     U1
+
+    vld       D0,     A0,    0x20  // a0ri
+    vld       D2,     A0,    0x30  // a1ri
+
+    vpackev.d  D5,     D2,    D0  //a0r a1r
+    vpackod.d  D6,     D2,    D0  //a0i a1i
+
+    VMADD1    U2,     D5,    D7,     U2  //02r 03r
+    VMADD2    U3,     D6,    D7,     U3  //02i 03i
+    VMADD3    U2,     D6,    D8,     U2
+    VMADD4    U3,     D5,    D8,     U3
+
+    addi.d     A0,     A0,    0x40
+    addi.d     B0,     B0,    0x10
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L32
+
+.L33:
+#if defined(TRMMKERNEL)
+    //res00 res01
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C0,    0x10 //c0: 2 3
+
+    vpackev.d D2,     D1,    D0   //c0: 0 2
+    vpackod.d D3,     D1,    D0   //c0: 1 3
+
+    vfmul.d      D2,    U0,    VALPHAR
+    vfmul.d      D3,    U1,    VALPHAR
+    VNMSUB      D2,    U1,    VALPHAI, D2
+    VFMADD      D3,    U0,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0: 0 1
+    vpackod.d D5,     D3,    D2   //c0: 2 3
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C0,    0x10
+
+    addi.d     C0,     C0,    0x20
+
+    //res02 res03
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C0,    0x10 //c0: 2 3
+
+    vpackev.d D2,     D1,    D0   //c0: 0 2
+    vpackod.d D3,     D1,    D0   //c0: 1 3
+
+    vfmul.d      D2,    U2,    VALPHAR
+    vfmul.d      D3,    U3,    VALPHAR
+    VNMSUB      D2,    U3,    VALPHAI, D2
+    VFMADD      D3,    U2,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0: 0 1
+    vpackod.d D5,     D3,    D2   //c0: 2 3
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C0,    0x10
+
+    addi.d     C0,     C0,    0x20
+#else
+    //res00 res01
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C0,    0x10 //c0: 2 3
+
+    vpackev.d D2,     D1,    D0   //c0: 0 2
+    vpackod.d D3,     D1,    D0   //c0: 1 3
+
+    VFMADD      D2,    U0,    VALPHAR, D2
+    VFMADD      D3,    U1,    VALPHAR, D3
+    VNMSUB      D2,    U1,    VALPHAI, D2
+    VFMADD      D3,    U0,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0: 0 1
+    vpackod.d D5,     D3,    D2   //c0: 2 3
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C0,    0x10
+
+    addi.d     C0,     C0,    0x20
+
+    //res02 res03
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C0,    0x10 //c0: 2 3
+
+    vpackev.d D2,     D1,    D0   //c0: 0 2
+    vpackod.d D3,     D1,    D0   //c0: 1 3
+
+    VFMADD      D2,    U2,    VALPHAR, D2
+    VFMADD      D3,    U3,    VALPHAR, D3
+    VNMSUB      D2,    U3,    VALPHAI, D2
+    VFMADD      D3,    U2,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0: 0 1
+    vpackod.d D5,     D3,    D2   //c0: 2 3
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C0,    0x10
+
+    addi.d     C0,     C0,    0x20
+#endif
+
+#if defined(TRMMKERNEL)
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -4
+#else
+    addi.d     TL,     TL,   -1
+#endif
+    slli.d     T3,     TL,   0x06
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x04
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   4
+#endif
+
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     I,      I,     1
+    blt        I,      T0,    .L31
+
+.L38:   /* if ( bm & 2 ) */
+    move       I,      $r0
+    andi       T1,     M,     2    //bm&2
+    beq        I,      T1,    .L312
+
+.L39:
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,   0x05
+    add.d      A0,     A0,    T3
+    slli.d     T3,     OFF,   0x04
+    add.d      B0,     B,     T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   2
+#else
+    addi.d     TL,     OFF,   1
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    vxor.v    U0,     U0,   U0
+    vxor.v    U1,     U1,   U1
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L311
+    blt        TL,     L,     .L311
+
+.L310:  /* for (k=0; k<temp; k++) */
+    vld       D1,     B0,    0x00  // b0ri
+
+    vld       D0,     A0,    0x00  // a0ri
+    vld       D2,     A0,    0x10  // a1ri
+
+    vpackev.d  D5,     D2,    D0  //a0r a1r
+    vpackod.d  D6,     D2,    D0  //a0i a1i
+
+    vand.v     D7,     D1,    D1
+    vand.v     D8,     D1,    D1
+    vshuf4i.d  D7,     D1,    0x00  //b0rr
+    vshuf4i.d  D8,     D1,    0x55  //b0ii
+
+    VMADD1    U0,     D5,    D7,     U0  //00r 01r
+    VMADD2    U1,     D6,    D7,     U1  //00i 01i
+    VMADD3    U0,     D6,    D8,     U0
+    VMADD4    U1,     D5,    D8,     U1
+
+    addi.d     A0,     A0,    0x20
+    addi.d     B0,     B0,    0x10
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L310
+
+.L311:
+#if defined(TRMMKERNEL)
+    //res00 res01
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C0,    0x10 //c0: 2 3
+
+    vpackev.d D2,     D1,    D0   //c0: 0 2
+    vpackod.d D3,     D1,    D0   //c0: 1 3
+
+    vfmul.d      D2,    U0,    VALPHAR
+    vfmul.d      D3,    U1,    VALPHAR
+    VNMSUB      D2,    U1,    VALPHAI, D2
+    VFMADD      D3,    U0,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0: 0 1
+    vpackod.d D5,     D3,    D2   //c0: 2 3
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C0,    0x10
+
+    addi.d     C0,     C0,    0x20
+#else
+    //res00 res01
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C0,    0x10 //c0: 2 3
+
+    vpackev.d D2,     D1,    D0   //c0: 0 2
+    vpackod.d D3,     D1,    D0   //c0: 1 3
+
+    VFMADD      D2,    U0,    VALPHAR, D2
+    VFMADD      D3,    U1,    VALPHAR, D3
+    VNMSUB      D2,    U1,    VALPHAI, D2
+    VFMADD      D3,    U0,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0: 0 1
+    vpackod.d D5,     D3,    D2   //c0: 2 3
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C0,    0x10
+
+    addi.d     C0,     C0,    0x20
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -2
+#else
+    addi.d     TL,     TL,   -1
+#endif
+    slli.d     T3,     TL,   0x05
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x04
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   2
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+.L312:   /* if ( bm & 1 )*/
+    move       I,      $r0
+    andi       T1,     M,     1    //bm&1
+    beq        I,      T1,    .L316
+
+.L313:
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,   0x04
+    add.d      A0,     A0,    T3
+    slli.d     T3,     OFF,   0x04
+    add.d      B0,     B,     T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   1
+#else
+    addi.d     TL,     OFF,   1
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    MTC        c11,    $r0
+    MTC        c12,    $r0
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L315
+    blt        TL,     L,     .L315
+
+.L314:  /* for (k=0; k<temp; k++) */
+    LD         a1,     A0,    0x00
+    LD         a2,     A0,    0x08
+
+    LD         b1,     B0,    0x00
+    LD         b2,     B0,    0x08
+
+    MADD1      c11,    a1,    b1,     c11
+    MADD2      c12,    a2,    b1,     c12
+    MADD3      c11,    a2,    b2,     c11
+    MADD4      c12,    a1,    b2,     c12
+
+    addi.d     A0,     A0,    0x10
+    addi.d     B0,     B0,    0x10
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L314
+
+.L315:
+#if defined(TRMMKERNEL)
+    MUL        a5,     c11,   ALPHA_R
+    MUL        a6,     c12,   ALPHA_I
+    SUB        a5,     a5,    a6
+    ST         a5,     C0,    0x00
+
+    MUL        a5,     c12,   ALPHA_R
+    MUL        a6,     c11,   ALPHA_I
+    ADD        a6,     a5,    a6
+    ST         a6,     C0,    0x08
+#else
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x08    //C0[1]
+
+    MADD       a5,     c11,   ALPHA_R, a5
+    MADD       a6,     c12,   ALPHA_R, a6
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x08
+
+    addi.d     C0,     C0,    0x10
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -1
+#else
+    addi.d     TL,     TL,   -1
+#endif
+    slli.d     T3,     TL,   0x04
+    add.d      A0,     A0,   T3
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   1
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+.L316:
+    slli.d     L,      K,     4
+    add.d      B,      B,     L
+
+    slli.d     I,      LDC,   1
+    add.d      C,      C,     I
+
+    addi.d     J,      J,     1
+    andi       T0,     N,     1
+    blt        J,      T0,    .L300
+
+.L999:
+    LDARG      $r23,   $sp,   0
+    LDARG      $r24,   $sp,   8
+    LDARG      $r25,   $sp,   16
+    LDARG      $r26,   $sp,   24
+    LDARG      $r27,   $sp,   32
+    LD         $f23,   $sp,   40
+    LD         $f24,   $sp,   48
+    LD         $f25,   $sp,   56
+    LD         $f26,   $sp,   64
+    LD         $f27,   $sp,   72
+    LD         $f28,   $sp,   80
+    LD         $f29,   $sp,   88
+    LD         $f30,   $sp,   96
+    LD         $f31,   $sp,   104
+
+    addi.d     $sp,    $sp,   128
+    jirl       $r0,    $r1,   0x0
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/zgemm_kernel_8x4_lasx.S b/kernel/loongarch64/zgemm_kernel_8x4_lasx.S
new file mode 100644
index 000000000..ca90b30f5
--- /dev/null
+++ b/kernel/loongarch64/zgemm_kernel_8x4_lasx.S
@@ -0,0 +1,3545 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+
+/* Function parameters */
+#define M      $r4   // param 1: bm
+#define N      $r5   // param 2: bn
+#define K      $r6   // param 3: bk
+#define ALPHA_R $f0   // param 4: alphar
+#define ALPHA_I $f1   // param 5: alphai
+#define A      $r7   // param 6: ba
+#define B      $r8  // param 7: bb
+#define C      $r9  // param 8: bc
+#define LDC    $r10  // param 9: ldc
+
+#if defined (TRMMKERNEL)
+#define OFFSET $r11  // param 10: offset
+#endif
+#define OFF    $r26
+
+#define I      $r12
+#define J      $r13
+#define L      $r14
+#define TL     $r15
+#define A0     $r16
+#define B0     $r17
+#define C0     $r18
+#define C1     $r19
+#define C2     $r20
+#define C3     $r23
+#define T0     $r24
+#define T1     $r25
+#define T2     $r26
+#define T3     $r27
+
+#define a1     $f2
+#define a2     $f3
+#define a3     $f4
+#define a4     $f5
+#define a5     $f6
+#define a6     $f7
+#define a7     $f8
+#define a8     $f9
+#define b1     $f10
+#define b2     $f11
+#define b3     $f12
+#define b4     $f13
+#define b5     $f14
+#define b6     $f15
+#define b7     $f16
+#define b8     $f17
+#define c11    $f18
+#define c12    $f19
+#define c21    $f20
+#define c22    $f21
+#define c31    $f22
+#define c32    $f23
+#define c41    $f24
+#define c42    $f25
+
+/* LASX vectors */
+#define U0     $xr30
+#define U1     $xr31
+#define U2     $xr2
+#define U3     $xr3
+#define U4     $xr4
+#define U5     $xr5
+#define U6     $xr6
+#define U7     $xr7
+#define U8     $xr8
+#define U9     $xr9
+#define U10    $xr10
+#define U11    $xr11
+#define U12    $xr12
+#define U13    $xr13
+#define U14    $xr14
+#define U15    $xr15
+#define D0     $xr16
+#define D1     $xr17
+#define D2     $xr18
+#define D3     $xr19
+#define D4     $xr20
+#define D5     $xr21
+#define D6     $xr22
+#define D7     $xr23
+#define D8     $xr24
+#define D9     $xr25
+#define D10    $xr26
+#define D11    $xr27
+#define D12    $xr28
+#define D13    $xr29
+#define VALPHAR $xr28
+#define VALPHAI $xr29
+
+
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+#define    XVMADD1       XVFMADD
+#define    XVMADD2       XVFMADD
+#define    XVMADD3       XVNMSUB
+#define    XVMADD4       XVFMADD
+
+#define    VMADD1       VFMADD
+#define    VMADD2       VFMADD
+#define    VMADD3       VNMSUB
+#define    VMADD4       VFMADD
+
+#define    MADD1       MADD
+#define    MADD2       MADD
+#define    MADD3       NMSUB
+#define    MADD4       MADD
+#endif
+
+#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
+#define    XVMADD1       XVFMADD
+#define    XVMADD2       XVFMADD
+#define    XVMADD3       XVFMADD
+#define    XVMADD4       XVNMSUB
+
+#define    VMADD1       VFMADD
+#define    VMADD2       VFMADD
+#define    VMADD3       VFMADD
+#define    VMADD4       VNMSUB
+
+#define    MADD1       MADD
+#define    MADD2       MADD
+#define    MADD3       MADD
+#define    MADD4       NMSUB
+#endif
+
+#if   defined(RN) || defined(RT) || defined(CN) || defined(CT)
+#define    XVMADD1       XVFMADD
+#define    XVMADD2       XVNMSUB
+#define    XVMADD3       XVFMADD
+#define    XVMADD4       XVFMADD
+
+#define    VMADD1       VFMADD
+#define    VMADD2       VNMSUB
+#define    VMADD3       VFMADD
+#define    VMADD4       VFMADD
+
+#define    MADD1       MADD
+#define    MADD2       NMSUB
+#define    MADD3       MADD
+#define    MADD4       MADD
+#endif
+
+#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
+#define    XVMADD1       XVFMADD
+#define    XVMADD2       XVNMSUB
+#define    XVMADD3       XVNMSUB
+#define    XVMADD4       XVNMSUB
+
+#define    VMADD1       VFMADD
+#define    VMADD2       VNMSUB
+#define    VMADD3       VNMSUB
+#define    VMADD4       VNMSUB
+
+#define    MADD1       MADD
+#define    MADD2       NMSUB
+#define    MADD3       NMSUB
+#define    MADD4       NMSUB
+#endif
+
+    PROLOGUE
+
+    addi.d     $sp,    $sp,   -128
+    SDARG      $r23,   $sp,   0
+    SDARG      $r24,   $sp,   8
+    SDARG      $r25,   $sp,   16
+    SDARG      $r26,   $sp,   24
+    SDARG      $r27,   $sp,   32
+    ST         $f23,   $sp,   40
+    ST         $f24,   $sp,   48
+    ST         $f25,   $sp,   56
+    ST         $f26,   $sp,   64
+    ST         $f27,   $sp,   72
+    ST         $f28,   $sp,   80
+    ST         $f29,   $sp,   88
+    ST         $f30,   $sp,   96
+    ST         $f31,   $sp,   104
+    ST         ALPHA_R,$sp,   112
+    ST         ALPHA_I,$sp,   120
+
+    xvldrepl.d  VALPHAR, $sp, 112
+    xvldrepl.d  VALPHAI, $sp, 120
+
+#if defined (TRMMKERNEL) && !defined(LEFT)
+    sub.d      OFF,    $r0,   OFFSET
+#else
+    xor        OFF,    OFF,   OFF
+#endif
+
+    slli.d     LDC,    LDC,   BASE_SHIFT
+
+    move       J,      $r0
+    srai.d     T0,     N,     2  //bn/4
+    beq        J,      T0,    .L19
+
+.L10:  /* for(j=0; j<bn/4; j+=1) */
+    move       C0,     C
+    slli.d     TL,     LDC,   1
+    add.d      C1,     C0,    TL
+    add.d      C2,     C1,    TL
+    add.d      C3,     C2,    TL
+    move       A0,     A    //ptrba
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move       OFF,    OFFSET
+#endif
+
+    move       I,      $r0
+    srai.d     T0,     M,     3  //bm/8
+    beq        I,      T0,    .L150
+
+.L11:  /* for(i=0; i<bm/8; i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B     //ptrbb
+#else
+    slli.d     T3,     OFF,   0x07
+    add.d      A0,     A0,    T3
+    slli.d     T3,     OFF,   0x06
+    add.d      B0,     B,     T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF   //temp
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   8
+#else
+    addi.d     TL,     OFF,   4
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    xvxor.v    U0,     U0,   U0
+    xvxor.v    U1,     U1,   U1
+    xvxor.v    U2,     U2,   U2
+    xvxor.v    U3,     U3,   U3
+    xvxor.v    U4,     U4,   U4
+    xvxor.v    U5,     U5,   U5
+    xvxor.v    U6,     U6,   U6
+    xvxor.v    U7,     U7,   U7
+    xvxor.v    U8,     U8,   U8
+    xvxor.v    U9,     U9,   U9
+    xvxor.v    U10,    U10,  U10
+    xvxor.v    U11,    U11,  U11
+    xvxor.v    U12,    U12,  U12
+    xvxor.v    U13,    U13,  U13
+    xvxor.v    U14,    U14,  U14
+    xvxor.v    U15,    U15,  U15
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L13
+    blt        TL,     L,     .L13
+
+.L12:  /* for(k=0; k<temp; k+=1) */
+    xvld       D0,     A0,    0x00  // a0ri a1ri
+    xvld       D2,     B0,    0x00  // b0ri b1ri
+    xvld       D3,     B0,    0x20  // b2ri b3ri
+
+    xvpermi.d  D4,     D0,    0x00  //a0r
+    xvpermi.d  D5,     D0,    0x55  //a0i
+
+    xvpackev.d D6,     D3,    D2
+    xvpermi.d  D6,     D6,    0xd8  //b0r b1r b2r b3r
+
+    xvpackod.d D7,     D3,    D2
+    xvpermi.d  D7,     D7,    0xd8  //b0i b1i b2i b3i
+
+    XVMADD1    U0,     D4,    D6,     U0  //00r 10r 20r 30r
+    XVMADD2    U1,     D5,    D6,     U1  //00i 10i 20i 30i
+    XVMADD3    U0,     D5,    D7,     U0
+    XVMADD4    U1,     D4,    D7,     U1
+
+    xvpermi.d  D4,     D0,    0xaa  //a1r
+    xvpermi.d  D5,     D0,    0xff  //a1i
+
+    XVMADD1    U2,     D4,    D6,     U2  //01r 11r 21r 31r
+    XVMADD2    U3,     D5,    D6,     U3  //01i 11i 21i 31i
+    XVMADD3    U2,     D5,    D7,     U2
+    XVMADD4    U3,     D4,    D7,     U3
+
+    xvld       D0,     A0,    0x20  // a2ri a3ri
+
+    xvpermi.d  D4,     D0,    0x00  //a2r
+    xvpermi.d  D5,     D0,    0x55  //a2i
+
+    XVMADD1    U4,     D4,    D6,     U4  //02r 12r 22r 32r
+    XVMADD2    U5,     D5,    D6,     U5  //02i 12i 22i 32i
+    XVMADD3    U4,     D5,    D7,     U4
+    XVMADD4    U5,     D4,    D7,     U5
+
+    xvpermi.d  D4,     D0,    0xaa  //a3r
+    xvpermi.d  D5,     D0,    0xff  //a3i
+
+    XVMADD1    U6,     D4,    D6,     U6  //03r 13r 23r 33r
+    XVMADD2    U7,     D5,    D6,     U7  //03i 13i 23i 33i
+    XVMADD3    U6,     D5,    D7,     U6
+    XVMADD4    U7,     D4,    D7,     U7
+
+    xvld       D0,     A0,    0x40  // a4ri a5ri
+
+    xvpermi.d  D4,     D0,    0x00  //a4r
+    xvpermi.d  D5,     D0,    0x55  //a4i
+
+    XVMADD1    U8,     D4,    D6,     U8  //04r 14r 24r 34r
+    XVMADD2    U9,     D5,    D6,     U9  //04i 14i 24i 34i
+    XVMADD3    U8,     D5,    D7,     U8
+    XVMADD4    U9,     D4,    D7,     U9
+
+    xvpermi.d  D4,     D0,    0xaa  //a5r
+    xvpermi.d  D5,     D0,    0xff  //a5i
+
+    XVMADD1    U10,     D4,    D6,     U10  //05r 15r 25r 35r
+    XVMADD2    U11,     D5,    D6,     U11  //05i 15i 25i 35i
+    XVMADD3    U10,     D5,    D7,     U10
+    XVMADD4    U11,     D4,    D7,     U11
+
+    xvld       D0,     A0,    0x60  // a6ri a7ri
+
+    xvpermi.d  D4,     D0,    0x00  //a6r
+    xvpermi.d  D5,     D0,    0x55  //a6i
+
+    XVMADD1    U12,     D4,    D6,     U12  //06r 16r 26r 36r
+    XVMADD2    U13,     D5,    D6,     U13  //06i 16i 26i 36i
+    XVMADD3    U12,     D5,    D7,     U12
+    XVMADD4    U13,     D4,    D7,     U13
+
+    xvpermi.d  D4,     D0,    0xaa  //a5r
+    xvpermi.d  D5,     D0,    0xff  //a5i
+
+    XVMADD1    U14,     D4,    D6,     U14  //07r 17r 27r 37r
+    XVMADD2    U15,     D5,    D6,     U15  //07i 17i 27i 37i
+    XVMADD3    U14,     D5,    D7,     U14
+    XVMADD4    U15,     D4,    D7,     U15
+
+    addi.d     A0,     A0,    0x80
+    addi.d     B0,     B0,    0x40
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L12
+
+.L13:
+#if defined(TRMMKERNEL)
+    //res00 res10 res20 res30
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3
+    xvld       D1,     C1,    0x00 //c1: 0 1 2 3
+    xvld       D2,     C2,    0x00 //c2: 0 1 2 3
+    xvld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    xvand.v    D4,     D0,    D0
+    xvpermi.q  D4,     D1,    0x02 //c0:0 1, c1:0 1
+    xvpermi.d  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    xvand.v    D5,     D2,    D2
+    xvpermi.q  D5,     D3,    0x02 //c2:0 1, c3:0 1
+    xvpermi.d  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[0] c1[0] c2[0] c3[0]
+    xvpermi.q  D7,     D9,    0x02 //c0[1] c1[1] c2[1] c3[1]
+
+    xvfmul.d      D6,    U0,    VALPHAR
+    xvfmul.d      D7,    U1,    VALPHAR
+    XVNMSUB      D6,    U1,    VALPHAI, D6
+    XVFMADD      D7,    U0,    VALPHAI, D7
+
+    xvand.v    D10,     D6,    D6
+    xvpermi.q  D10,     D7,    0x02 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    xvand.v    D11,     D7,    D7
+    xvpermi.q  D11,     D6,    0x31 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res01 res11 res21 res31
+    xvand.v    D4,     D1,    D1
+    xvpermi.q  D4,     D0,    0x31 //c0:2 3, c1:2 3
+    xvpermi.d  D6,     D4,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    xvpermi.d  D7,     D4,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    xvand.v    D5,     D3,    D3
+    xvpermi.q  D5,     D2,    0x31 //c2:2 3, c3:2 3
+    xvpermi.d  D8,     D5,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    xvpermi.d  D9,     D5,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[2] c1[2] c2[2] c3[2]
+    xvpermi.q  D7,     D9,    0x02 //c0[3] c1[3] c2[3] c3[3]
+
+    xvfmul.d      D6,    U2,    VALPHAR
+    xvfmul.d      D7,    U3,    VALPHAR
+    XVNMSUB      D6,    U3,    VALPHAI, D6
+    XVFMADD      D7,    U2,    VALPHAI, D7
+
+    xvand.v    D4,     D6,    D6
+    xvpermi.q  D4,     D7,    0x02 //c0[2] c1[2] c0[3] c1[3]
+    xvpermi.d  D4,     D4,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+
+    xvand.v    D5,     D7,    D7
+    xvpermi.q  D5,     D6,    0x31 //c2[2] c3[2] c2[3] c3[3]
+    xvpermi.d  D5,     D5,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+
+    xvand.v    D0,     D10,    D10
+    xvand.v    D1,     D11,    D11
+
+    xvpermi.q  D0,     D4,     0x02 //c0: 0 1 2 3
+    xvpermi.q  D4,     D10,    0x31 //c1: 0 1 2 3
+    xvpermi.q  D1,     D5,     0x02 //c2: 0 1 2 3
+    xvpermi.q  D5,     D11,    0x31 //c3: 0 1 2 3
+
+    xvst       D0,     C0,    0x00
+    xvst       D4,     C1,    0x00
+    xvst       D1,     C2,    0x00
+    xvst       D5,     C3,    0x00
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+    addi.d     C2,     C2,    0x20
+    addi.d     C3,     C3,    0x20
+
+    //res02 res12 res22 res32
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3
+    xvld       D1,     C1,    0x00 //c1: 0 1 2 3
+    xvld       D2,     C2,    0x00 //c2: 0 1 2 3
+    xvld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    xvand.v    D4,     D0,    D0
+    xvpermi.q  D4,     D1,    0x02 //c0:0 1, c1:0 1
+    xvpermi.d  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    xvand.v    D5,     D2,    D2
+    xvpermi.q  D5,     D3,    0x02 //c2:0 1, c3:0 1
+    xvpermi.d  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[0] c1[0] c2[0] c3[0]
+    xvpermi.q  D7,     D9,    0x02 //c0[1] c1[1] c2[1] c3[1]
+
+    xvfmul.d      D6,    U4,    VALPHAR
+    xvfmul.d      D7,    U5,    VALPHAR
+    XVNMSUB      D6,    U5,    VALPHAI, D6
+    XVFMADD      D7,    U4,    VALPHAI, D7
+
+    xvand.v    D10,     D6,    D6
+    xvpermi.q  D10,     D7,    0x02 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    xvand.v    D11,     D7,    D7
+    xvpermi.q  D11,     D6,    0x31 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res03 res13 res23 res33
+    xvand.v    D4,     D1,    D1
+    xvpermi.q  D4,     D0,    0x31 //c0:2 3, c1:2 3
+    xvpermi.d  D6,     D4,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    xvpermi.d  D7,     D4,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    xvand.v    D5,     D3,    D3
+    xvpermi.q  D5,     D2,    0x31 //c2:2 3, c3:2 3
+    xvpermi.d  D8,     D5,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    xvpermi.d  D9,     D5,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[2] c1[2] c2[2] c3[2]
+    xvpermi.q  D7,     D9,    0x02 //c0[3] c1[3] c2[3] c3[3]
+
+    xvfmul.d      D6,    U6,    VALPHAR
+    xvfmul.d      D7,    U7,    VALPHAR
+    XVNMSUB      D6,    U7,    VALPHAI, D6
+    XVFMADD      D7,    U6,    VALPHAI, D7
+
+    xvand.v    D4,     D6,    D6
+    xvpermi.q  D4,     D7,    0x02 //c0[2] c1[2] c0[3] c1[3]
+    xvpermi.d  D4,     D4,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+
+    xvand.v    D5,     D7,    D7
+    xvpermi.q  D5,     D6,    0x31 //c2[2] c3[2] c2[3] c3[3]
+    xvpermi.d  D5,     D5,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+
+    xvand.v    D0,     D10,    D10
+    xvand.v    D1,     D11,    D11
+
+    xvpermi.q  D0,     D4,     0x02 //c0: 0 1 2 3
+    xvpermi.q  D4,     D10,    0x31 //c1: 0 1 2 3
+    xvpermi.q  D1,     D5,     0x02 //c2: 0 1 2 3
+    xvpermi.q  D5,     D11,    0x31 //c3: 0 1 2 3
+
+    xvst       D0,     C0,    0x00
+    xvst       D4,     C1,    0x00
+    xvst       D1,     C2,    0x00
+    xvst       D5,     C3,    0x00
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+    addi.d     C2,     C2,    0x20
+    addi.d     C3,     C3,    0x20
+
+    //res04 res14 res24 res34
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3
+    xvld       D1,     C1,    0x00 //c1: 0 1 2 3
+    xvld       D2,     C2,    0x00 //c2: 0 1 2 3
+    xvld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    xvand.v    D4,     D0,    D0
+    xvpermi.q  D4,     D1,    0x02 //c0:0 1, c1:0 1
+    xvpermi.d  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    xvand.v    D5,     D2,    D2
+    xvpermi.q  D5,     D3,    0x02 //c2:0 1, c3:0 1
+    xvpermi.d  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[0] c1[0] c2[0] c3[0]
+    xvpermi.q  D7,     D9,    0x02 //c0[1] c1[1] c2[1] c3[1]
+
+    xvfmul.d      D6,    U8,    VALPHAR
+    xvfmul.d      D7,    U9,    VALPHAR
+    XVNMSUB      D6,    U9,    VALPHAI, D6
+    XVFMADD      D7,    U8,    VALPHAI, D7
+
+    xvand.v    D10,     D6,    D6
+    xvpermi.q  D10,     D7,    0x02 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    xvand.v    D11,     D7,    D7
+    xvpermi.q  D11,     D6,    0x31 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res05 res15 res25 res35
+    xvand.v    D4,     D1,    D1
+    xvpermi.q  D4,     D0,    0x31 //c0:2 3, c1:2 3
+    xvpermi.d  D6,     D4,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    xvpermi.d  D7,     D4,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    xvand.v    D5,     D3,    D3
+    xvpermi.q  D5,     D2,    0x31 //c2:2 3, c3:2 3
+    xvpermi.d  D8,     D5,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    xvpermi.d  D9,     D5,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[2] c1[2] c2[2] c3[2]
+    xvpermi.q  D7,     D9,    0x02 //c0[3] c1[3] c2[3] c3[3]
+
+    xvfmul.d      D6,    U10,    VALPHAR
+    xvfmul.d      D7,    U11,    VALPHAR
+    XVNMSUB      D6,    U11,    VALPHAI, D6
+    XVFMADD      D7,    U10,    VALPHAI, D7
+
+    xvand.v    D4,     D6,    D6
+    xvpermi.q  D4,     D7,    0x02 //c0[2] c1[2] c0[3] c1[3]
+    xvpermi.d  D4,     D4,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+
+    xvand.v    D5,     D7,    D7
+    xvpermi.q  D5,     D6,    0x31 //c2[2] c3[2] c2[3] c3[3]
+    xvpermi.d  D5,     D5,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+
+    xvand.v    D0,     D10,    D10
+    xvand.v    D1,     D11,    D11
+
+    xvpermi.q  D0,     D4,     0x02 //c0: 0 1 2 3
+    xvpermi.q  D4,     D10,    0x31 //c1: 0 1 2 3
+    xvpermi.q  D1,     D5,     0x02 //c2: 0 1 2 3
+    xvpermi.q  D5,     D11,    0x31 //c3: 0 1 2 3
+
+    xvst       D0,     C0,    0x00
+    xvst       D4,     C1,    0x00
+    xvst       D1,     C2,    0x00
+    xvst       D5,     C3,    0x00
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+    addi.d     C2,     C2,    0x20
+    addi.d     C3,     C3,    0x20
+
+    //res06 res16 res26 res36
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3
+    xvld       D1,     C1,    0x00 //c1: 0 1 2 3
+    xvld       D2,     C2,    0x00 //c2: 0 1 2 3
+    xvld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    xvand.v    D4,     D0,    D0
+    xvpermi.q  D4,     D1,    0x02 //c0:0 1, c1:0 1
+    xvpermi.d  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    xvand.v    D5,     D2,    D2
+    xvpermi.q  D5,     D3,    0x02 //c2:0 1, c3:0 1
+    xvpermi.d  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[0] c1[0] c2[0] c3[0]
+    xvpermi.q  D7,     D9,    0x02 //c0[1] c1[1] c2[1] c3[1]
+
+    xvfmul.d      D6,    U12,    VALPHAR
+    xvfmul.d      D7,    U13,    VALPHAR
+    XVNMSUB      D6,    U13,    VALPHAI, D6
+    XVFMADD      D7,    U12,    VALPHAI, D7
+
+    xvand.v    D10,     D6,    D6
+    xvpermi.q  D10,     D7,    0x02 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    xvand.v    D11,     D7,    D7
+    xvpermi.q  D11,     D6,    0x31 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res07 res17 res27 res37
+    xvand.v    D4,     D1,    D1
+    xvpermi.q  D4,     D0,    0x31 //c0:2 3, c1:2 3
+    xvpermi.d  D6,     D4,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    xvpermi.d  D7,     D4,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    xvand.v    D5,     D3,    D3
+    xvpermi.q  D5,     D2,    0x31 //c2:2 3, c3:2 3
+    xvpermi.d  D8,     D5,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    xvpermi.d  D9,     D5,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[2] c1[2] c2[2] c3[2]
+    xvpermi.q  D7,     D9,    0x02 //c0[3] c1[3] c2[3] c3[3]
+
+    xvfmul.d      D6,    U14,    VALPHAR
+    xvfmul.d      D7,    U15,    VALPHAR
+    XVNMSUB      D6,    U15,    VALPHAI, D6
+    XVFMADD      D7,    U14,    VALPHAI, D7
+
+    xvand.v    D4,     D6,    D6
+    xvpermi.q  D4,     D7,    0x02 //c0[2] c1[2] c0[3] c1[3]
+    xvpermi.d  D4,     D4,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+
+    xvand.v    D5,     D7,    D7
+    xvpermi.q  D5,     D6,    0x31 //c2[2] c3[2] c2[3] c3[3]
+    xvpermi.d  D5,     D5,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+
+    xvand.v    D0,     D10,    D10
+    xvand.v    D1,     D11,    D11
+
+    xvpermi.q  D0,     D4,     0x02 //c0: 0 1 2 3
+    xvpermi.q  D4,     D10,    0x31 //c1: 0 1 2 3
+    xvpermi.q  D1,     D5,     0x02 //c2: 0 1 2 3
+    xvpermi.q  D5,     D11,    0x31 //c3: 0 1 2 3
+
+    xvst       D0,     C0,    0x00
+    xvst       D4,     C1,    0x00
+    xvst       D1,     C2,    0x00
+    xvst       D5,     C3,    0x00
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+    addi.d     C2,     C2,    0x20
+    addi.d     C3,     C3,    0x20
+#else
+    //res00 res10 res20 res30
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3
+    xvld       D1,     C1,    0x00 //c1: 0 1 2 3
+    xvld       D2,     C2,    0x00 //c2: 0 1 2 3
+    xvld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    xvand.v    D4,     D0,    D0
+    xvpermi.q  D4,     D1,    0x02 //c0:0 1, c1:0 1
+    xvpermi.d  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    xvand.v    D5,     D2,    D2
+    xvpermi.q  D5,     D3,    0x02 //c2:0 1, c3:0 1
+    xvpermi.d  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[0] c1[0] c2[0] c3[0]
+    xvpermi.q  D7,     D9,    0x02 //c0[1] c1[1] c2[1] c3[1]
+
+    XVFMADD      D6,    U0,    VALPHAR, D6
+    XVFMADD      D7,    U1,    VALPHAR, D7
+    XVNMSUB      D6,    U1,    VALPHAI, D6
+    XVFMADD      D7,    U0,    VALPHAI, D7
+
+    xvand.v    D10,     D6,    D6
+    xvpermi.q  D10,     D7,    0x02 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    xvand.v    D11,     D7,    D7
+    xvpermi.q  D11,     D6,    0x31 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res01 res11 res21 res31
+    xvand.v    D4,     D1,    D1
+    xvpermi.q  D4,     D0,    0x31 //c0:2 3, c1:2 3
+    xvpermi.d  D6,     D4,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    xvpermi.d  D7,     D4,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    xvand.v    D5,     D3,    D3
+    xvpermi.q  D5,     D2,    0x31 //c2:2 3, c3:2 3
+    xvpermi.d  D8,     D5,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    xvpermi.d  D9,     D5,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[2] c1[2] c2[2] c3[2]
+    xvpermi.q  D7,     D9,    0x02 //c0[3] c1[3] c2[3] c3[3]
+
+    XVFMADD      D6,    U2,    VALPHAR, D6
+    XVFMADD      D7,    U3,    VALPHAR, D7
+    XVNMSUB      D6,    U3,    VALPHAI, D6
+    XVFMADD      D7,    U2,    VALPHAI, D7
+
+    xvand.v    D4,     D6,    D6
+    xvpermi.q  D4,     D7,    0x02 //c0[2] c1[2] c0[3] c1[3]
+    xvpermi.d  D4,     D4,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+
+    xvand.v    D5,     D7,    D7
+    xvpermi.q  D5,     D6,    0x31 //c2[2] c3[2] c2[3] c3[3]
+    xvpermi.d  D5,     D5,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+
+    xvand.v    D0,     D10,    D10
+    xvand.v    D1,     D11,    D11
+
+    xvpermi.q  D0,     D4,     0x02 //c0: 0 1 2 3
+    xvpermi.q  D4,     D10,    0x31 //c1: 0 1 2 3
+    xvpermi.q  D1,     D5,     0x02 //c2: 0 1 2 3
+    xvpermi.q  D5,     D11,    0x31 //c3: 0 1 2 3
+
+    xvst       D0,     C0,    0x00
+    xvst       D4,     C1,    0x00
+    xvst       D1,     C2,    0x00
+    xvst       D5,     C3,    0x00
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+    addi.d     C2,     C2,    0x20
+    addi.d     C3,     C3,    0x20
+
+    //res02 res12 res22 res32
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3
+    xvld       D1,     C1,    0x00 //c1: 0 1 2 3
+    xvld       D2,     C2,    0x00 //c2: 0 1 2 3
+    xvld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    xvand.v    D4,     D0,    D0
+    xvpermi.q  D4,     D1,    0x02 //c0:0 1, c1:0 1
+    xvpermi.d  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    xvand.v    D5,     D2,    D2
+    xvpermi.q  D5,     D3,    0x02 //c2:0 1, c3:0 1
+    xvpermi.d  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[0] c1[0] c2[0] c3[0]
+    xvpermi.q  D7,     D9,    0x02 //c0[1] c1[1] c2[1] c3[1]
+
+    XVFMADD      D6,    U4,    VALPHAR, D6
+    XVFMADD      D7,    U5,    VALPHAR, D7
+    XVNMSUB      D6,    U5,    VALPHAI, D6
+    XVFMADD      D7,    U4,    VALPHAI, D7
+
+    xvand.v    D10,     D6,    D6
+    xvpermi.q  D10,     D7,    0x02 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    xvand.v    D11,     D7,    D7
+    xvpermi.q  D11,     D6,    0x31 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res03 res13 res23 res33
+    xvand.v    D4,     D1,    D1
+    xvpermi.q  D4,     D0,    0x31 //c0:2 3, c1:2 3
+    xvpermi.d  D6,     D4,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    xvpermi.d  D7,     D4,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    xvand.v    D5,     D3,    D3
+    xvpermi.q  D5,     D2,    0x31 //c2:2 3, c3:2 3
+    xvpermi.d  D8,     D5,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    xvpermi.d  D9,     D5,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[2] c1[2] c2[2] c3[2]
+    xvpermi.q  D7,     D9,    0x02 //c0[3] c1[3] c2[3] c3[3]
+
+    XVFMADD      D6,    U6,    VALPHAR, D6
+    XVFMADD      D7,    U7,    VALPHAR, D7
+    XVNMSUB      D6,    U7,    VALPHAI, D6
+    XVFMADD      D7,    U6,    VALPHAI, D7
+
+    xvand.v    D4,     D6,    D6
+    xvpermi.q  D4,     D7,    0x02 //c0[2] c1[2] c0[3] c1[3]
+    xvpermi.d  D4,     D4,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+
+    xvand.v    D5,     D7,    D7
+    xvpermi.q  D5,     D6,    0x31 //c2[2] c3[2] c2[3] c3[3]
+    xvpermi.d  D5,     D5,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+
+    xvand.v    D0,     D10,    D10
+    xvand.v    D1,     D11,    D11
+
+    xvpermi.q  D0,     D4,     0x02 //c0: 0 1 2 3
+    xvpermi.q  D4,     D10,    0x31 //c1: 0 1 2 3
+    xvpermi.q  D1,     D5,     0x02 //c2: 0 1 2 3
+    xvpermi.q  D5,     D11,    0x31 //c3: 0 1 2 3
+
+    xvst       D0,     C0,    0x00
+    xvst       D4,     C1,    0x00
+    xvst       D1,     C2,    0x00
+    xvst       D5,     C3,    0x00
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+    addi.d     C2,     C2,    0x20
+    addi.d     C3,     C3,    0x20
+
+    //res04 res14 res24 res34
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3
+    xvld       D1,     C1,    0x00 //c1: 0 1 2 3
+    xvld       D2,     C2,    0x00 //c2: 0 1 2 3
+    xvld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    xvand.v    D4,     D0,    D0
+    xvpermi.q  D4,     D1,    0x02 //c0:0 1, c1:0 1
+    xvpermi.d  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    xvand.v    D5,     D2,    D2
+    xvpermi.q  D5,     D3,    0x02 //c2:0 1, c3:0 1
+    xvpermi.d  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[0] c1[0] c2[0] c3[0]
+    xvpermi.q  D7,     D9,    0x02 //c0[1] c1[1] c2[1] c3[1]
+
+    XVFMADD      D6,    U8,    VALPHAR, D6
+    XVFMADD      D7,    U9,    VALPHAR, D7
+    XVNMSUB      D6,    U9,    VALPHAI, D6
+    XVFMADD      D7,    U8,    VALPHAI, D7
+
+    xvand.v    D10,     D6,    D6
+    xvpermi.q  D10,     D7,    0x02 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    xvand.v    D11,     D7,    D7
+    xvpermi.q  D11,     D6,    0x31 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res05 res15 res25 res35
+    xvand.v    D4,     D1,    D1
+    xvpermi.q  D4,     D0,    0x31 //c0:2 3, c1:2 3
+    xvpermi.d  D6,     D4,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    xvpermi.d  D7,     D4,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    xvand.v    D5,     D3,    D3
+    xvpermi.q  D5,     D2,    0x31 //c2:2 3, c3:2 3
+    xvpermi.d  D8,     D5,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    xvpermi.d  D9,     D5,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[2] c1[2] c2[2] c3[2]
+    xvpermi.q  D7,     D9,    0x02 //c0[3] c1[3] c2[3] c3[3]
+
+    XVFMADD      D6,    U10,    VALPHAR, D6
+    XVFMADD      D7,    U11,    VALPHAR, D7
+    XVNMSUB      D6,    U11,    VALPHAI, D6
+    XVFMADD      D7,    U10,    VALPHAI, D7
+
+    xvand.v    D4,     D6,    D6
+    xvpermi.q  D4,     D7,    0x02 //c0[2] c1[2] c0[3] c1[3]
+    xvpermi.d  D4,     D4,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+
+    xvand.v    D5,     D7,    D7
+    xvpermi.q  D5,     D6,    0x31 //c2[2] c3[2] c2[3] c3[3]
+    xvpermi.d  D5,     D5,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+
+    xvand.v    D0,     D10,    D10
+    xvand.v    D1,     D11,    D11
+
+    xvpermi.q  D0,     D4,     0x02 //c0: 0 1 2 3
+    xvpermi.q  D4,     D10,    0x31 //c1: 0 1 2 3
+    xvpermi.q  D1,     D5,     0x02 //c2: 0 1 2 3
+    xvpermi.q  D5,     D11,    0x31 //c3: 0 1 2 3
+
+    xvst       D0,     C0,    0x00
+    xvst       D4,     C1,    0x00
+    xvst       D1,     C2,    0x00
+    xvst       D5,     C3,    0x00
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+    addi.d     C2,     C2,    0x20
+    addi.d     C3,     C3,    0x20
+
+    //res06 res16 res26 res36
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3
+    xvld       D1,     C1,    0x00 //c1: 0 1 2 3
+    xvld       D2,     C2,    0x00 //c2: 0 1 2 3
+    xvld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    xvand.v    D4,     D0,    D0
+    xvpermi.q  D4,     D1,    0x02 //c0:0 1, c1:0 1
+    xvpermi.d  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    xvand.v    D5,     D2,    D2
+    xvpermi.q  D5,     D3,    0x02 //c2:0 1, c3:0 1
+    xvpermi.d  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[0] c1[0] c2[0] c3[0]
+    xvpermi.q  D7,     D9,    0x02 //c0[1] c1[1] c2[1] c3[1]
+
+    XVFMADD      D6,    U12,    VALPHAR, D6
+    XVFMADD      D7,    U13,    VALPHAR, D7
+    XVNMSUB      D6,    U13,    VALPHAI, D6
+    XVFMADD      D7,    U12,    VALPHAI, D7
+
+    xvand.v    D10,     D6,    D6
+    xvpermi.q  D10,     D7,    0x02 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    xvand.v    D11,     D7,    D7
+    xvpermi.q  D11,     D6,    0x31 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res07 res17 res27 res37
+    xvand.v    D4,     D1,    D1
+    xvpermi.q  D4,     D0,    0x31 //c0:2 3, c1:2 3
+    xvpermi.d  D6,     D4,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    xvpermi.d  D7,     D4,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    xvand.v    D5,     D3,    D3
+    xvpermi.q  D5,     D2,    0x31 //c2:2 3, c3:2 3
+    xvpermi.d  D8,     D5,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    xvpermi.d  D9,     D5,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[2] c1[2] c2[2] c3[2]
+    xvpermi.q  D7,     D9,    0x02 //c0[3] c1[3] c2[3] c3[3]
+
+    XVFMADD      D6,    U14,    VALPHAR, D6
+    XVFMADD      D7,    U15,    VALPHAR, D7
+    XVNMSUB      D6,    U15,    VALPHAI, D6
+    XVFMADD      D7,    U14,    VALPHAI, D7
+
+    xvand.v    D4,     D6,    D6
+    xvpermi.q  D4,     D7,    0x02 //c0[2] c1[2] c0[3] c1[3]
+    xvpermi.d  D4,     D4,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+
+    xvand.v    D5,     D7,    D7
+    xvpermi.q  D5,     D6,    0x31 //c2[2] c3[2] c2[3] c3[3]
+    xvpermi.d  D5,     D5,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+
+    xvand.v    D0,     D10,    D10
+    xvand.v    D1,     D11,    D11
+
+    xvpermi.q  D0,     D4,     0x02 //c0: 0 1 2 3
+    xvpermi.q  D4,     D10,    0x31 //c1: 0 1 2 3
+    xvpermi.q  D1,     D5,     0x02 //c2: 0 1 2 3
+    xvpermi.q  D5,     D11,    0x31 //c3: 0 1 2 3
+
+    xvst       D0,     C0,    0x00
+    xvst       D4,     C1,    0x00
+    xvst       D1,     C2,    0x00
+    xvst       D5,     C3,    0x00
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+    addi.d     C2,     C2,    0x20
+    addi.d     C3,     C3,    0x20
+#endif
+
+#if defined(TRMMKERNEL)
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -8
+#else
+    addi.d     TL,     TL,   -4
+#endif
+    slli.d     T3,     TL,   0x07
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x06
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   8
+#endif
+
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     I,      I,     1
+    blt        I,      T0,    .L11
+
+.L150:
+    move       I,      $r0
+    andi       T0,     M,     4
+    beq        I,      T0,    .L18
+
+.L15:  /* if (bm & 4) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,  0x06
+    add.d      A0,     A0,   T3
+    slli.d     T3,     OFF,  0x06
+    add.d      B0,     B,    T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   4
+#else
+    addi.d     TL,     OFF,   4
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    xvxor.v    U0,     U0,   U0
+    xvxor.v    U1,     U1,   U1
+    xvxor.v    U2,     U2,   U2
+    xvxor.v    U3,     U3,   U3
+    xvxor.v    U4,     U4,   U4
+    xvxor.v    U5,     U5,   U5
+    xvxor.v    U6,     U6,   U6
+    xvxor.v    U7,     U7,   U7
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L17
+    blt        TL,     L,     .L17
+
+.L16:  /* for (k=0; k<temp; k++) */
+    xvld       D0,     A0,    0x00  // a0ri a1ri
+    xvld       D2,     B0,    0x00  // b0ri b1ri
+    xvld       D3,     B0,    0x20  // b2ri b3ri
+
+    xvpermi.d  D4,     D0,    0x00  //a0r
+    xvpermi.d  D5,     D0,    0x55  //a0i
+
+    xvpackev.d D6,     D3,    D2
+    xvpermi.d  D6,     D6,    0xd8  //b0r b1r b2r b3r
+
+    xvpackod.d D7,     D3,    D2
+    xvpermi.d  D7,     D7,    0xd8  //b0i b1i b2i b3i
+
+    XVMADD1    U0,     D4,    D6,     U0  //00r 10r 20r 30r
+    XVMADD2    U1,     D5,    D6,     U1  //00i 10i 20i 30i
+    XVMADD3    U0,     D5,    D7,     U0
+    XVMADD4    U1,     D4,    D7,     U1
+
+    xvpermi.d  D4,     D0,    0xaa  //a1r
+    xvpermi.d  D5,     D0,    0xff  //a1i
+
+    XVMADD1    U2,     D4,    D6,     U2  //01r 11r 21r 31r
+    XVMADD2    U3,     D5,    D6,     U3  //01i 11i 21i 31i
+    XVMADD3    U2,     D5,    D7,     U2
+    XVMADD4    U3,     D4,    D7,     U3
+
+    xvld       D0,     A0,    0x20  // a2ri a3ri
+
+    xvpermi.d  D4,     D0,    0x00  //a2r
+    xvpermi.d  D5,     D0,    0x55  //a2i
+
+    XVMADD1    U4,     D4,    D6,     U4  //02r 12r 22r 32r
+    XVMADD2    U5,     D5,    D6,     U5  //02i 12i 22i 32i
+    XVMADD3    U4,     D5,    D7,     U4
+    XVMADD4    U5,     D4,    D7,     U5
+
+    xvpermi.d  D4,     D0,    0xaa  //a3r
+    xvpermi.d  D5,     D0,    0xff  //a3i
+
+    XVMADD1    U6,     D4,    D6,     U6  //03r 13r 23r 33r
+    XVMADD2    U7,     D5,    D6,     U7  //03i 13i 23i 33i
+    XVMADD3    U6,     D5,    D7,     U6
+    XVMADD4    U7,     D4,    D7,     U7
+
+    addi.d     A0,     A0,    0x40
+    addi.d     B0,     B0,    0x40
+
+    addi.d     L,      L,     1
+    blt        L,      TL,     .L16
+
+.L17:
+#if defined(TRMMKERNEL)
+    //res00 res10 res20 res30
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3
+    xvld       D1,     C1,    0x00 //c1: 0 1 2 3
+    xvld       D2,     C2,    0x00 //c2: 0 1 2 3
+    xvld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    xvand.v    D4,     D0,    D0
+    xvpermi.q  D4,     D1,    0x02 //c0:0 1, c1:0 1
+    xvpermi.d  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    xvand.v    D5,     D2,    D2
+    xvpermi.q  D5,     D3,    0x02 //c2:0 1, c3:0 1
+    xvpermi.d  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[0] c1[0] c2[0] c3[0]
+    xvpermi.q  D7,     D9,    0x02 //c0[1] c1[1] c2[1] c3[1]
+
+    xvfmul.d      D6,    U0,    VALPHAR
+    xvfmul.d      D7,    U1,    VALPHAR
+    XVNMSUB      D6,    U1,    VALPHAI, D6
+    XVFMADD      D7,    U0,    VALPHAI, D7
+
+    xvand.v    D10,     D6,    D6
+    xvpermi.q  D10,     D7,    0x02 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    xvand.v    D11,     D7,    D7
+    xvpermi.q  D11,     D6,    0x31 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res01 res11 res21 res31
+    xvand.v    D4,     D1,    D1
+    xvpermi.q  D4,     D0,    0x31 //c0:2 3, c1:2 3
+    xvpermi.d  D6,     D4,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    xvpermi.d  D7,     D4,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    xvand.v    D5,     D3,    D3
+    xvpermi.q  D5,     D2,    0x31 //c2:2 3, c3:2 3
+    xvpermi.d  D8,     D5,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    xvpermi.d  D9,     D5,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[2] c1[2] c2[2] c3[2]
+    xvpermi.q  D7,     D9,    0x02 //c0[3] c1[3] c2[3] c3[3]
+
+    xvfmul.d      D6,    U2,    VALPHAR
+    xvfmul.d      D7,    U3,    VALPHAR
+    XVNMSUB      D6,    U3,    VALPHAI, D6
+    XVFMADD      D7,    U2,    VALPHAI, D7
+
+    xvand.v    D4,     D6,    D6
+    xvpermi.q  D4,     D7,    0x02 //c0[2] c1[2] c0[3] c1[3]
+    xvpermi.d  D4,     D4,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+
+    xvand.v    D5,     D7,    D7
+    xvpermi.q  D5,     D6,    0x31 //c2[2] c3[2] c2[3] c3[3]
+    xvpermi.d  D5,     D5,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+
+    xvand.v    D0,     D10,    D10
+    xvand.v    D1,     D11,    D11
+
+    xvpermi.q  D0,     D4,     0x02 //c0: 0 1 2 3
+    xvpermi.q  D4,     D10,    0x31 //c1: 0 1 2 3
+    xvpermi.q  D1,     D5,     0x02 //c2: 0 1 2 3
+    xvpermi.q  D5,     D11,    0x31 //c3: 0 1 2 3
+
+    xvst       D0,     C0,    0x00
+    xvst       D4,     C1,    0x00
+    xvst       D1,     C2,    0x00
+    xvst       D5,     C3,    0x00
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+    addi.d     C2,     C2,    0x20
+    addi.d     C3,     C3,    0x20
+
+    //res02 res12 res22 res32
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3
+    xvld       D1,     C1,    0x00 //c1: 0 1 2 3
+    xvld       D2,     C2,    0x00 //c2: 0 1 2 3
+    xvld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    xvand.v    D4,     D0,    D0
+    xvpermi.q  D4,     D1,    0x02 //c0:0 1, c1:0 1
+    xvpermi.d  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    xvand.v    D5,     D2,    D2
+    xvpermi.q  D5,     D3,    0x02 //c2:0 1, c3:0 1
+    xvpermi.d  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[0] c1[0] c2[0] c3[0]
+    xvpermi.q  D7,     D9,    0x02 //c0[1] c1[1] c2[1] c3[1]
+
+    xvfmul.d      D6,    U4,    VALPHAR
+    xvfmul.d      D7,    U5,    VALPHAR
+    XVNMSUB      D6,    U5,    VALPHAI, D6
+    XVFMADD      D7,    U4,    VALPHAI, D7
+
+    xvand.v    D10,     D6,    D6
+    xvpermi.q  D10,     D7,    0x02 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    xvand.v    D11,     D7,    D7
+    xvpermi.q  D11,     D6,    0x31 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res03 res13 res23 res33
+    xvand.v    D4,     D1,    D1
+    xvpermi.q  D4,     D0,    0x31 //c0:2 3, c1:2 3
+    xvpermi.d  D6,     D4,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    xvpermi.d  D7,     D4,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    xvand.v    D5,     D3,    D3
+    xvpermi.q  D5,     D2,    0x31 //c2:2 3, c3:2 3
+    xvpermi.d  D8,     D5,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    xvpermi.d  D9,     D5,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[2] c1[2] c2[2] c3[2]
+    xvpermi.q  D7,     D9,    0x02 //c0[3] c1[3] c2[3] c3[3]
+
+    xvfmul.d      D6,    U6,    VALPHAR
+    xvfmul.d      D7,    U7,    VALPHAR
+    XVNMSUB      D6,    U7,    VALPHAI, D6
+    XVFMADD      D7,    U6,    VALPHAI, D7
+
+    xvand.v    D4,     D6,    D6
+    xvpermi.q  D4,     D7,    0x02 //c0[2] c1[2] c0[3] c1[3]
+    xvpermi.d  D4,     D4,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+
+    xvand.v    D5,     D7,    D7
+    xvpermi.q  D5,     D6,    0x31 //c2[2] c3[2] c2[3] c3[3]
+    xvpermi.d  D5,     D5,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+
+    xvand.v    D0,     D10,    D10
+    xvand.v    D1,     D11,    D11
+
+    xvpermi.q  D0,     D4,     0x02 //c0: 0 1 2 3
+    xvpermi.q  D4,     D10,    0x31 //c1: 0 1 2 3
+    xvpermi.q  D1,     D5,     0x02 //c2: 0 1 2 3
+    xvpermi.q  D5,     D11,    0x31 //c3: 0 1 2 3
+
+    xvst       D0,     C0,    0x00
+    xvst       D4,     C1,    0x00
+    xvst       D1,     C2,    0x00
+    xvst       D5,     C3,    0x00
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+    addi.d     C2,     C2,    0x20
+    addi.d     C3,     C3,    0x20
+#else
+    //res00 res10 res20 res30
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3
+    xvld       D1,     C1,    0x00 //c1: 0 1 2 3
+    xvld       D2,     C2,    0x00 //c2: 0 1 2 3
+    xvld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    xvand.v    D4,     D0,    D0
+    xvpermi.q  D4,     D1,    0x02 //c0:0 1, c1:0 1
+    xvpermi.d  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    xvand.v    D5,     D2,    D2
+    xvpermi.q  D5,     D3,    0x02 //c2:0 1, c3:0 1
+    xvpermi.d  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[0] c1[0] c2[0] c3[0]
+    xvpermi.q  D7,     D9,    0x02 //c0[1] c1[1] c2[1] c3[1]
+
+    XVFMADD      D6,    U0,    VALPHAR, D6
+    XVFMADD      D7,    U1,    VALPHAR, D7
+    XVNMSUB      D6,    U1,    VALPHAI, D6
+    XVFMADD      D7,    U0,    VALPHAI, D7
+
+    xvand.v    D10,     D6,    D6
+    xvpermi.q  D10,     D7,    0x02 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    xvand.v    D11,     D7,    D7
+    xvpermi.q  D11,     D6,    0x31 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res01 res11 res21 res31
+    xvand.v    D4,     D1,    D1
+    xvpermi.q  D4,     D0,    0x31 //c0:2 3, c1:2 3
+    xvpermi.d  D6,     D4,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    xvpermi.d  D7,     D4,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    xvand.v    D5,     D3,    D3
+    xvpermi.q  D5,     D2,    0x31 //c2:2 3, c3:2 3
+    xvpermi.d  D8,     D5,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    xvpermi.d  D9,     D5,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[2] c1[2] c2[2] c3[2]
+    xvpermi.q  D7,     D9,    0x02 //c0[3] c1[3] c2[3] c3[3]
+
+    XVFMADD      D6,    U2,    VALPHAR, D6
+    XVFMADD      D7,    U3,    VALPHAR, D7
+    XVNMSUB      D6,    U3,    VALPHAI, D6
+    XVFMADD      D7,    U2,    VALPHAI, D7
+
+    xvand.v    D4,     D6,    D6
+    xvpermi.q  D4,     D7,    0x02 //c0[2] c1[2] c0[3] c1[3]
+    xvpermi.d  D4,     D4,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+
+    xvand.v    D5,     D7,    D7
+    xvpermi.q  D5,     D6,    0x31 //c2[2] c3[2] c2[3] c3[3]
+    xvpermi.d  D5,     D5,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+
+    xvand.v    D0,     D10,    D10
+    xvand.v    D1,     D11,    D11
+
+    xvpermi.q  D0,     D4,     0x02 //c0: 0 1 2 3
+    xvpermi.q  D4,     D10,    0x31 //c1: 0 1 2 3
+    xvpermi.q  D1,     D5,     0x02 //c2: 0 1 2 3
+    xvpermi.q  D5,     D11,    0x31 //c3: 0 1 2 3
+
+    xvst       D0,     C0,    0x00
+    xvst       D4,     C1,    0x00
+    xvst       D1,     C2,    0x00
+    xvst       D5,     C3,    0x00
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+    addi.d     C2,     C2,    0x20
+    addi.d     C3,     C3,    0x20
+
+    //res02 res12 res22 res32
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3
+    xvld       D1,     C1,    0x00 //c1: 0 1 2 3
+    xvld       D2,     C2,    0x00 //c2: 0 1 2 3
+    xvld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    xvand.v    D4,     D0,    D0
+    xvpermi.q  D4,     D1,    0x02 //c0:0 1, c1:0 1
+    xvpermi.d  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    xvand.v    D5,     D2,    D2
+    xvpermi.q  D5,     D3,    0x02 //c2:0 1, c3:0 1
+    xvpermi.d  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[0] c1[0] c2[0] c3[0]
+    xvpermi.q  D7,     D9,    0x02 //c0[1] c1[1] c2[1] c3[1]
+
+    XVFMADD      D6,    U4,    VALPHAR, D6
+    XVFMADD      D7,    U5,    VALPHAR, D7
+    XVNMSUB      D6,    U5,    VALPHAI, D6
+    XVFMADD      D7,    U4,    VALPHAI, D7
+
+    xvand.v    D10,     D6,    D6
+    xvpermi.q  D10,     D7,    0x02 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    xvand.v    D11,     D7,    D7
+    xvpermi.q  D11,     D6,    0x31 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res03 res13 res23 res33
+    xvand.v    D4,     D1,    D1
+    xvpermi.q  D4,     D0,    0x31 //c0:2 3, c1:2 3
+    xvpermi.d  D6,     D4,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    xvpermi.d  D7,     D4,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    xvand.v    D5,     D3,    D3
+    xvpermi.q  D5,     D2,    0x31 //c2:2 3, c3:2 3
+    xvpermi.d  D8,     D5,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    xvpermi.d  D9,     D5,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[2] c1[2] c2[2] c3[2]
+    xvpermi.q  D7,     D9,    0x02 //c0[3] c1[3] c2[3] c3[3]
+
+    XVFMADD      D6,    U6,    VALPHAR, D6
+    XVFMADD      D7,    U7,    VALPHAR, D7
+    XVNMSUB      D6,    U7,    VALPHAI, D6
+    XVFMADD      D7,    U6,    VALPHAI, D7
+
+    xvand.v    D4,     D6,    D6
+    xvpermi.q  D4,     D7,    0x02 //c0[2] c1[2] c0[3] c1[3]
+    xvpermi.d  D4,     D4,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+
+    xvand.v    D5,     D7,    D7
+    xvpermi.q  D5,     D6,    0x31 //c2[2] c3[2] c2[3] c3[3]
+    xvpermi.d  D5,     D5,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+
+    xvand.v    D0,     D10,    D10
+    xvand.v    D1,     D11,    D11
+
+    xvpermi.q  D0,     D4,     0x02 //c0: 0 1 2 3
+    xvpermi.q  D4,     D10,    0x31 //c1: 0 1 2 3
+    xvpermi.q  D1,     D5,     0x02 //c2: 0 1 2 3
+    xvpermi.q  D5,     D11,    0x31 //c3: 0 1 2 3
+
+    xvst       D0,     C0,    0x00
+    xvst       D4,     C1,    0x00
+    xvst       D1,     C2,    0x00
+    xvst       D5,     C3,    0x00
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+    addi.d     C2,     C2,    0x20
+    addi.d     C3,     C3,    0x20
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -4
+#else
+    addi.d     TL,     TL,   -4
+#endif
+    slli.d     T3,     TL,   0x06
+    add.d      A0,     A0,   T3
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   4
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+.L18:   /* if (bm & 2) */
+    move       I,      $r0
+    andi       T0,     M,     2
+    beq        I,      T0,    .L183
+
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,  0x05
+    add.d      A0,     A0,   T3
+    slli.d     T3,     OFF,  0x06
+    add.d      B0,     B,    T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   2
+#else
+    addi.d     TL,     OFF,   4
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    xvxor.v    U0,     U0,   U0
+    xvxor.v    U1,     U1,   U1
+    xvxor.v    U2,     U2,   U2
+    xvxor.v    U3,     U3,   U3
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L182
+    blt        TL,     L,     .L182
+
+.L181:  /* for (k=0; k<temp; k++) */
+    xvld       D0,     A0,    0x00  // a0ri a1ri
+    xvld       D2,     B0,    0x00  // b0ri b1ri
+    xvld       D3,     B0,    0x20  // b2ri b3ri
+
+    xvpermi.d  D4,     D0,    0x00  //a0r
+    xvpermi.d  D5,     D0,    0x55  //a0i
+
+    xvpackev.d D6,     D3,    D2
+    xvpermi.d  D6,     D6,    0xd8  //b0r b1r b2r b3r
+
+    xvpackod.d D7,     D3,    D2
+    xvpermi.d  D7,     D7,    0xd8  //b0i b1i b2i b3i
+
+    XVMADD1    U0,     D4,    D6,     U0  //00r 10r 20r 30r
+    XVMADD2    U1,     D5,    D6,     U1  //00i 10i 20i 30i
+    XVMADD3    U0,     D5,    D7,     U0
+    XVMADD4    U1,     D4,    D7,     U1
+
+    xvpermi.d  D4,     D0,    0xaa  //a1r
+    xvpermi.d  D5,     D0,    0xff  //a1i
+
+    XVMADD1    U2,     D4,    D6,     U2  //01r 11r 21r 31r
+    XVMADD2    U3,     D5,    D6,     U3  //01i 11i 21i 31i
+    XVMADD3    U2,     D5,    D7,     U2
+    XVMADD4    U3,     D4,    D7,     U3
+
+    addi.d     A0,     A0,    0x20
+    addi.d     B0,     B0,    0x40
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L181
+
+.L182:
+#if defined(TRMMKERNEL)
+    //res00 res10 res20 res30
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3
+    xvld       D1,     C1,    0x00 //c1: 0 1 2 3
+    xvld       D2,     C2,    0x00 //c2: 0 1 2 3
+    xvld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    xvand.v    D4,     D0,    D0
+    xvpermi.q  D4,     D1,    0x02 //c0:0 1, c1:0 1
+    xvpermi.d  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    xvand.v    D5,     D2,    D2
+    xvpermi.q  D5,     D3,    0x02 //c2:0 1, c3:0 1
+    xvpermi.d  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[0] c1[0] c2[0] c3[0]
+    xvpermi.q  D7,     D9,    0x02 //c0[1] c1[1] c2[1] c3[1]
+
+    xvfmul.d      D6,    U0,    VALPHAR
+    xvfmul.d      D7,    U1,    VALPHAR
+    XVNMSUB      D6,    U1,    VALPHAI, D6
+    XVFMADD      D7,    U0,    VALPHAI, D7
+
+    xvand.v    D10,     D6,    D6
+    xvpermi.q  D10,     D7,    0x02 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    xvand.v    D11,     D7,    D7
+    xvpermi.q  D11,     D6,    0x31 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res01 res11 res21 res31
+    xvand.v    D4,     D1,    D1
+    xvpermi.q  D4,     D0,    0x31 //c0:2 3, c1:2 3
+    xvpermi.d  D6,     D4,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    xvpermi.d  D7,     D4,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    xvand.v    D5,     D3,    D3
+    xvpermi.q  D5,     D2,    0x31 //c2:2 3, c3:2 3
+    xvpermi.d  D8,     D5,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    xvpermi.d  D9,     D5,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[2] c1[2] c2[2] c3[2]
+    xvpermi.q  D7,     D9,    0x02 //c0[3] c1[3] c2[3] c3[3]
+
+    xvfmul.d      D6,    U2,    VALPHAR
+    xvfmul.d      D7,    U3,    VALPHAR
+    XVNMSUB      D6,    U3,    VALPHAI, D6
+    XVFMADD      D7,    U2,    VALPHAI, D7
+
+    xvand.v    D4,     D6,    D6
+    xvpermi.q  D4,     D7,    0x02 //c0[2] c1[2] c0[3] c1[3]
+    xvpermi.d  D4,     D4,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+
+    xvand.v    D5,     D7,    D7
+    xvpermi.q  D5,     D6,    0x31 //c2[2] c3[2] c2[3] c3[3]
+    xvpermi.d  D5,     D5,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+
+    xvand.v    D0,     D10,    D10
+    xvand.v    D1,     D11,    D11
+
+    xvpermi.q  D0,     D4,     0x02 //c0: 0 1 2 3
+    xvpermi.q  D4,     D10,    0x31 //c1: 0 1 2 3
+    xvpermi.q  D1,     D5,     0x02 //c2: 0 1 2 3
+    xvpermi.q  D5,     D11,    0x31 //c3: 0 1 2 3
+
+    xvst       D0,     C0,    0x00
+    xvst       D4,     C1,    0x00
+    xvst       D1,     C2,    0x00
+    xvst       D5,     C3,    0x00
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+    addi.d     C2,     C2,    0x20
+    addi.d     C3,     C3,    0x20
+#else
+    //res00 res10 res20 res30
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3
+    xvld       D1,     C1,    0x00 //c1: 0 1 2 3
+    xvld       D2,     C2,    0x00 //c2: 0 1 2 3
+    xvld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    xvand.v    D4,     D0,    D0
+    xvpermi.q  D4,     D1,    0x02 //c0:0 1, c1:0 1
+    xvpermi.d  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    xvand.v    D5,     D2,    D2
+    xvpermi.q  D5,     D3,    0x02 //c2:0 1, c3:0 1
+    xvpermi.d  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[0] c1[0] c2[0] c3[0]
+    xvpermi.q  D7,     D9,    0x02 //c0[1] c1[1] c2[1] c3[1]
+
+    XVFMADD      D6,    U0,    VALPHAR, D6
+    XVFMADD      D7,    U1,    VALPHAR, D7
+    XVNMSUB      D6,    U1,    VALPHAI, D6
+    XVFMADD      D7,    U0,    VALPHAI, D7
+
+    xvand.v    D10,     D6,    D6
+    xvpermi.q  D10,     D7,    0x02 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    xvand.v    D11,     D7,    D7
+    xvpermi.q  D11,     D6,    0x31 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res01 res11 res21 res31
+    xvand.v    D4,     D1,    D1
+    xvpermi.q  D4,     D0,    0x31 //c0:2 3, c1:2 3
+    xvpermi.d  D6,     D4,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    xvpermi.d  D7,     D4,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    xvand.v    D5,     D3,    D3
+    xvpermi.q  D5,     D2,    0x31 //c2:2 3, c3:2 3
+    xvpermi.d  D8,     D5,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    xvpermi.d  D9,     D5,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[2] c1[2] c2[2] c3[2]
+    xvpermi.q  D7,     D9,    0x02 //c0[3] c1[3] c2[3] c3[3]
+
+    XVFMADD      D6,    U2,    VALPHAR, D6
+    XVFMADD      D7,    U3,    VALPHAR, D7
+    XVNMSUB      D6,    U3,    VALPHAI, D6
+    XVFMADD      D7,    U2,    VALPHAI, D7
+
+    xvand.v    D4,     D6,    D6
+    xvpermi.q  D4,     D7,    0x02 //c0[2] c1[2] c0[3] c1[3]
+    xvpermi.d  D4,     D4,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+
+    xvand.v    D5,     D7,    D7
+    xvpermi.q  D5,     D6,    0x31 //c2[2] c3[2] c2[3] c3[3]
+    xvpermi.d  D5,     D5,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+
+    xvand.v    D0,     D10,    D10
+    xvand.v    D1,     D11,    D11
+
+    xvpermi.q  D0,     D4,     0x02 //c0: 0 1 2 3
+    xvpermi.q  D4,     D10,    0x31 //c1: 0 1 2 3
+    xvpermi.q  D1,     D5,     0x02 //c2: 0 1 2 3
+    xvpermi.q  D5,     D11,    0x31 //c3: 0 1 2 3
+
+    xvst       D0,     C0,    0x00
+    xvst       D4,     C1,    0x00
+    xvst       D1,     C2,    0x00
+    xvst       D5,     C3,    0x00
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+    addi.d     C2,     C2,    0x20
+    addi.d     C3,     C3,    0x20
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -2
+#else
+    addi.d     TL,     TL,   -4
+#endif
+    slli.d     T3,     TL,   0x05
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x06
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   2
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+.L183:   /* if (bm & 1) */
+    move       I,      $r0
+    andi       T0,     M,     1
+    beq        I,      T0,    .L186
+
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,  0x04
+    add.d      A0,     A0,   T3
+    slli.d     T3,     OFF,  0x06
+    add.d      B0,     B,    T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   1
+#else
+    addi.d     TL,     OFF,   4
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    xvxor.v    U0,     U0,   U0
+    xvxor.v    U1,     U1,   U1
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L185
+    blt        TL,     L,     .L185
+
+.L184:  /* for (k=0; k<temp; k++) */
+    xvld       D0,     A0,    0x00  // a0ri a1ri
+    xvld       D2,     B0,    0x00  // b0ri b1ri
+    xvld       D3,     B0,    0x20  // b2ri b3ri
+
+    xvpermi.d  D4,     D0,    0x00  //a0r
+    xvpermi.d  D5,     D0,    0x55  //a0i
+
+    xvpackev.d D6,     D3,    D2
+    xvpermi.d  D6,     D6,    0xd8  //b0r b1r b2r b3r
+
+    xvpackod.d D7,     D3,    D2
+    xvpermi.d  D7,     D7,    0xd8  //b0i b1i b2i b3i
+
+    XVMADD1    U0,     D4,    D6,     U0  //00r 10r 20r 30r
+    XVMADD2    U1,     D5,    D6,     U1  //00i 10i 20i 30i
+    XVMADD3    U0,     D5,    D7,     U0
+    XVMADD4    U1,     D4,    D7,     U1
+
+    addi.d     A0,     A0,    0x10
+    addi.d     B0,     B0,    0x40
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L184
+
+.L185:
+#if defined(TRMMKERNEL)
+    //res00 res10 res20 res30
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3
+    xvld       D1,     C1,    0x00 //c1: 0 1 2 3
+    xvld       D2,     C2,    0x00 //c2: 0 1 2 3
+    xvld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    xvand.v    D4,     D0,    D0
+    xvpermi.q  D4,     D1,    0x02 //c0:0 1, c1:0 1
+    xvpermi.d  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    xvand.v    D5,     D2,    D2
+    xvpermi.q  D5,     D3,    0x02 //c2:0 1, c3:0 1
+    xvpermi.d  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[0] c1[0] c2[0] c3[0]
+    xvpermi.q  D7,     D9,    0x02 //c0[1] c1[1] c2[1] c3[1]
+
+    xvfmul.d      D6,    U0,    VALPHAR
+    xvfmul.d      D7,    U1,    VALPHAR
+    XVNMSUB      D6,    U1,    VALPHAI, D6
+    XVFMADD      D7,    U0,    VALPHAI, D7
+
+    xvand.v    D10,     D6,    D6
+    xvpermi.q  D10,     D7,    0x02 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    xvand.v    D11,     D7,    D7
+    xvpermi.q  D11,     D6,    0x31 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    xvpermi.d  D8,     D10,   0x4e //c1[0] c1[1] c0[0] c0[1]
+    xvpermi.d  D9,     D11,   0x4e //c3[0] c3[1] c2[0] c2[1]
+
+    vst       $vr26,     C0,    0x00
+    vst       $vr24,     C1,    0x00
+    vst       $vr27,     C2,    0x00
+    vst       $vr25,     C3,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+#else
+    //res00 res10 res20 res30
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3
+    xvld       D1,     C1,    0x00 //c1: 0 1 2 3
+    xvld       D2,     C2,    0x00 //c2: 0 1 2 3
+    xvld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    xvand.v    D4,     D0,    D0
+    xvpermi.q  D4,     D1,    0x02 //c0:0 1, c1:0 1
+    xvpermi.d  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    xvand.v    D5,     D2,    D2
+    xvpermi.q  D5,     D3,    0x02 //c2:0 1, c3:0 1
+    xvpermi.d  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[0] c1[0] c2[0] c3[0]
+    xvpermi.q  D7,     D9,    0x02 //c0[1] c1[1] c2[1] c3[1]
+
+    XVFMADD      D6,    U0,    VALPHAR, D6
+    XVFMADD      D7,    U1,    VALPHAR, D7
+    XVNMSUB      D6,    U1,    VALPHAI, D6
+    XVFMADD      D7,    U0,    VALPHAI, D7
+
+    xvand.v    D10,     D6,    D6
+    xvpermi.q  D10,     D7,    0x02 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    xvand.v    D11,     D7,    D7
+    xvpermi.q  D11,     D6,    0x31 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    xvpermi.d  D8,     D10,   0x4e //c1[0] c1[1] c0[0] c0[1]
+    xvpermi.d  D9,     D11,   0x4e //c3[0] c3[1] c2[0] c2[1]
+
+    vst       $vr26,     C0,    0x00
+    vst       $vr24,     C1,    0x00
+    vst       $vr27,     C2,    0x00
+    vst       $vr25,     C3,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -1
+#else
+    addi.d     TL,     TL,   -4
+#endif
+    slli.d     T3,     TL,   0x04
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x06
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   1
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+
+.L186:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    addi.d     OFF,    OFF,   4
+#endif
+
+    slli.d     L,      K,     0x06
+    add.d      B,      B,     L
+
+    slli.d     I,      LDC,   0x03
+    add.d      C,      C,     I
+
+    addi.d     J,      J,     1
+    srai.d     T0,     N,     2
+    blt        J,      T0,    .L10
+
+.L19:
+    move       J,      $r0
+    andi       T0,     N,     2
+    beq        J,      T0,    .L30
+
+.L20: /* for (j=0; j<(bn&2); j+=2) */
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move       OFF,    OFFSET
+#endif
+
+    move       C0,     C
+    slli.d     TL,     LDC,   1
+    add.d      C1,     C0,    TL
+    move       A0,     A    //ptrba
+
+    move       I,      $r0
+    srai.d     T0,     M,     3  //bm/8
+    beq        I,      T0,    .L24
+
+.L21:  /* for (i=0; i<bm/8; i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,  0x07
+    add.d      A0,     A0,   T3
+    slli.d     T3,     OFF,  0x05
+    add.d      B0,     B,    T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   8
+#else
+    addi.d     TL,     OFF,   2
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    vxor.v    $vr30,     $vr30,   $vr30
+    vxor.v    $vr31,     $vr31,   $vr31
+    vxor.v    $vr2,      $vr2,    $vr2
+    vxor.v    $vr3,      $vr3,    $vr3
+    vxor.v    $vr4,      $vr4,    $vr4
+    vxor.v    $vr5,      $vr5,    $vr5
+    vxor.v    $vr6,      $vr6,    $vr6
+    vxor.v    $vr7,      $vr7,    $vr7
+    vxor.v    $vr8,      $vr8,    $vr8
+    vxor.v    $vr9,      $vr9,    $vr9
+    vxor.v    $vr10,      $vr10,    $vr10
+    vxor.v    $vr11,      $vr11,    $vr11
+    vxor.v    $vr12,      $vr12,    $vr12
+    vxor.v    $vr13,      $vr13,    $vr13
+    vxor.v    $vr14,      $vr14,    $vr14
+    vxor.v    $vr15,      $vr15,    $vr15
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L23
+    blt        TL,     L,     .L23
+
+.L22:  /* for (k=0; k<temp; k++) */
+    vld       $vr16,     A0,    0x00  // a0ri
+
+    vld       $vr18,     B0,    0x00  // b0ri
+    vld       $vr19,     B0,    0x10  // b1ri
+
+    vshuf4i.d  $vr21,     $vr16,    0x0a  //a0rr
+    vshuf4i.d  $vr22,     $vr16,    0x0f  //a0ii
+
+    vand.v    $vr23,     $vr18,    $vr18
+    vshuf4i.d $vr23,     $vr19,    0x08 //b0r b1r
+    vshuf4i.d $vr18,     $vr19,    0x0d //b0i b1i
+
+    VMADD1    $vr30,     $vr21,    $vr23,     $vr30  //00r 10r
+    VMADD2    $vr31,     $vr22,    $vr23,     $vr31  //00i 10i
+    VMADD3    $vr30,     $vr22,    $vr18,     $vr30
+    VMADD4    $vr31,     $vr21,    $vr18,     $vr31
+
+    vld       $vr16,     A0,    0x10  // a1ri
+
+    vshuf4i.d  $vr21,     $vr16,    0x0a  //a1rr
+    vshuf4i.d  $vr22,     $vr16,    0x0f  //a1ii
+
+    VMADD1    $vr2,     $vr21,    $vr23,     $vr2  //01r 11r
+    VMADD2    $vr3,     $vr22,    $vr23,     $vr3  //01i 11i
+    VMADD3    $vr2,     $vr22,    $vr18,     $vr2
+    VMADD4    $vr3,     $vr21,    $vr18,     $vr3
+
+    vld       $vr16,     A0,    0x20  // a2ri
+
+    vshuf4i.d  $vr21,     $vr16,    0x0a  //a2rr
+    vshuf4i.d  $vr22,     $vr16,    0x0f  //a2ii
+
+    VMADD1    $vr4,     $vr21,    $vr23,     $vr4  //02r 12r
+    VMADD2    $vr5,     $vr22,    $vr23,     $vr5  //02i 12i
+    VMADD3    $vr4,     $vr22,    $vr18,     $vr4
+    VMADD4    $vr5,     $vr21,    $vr18,     $vr5
+
+    vld       $vr16,     A0,    0x30  // a3ri
+
+    vshuf4i.d  $vr21,     $vr16,    0x0a  //a3rr
+    vshuf4i.d  $vr22,     $vr16,    0x0f  //a3ii
+
+    VMADD1    $vr6,     $vr21,    $vr23,     $vr6  //03r 13r
+    VMADD2    $vr7,     $vr22,    $vr23,     $vr7  //03i 13i
+    VMADD3    $vr6,     $vr22,    $vr18,     $vr6
+    VMADD4    $vr7,     $vr21,    $vr18,     $vr7
+
+    vld       $vr16,     A0,    0x40  // a4ri
+
+    vshuf4i.d  $vr21,     $vr16,    0x0a  //a4rr
+    vshuf4i.d  $vr22,     $vr16,    0x0f  //a4ii
+
+    VMADD1    $vr8,     $vr21,    $vr23,     $vr8  //04r 14r
+    VMADD2    $vr9,     $vr22,    $vr23,     $vr9  //04i 14i
+    VMADD3    $vr8,     $vr22,    $vr18,     $vr8
+    VMADD4    $vr9,     $vr21,    $vr18,     $vr9
+
+    vld       $vr16,     A0,    0x50  // a5ri
+
+    vshuf4i.d  $vr21,     $vr16,    0x0a  //a5rr
+    vshuf4i.d  $vr22,     $vr16,    0x0f  //a5ii
+
+    VMADD1    $vr10,     $vr21,    $vr23,     $vr10  //05r 15r
+    VMADD2    $vr11,     $vr22,    $vr23,     $vr11  //05i 15i
+    VMADD3    $vr10,     $vr22,    $vr18,     $vr10
+    VMADD4    $vr11,     $vr21,    $vr18,     $vr11
+
+    vld       $vr16,     A0,    0x60  // a6ri
+
+    vshuf4i.d  $vr21,     $vr16,    0x0a  //a6rr
+    vshuf4i.d  $vr22,     $vr16,    0x0f  //a6ii
+
+    VMADD1    $vr12,     $vr21,    $vr23,     $vr12  //06r 16r
+    VMADD2    $vr13,     $vr22,    $vr23,     $vr13  //06i 16i
+    VMADD3    $vr12,     $vr22,    $vr18,     $vr12
+    VMADD4    $vr13,     $vr21,    $vr18,     $vr13
+
+    vld       $vr16,     A0,    0x70  // a7ri
+
+    vshuf4i.d  $vr21,     $vr16,    0x0a  //a7rr
+    vshuf4i.d  $vr22,     $vr16,    0x0f  //a7ii
+
+    VMADD1    $vr14,     $vr21,    $vr23,     $vr14  //07r 17r
+    VMADD2    $vr15,     $vr22,    $vr23,     $vr15  //07i 17i
+    VMADD3    $vr14,     $vr22,    $vr18,     $vr14
+    VMADD4    $vr15,     $vr21,    $vr18,     $vr15
+
+    addi.d     A0,     A0,    0x80
+    addi.d     B0,     B0,    0x20
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L22
+
+.L23:
+#if defined(TRMMKERNEL)
+    //res00 res10
+    vld       $vr16,     C0,    0x00 //c0: 0 1
+    vld       $vr17,     C1,    0x00 //c1: 0 1
+
+    vpackev.d $vr18,     $vr17,    $vr16
+    vpackod.d $vr19,     $vr17,    $vr16
+
+    vfmul.d      $vr18,    $vr30,    $vr28
+    vfmul.d      $vr19,    $vr31,    $vr28
+    VNMSUB      $vr18,    $vr31,    $vr29, $vr18
+    VFMADD      $vr19,    $vr30,    $vr29, $vr19
+
+    vpackev.d $vr16,     $vr19,    $vr18
+    vpackod.d $vr17,     $vr19,    $vr18
+
+    vst       $vr16,     C0,    0x00 //c0: 0 1
+    vst       $vr17,     C1,    0x00 //c1: 0 1
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res01 res11
+    vld       $vr16,     C0,    0x00 //c0: 0 1
+    vld       $vr17,     C1,    0x00 //c1: 0 1
+
+    vpackev.d $vr18,     $vr17,    $vr16
+    vpackod.d $vr19,     $vr17,    $vr16
+
+    vfmul.d      $vr18,    $vr2,    $vr28
+    vfmul.d      $vr19,    $vr3,    $vr28
+    VNMSUB      $vr18,    $vr3,    $vr29, $vr18
+    VFMADD      $vr19,    $vr2,    $vr29, $vr19
+
+    vpackev.d $vr16,     $vr19,    $vr18
+    vpackod.d $vr17,     $vr19,    $vr18
+
+    vst       $vr16,     C0,    0x00 //c0: 0 1
+    vst       $vr17,     C1,    0x00 //c1: 0 1
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res02 res12
+    vld       $vr16,     C0,    0x00 //c0: 0 1
+    vld       $vr17,     C1,    0x00 //c1: 0 1
+
+    vpackev.d $vr18,     $vr17,    $vr16
+    vpackod.d $vr19,     $vr17,    $vr16
+
+    vfmul.d      $vr18,    $vr4,    $vr28
+    vfmul.d      $vr19,    $vr5,    $vr28
+    VNMSUB      $vr18,    $vr5,    $vr29, $vr18
+    VFMADD      $vr19,    $vr4,    $vr29, $vr19
+
+    vpackev.d $vr16,     $vr19,    $vr18
+    vpackod.d $vr17,     $vr19,    $vr18
+
+    vst       $vr16,     C0,    0x00 //c0: 0 1
+    vst       $vr17,     C1,    0x00 //c1: 0 1
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res03 res13
+    vld       $vr16,     C0,    0x00 //c0: 0 1
+    vld       $vr17,     C1,    0x00 //c1: 0 1
+
+    vpackev.d $vr18,     $vr17,    $vr16
+    vpackod.d $vr19,     $vr17,    $vr16
+
+    vfmul.d      $vr18,    $vr6,    $vr28
+    vfmul.d      $vr19,    $vr7,    $vr28
+    VNMSUB      $vr18,    $vr7,    $vr29, $vr18
+    VFMADD      $vr19,    $vr6,    $vr29, $vr19
+
+    vpackev.d $vr16,     $vr19,    $vr18
+    vpackod.d $vr17,     $vr19,    $vr18
+
+    vst       $vr16,     C0,    0x00 //c0: 0 1
+    vst       $vr17,     C1,    0x00 //c1: 0 1
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res04 res14
+    vld       $vr16,     C0,    0x00 //c0: 0 1
+    vld       $vr17,     C1,    0x00 //c1: 0 1
+
+    vpackev.d $vr18,     $vr17,    $vr16
+    vpackod.d $vr19,     $vr17,    $vr16
+
+    vfmul.d      $vr18,    $vr8,    $vr28
+    vfmul.d      $vr19,    $vr9,    $vr28
+    VNMSUB      $vr18,    $vr9,    $vr29, $vr18
+    VFMADD      $vr19,    $vr8,    $vr29, $vr19
+
+    vpackev.d $vr16,     $vr19,    $vr18
+    vpackod.d $vr17,     $vr19,    $vr18
+
+    vst       $vr16,     C0,    0x00 //c0: 0 1
+    vst       $vr17,     C1,    0x00 //c1: 0 1
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res05 res15
+    vld       $vr16,     C0,    0x00 //c0: 0 1
+    vld       $vr17,     C1,    0x00 //c1: 0 1
+
+    vpackev.d $vr18,     $vr17,    $vr16
+    vpackod.d $vr19,     $vr17,    $vr16
+
+    vfmul.d      $vr18,    $vr10,    $vr28
+    vfmul.d      $vr19,    $vr11,    $vr28
+    VNMSUB      $vr18,    $vr11,    $vr29, $vr18
+    VFMADD      $vr19,    $vr10,    $vr29, $vr19
+
+    vpackev.d $vr16,     $vr19,    $vr18
+    vpackod.d $vr17,     $vr19,    $vr18
+
+    vst       $vr16,     C0,    0x00 //c0: 0 1
+    vst       $vr17,     C1,    0x00 //c1: 0 1
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res06 res16
+    vld       $vr16,     C0,    0x00 //c0: 0 1
+    vld       $vr17,     C1,    0x00 //c1: 0 1
+
+    vpackev.d $vr18,     $vr17,    $vr16
+    vpackod.d $vr19,     $vr17,    $vr16
+
+    vfmul.d      $vr18,    $vr12,    $vr28
+    vfmul.d      $vr19,    $vr13,    $vr28
+    VNMSUB      $vr18,    $vr13,    $vr29, $vr18
+    VFMADD      $vr19,    $vr12,    $vr29, $vr19
+
+    vpackev.d $vr16,     $vr19,    $vr18
+    vpackod.d $vr17,     $vr19,    $vr18
+
+    vst       $vr16,     C0,    0x00 //c0: 0 1
+    vst       $vr17,     C1,    0x00 //c1: 0 1
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res07 res17
+    vld       $vr16,     C0,    0x00 //c0: 0 1
+    vld       $vr17,     C1,    0x00 //c1: 0 1
+
+    vpackev.d $vr18,     $vr17,    $vr16
+    vpackod.d $vr19,     $vr17,    $vr16
+
+    vfmul.d      $vr18,    $vr14,    $vr28
+    vfmul.d      $vr19,    $vr15,    $vr28
+    VNMSUB      $vr18,    $vr15,    $vr29, $vr18
+    VFMADD      $vr19,    $vr14,    $vr29, $vr19
+
+    vpackev.d $vr16,     $vr19,    $vr18
+    vpackod.d $vr17,     $vr19,    $vr18
+
+    vst       $vr16,     C0,    0x00 //c0: 0 1
+    vst       $vr17,     C1,    0x00 //c1: 0 1
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+#else
+    //res00 res10
+    vld       $vr16,     C0,    0x00 //c0: 0 1
+    vld       $vr17,     C1,    0x00 //c1: 0 1
+
+    vpackev.d $vr18,     $vr17,    $vr16
+    vpackod.d $vr19,     $vr17,    $vr16
+
+    VFMADD      $vr18,    $vr30,    $vr28, $vr18
+    VFMADD      $vr19,    $vr31,    $vr28, $vr19
+    VNMSUB      $vr18,    $vr31,    $vr29, $vr18
+    VFMADD      $vr19,    $vr30,    $vr29, $vr19
+
+    vpackev.d $vr16,     $vr19,    $vr18
+    vpackod.d $vr17,     $vr19,    $vr18
+
+    vst       $vr16,     C0,    0x00 //c0: 0 1
+    vst       $vr17,     C1,    0x00 //c1: 0 1
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res01 res11
+    vld       $vr16,     C0,    0x00 //c0: 0 1
+    vld       $vr17,     C1,    0x00 //c1: 0 1
+
+    vpackev.d $vr18,     $vr17,    $vr16
+    vpackod.d $vr19,     $vr17,    $vr16
+
+    VFMADD      $vr18,    $vr2,    $vr28, $vr18
+    VFMADD      $vr19,    $vr3,    $vr28, $vr19
+    VNMSUB      $vr18,    $vr3,    $vr29, $vr18
+    VFMADD      $vr19,    $vr2,    $vr29, $vr19
+
+    vpackev.d $vr16,     $vr19,    $vr18
+    vpackod.d $vr17,     $vr19,    $vr18
+
+    vst       $vr16,     C0,    0x00 //c0: 0 1
+    vst       $vr17,     C1,    0x00 //c1: 0 1
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res02 res12
+    vld       $vr16,     C0,    0x00 //c0: 0 1
+    vld       $vr17,     C1,    0x00 //c1: 0 1
+
+    vpackev.d $vr18,     $vr17,    $vr16
+    vpackod.d $vr19,     $vr17,    $vr16
+
+    VFMADD      $vr18,    $vr4,    $vr28, $vr18
+    VFMADD      $vr19,    $vr5,    $vr28, $vr19
+    VNMSUB      $vr18,    $vr5,    $vr29, $vr18
+    VFMADD      $vr19,    $vr4,    $vr29, $vr19
+
+    vpackev.d $vr16,     $vr19,    $vr18
+    vpackod.d $vr17,     $vr19,    $vr18
+
+    vst       $vr16,     C0,    0x00 //c0: 0 1
+    vst       $vr17,     C1,    0x00 //c1: 0 1
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res03 res13
+    vld       $vr16,     C0,    0x00 //c0: 0 1
+    vld       $vr17,     C1,    0x00 //c1: 0 1
+
+    vpackev.d $vr18,     $vr17,    $vr16
+    vpackod.d $vr19,     $vr17,    $vr16
+
+    VFMADD      $vr18,    $vr6,    $vr28, $vr18
+    VFMADD      $vr19,    $vr7,    $vr28, $vr19
+    VNMSUB      $vr18,    $vr7,    $vr29, $vr18
+    VFMADD      $vr19,    $vr6,    $vr29, $vr19
+
+    vpackev.d $vr16,     $vr19,    $vr18
+    vpackod.d $vr17,     $vr19,    $vr18
+
+    vst       $vr16,     C0,    0x00 //c0: 0 1
+    vst       $vr17,     C1,    0x00 //c1: 0 1
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res04 res14
+    vld       $vr16,     C0,    0x00 //c0: 0 1
+    vld       $vr17,     C1,    0x00 //c1: 0 1
+
+    vpackev.d $vr18,     $vr17,    $vr16
+    vpackod.d $vr19,     $vr17,    $vr16
+
+    VFMADD      $vr18,    $vr8,    $vr28, $vr18
+    VFMADD      $vr19,    $vr9,    $vr28, $vr19
+    VNMSUB      $vr18,    $vr9,    $vr29, $vr18
+    VFMADD      $vr19,    $vr8,    $vr29, $vr19
+
+    vpackev.d $vr16,     $vr19,    $vr18
+    vpackod.d $vr17,     $vr19,    $vr18
+
+    vst       $vr16,     C0,    0x00 //c0: 0 1
+    vst       $vr17,     C1,    0x00 //c1: 0 1
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res05 res15
+    vld       $vr16,     C0,    0x00 //c0: 0 1
+    vld       $vr17,     C1,    0x00 //c1: 0 1
+
+    vpackev.d $vr18,     $vr17,    $vr16
+    vpackod.d $vr19,     $vr17,    $vr16
+
+    VFMADD      $vr18,    $vr10,    $vr28, $vr18
+    VFMADD      $vr19,    $vr11,    $vr28, $vr19
+    VNMSUB      $vr18,    $vr11,    $vr29, $vr18
+    VFMADD      $vr19,    $vr10,    $vr29, $vr19
+
+    vpackev.d $vr16,     $vr19,    $vr18
+    vpackod.d $vr17,     $vr19,    $vr18
+
+    vst       $vr16,     C0,    0x00 //c0: 0 1
+    vst       $vr17,     C1,    0x00 //c1: 0 1
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res06 res16
+    vld       $vr16,     C0,    0x00 //c0: 0 1
+    vld       $vr17,     C1,    0x00 //c1: 0 1
+
+    vpackev.d $vr18,     $vr17,    $vr16
+    vpackod.d $vr19,     $vr17,    $vr16
+
+    VFMADD      $vr18,    $vr12,    $vr28, $vr18
+    VFMADD      $vr19,    $vr13,    $vr28, $vr19
+    VNMSUB      $vr18,    $vr13,    $vr29, $vr18
+    VFMADD      $vr19,    $vr12,    $vr29, $vr19
+
+    vpackev.d $vr16,     $vr19,    $vr18
+    vpackod.d $vr17,     $vr19,    $vr18
+
+    vst       $vr16,     C0,    0x00 //c0: 0 1
+    vst       $vr17,     C1,    0x00 //c1: 0 1
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res07 res17
+    vld       $vr16,     C0,    0x00 //c0: 0 1
+    vld       $vr17,     C1,    0x00 //c1: 0 1
+
+    vpackev.d $vr18,     $vr17,    $vr16
+    vpackod.d $vr19,     $vr17,    $vr16
+
+    VFMADD      $vr18,    $vr14,    $vr28, $vr18
+    VFMADD      $vr19,    $vr15,    $vr28, $vr19
+    VNMSUB      $vr18,    $vr15,    $vr29, $vr18
+    VFMADD      $vr19,    $vr14,    $vr29, $vr19
+
+    vpackev.d $vr16,     $vr19,    $vr18
+    vpackod.d $vr17,     $vr19,    $vr18
+
+    vst       $vr16,     C0,    0x00 //c0: 0 1
+    vst       $vr17,     C1,    0x00 //c1: 0 1
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -8
+#else
+    addi.d     TL,     TL,   -2
+#endif
+    slli.d     T3,     TL,   0x07
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x05
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   8
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     I,      I,     1
+    blt        I,      T0,    .L21
+
+.L24:   /* if ( bm & 4 ) */
+    move       I,      $r0
+    andi       T1,     M,     4    //bm&4
+    beq        I,      T1,    .L280
+
+.L25:
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,   0x06
+    add.d      A0,     A0,    T3
+    slli.d     T3,     OFF,   0x05
+    add.d      B0,     B,     T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   4
+#else
+    addi.d     TL,     OFF,   2
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    vxor.v    $vr30,     $vr30,   $vr30
+    vxor.v    $vr31,     $vr31,   $vr31
+    vxor.v    $vr2,      $vr2,    $vr2
+    vxor.v    $vr3,      $vr3,    $vr3
+    vxor.v    $vr4,      $vr4,    $vr4
+    vxor.v    $vr5,      $vr5,    $vr5
+    vxor.v    $vr6,      $vr6,    $vr6
+    vxor.v    $vr7,      $vr7,    $vr7
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L27
+    blt        TL,     L,     .L27
+
+.L26:  /* for (k=0; k<temp; k++) */
+    vld       $vr16,     A0,    0x00  // a0ri
+
+    vld       $vr18,     B0,    0x00  // b0ri
+    vld       $vr19,     B0,    0x10  // b1ri
+
+    vshuf4i.d  $vr21,     $vr16,    0x0a  //a0rr
+    vshuf4i.d  $vr22,     $vr16,    0x0f  //a0ii
+
+    vand.v    $vr23,     $vr18,    $vr18
+    vshuf4i.d $vr23,     $vr19,    0x08 //b0r b1r
+    vshuf4i.d $vr18,     $vr19,    0x0d //b0i b1i
+
+    VMADD1    $vr30,     $vr21,    $vr23,     $vr30  //00r 10r
+    VMADD2    $vr31,     $vr22,    $vr23,     $vr31  //00i 10i
+    VMADD3    $vr30,     $vr22,    $vr18,     $vr30
+    VMADD4    $vr31,     $vr21,    $vr18,     $vr31
+
+    vld       $vr16,     A0,    0x10  // a1ri
+
+    vshuf4i.d  $vr21,     $vr16,    0x0a  //a1rr
+    vshuf4i.d  $vr22,     $vr16,    0x0f  //a1ii
+
+    VMADD1    $vr2,     $vr21,    $vr23,     $vr2  //01r 11r
+    VMADD2    $vr3,     $vr22,    $vr23,     $vr3  //01i 11i
+    VMADD3    $vr2,     $vr22,    $vr18,     $vr2
+    VMADD4    $vr3,     $vr21,    $vr18,     $vr3
+
+    vld       $vr16,     A0,    0x20  // a2ri
+
+    vshuf4i.d  $vr21,     $vr16,    0x0a  //a2rr
+    vshuf4i.d  $vr22,     $vr16,    0x0f  //a2ii
+
+    VMADD1    $vr4,     $vr21,    $vr23,     $vr4  //02r 12r
+    VMADD2    $vr5,     $vr22,    $vr23,     $vr5  //02i 12i
+    VMADD3    $vr4,     $vr22,    $vr18,     $vr4
+    VMADD4    $vr5,     $vr21,    $vr18,     $vr5
+
+    vld       $vr16,     A0,    0x30  // a3ri
+
+    vshuf4i.d  $vr21,     $vr16,    0x0a  //a3rr
+    vshuf4i.d  $vr22,     $vr16,    0x0f  //a3ii
+
+    VMADD1    $vr6,     $vr21,    $vr23,     $vr6  //03r 13r
+    VMADD2    $vr7,     $vr22,    $vr23,     $vr7  //03i 13i
+    VMADD3    $vr6,     $vr22,    $vr18,     $vr6
+    VMADD4    $vr7,     $vr21,    $vr18,     $vr7
+
+    addi.d     A0,     A0,    0x40
+    addi.d     B0,     B0,    0x20
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L26
+
+.L27:
+#if defined(TRMMKERNEL)
+    //res00 res10
+    vld       $vr16,     C0,    0x00 //c0: 0 1
+    vld       $vr17,     C1,    0x00 //c1: 0 1
+
+    vpackev.d $vr18,     $vr17,    $vr16
+    vpackod.d $vr19,     $vr17,    $vr16
+
+    vfmul.d      $vr18,    $vr30,    $vr28
+    vfmul.d      $vr19,    $vr31,    $vr28
+    VNMSUB      $vr18,    $vr31,    $vr29, $vr18
+    VFMADD      $vr19,    $vr30,    $vr29, $vr19
+
+    vpackev.d $vr16,     $vr19,    $vr18
+    vpackod.d $vr17,     $vr19,    $vr18
+
+    vst       $vr16,     C0,    0x00 //c0: 0 1
+    vst       $vr17,     C1,    0x00 //c1: 0 1
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res01 res11
+    vld       $vr16,     C0,    0x00 //c0: 0 1
+    vld       $vr17,     C1,    0x00 //c1: 0 1
+
+    vpackev.d $vr18,     $vr17,    $vr16
+    vpackod.d $vr19,     $vr17,    $vr16
+
+    vfmul.d      $vr18,    $vr2,    $vr28
+    vfmul.d      $vr19,    $vr3,    $vr28
+    VNMSUB      $vr18,    $vr3,    $vr29, $vr18
+    VFMADD      $vr19,    $vr2,    $vr29, $vr19
+
+    vpackev.d $vr16,     $vr19,    $vr18
+    vpackod.d $vr17,     $vr19,    $vr18
+
+    vst       $vr16,     C0,    0x00 //c0: 0 1
+    vst       $vr17,     C1,    0x00 //c1: 0 1
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res02 res12
+    vld       $vr16,     C0,    0x00 //c0: 0 1
+    vld       $vr17,     C1,    0x00 //c1: 0 1
+
+    vpackev.d $vr18,     $vr17,    $vr16
+    vpackod.d $vr19,     $vr17,    $vr16
+
+    vfmul.d      $vr18,    $vr4,    $vr28
+    vfmul.d      $vr19,    $vr5,    $vr28
+    VNMSUB      $vr18,    $vr5,    $vr29, $vr18
+    VFMADD      $vr19,    $vr4,    $vr29, $vr19
+
+    vpackev.d $vr16,     $vr19,    $vr18
+    vpackod.d $vr17,     $vr19,    $vr18
+
+    vst       $vr16,     C0,    0x00 //c0: 0 1
+    vst       $vr17,     C1,    0x00 //c1: 0 1
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res03 res13
+    vld       $vr16,     C0,    0x00 //c0: 0 1
+    vld       $vr17,     C1,    0x00 //c1: 0 1
+
+    vpackev.d $vr18,     $vr17,    $vr16
+    vpackod.d $vr19,     $vr17,    $vr16
+
+    vfmul.d      $vr18,    $vr6,    $vr28
+    vfmul.d      $vr19,    $vr7,    $vr28
+    VNMSUB      $vr18,    $vr7,    $vr29, $vr18
+    VFMADD      $vr19,    $vr6,    $vr29, $vr19
+
+    vpackev.d $vr16,     $vr19,    $vr18
+    vpackod.d $vr17,     $vr19,    $vr18
+
+    vst       $vr16,     C0,    0x00 //c0: 0 1
+    vst       $vr17,     C1,    0x00 //c1: 0 1
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+#else
+    //res00 res10
+    vld       $vr16,     C0,    0x00 //c0: 0 1
+    vld       $vr17,     C1,    0x00 //c1: 0 1
+
+    vpackev.d $vr18,     $vr17,    $vr16
+    vpackod.d $vr19,     $vr17,    $vr16
+
+    VFMADD      $vr18,    $vr30,    $vr28, $vr18
+    VFMADD      $vr19,    $vr31,    $vr28, $vr19
+    VNMSUB      $vr18,    $vr31,    $vr29, $vr18
+    VFMADD      $vr19,    $vr30,    $vr29, $vr19
+
+    vpackev.d $vr16,     $vr19,    $vr18
+    vpackod.d $vr17,     $vr19,    $vr18
+
+    vst       $vr16,     C0,    0x00 //c0: 0 1
+    vst       $vr17,     C1,    0x00 //c1: 0 1
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res01 res11
+    vld       $vr16,     C0,    0x00 //c0: 0 1
+    vld       $vr17,     C1,    0x00 //c1: 0 1
+
+    vpackev.d $vr18,     $vr17,    $vr16
+    vpackod.d $vr19,     $vr17,    $vr16
+
+    VFMADD      $vr18,    $vr2,    $vr28, $vr18
+    VFMADD      $vr19,    $vr3,    $vr28, $vr19
+    VNMSUB      $vr18,    $vr3,    $vr29, $vr18
+    VFMADD      $vr19,    $vr2,    $vr29, $vr19
+
+    vpackev.d $vr16,     $vr19,    $vr18
+    vpackod.d $vr17,     $vr19,    $vr18
+
+    vst       $vr16,     C0,    0x00 //c0: 0 1
+    vst       $vr17,     C1,    0x00 //c1: 0 1
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res02 res12
+    vld       $vr16,     C0,    0x00 //c0: 0 1
+    vld       $vr17,     C1,    0x00 //c1: 0 1
+
+    vpackev.d $vr18,     $vr17,    $vr16
+    vpackod.d $vr19,     $vr17,    $vr16
+
+    VFMADD      $vr18,    $vr4,    $vr28, $vr18
+    VFMADD      $vr19,    $vr5,    $vr28, $vr19
+    VNMSUB      $vr18,    $vr5,    $vr29, $vr18
+    VFMADD      $vr19,    $vr4,    $vr29, $vr19
+
+    vpackev.d $vr16,     $vr19,    $vr18
+    vpackod.d $vr17,     $vr19,    $vr18
+
+    vst       $vr16,     C0,    0x00 //c0: 0 1
+    vst       $vr17,     C1,    0x00 //c1: 0 1
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res03 res13
+    vld       $vr16,     C0,    0x00 //c0: 0 1
+    vld       $vr17,     C1,    0x00 //c1: 0 1
+
+    vpackev.d $vr18,     $vr17,    $vr16
+    vpackod.d $vr19,     $vr17,    $vr16
+
+    VFMADD      $vr18,    $vr6,    $vr28, $vr18
+    VFMADD      $vr19,    $vr7,    $vr28, $vr19
+    VNMSUB      $vr18,    $vr7,    $vr29, $vr18
+    VFMADD      $vr19,    $vr6,    $vr29, $vr19
+
+    vpackev.d $vr16,     $vr19,    $vr18
+    vpackod.d $vr17,     $vr19,    $vr18
+
+    vst       $vr16,     C0,    0x00 //c0: 0 1
+    vst       $vr17,     C1,    0x00 //c1: 0 1
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -4
+#else
+    addi.d     TL,     TL,   -2
+#endif
+    slli.d     T3,     TL,   0x06
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x05
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   4
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+.L280:   /* if ( bm & 2 )*/
+    move       I,      $r0
+    andi       T1,     M,     2    //bm&2
+    beq        I,      T1,    .L284
+
+.L281:
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,   0x05
+    add.d      A0,     A0,    T3
+    add.d      B0,     B,     T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   2
+#else
+    addi.d     TL,     OFF,   2
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    xvxor.v    U0,     U0,   U0
+    xvxor.v    U1,     U1,   U1
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L283
+    blt        TL,     L,     .L283
+
+.L282:  /* for (k=0; k<temp; k++) */
+    xvld       D0,     A0,    0x00  // a0ri a1ri
+    xvld       D2,     B0,    0x00  // b0ri b1ri
+
+    xvpermi.d  D1,     D0,    0xf5  //a0ii a1ii
+    xvpermi.d  D0,     D0,    0xa0  //a0rr a1rr
+
+    xvpermi.d  D3,     D2,    0xdd  //b0i b1i b0i b1i
+    xvpermi.d  D2,     D2,    0x88  //b0r b1r b0r b1r
+
+    XVMADD1    U0,     D0,    D2,     U0  //00r 10r 01r 11r
+    XVMADD2    U1,     D1,    D2,     U1  //00i 10i 01i 11i
+    XVMADD3    U0,     D1,    D3,     U0
+    XVMADD4    U1,     D0,    D3,     U1
+
+    addi.d     A0,     A0,    0x20
+    addi.d     B0,     B0,    0x20
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L282
+
+.L283:
+#if defined(TRMMKERNEL)
+    //res00 res10 res01 res11
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3
+    xvld       D1,     C1,    0x00 //c1: 0 1 2 3
+
+    xvpackev.d D2,     D1,    D0  //0 4 2 6
+    xvpackod.d D3,     D1,    D0  //1 5 3 7
+
+    xvfmul.d      D2,    U0,    VALPHAR
+    xvfmul.d      D3,    U1,    VALPHAR
+    XVNMSUB      D2,    U1,    VALPHAI, D2
+    XVFMADD      D3,    U0,    VALPHAI, D3
+
+    xvpackev.d D4,     D3,    D2  //0 1 2 3
+    xvpackod.d D5,     D3,    D2  //4 5 6 7
+
+    xvst       D4,     C0,    0x00 //c0: 0 1 2 3
+    xvst       D5,     C1,    0x00 //c1: 0 1 2 3
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+#else
+    //res00 res10 res01 res11
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3
+    xvld       D1,     C1,    0x00 //c1: 0 1 2 3
+
+    xvpackev.d D2,     D1,    D0  //0 4 2 6
+    xvpackod.d D3,     D1,    D0  //1 5 3 7
+
+    XVFMADD      D2,    U0,    VALPHAR, D2
+    XVFMADD      D3,    U1,    VALPHAR, D3
+    XVNMSUB      D2,    U1,    VALPHAI, D2
+    XVFMADD      D3,    U0,    VALPHAI, D3
+
+    xvpackev.d D4,     D3,    D2  //0 1 2 3
+    xvpackod.d D5,     D3,    D2  //4 5 6 7
+
+    xvst       D4,     C0,    0x00 //c0: 0 1 2 3
+    xvst       D5,     C1,    0x00 //c1: 0 1 2 3
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -2
+#else
+    addi.d     TL,     TL,   -2
+#endif
+    slli.d     T3,     TL,   0x05
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x05
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   2
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+.L284:   /* if ( bm & 1 )*/
+    move       I,      $r0
+    andi       T1,     M,     1    //bm&1
+    beq        I,      T1,    .L288
+
+.L285:
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,   0x04
+    add.d      A0,     A0,    T3
+    slli.d     T3,     OFF,   0x05
+    add.d      B0,     B,     T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   1
+#else
+    addi.d     TL,     OFF,   2
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    vxor.v    $vr30,     $vr30,   $vr30
+    vxor.v    $vr31,     $vr31,   $vr31
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L287
+    blt        TL,     L,     .L287
+
+.L286:  /* for (k=0; k<temp; k++) */
+    vld       $vr16,     A0,    0x00  // a0ri
+
+    vld       $vr18,     B0,    0x00  // b0ri
+    vld       $vr19,     B0,    0x10  // b1ri
+
+    vshuf4i.d  $vr21,     $vr16,    0x0a  //a0rr
+    vshuf4i.d  $vr22,     $vr16,    0x0f  //a0ii
+
+    vand.v    $vr23,     $vr18,    $vr18
+    vshuf4i.d $vr23,     $vr19,    0x08 //b0r b1r
+    vshuf4i.d $vr18,     $vr19,    0x0d //b0i b1i
+
+    VMADD1    $vr30,     $vr21,    $vr23,     $vr30  //00r 10r
+    VMADD2    $vr31,     $vr22,    $vr23,     $vr31  //00i 10i
+    VMADD3    $vr30,     $vr22,    $vr18,     $vr30
+    VMADD4    $vr31,     $vr21,    $vr18,     $vr31
+
+    addi.d     A0,     A0,    0x10
+    addi.d     B0,     B0,    0x20
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L286
+
+.L287:
+#if defined(TRMMKERNEL)
+    //res00 res10
+    vld       $vr16,     C0,    0x00 //c0: 0 1
+    vld       $vr17,     C1,    0x00 //c1: 0 1
+
+    vand.v    $vr18,     $vr16,    $vr16
+    vshuf4i.d $vr18,     $vr17,    0x08 //c0[0] c1[0]
+    vshuf4i.d $vr16,     $vr17,    0x0d //c0[1] c1[1]
+
+    vfmul.d      $vr18,    $vr30,    $vr28
+    vfmul.d      $vr16,    $vr31,    $vr28
+    VNMSUB      $vr18,    $vr31,    $vr29, $vr18
+    VFMADD      $vr16,    $vr30,    $vr29, $vr16
+
+    vand.v    $vr19,     $vr18,    $vr18
+    vshuf4i.d $vr19,     $vr16,    0x08 //c0[0] c0[1]
+    vshuf4i.d $vr18,     $vr16,    0x0d //c1[0] c1[1]
+
+    vst       $vr19,     C0,    0x00 //c0: 0 1
+    vst       $vr18,     C1,    0x00 //c1: 0 1
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+#else
+    //res00 res10
+    vld       $vr16,     C0,    0x00 //c0: 0 1
+    vld       $vr17,     C1,    0x00 //c1: 0 1
+
+    vand.v    $vr18,     $vr16,    $vr16
+    vshuf4i.d $vr18,     $vr17,    0x08 //c0[0] c1[0]
+    vshuf4i.d $vr16,     $vr17,    0x0d //c0[1] c1[1]
+
+    VFMADD      $vr18,    $vr30,    $vr28, $vr18
+    VFMADD      $vr16,    $vr31,    $vr28, $vr16
+    VNMSUB      $vr18,    $vr31,    $vr29, $vr18
+    VFMADD      $vr16,    $vr30,    $vr29, $vr16
+
+    vand.v    $vr19,     $vr18,    $vr18
+    vshuf4i.d $vr19,     $vr16,    0x08 //c0[0] c0[1]
+    vshuf4i.d $vr18,     $vr16,    0x0d //c1[0] c1[1]
+
+    vst       $vr19,     C0,    0x00 //c0: 0 1
+    vst       $vr18,     C1,    0x00 //c1: 0 1
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -1
+#else
+    addi.d     TL,     TL,   -2
+#endif
+    slli.d     T3,     TL,   0x04
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x05
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   1
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+.L288:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    addi.d     OFF,    OFF,   2
+#endif
+    slli.d     L,      K,     5
+    add.d      B,      B,     L
+
+    slli.d     I,      LDC,   2
+    add.d      C,      C,     I
+
+    addi.d     J,      J,     2
+    andi       T0,     N,     2
+    blt        J,      T0,    .L20
+
+.L30:
+    move       J,      $r0
+    andi       T0,     N,     1
+    beq        J,      T0,    .L999
+
+.L300:  /* for (j=0; j<(bn&1); j+=1) */
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move       OFF,    OFFSET
+#endif
+
+    move       C0,     C
+    move       A0,     A    //ptrba
+
+    move       I,      $r0
+    srai.d     T0,     M,     3  //bm/8
+    beq        I,      T0,    .L34
+
+.L31:  /* for (i=0; i<bm/8; i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,  0x07
+    add.d      A0,     A0,   T3
+    slli.d     T3,     OFF,  0x04
+    add.d      B0,     B,    T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   8
+#else
+    addi.d     TL,     OFF,   1
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    xvxor.v    U0,     U0,   U0
+    xvxor.v    U1,     U1,   U1
+    xvxor.v    U2,     U2,   U2
+    xvxor.v    U3,     U3,   U3
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L33
+    blt        TL,     L,     .L33
+
+.L32:  /* for (k=0; k<temp; k++) */
+    xvld       D0,     A0,    0x00  // a0ri a1ri
+    xvld       D1,     A0,    0x20  // a2ri a3ri
+
+    xvldrepl.d D2,     B0,    0x00 //b0r
+    xvldrepl.d D3,     B0,    0x08 //b0i
+
+    xvpackev.d D4,     D1,    D0
+    xvpermi.d  D4,     D4,    0xd8  //a0r a1r a2r a3r
+
+    xvpackod.d D5,     D1,    D0
+    xvpermi.d  D5,     D5,    0xd8  //a0i a1i a2i a3i
+
+    XVMADD1    U0,     D4,    D2,     U0  //00r 01r 02r 03r
+    XVMADD2    U1,     D5,    D2,     U1  //00i 01i 02i 03i
+    XVMADD3    U0,     D5,    D3,     U0
+    XVMADD4    U1,     D4,    D3,     U1
+
+    xvld       D0,     A0,    0x40  // a4ri a5ri
+    xvld       D1,     A0,    0x60  // a6ri a7ri
+
+    xvpackev.d D4,     D1,    D0
+    xvpermi.d  D4,     D4,    0xd8  //a4r a5r a6r a7r
+
+    xvpackod.d D5,     D1,    D0
+    xvpermi.d  D5,     D5,    0xd8  //a4i a5i a6i a7i
+
+    XVMADD1    U2,     D4,    D2,     U2  //04r 05r 06r 07r
+    XVMADD2    U3,     D5,    D2,     U3  //04i 05i 06i 07i
+    XVMADD3    U2,     D5,    D3,     U2
+    XVMADD4    U3,     D4,    D3,     U3
+
+    addi.d     A0,     A0,    0x80
+    addi.d     B0,     B0,    0x10
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L32
+
+.L33:
+#if defined(TRMMKERNEL)
+    //res00 res01 res02 res03
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3
+    xvld       D1,     C0,    0x20 //c0: 4 5 6 7
+
+    xvpackev.d D2,     D1,    D0
+    xvpermi.d  D2,     D2,    0xd8  //0 2 4 6
+    xvpackod.d D3,     D1,    D0
+    xvpermi.d  D3,     D3,    0xd8  //1 3 5 7
+
+    xvfmul.d      D2,    U0,    VALPHAR
+    xvfmul.d      D3,    U1,    VALPHAR
+    XVNMSUB      D2,    U1,    VALPHAI, D2
+    XVFMADD      D3,    U0,    VALPHAI, D3
+
+    xvand.v    D4,     D2,   D2  //0 2 4 6
+    xvpermi.q  D4,     D3,   0x02 //0 2 1 3
+    xvpermi.d  D4,     D4,   0xd8 //0 1 2 3
+
+    xvand.v    D5,     D3,   D3  //1 3 5 7
+    xvpermi.q  D5,     D2,   0x31 //4 6 5 7
+    xvpermi.d  D5,     D5,   0xd8 //4 5 6 7
+
+    xvst       D4,     C0,    0x00
+    xvst       D5,     C0,    0x20
+
+    //res04 res05 res06 res07
+    xvld       D0,     C0,    0x40 //c0: 8 9 10 11
+    xvld       D1,     C0,    0x60 //c0: 12 13 14 15
+
+    xvpackev.d D2,     D1,    D0
+    xvpermi.d  D2,     D2,    0xd8  //8 10 12 14
+    xvpackod.d D3,     D1,    D0
+    xvpermi.d  D3,     D3,    0xd8  //9 11 13 15
+
+    xvfmul.d      D2,    U2,    VALPHAR
+    xvfmul.d      D3,    U3,    VALPHAR
+    XVNMSUB      D2,    U3,    VALPHAI, D2
+    XVFMADD      D3,    U2,    VALPHAI, D3
+
+    xvand.v    D4,     D2,   D2  //8 10 12 14
+    xvpermi.q  D4,     D3,   0x02 //8 10 9 11
+    xvpermi.d  D4,     D4,   0xd8 //8 9 10 11
+
+    xvand.v    D5,     D3,   D3  //9 11 13 15
+    xvpermi.q  D5,     D2,   0x31 //12 14 13 15
+    xvpermi.d  D5,     D5,   0xd8 //12 13 14 15
+
+    xvst       D4,     C0,    0x40
+    xvst       D5,     C0,    0x60
+
+    addi.d     C0,     C0,    0x80
+#else
+    //res00 res01 res02 res03
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3
+    xvld       D1,     C0,    0x20 //c0: 4 5 6 7
+
+    xvpackev.d D2,     D1,    D0
+    xvpermi.d  D2,     D2,    0xd8  //0 2 4 6
+    xvpackod.d D3,     D1,    D0
+    xvpermi.d  D3,     D3,    0xd8  //1 3 5 7
+
+    XVFMADD      D2,    U0,    VALPHAR, D2
+    XVFMADD      D3,    U1,    VALPHAR, D3
+    XVNMSUB      D2,    U1,    VALPHAI, D2
+    XVFMADD      D3,    U0,    VALPHAI, D3
+
+    xvand.v    D4,     D2,   D2  //0 2 4 6
+    xvpermi.q  D4,     D3,   0x02 //0 2 1 3
+    xvpermi.d  D4,     D4,   0xd8 //0 1 2 3
+
+    xvand.v    D5,     D3,   D3  //1 3 5 7
+    xvpermi.q  D5,     D2,   0x31 //4 6 5 7
+    xvpermi.d  D5,     D5,   0xd8 //4 5 6 7
+
+    xvst       D4,     C0,    0x00
+    xvst       D5,     C0,    0x20
+
+    //res04 res05 res06 res07
+    xvld       D0,     C0,    0x40 //c0: 8 9 10 11
+    xvld       D1,     C0,    0x60 //c0: 12 13 14 15
+
+    xvpackev.d D2,     D1,    D0
+    xvpermi.d  D2,     D2,    0xd8  //8 10 12 14
+    xvpackod.d D3,     D1,    D0
+    xvpermi.d  D3,     D3,    0xd8  //9 11 13 15
+
+    XVFMADD      D2,    U2,    VALPHAR, D2
+    XVFMADD      D3,    U3,    VALPHAR, D3
+    XVNMSUB      D2,    U3,    VALPHAI, D2
+    XVFMADD      D3,    U2,    VALPHAI, D3
+
+    xvand.v    D4,     D2,   D2  //8 10 12 14
+    xvpermi.q  D4,     D3,   0x02 //8 10 9 11
+    xvpermi.d  D4,     D4,   0xd8 //8 9 10 11
+
+    xvand.v    D5,     D3,   D3  //9 11 13 15
+    xvpermi.q  D5,     D2,   0x31 //12 14 13 15
+    xvpermi.d  D5,     D5,   0xd8 //12 13 14 15
+
+    xvst       D4,     C0,    0x40
+    xvst       D5,     C0,    0x60
+
+    addi.d     C0,     C0,    0x80
+#endif
+
+#if defined(TRMMKERNEL)
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -8
+#else
+    addi.d     TL,     TL,   -1
+#endif
+    slli.d     T3,     TL,   0x07
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x04
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   8
+#endif
+
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     I,      I,     1
+    blt        I,      T0,    .L31
+
+.L34:   /* if ( bm & 4 ) */
+    move       I,      $r0
+    andi       T1,     M,     4    //bm&4
+    beq        I,      T1,    .L38
+
+.L35:
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,   0x06
+    add.d      A0,     A0,    T3
+    slli.d     T3,     OFF,   0x04
+    add.d      B0,     B,     T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   4
+#else
+    addi.d     TL,     OFF,   1
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    xvxor.v    U0,     U0,   U0
+    xvxor.v    U1,     U1,   U1
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L37
+    blt        TL,     L,     .L37
+
+.L36:  /* for (k=0; k<temp; k++) */
+    xvld       D0,     A0,    0x00  // a0ri a1ri
+    xvld       D1,     A0,    0x20  // a2ri a3ri
+
+    xvldrepl.d D2,     B0,    0x00 //b0r
+    xvldrepl.d D3,     B0,    0x08 //b0i
+
+    xvpackev.d D4,     D1,    D0
+    xvpermi.d  D4,     D4,    0xd8  //a0r a1r a2r a3r
+
+    xvpackod.d D5,     D1,    D0
+    xvpermi.d  D5,     D5,    0xd8  //a0i a1i a2i a3i
+
+    XVMADD1    U0,     D4,    D2,     U0  //00r 01r 02r 03r
+    XVMADD2    U1,     D5,    D2,     U1  //00i 01i 02i 03i
+    XVMADD3    U0,     D5,    D3,     U0
+    XVMADD4    U1,     D4,    D3,     U1
+
+    addi.d     A0,     A0,    0x40
+    addi.d     B0,     B0,    0x10
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L36
+
+.L37:
+#if defined(TRMMKERNEL)
+    //res00 res01 res02 res03
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3
+    xvld       D1,     C0,    0x20 //c0: 4 5 6 7
+
+    xvpackev.d D2,     D1,    D0
+    xvpermi.d  D2,     D2,    0xd8  //0 2 4 6
+    xvpackod.d D3,     D1,    D0
+    xvpermi.d  D3,     D3,    0xd8  //1 3 5 7
+
+    xvfmul.d      D2,    U0,    VALPHAR
+    xvfmul.d      D3,    U1,    VALPHAR
+    XVNMSUB      D2,    U1,    VALPHAI, D2
+    XVFMADD      D3,    U0,    VALPHAI, D3
+
+    xvand.v    D4,     D2,   D2  //0 2 4 6
+    xvpermi.q  D4,     D3,   0x02 //0 2 1 3
+    xvpermi.d  D4,     D4,   0xd8 //0 1 2 3
+
+    xvand.v    D5,     D3,   D3  //1 3 5 7
+    xvpermi.q  D5,     D2,   0x31 //4 6 5 7
+    xvpermi.d  D5,     D5,   0xd8 //4 5 6 7
+
+    xvst       D4,     C0,    0x00
+    xvst       D5,     C0,    0x20
+
+    addi.d     C0,     C0,    0x40
+#else
+    //res00 res01 res02 res03
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3
+    xvld       D1,     C0,    0x20 //c0: 4 5 6 7
+
+    xvpackev.d D2,     D1,    D0
+    xvpermi.d  D2,     D2,    0xd8  //0 2 4 6
+    xvpackod.d D3,     D1,    D0
+    xvpermi.d  D3,     D3,    0xd8  //1 3 5 7
+
+    XVFMADD      D2,    U0,    VALPHAR, D2
+    XVFMADD      D3,    U1,    VALPHAR, D3
+    XVNMSUB      D2,    U1,    VALPHAI, D2
+    XVFMADD      D3,    U0,    VALPHAI, D3
+
+    xvand.v    D4,     D2,   D2  //0 2 4 6
+    xvpermi.q  D4,     D3,   0x02 //0 2 1 3
+    xvpermi.d  D4,     D4,   0xd8 //0 1 2 3
+
+    xvand.v    D5,     D3,   D3  //1 3 5 7
+    xvpermi.q  D5,     D2,   0x31 //4 6 5 7
+    xvpermi.d  D5,     D5,   0xd8 //4 5 6 7
+
+    xvst       D4,     C0,    0x00
+    xvst       D5,     C0,    0x20
+
+    addi.d     C0,     C0,    0x40
+#endif
+
+#if defined(TRMMKERNEL)
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -4
+#else
+    addi.d     TL,     TL,   -1
+#endif
+    slli.d     T3,     TL,   0x06
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x04
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   4
+#endif
+
+#endif   // #if defined(TRMMKERNEL)
+
+.L38:   /* if ( bm & 2 ) */
+    move       I,      $r0
+    andi       T1,     M,     2    //bm&2
+    beq        I,      T1,    .L312
+
+.L39:
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,   0x05
+    add.d      A0,     A0,    T3
+    slli.d     T3,     OFF,   0x04
+    add.d      B0,     B,     T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   2
+#else
+    addi.d     TL,     OFF,   1
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    vxor.v    $vr30,     $vr30,   $vr30
+    vxor.v    $vr31,     $vr31,   $vr31
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L311
+    blt        TL,     L,     .L311
+
+.L310:  /* for (k=0; k<temp; k++) */
+    vld       $vr16,     A0,    0x00  // a0ri
+    vld       $vr17,     A0,    0x10  // a1ri
+
+    //vldrepl.d $vr18,     B0,    0x00 //b0rr
+    //vldrepl.d $vr19,     B0,    0x08 //b0ii
+    vld       $vr18,     B0,    0x00
+    vld       $vr19,     B0,    0x00
+    vshuf4i.d  $vr18,     $vr18,    0x00 //b0rr
+    vshuf4i.d  $vr19,     $vr19,    0x05 //b0ii
+
+    vand.v     $vr20,     $vr16,    $vr16
+    vshuf4i.d  $vr20,     $vr17,    0x08  //a0r a1r
+    vshuf4i.d  $vr16,     $vr17,    0x0d  //a0i a1i
+
+    VMADD1    $vr30,     $vr20,    $vr18,     $vr30  //00r 01r
+    VMADD2    $vr31,     $vr16,    $vr18,     $vr31  //00i 01i
+    VMADD3    $vr30,     $vr16,    $vr19,     $vr30
+    VMADD4    $vr31,     $vr20,    $vr19,     $vr31
+
+    addi.d     A0,     A0,    0x20
+    addi.d     B0,     B0,    0x10
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L310
+
+.L311:
+#if defined(TRMMKERNEL)
+    //res00 res01
+    vld       $vr16,     C0,    0x00 //c0: 0 1
+    vld       $vr17,     C0,    0x10 //c0: 2 3
+
+    vand.v    $vr18,     $vr16,    $vr16
+    vshuf4i.d $vr18,     $vr17,    0x08 //c0[0] c0[2]
+    vshuf4i.d $vr16,     $vr17,    0x0d //c0[1] c0[3]
+
+    vfmul.d      $vr18,    $vr30,    $vr28
+    vfmul.d      $vr16,    $vr31,    $vr28
+    VNMSUB      $vr18,    $vr31,    $vr29, $vr18
+    VFMADD      $vr16,    $vr30,    $vr29, $vr16
+
+    vand.v    $vr19,     $vr18,    $vr18
+    vshuf4i.d $vr19,     $vr16,    0x08 //c0[0] c0[1]
+    vshuf4i.d $vr18,     $vr16,    0x0d //c0[2] c0[3]
+
+    vst       $vr19,     C0,    0x00 //c0: 0 1
+    vst       $vr18,     C0,    0x10 //c1: 2 3
+
+    addi.d     C0,     C0,    0x20
+#else
+    //res00 res01
+    vld       $vr16,     C0,    0x00 //c0: 0 1
+    vld       $vr17,     C0,    0x10 //c0: 2 3
+
+    vand.v    $vr18,     $vr16,    $vr16
+    vshuf4i.d $vr18,     $vr17,    0x08 //c0[0] c0[2]
+    vshuf4i.d $vr16,     $vr17,    0x0d //c0[1] c0[3]
+
+    VFMADD      $vr18,    $vr30,    $vr28, $vr18
+    VFMADD      $vr16,    $vr31,    $vr28, $vr16
+    VNMSUB      $vr18,    $vr31,    $vr29, $vr18
+    VFMADD      $vr16,    $vr30,    $vr29, $vr16
+
+    vand.v    $vr19,     $vr18,    $vr18
+    vshuf4i.d $vr19,     $vr16,    0x08 //c0[0] c0[1]
+    vshuf4i.d $vr18,     $vr16,    0x0d //c0[2] c0[3]
+
+    vst       $vr19,     C0,    0x00 //c0: 0 1
+    vst       $vr18,     C0,    0x10 //c1: 2 3
+
+    addi.d     C0,     C0,    0x20
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -2
+#else
+    addi.d     TL,     TL,   -1
+#endif
+    slli.d     T3,     TL,   0x05
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x04
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   2
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+.L312:   /* if ( bm & 1 )*/
+    move       I,      $r0
+    andi       T1,     M,     1    //bm&1
+    beq        I,      T1,    .L316
+
+.L313:
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,   0x04
+    add.d      A0,     A0,    T3
+    slli.d     T3,     OFF,   0x04
+    add.d      B0,     B,     T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   1
+#else
+    addi.d     TL,     OFF,   1
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    MTC        c11,    $r0
+    MTC        c12,    $r0
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L315
+    blt        TL,     L,     .L315
+
+.L314:  /* for (k=0; k<temp; k++) */
+    LD         a1,     A0,    0x00
+    LD         a2,     A0,    0x08
+
+    LD         b1,     B0,    0x00
+    LD         b2,     B0,    0x08
+
+    MADD1      c11,    a1,    b1,     c11
+    MADD2      c12,    a2,    b1,     c12
+    MADD3      c11,    a2,    b2,     c11
+    MADD4      c12,    a1,    b2,     c12
+
+    addi.d     A0,     A0,    0x10
+    addi.d     B0,     B0,    0x10
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L314
+
+.L315:
+#if defined(TRMMKERNEL)
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x08    //C0[1]
+
+    MUL       a5,     c11,   ALPHA_R
+    MUL       a6,     c12,   ALPHA_R
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x08
+
+    addi.d     C0,     C0,    0x10
+#else
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x08    //C0[1]
+
+    MADD       a5,     c11,   ALPHA_R, a5
+    MADD       a6,     c12,   ALPHA_R, a6
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x08
+
+    addi.d     C0,     C0,    0x10
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -1
+#else
+    addi.d     TL,     TL,   -1
+#endif
+    slli.d     T3,     TL,   0x04
+    add.d      A0,     A0,   T3
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   1
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+.L316:
+    slli.d     L,      K,     4
+    add.d      B,      B,     L
+
+    slli.d     I,      LDC,   1
+    add.d      C,      C,     I
+
+    addi.d     J,      J,     1
+    andi       T0,     N,     1
+    blt        J,      T0,    .L300
+
+.L999:
+    LDARG      $r23,   $sp,   0
+    LDARG      $r24,   $sp,   8
+    LDARG      $r25,   $sp,   16
+    LDARG      $r26,   $sp,   24
+    LDARG      $r27,   $sp,   32
+    LD         $f23,   $sp,   40
+    LD         $f24,   $sp,   48
+    LD         $f25,   $sp,   56
+    LD         $f26,   $sp,   64
+    LD         $f27,   $sp,   72
+    LD         $f28,   $sp,   80
+    LD         $f29,   $sp,   88
+    LD         $f30,   $sp,   96
+    LD         $f31,   $sp,   104
+
+    addi.d     $sp,    $sp,   128
+    jirl       $r0,    $r1,   0x0
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/zgemm_ncopy_4_lasx.S b/kernel/loongarch64/zgemm_ncopy_4_lasx.S
new file mode 100644
index 000000000..5d874bcc1
--- /dev/null
+++ b/kernel/loongarch64/zgemm_ncopy_4_lasx.S
@@ -0,0 +1,320 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define S5     $r16
+#define S6     $r17
+#define S7     $r18
+#define TD     $r20
+#define TS     $r11
+#define TL     $r19
+#define T0     $r23
+#define ZERO   $r0
+
+#define F0     $f0
+#define F1     $f1
+#define F2     $f2
+#define F3     $f3
+#define F4     $f4
+#define F5     $f5
+#define F6     $f6
+#define F7     $f7
+
+/* LASX vectors */
+#define U0     $xr0
+#define U1     $xr1
+#define U2     $xr2
+#define U3     $xr3
+#define U4     $xr4
+#define U5     $xr5
+#define U6     $xr6
+#define U7     $xr7
+#define D0     $xr8
+#define D1     $xr9
+#define D2     $xr10
+#define D3     $xr11
+#define D4     $xr12
+#define D5     $xr13
+#define D6     $xr14
+#define D7     $xr15
+#define D8     $xr16
+
+    PROLOGUE
+
+    addi.d     $sp,  $sp,  -8
+    SDARG      $r23, $sp,  0
+
+    move       TD,   DST   //boffset
+    move       TS,   SRC   //aoffset
+
+    slli.d     TL,   LDA,  0x03
+    slli.d     TL,   TL,   0x01
+
+    srai.d     J,    N,    0x02
+    beq        J,    ZERO,  .L_N0
+
+.L_J1: /* J-- */
+    move       S1,   TS
+    add.d      S2,   S1,   TL
+    add.d      S3,   S2,   TL
+    add.d      S4,   S3,   TL
+
+    slli.d     T0,   TL,   0x02
+    add.d      TS,   TS,   T0
+
+    srai.d     I,    M,    0x02
+    beq        I,    ZERO,  .L_I3
+
+.L_I1: /* I-- */
+    xvld       U0,   S1,   0x00
+    xvld       U1,   S1,   0x20
+    xvld       U2,   S2,   0x00
+    xvld       U3,   S2,   0x20
+    xvld       U4,   S3,   0x00
+    xvld       U5,   S3,   0x20
+    xvld       U6,   S4,   0x00
+    xvld       U7,   S4,   0x20
+
+    xvand.v    D0,   U0,   U0
+    xvand.v    D1,   U1,   U1
+    xvand.v    D2,   U2,   U2
+    xvand.v    D3,   U3,   U3
+    xvand.v    D4,   U4,   U4
+    xvand.v    D5,   U5,   U5
+    xvand.v    D6,   U6,   U6
+    xvand.v    D7,   U7,   U7
+
+    xvpermi.q  D0,   U2,   0x02
+    xvpermi.q  D4,   U6,   0x02
+    xvpermi.q  D2,   U0,   0x31
+    xvpermi.q  D6,   U4,   0x31
+    xvpermi.q  D1,   U3,   0x02
+    xvpermi.q  D5,   U7,   0x02
+    xvpermi.q  D3,   U1,   0x31
+    xvpermi.q  D7,   U5,   0x31
+
+    xvst       D0,   TD,   0x00
+    xvst       D4,   TD,   0x20
+    xvst       D2,   TD,   0x40
+    xvst       D6,   TD,   0x60
+    xvst       D1,   TD,   0x80
+    xvst       D5,   TD,   0xa0
+    xvst       D3,   TD,   0xc0
+    xvst       D7,   TD,   0xe0
+
+    addi.d     S1,   S1,   0x40   // a_offset
+    addi.d     S2,   S2,   0x40
+    addi.d     S3,   S3,   0x40
+    addi.d     S4,   S4,   0x40
+    addi.d     TD,   TD,   0x100  // b_offset
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_I1
+
+.L_I3:
+    andi       I,    M,    0x02
+    beq        I,    ZERO, .L_II20
+
+.L_II1:  /* if(m&2) */
+    xvld       U0,   S1,   0x00
+    xvld       U1,   S2,   0x00
+    xvld       U2,   S3,   0x00
+    xvld       U3,   S4,   0x00
+
+    xvand.v    D0,   U0,   U0
+    xvand.v    D1,   U1,   U1
+    xvand.v    D2,   U2,   U2
+    xvand.v    D3,   U3,   U3
+
+    xvpermi.q  D0,   U1,   0x02
+    xvpermi.q  D2,   U3,   0x02
+    xvpermi.q  D1,   U0,   0x31
+    xvpermi.q  D3,   U2,   0x31
+
+    xvst       D0,   TD,   0x00
+    xvst       D2,   TD,   0x20
+    xvst       D1,   TD,   0x40
+    xvst       D3,   TD,   0x60
+
+    addi.d     S1,   S1,   0x20
+    addi.d     S2,   S2,   0x20
+    addi.d     S3,   S3,   0x20
+    addi.d     S4,   S4,   0x20
+    addi.d     TD,   TD,   0x80
+
+.L_II20:
+    andi       I,    M,    0x01
+    beq        I,    ZERO, .L_J0
+
+.L_II2:  /* if(m&1) */
+    vld        $vr0, S1,   0x00
+    vld        $vr1, S2,   0x00
+    vld        $vr2, S3,   0x00
+    vld        $vr3, S4,   0x00
+
+    vst        $vr0, TD,   0x00
+    vst        $vr1, TD,   0x10
+    vst        $vr2, TD,   0x20
+    vst        $vr3, TD,   0x30
+
+    addi.d     TD,   TD,   0x40
+
+.L_J0:
+    addi.d    J,     J,    -1
+    blt       ZERO,  J,    .L_J1
+
+.L_N0:  /* if(n&2) */
+    andi       I,     N,   0x02
+    beq        ZERO,  I,   .L_N20
+
+    move       S1,    TS
+    add.d      S2,    S1,   TL
+
+    slli.d     T0,    TL,   0x01
+    add.d      TS,    TS,   T0
+
+    srai.d     I,     M,    0x02
+    beq        ZERO,  I,    .L_N10
+
+.L_N11: /* if(i>0) */
+    xvld       U0,    S1,   0x00
+    xvld       U1,    S1,   0x20
+    xvld       U2,    S2,   0x00
+    xvld       U3,    S2,   0x20
+
+    xvand.v    D0,    U0,   U0
+    xvand.v    D1,    U1,   U1
+    xvand.v    D2,    U2,   U2
+    xvand.v    D3,    U3,   U3
+
+    xvpermi.q  D0,    U2,   0x02
+    xvpermi.q  D2,    U0,   0x31
+    xvpermi.q  D1,    U3,   0x02
+    xvpermi.q  D3,    U1,   0x31
+
+    xvst       D0,    TD,   0x00
+    xvst       D2,    TD,   0x20
+    xvst       D1,    TD,   0x40
+    xvst       D3,    TD,   0x60
+
+    addi.d     S1,    S1,   0x40   // a_offset
+    addi.d     S2,    S2,   0x40
+    addi.d     TD,    TD,   0x80   // b_offset
+
+    addi.d     I,     I,   -1
+    blt        ZERO,  I,   .L_N11
+
+.L_N10:  /* if(m&2) */
+    andi       I,     M,    0x02
+    beq        I,     ZERO, .L_N130
+
+    xvld       U0,    S1,   0x00
+    xvld       U1,    S2,   0x00
+    xvand.v    D0,    U0,   U0
+
+    xvpermi.q  D0,    U1,   0x02
+    xvpermi.q  U1,    U0,   0x31
+
+    xvst       D0,    TD,   0x00
+    xvst       U1,    TD,   0x20
+
+    addi.d     S1,    S1,   0x20   // a_offset
+    addi.d     S2,    S2,   0x20
+    addi.d     TD,    TD,   0x40   // b_offset
+
+.L_N130:  /* if(m&1) */
+    andi       I,     M,    0x01
+    beq        I,     ZERO, .L_N20
+
+    vld       $vr0,    S1,   0x00
+    vld       $vr1,    S2,   0x00
+
+    vst       $vr0,    TD,   0x00
+    vst       $vr1,    TD,   0x10
+
+    addi.d     TD,    TD,   0x20
+
+.L_N20:   /* if(n&1) */
+    andi       I,     N,    0x01
+    beq        I,     ZERO, .L_N00
+
+    move       S1,   TS
+    srai.d     I,    M,    0x02
+
+    beq        I,    ZERO, .L_N30
+
+.L_N21:  /* if(i>0) */
+    xvld       U0,   S1,   0x00
+    xvld       U1,   S1,   0x20
+
+    xvst       U0,   TD,   0x00
+    xvst       U1,   TD,   0x20
+
+    addi.d     S1,   S1,   0x40   // aoffset1
+    addi.d     TD,   TD,   0x40   // b_offset
+
+    addi.d     I,     I,   -1
+    blt        ZERO,  I,   .L_N21
+
+.L_N30:  /* if(m&2) */
+    andi       I,     M,    0x02
+    beq        I,     ZERO, .L_N330
+
+    xvld       U0,   S1,   0x00
+    xvst       U0,   TD,   0x00
+
+    addi.d     S1,   S1,   0x20   // aoffset1
+    addi.d     TD,   TD,   0x20   // b_offset
+
+.L_N330:  /* if(m&1) */
+    andi       I,     M,    0x01
+    beq        I,     ZERO, .L_N00
+
+    vld       $vr0,   S1,   0x00
+    vst       $vr0,   TD,   0x00
+
+.L_N00:
+    LDARG     $r23,  $sp, 0
+    addi.d    $sp,   $sp, 8
+    jirl      $r0,   $r1, 0x00
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/zgemm_ncopy_4_lsx.S b/kernel/loongarch64/zgemm_ncopy_4_lsx.S
new file mode 100644
index 000000000..203471cbd
--- /dev/null
+++ b/kernel/loongarch64/zgemm_ncopy_4_lsx.S
@@ -0,0 +1,332 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define S5     $r16
+#define S6     $r17
+#define S7     $r18
+#define TD     $r20
+#define TS     $r11
+#define TL     $r19
+#define T0     $r23
+#define ZERO   $r0
+
+#define F0     $f0
+#define F1     $f1
+#define F2     $f2
+#define F3     $f3
+#define F4     $f4
+#define F5     $f5
+#define F6     $f6
+#define F7     $f7
+
+/* LSX vectors */
+#define U0     $vr0
+#define U1     $vr1
+#define U2     $vr2
+#define U3     $vr3
+#define U4     $vr4
+#define U5     $vr5
+#define U6     $vr6
+#define U7     $vr7
+#define U8     $vr8
+#define U9     $vr9
+#define U10    $vr10
+#define U11    $vr11
+#define U12    $vr12
+#define U13    $vr13
+#define U14    $vr14
+#define U15    $vr15
+
+    PROLOGUE
+
+    addi.d     $sp,  $sp,  -8
+    SDARG      $r23, $sp,  0
+
+    move       TD,   DST   //boffset
+    move       TS,   SRC   //aoffset
+
+    slli.d     TL,   LDA,  0x03
+    slli.d     TL,   TL,   0x01
+
+    srai.d     J,    N,    0x02
+    beq        J,    ZERO,  .L_N0
+
+.L_J1: /* J-- */
+    move       S1,   TS
+    add.d      S2,   S1,   TL
+    add.d      S3,   S2,   TL
+    add.d      S4,   S3,   TL
+
+    slli.d     T0,   TL,   0x02
+    add.d      TS,   TS,   T0
+
+    srai.d     I,    M,    0x02
+    beq        I,    ZERO,  .L_I3
+
+.L_I1: /* I-- */
+    vld       U0,   S1,   0x00
+    vld       U1,   S1,   0x10
+    vld       U2,   S1,   0x20
+    vld       U3,   S1,   0x30
+
+    vld       U4,   S2,   0x00
+    vld       U5,   S2,   0x10
+    vld       U6,   S2,   0x20
+    vld       U7,   S2,   0x30
+
+    vld       U8,   S3,   0x00
+    vld       U9,   S3,   0x10
+    vld       U10,  S3,   0x20
+    vld       U11,  S3,   0x30
+
+    vld       U12,  S4,   0x00
+    vld       U13,  S4,   0x10
+    vld       U14,  S4,   0x20
+    vld       U15,  S4,   0x30
+
+    vst       U0,   TD,   0x00
+    vst       U4,   TD,   0x10
+    vst       U8,   TD,   0x20
+    vst       U12,  TD,   0x30
+
+    vst       U1,   TD,   0x40
+    vst       U5,   TD,   0x50
+    vst       U9,   TD,   0x60
+    vst       U13,  TD,   0x70
+
+    vst       U2,   TD,   0x80
+    vst       U6,   TD,   0x90
+    vst       U10,  TD,   0xa0
+    vst       U14,  TD,   0xb0
+
+    vst       U3,   TD,   0xc0
+    vst       U7,   TD,   0xd0
+    vst       U11,  TD,   0xe0
+    vst       U15,  TD,   0xf0
+
+    addi.d     S1,   S1,   0x40   // a_offset
+    addi.d     S2,   S2,   0x40
+    addi.d     S3,   S3,   0x40
+    addi.d     S4,   S4,   0x40
+    addi.d     TD,   TD,   0x100  // b_offset
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_I1
+
+.L_I3:  /* if(m&2) */
+    andi       I,    M,    0x02
+    beq        I,    ZERO, .L_II20
+
+    vld       U0,   S1,   0x00
+    vld       U1,   S1,   0x10
+
+    vld       U2,   S2,   0x00
+    vld       U3,   S2,   0x10
+
+    vld       U4,   S3,   0x00
+    vld       U5,   S3,   0x10
+
+    vld       U6,   S4,   0x00
+    vld       U7,   S4,   0x10
+
+    vst       U0,   TD,   0x00
+    vst       U2,   TD,   0x10
+    vst       U4,   TD,   0x20
+    vst       U6,   TD,   0x30
+
+    vst       U1,   TD,   0x40
+    vst       U3,   TD,   0x50
+    vst       U5,   TD,   0x60
+    vst       U7,   TD,   0x70
+
+    addi.d     S1,   S1,   0x20
+    addi.d     S2,   S2,   0x20
+    addi.d     S3,   S3,   0x20
+    addi.d     S4,   S4,   0x20
+    addi.d     TD,   TD,   0x80
+
+.L_II20:  /* if(m&1) */
+    andi       I,    M,    0x01
+    beq        I,    ZERO, .L_J0
+
+    vld        U0,   S1,   0x00
+    vld        U1,   S2,   0x00
+    vld        U2,   S3,   0x00
+    vld        U3,   S4,   0x00
+
+    vst        U0,   TD,   0x00
+    vst        U1,   TD,   0x10
+    vst        U2,   TD,   0x20
+    vst        U3,   TD,   0x30
+
+    addi.d     TD,   TD,   0x40
+
+.L_J0:
+    addi.d    J,     J,    -1
+    blt       ZERO,  J,    .L_J1
+
+.L_N0:  /* if(n&2) */
+    andi       I,     N,   0x02
+    beq        ZERO,  I,   .L_N20
+
+    move       S1,    TS
+    add.d      S2,    S1,   TL
+
+    slli.d     T0,    TL,   0x01
+    add.d      TS,    TS,   T0
+
+    srai.d     I,     M,    0x02
+    beq        ZERO,  I,    .L_N10
+
+.L_N11: /* if(i>0) */
+    vld       U0,    S1,   0x00
+    vld       U1,    S1,   0x10
+    vld       U2,    S1,   0x20
+    vld       U3,    S1,   0x30
+
+    vld       U4,    S2,   0x00
+    vld       U5,    S2,   0x10
+    vld       U6,    S2,   0x20
+    vld       U7,    S2,   0x30
+
+    vst       U0,    TD,   0x00
+    vst       U4,    TD,   0x10
+    vst       U1,    TD,   0x20
+    vst       U5,    TD,   0x30
+
+    vst       U2,    TD,   0x40
+    vst       U6,    TD,   0x50
+    vst       U3,    TD,   0x60
+    vst       U7,    TD,   0x70
+
+    addi.d     S1,    S1,   0x40   // a_offset
+    addi.d     S2,    S2,   0x40
+    addi.d     TD,    TD,   0x80   // b_offset
+
+    addi.d     I,     I,   -1
+    blt        ZERO,  I,   .L_N11
+
+.L_N10:  /* if(m&2) */
+    andi       I,     M,    0x02
+    beq        I,     ZERO, .L_N130
+
+    vld       U0,    S1,   0x00
+    vld       U1,    S1,   0x10
+
+    vld       U2,    S2,   0x00
+    vld       U3,    S2,   0x10
+
+    vst       U0,    TD,   0x00
+    vst       U2,    TD,   0x10
+    vst       U1,    TD,   0x20
+    vst       U3,    TD,   0x30
+
+    addi.d     S1,    S1,   0x20   // a_offset
+    addi.d     S2,    S2,   0x20
+    addi.d     TD,    TD,   0x40   // b_offset
+
+.L_N130:  /* if(m&1) */
+    andi       I,     M,    0x01
+    beq        I,     ZERO, .L_N20
+
+    vld       U0,     S1,   0x00
+    vld       U1,     S2,   0x00
+
+    vst       U0,     TD,   0x00
+    vst       U1,     TD,   0x10
+
+    addi.d     TD,    TD,   0x20
+
+.L_N20:   /* if(n&1) */
+    andi       I,     N,    0x01
+    beq        I,     ZERO, .L_N00
+
+    move       S1,   TS
+    srai.d     I,    M,    0x02
+
+    beq        I,    ZERO, .L_N30
+
+.L_N21:  /* if(i>0) */
+    vld       U0,   S1,   0x00
+    vld       U1,   S1,   0x10
+    vld       U2,   S1,   0x20
+    vld       U3,   S1,   0x30
+
+    vst       U0,   TD,   0x00
+    vst       U1,   TD,   0x10
+    vst       U2,   TD,   0x20
+    vst       U3,   TD,   0x30
+
+    addi.d     S1,   S1,   0x40   // aoffset1
+    addi.d     TD,   TD,   0x40   // b_offset
+
+    addi.d     I,     I,   -1
+    blt        ZERO,  I,   .L_N21
+
+.L_N30:  /* if(m&2) */
+    andi       I,     M,    0x02
+    beq        I,     ZERO, .L_N330
+
+    vld       U0,   S1,   0x00
+    vld       U1,   S1,   0x10
+
+    vst       U0,   TD,   0x00
+    vst       U1,   TD,   0x10
+
+    addi.d     S1,   S1,   0x20   // aoffset1
+    addi.d     TD,   TD,   0x20   // b_offset
+
+.L_N330:  /* if(m&1) */
+    andi       I,     M,    0x01
+    beq        I,     ZERO, .L_N00
+
+    vld       U0,   S1,   0x00
+
+    vst       U0,   TD,   0x00
+
+.L_N00:
+    LDARG     $r23,  $sp, 0
+    addi.d    $sp,   $sp, 8
+    jirl      $r0,   $r1, 0x00
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/zgemm_ncopy_8_lasx.S b/kernel/loongarch64/zgemm_ncopy_8_lasx.S
new file mode 100644
index 000000000..7cd8f605b
--- /dev/null
+++ b/kernel/loongarch64/zgemm_ncopy_8_lasx.S
@@ -0,0 +1,263 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define S5     $r16
+#define S6     $r17
+#define S7     $r18
+#define S8     $r19
+#define TD     $r20
+#define TS     $r11
+#define TL     $r7
+#define T0     $r23
+#define ZERO   $r0
+
+#define F0     $f0
+#define F1     $f1
+#define F2     $f2
+#define F3     $f3
+#define F4     $f4
+#define F5     $f5
+#define F6     $f6
+#define F7     $f7
+
+/* LASX vectors */
+#define U0     $xr0
+#define U1     $xr1
+#define U2     $xr2
+#define U3     $xr3
+#define U4     $xr4
+#define U5     $xr5
+#define U6     $xr6
+#define U7     $xr7
+#define D0     $xr8
+#define D1     $xr9
+#define D2     $xr10
+#define D3     $xr11
+#define D4     $xr12
+#define D5     $xr13
+#define D6     $xr14
+#define D7     $xr15
+#define D8     $xr16
+
+    PROLOGUE
+
+    addi.d     $sp,  $sp,  -8
+    SDARG      $r23, $sp,  0
+
+    move       TD,   DST  //boffset
+    move       TS,   SRC  //aoffset
+
+    slli.d     TL,   LDA,  0x03  //lda
+    slli.d     TL,   TL,   0x01
+
+    slli.d     T0,   TL,   0x03
+    srai.d     J,    N,    0x03  //j
+
+    beq        J,    ZERO, .L_N1
+
+.L_J1:  /* if(j>0) j--*/
+    move       S1,   TS
+    add.d      S2,   TS,   TL
+    move       I,    M
+    add.d      S3,   S2,   TL
+    add.d      S4,   S3,   TL
+    add.d      S5,   S4,   TL
+    add.d      S6,   S5,   TL
+    add.d      S7,   S6,   TL
+    add.d      S8,   S7,   TL
+    add.d      TS,   TS,   T0
+
+    beq        I,    ZERO, .L_J11
+
+.L_I1:  /* if(i>0) i--*/
+    fld.d      F0,   S1,   0x00
+    fld.d      F1,   S1,   0x08
+    fld.d      F2,   S2,   0x00
+    fld.d      F3,   S2,   0x08
+    fld.d      F4,   S3,   0x00
+    fld.d      F5,   S3,   0x08
+    fld.d      F6,   S4,   0x00
+    fld.d      F7,   S4,   0x08
+
+    fst.d      F0,   TD,   0x00
+    fst.d      F1,   TD,   0x08
+    fst.d      F2,   TD,   0x10
+    fst.d      F3,   TD,   0x18
+    fst.d      F4,   TD,   0x20
+    fst.d      F5,   TD,   0x28
+    fst.d      F6,   TD,   0x30
+    fst.d      F7,   TD,   0x38
+
+    fld.d      F0,   S5,   0x00
+    fld.d      F1,   S5,   0x08
+    fld.d      F2,   S6,   0x00
+    fld.d      F3,   S6,   0x08
+    fld.d      F4,   S7,   0x00
+    fld.d      F5,   S7,   0x08
+    fld.d      F6,   S8,   0x00
+    fld.d      F7,   S8,   0x08
+
+    fst.d      F0,   TD,   0x40
+    fst.d      F1,   TD,   0x48
+    fst.d      F2,   TD,   0x50
+    fst.d      F3,   TD,   0x58
+    fst.d      F4,   TD,   0x60
+    fst.d      F5,   TD,   0x68
+    fst.d      F6,   TD,   0x70
+    fst.d      F7,   TD,   0x78
+
+    addi.d     S1,   S1,   0x10
+    addi.d     S2,   S2,   0x10
+    addi.d     S3,   S3,   0x10
+    addi.d     S4,   S4,   0x10
+    addi.d     S5,   S5,   0x10
+    addi.d     S6,   S6,   0x10
+    addi.d     S7,   S7,   0x10
+    addi.d     S8,   S8,   0x10
+    addi.d     TD,   TD,   0x80
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_I1
+
+.L_J11: /* j--*/
+    addi.d     J,    J,    -1
+    blt        ZERO, J,    .L_J1
+
+.L_N1:  /* if(n&4)*/
+    andi       I,     N,    0x04
+    beq        I,     ZERO, .L_N2
+
+    move       S1,   TS
+    add.d      S2,   TS,   TL
+    move       I,    M
+    add.d      S3,   S2,   TL
+    add.d      S4,   S3,   TL
+    add.d      TS,   S4,   TL
+
+    beq        I,     ZERO, .L_N2
+
+.L_N11:  /* if(i>0)*/
+    fld.d      F0,   S1,   0x00
+    fld.d      F1,   S1,   0x08
+    fld.d      F2,   S2,   0x00
+    fld.d      F3,   S2,   0x08
+    fld.d      F4,   S3,   0x00
+    fld.d      F5,   S3,   0x08
+    fld.d      F6,   S4,   0x00
+    fld.d      F7,   S4,   0x08
+
+    fst.d      F0,   TD,   0x00
+    fst.d      F1,   TD,   0x08
+    fst.d      F2,   TD,   0x10
+    fst.d      F3,   TD,   0x18
+    fst.d      F4,   TD,   0x20
+    fst.d      F5,   TD,   0x28
+    fst.d      F6,   TD,   0x30
+    fst.d      F7,   TD,   0x38
+
+    addi.d     S1,   S1,   0x10
+    addi.d     S2,   S2,   0x10
+    addi.d     S3,   S3,   0x10
+    addi.d     S4,   S4,   0x10
+    addi.d     TD,   TD,   0x40
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_N11
+
+.L_N2:  /* if(n&2)*/
+    andi       I,     N,    0x02
+    beq        I,     ZERO, .L_N3
+
+    move       S1,   TS
+    add.d      S2,   TS,   TL
+    move       I,    M
+    add.d      TS,   S2,   TL
+
+    beq        I,    ZERO, .L_N3
+
+.L_N21:  /* if(i>0)*/
+    fld.d      F0,   S1,   0x00
+    fld.d      F1,   S1,   0x08
+    fld.d      F2,   S2,   0x00
+    fld.d      F3,   S2,   0x08
+
+    fst.d      F0,   TD,   0x00
+    fst.d      F1,   TD,   0x08
+    fst.d      F2,   TD,   0x10
+    fst.d      F3,   TD,   0x18
+
+    addi.d     S1,   S1,   0x10
+    addi.d     S2,   S2,   0x10
+    addi.d     TD,   TD,   0x20
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_N21
+
+.L_N3:  /* if(n&2)*/
+    andi       I,    N,    0x01
+    beq        I,    ZERO, .L_N0
+
+    move       S1,   TS
+    move       I,    M
+
+    beq        I,    ZERO, .L_N0
+
+.L_N31:  /* if(i>0)*/
+    fld.d      F0,   S1,   0x00
+    fld.d      F1,   S1,   0x08
+
+    fst.d      F0,   TD,   0x00
+    fst.d      F1,   TD,   0x08
+
+    addi.d     S1,   S1,   0x10
+    addi.d     TD,   TD,   0x10
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_N31
+
+.L_N0:
+    LDARG     $r23,  $sp, 0
+    addi.d    $sp,   $sp, 8
+    jirl      $r0,   $r1, 0x00
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/zgemm_tcopy_4_lasx.S b/kernel/loongarch64/zgemm_tcopy_4_lasx.S
new file mode 100644
index 000000000..1adee11c5
--- /dev/null
+++ b/kernel/loongarch64/zgemm_tcopy_4_lasx.S
@@ -0,0 +1,302 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define TD     $r16
+#define TS     $r17
+#define TL     $r18
+#define T0     $r19
+#define S8     $r20
+#define S9     $r23
+#define S10    $r11
+#define ZERO   $r0
+
+#define F0     $f0
+#define F1     $f1
+#define F2     $f2
+#define F3     $f3
+#define F4     $f4
+#define F5     $f5
+#define F6     $f6
+#define F7     $f7
+
+/* LASX vectors */
+#define U0     $xr0
+#define U1     $xr1
+#define U2     $xr2
+#define U3     $xr3
+#define U4     $xr4
+#define U5     $xr5
+#define U6     $xr6
+#define U7     $xr7
+#define D0     $xr8
+#define D1     $xr9
+#define D2     $xr10
+#define D3     $xr11
+#define D4     $xr12
+#define D5     $xr13
+#define D6     $xr14
+#define D7     $xr15
+
+
+    PROLOGUE
+
+    addi.d     $sp,  $sp,  -8
+    SDARG      $r23, $sp,  0
+
+    move       TS,     SRC          //aoffset
+    move       TD,     DST          //boffset
+    slli.d     TL,     LDA,   0x03  //lda
+    slli.d     TL,     TL,    0x01  //lda
+
+    ori        T0,     ZERO,  0x03
+    andn       T0,     N,     T0
+    mul.d      T0,     M,     T0
+    slli.d     T0,     T0,    0x01
+    slli.d     T0,     T0,    0x03
+    add.d      S9,     DST,   T0  //boffset2
+
+    ori        T0,     ZERO,  0x01
+    andn       T0,     N,     T0
+    mul.d      T0,     M,     T0
+    slli.d     T0,     T0,    0x01
+    slli.d     T0,     T0,    0x03
+    add.d      S10,    DST,   T0  //boffset3
+
+    srai.d     J,      M,     0x02  //j
+
+    beq        J,      ZERO,  .L_M1
+
+.L_J1:  /* if(j>0) j--*/
+    move       S1,     TS     //aoffset1
+    add.d      S2,     S1,    TL
+    add.d      S3,     S2,    TL
+    add.d      S4,     S3,    TL
+
+    slli.d     T0,     TL,    0x02
+    add.d      TS,     TS,    T0
+
+    move       S8,     TD     //boffset1
+    addi.d     TD,     TD,    0x100
+
+    srai.d     I,      N,     0x02
+
+    beq        ZERO,   I,     .L_JN1
+
+.L_JI1:  /* if(i>0) i--*/
+    xvld       U0,   S1,   0x00
+    xvld       U1,   S1,   0x20
+    xvld       U2,   S2,   0x00
+    xvld       U3,   S2,   0x20
+    xvld       U4,   S3,   0x00
+    xvld       U5,   S3,   0x20
+    xvld       U6,   S4,   0x00
+    xvld       U7,   S4,   0x20
+
+    xvst       U0,   S8,   0x00
+    xvst       U1,   S8,   0x20
+    xvst       U2,   S8,   0x40
+    xvst       U3,   S8,   0x60
+    xvst       U4,   S8,   0x80
+    xvst       U5,   S8,   0xa0
+    xvst       U6,   S8,   0xc0
+    xvst       U7,   S8,   0xe0
+
+    addi.d     S1,   S1,   0x40
+    addi.d     S2,   S2,   0x40
+    addi.d     S3,   S3,   0x40
+    addi.d     S4,   S4,   0x40
+    slli.d     T0,   M,    0x06
+    add.d      S8,   S8,   T0
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_JI1
+
+.L_JN1:  /* if(n&2) */
+    andi       I,    N,    0x02
+    beq        ZERO, I,    .L_JN2
+
+    xvld       U0,   S1,   0x00
+    xvld       U1,   S2,   0x00
+    xvld       U2,   S3,   0x00
+    xvld       U3,   S4,   0x00
+
+    xvst       U0,   S9,   0x00
+    xvst       U1,   S9,   0x20
+    xvst       U2,   S9,   0x40
+    xvst       U3,   S9,   0x60
+
+    addi.d     S1,   S1,   0x20
+    addi.d     S2,   S2,   0x20
+    addi.d     S3,   S3,   0x20
+    addi.d     S4,   S4,   0x20
+    addi.d     S9,   S9,   0x80
+
+.L_JN2:  /* if(n&1) */
+    andi       I,    N,    0x01
+    beq        ZERO, I,    .L_J0
+
+    vld        $vr0, S1,   0x00
+    vld        $vr1, S2,   0x00
+    vld        $vr2, S3,   0x00
+    vld        $vr3, S4,   0x00
+
+    vst        $vr0, S10,   0x00
+    vst        $vr1, S10,   0x10
+    vst        $vr2, S10,   0x20
+    vst        $vr3, S10,   0x30
+
+    addi.d     S10,  S10,  0x40
+
+.L_J0:
+    addi.d     J,      J,     -1
+    blt        ZERO, J,   .L_J1
+
+.L_M1:  /* if(m&2) */
+    andi       I,      M,    0x02
+    beq        ZERO,   I,    .L_M2
+
+    move       S1,     TS     //aoffset1
+    add.d      S2,     S1,    TL
+
+    slli.d     T0,     TL,    0x01
+    add.d      TS,     TS,    T0
+
+    move       S8,     TD     //boffset1
+    addi.d     TD,     TD,    0x80
+
+    srai.d     I,      N,     0x02
+    beq        ZERO,   I,     .L_M1N1
+
+.L_M1I1:  /* if(i>0) */
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S1,    0x20
+    xvld       U2,     S2,    0x00
+    xvld       U3,     S2,    0x20
+
+    xvst       U0,     S8,   0x00
+    xvst       U1,     S8,   0x20
+    xvst       U2,     S8,   0x40
+    xvst       U3,     S8,   0x60
+
+    addi.d     S1,     S1,    0x40
+    addi.d     S2,     S2,    0x40
+    slli.d     T0,     M,     0x06
+    add.d      S8,     S8,    T0
+
+    addi.d     I,      I,     -1
+    blt        ZERO,   I,     .L_M1I1
+
+.L_M1N1:  /* if(n&2) */
+    andi       I,      N,    0x02
+    beq        ZERO,   I,    .L_M1N2
+
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S2,    0x00
+
+    xvst       U0,     S9,    0x00
+    xvst       U1,     S9,    0x20
+
+    addi.d     S1,     S1,    0x20
+    addi.d     S2,     S2,    0x20
+    addi.d     S9,     S9,    0x40
+
+.L_M1N2:  /* if(n&1) */
+    andi       I,      N,    0x01
+    beq        ZERO,   I,    .L_M2
+
+    vld       $vr0,     S1,    0x00
+    vld       $vr1,     S2,    0x00
+
+    vst       $vr0,     S10,    0x00
+    vst       $vr1,     S10,    0x10
+
+    addi.d     S10,    S10,   0x20
+
+.L_M2:  /* if(m&1) */
+    andi       I,      M,    0x01
+    beq        ZERO,   I,    .L_M0
+
+    move       S1,     TS     //aoffset1
+    move       S8,     TD     //boffset1
+
+    srai.d     I,      N,     0x02
+    beq        ZERO,   I,     .L_M2N1
+
+.L_M2I1:  /* if(i>0) */
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S1,    0x20
+
+    xvst       U0,     S8,    0x00
+    xvst       U1,     S8,    0x20
+
+    addi.d     S1,     S1,    0x40
+    slli.d     T0,     M,     0x06
+    add.d      S8,     S8,    T0
+
+    addi.d     I,      I,     -1
+    blt        ZERO,   I,     .L_M2I1
+
+.L_M2N1:  /* if(n&2) */
+    andi       I,      N,    0x02
+    beq        ZERO,   I,    .L_M2N2
+
+    xvld       U0,     S1,    0x00
+
+    xvst       U0,     S9,    0x00
+
+    addi.d     S1,     S1,    0x20
+
+.L_M2N2:  /* if(n&1) */
+    andi       I,      N,    0x01
+    beq        ZERO,   I,    .L_M0
+
+    vld       $vr0,     S1,    0x00
+
+    vst       $vr0,     S10,    0x00
+
+.L_M0:
+    LDARG     $r23,  $sp, 0
+    addi.d    $sp,   $sp, 8
+    jirl      $r0,   $r1, 0x00
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/zgemm_tcopy_4_lsx.S b/kernel/loongarch64/zgemm_tcopy_4_lsx.S
new file mode 100644
index 000000000..954753eaf
--- /dev/null
+++ b/kernel/loongarch64/zgemm_tcopy_4_lsx.S
@@ -0,0 +1,355 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define TD     $r16
+#define TS     $r17
+#define TL     $r18
+#define T0     $r19
+#define S8     $r20
+#define S9     $r23
+#define S10    $r11
+#define ZERO   $r0
+
+#define F0     $f0
+#define F1     $f1
+#define F2     $f2
+#define F3     $f3
+#define F4     $f4
+#define F5     $f5
+#define F6     $f6
+#define F7     $f7
+
+/* LSX vectors */
+#define U0     $vr0
+#define U1     $vr1
+#define U2     $vr2
+#define U3     $vr3
+#define U4     $vr4
+#define U5     $vr5
+#define U6     $vr6
+#define U7     $vr7
+#define U8     $vr8
+#define U9     $vr9
+#define U10    $vr10
+#define U11    $vr11
+#define U12    $vr12
+#define U13    $vr13
+#define U14    $vr14
+#define U15    $vr15
+
+
+    PROLOGUE
+
+    addi.d     $sp,  $sp,  -8
+    SDARG      $r23, $sp,  0
+
+    move       TS,     SRC          //aoffset
+    move       TD,     DST          //boffset
+    slli.d     TL,     LDA,   0x03  //lda
+    slli.d     TL,     TL,    0x01  //lda
+
+    ori        T0,     ZERO,  0x03
+    andn       T0,     N,     T0
+    mul.d      T0,     M,     T0
+    slli.d     T0,     T0,    0x01
+    slli.d     T0,     T0,    0x03
+    add.d      S9,     DST,   T0  //boffset2
+
+    ori        T0,     ZERO,  0x01
+    andn       T0,     N,     T0
+    mul.d      T0,     M,     T0
+    slli.d     T0,     T0,    0x01
+    slli.d     T0,     T0,    0x03
+    add.d      S10,    DST,   T0  //boffset3
+
+    srai.d     J,      M,     0x02  //j
+
+    beq        J,      ZERO,  .L_M1
+
+.L_J1:  /* if(j>0) j--*/
+    move       S1,     TS     //aoffset1
+    add.d      S2,     S1,    TL
+    add.d      S3,     S2,    TL
+    add.d      S4,     S3,    TL
+
+    slli.d     T0,     TL,    0x02
+    add.d      TS,     TS,    T0
+
+    move       S8,     TD     //boffset1
+    addi.d     TD,     TD,    0x100
+
+    srai.d     I,      N,     0x02
+
+    beq        ZERO,   I,     .L_JN1
+
+.L_JI1:  /* if(i>0) i--*/
+    vld       U0,   S1,   0x00
+    vld       U1,   S1,   0x10
+    vld       U2,   S1,   0x20
+    vld       U3,   S1,   0x30
+
+    vld       U4,   S2,   0x00
+    vld       U5,   S2,   0x10
+    vld       U6,   S2,   0x20
+    vld       U7,   S2,   0x30
+
+    vld       U8,   S3,   0x00
+    vld       U9,   S3,   0x10
+    vld       U10,  S3,   0x20
+    vld       U11,  S3,   0x30
+
+    vld       U12,  S4,   0x00
+    vld       U13,  S4,   0x10
+    vld       U14,  S4,   0x20
+    vld       U15,  S4,   0x30
+
+    vst       U0,   S8,   0x00
+    vst       U1,   S8,   0x10
+    vst       U2,   S8,   0x20
+    vst       U3,   S8,   0x30
+    vst       U4,   S8,   0x40
+    vst       U5,   S8,   0x50
+    vst       U6,   S8,   0x60
+    vst       U7,   S8,   0x70
+
+    vst       U8,   S8,   0x80
+    vst       U9,   S8,   0x90
+    vst       U10,  S8,   0xa0
+    vst       U11,  S8,   0xb0
+    vst       U12,  S8,   0xc0
+    vst       U13,  S8,   0xd0
+    vst       U14,  S8,   0xe0
+    vst       U15,  S8,   0xf0
+
+    addi.d     S1,   S1,   0x40
+    addi.d     S2,   S2,   0x40
+    addi.d     S3,   S3,   0x40
+    addi.d     S4,   S4,   0x40
+    slli.d     T0,   M,    0x06
+    add.d      S8,   S8,   T0
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_JI1
+
+.L_JN1:  /* if(n&2) */
+    andi       I,    N,    0x02
+    beq        ZERO, I,    .L_JN2
+
+    vld       U0,   S1,   0x00
+    vld       U1,   S1,   0x10
+
+    vld       U2,   S2,   0x00
+    vld       U3,   S2,   0x10
+
+    vld       U4,   S3,   0x00
+    vld       U5,   S3,   0x10
+
+    vld       U6,   S4,   0x00
+    vld       U7,   S4,   0x10
+
+    vst       U0,   S9,   0x00
+    vst       U1,   S9,   0x10
+    vst       U2,   S9,   0x20
+    vst       U3,   S9,   0x30
+
+    vst       U4,   S9,   0x40
+    vst       U5,   S9,   0x50
+    vst       U6,   S9,   0x60
+    vst       U7,   S9,   0x70
+
+    addi.d     S1,   S1,   0x20
+    addi.d     S2,   S2,   0x20
+    addi.d     S3,   S3,   0x20
+    addi.d     S4,   S4,   0x20
+    addi.d     S9,   S9,   0x80
+
+.L_JN2:  /* if(n&1) */
+    andi       I,    N,    0x01
+    beq        ZERO, I,    .L_J0
+
+    vld        U0,   S1,   0x00
+    vld        U1,   S2,   0x00
+    vld        U2,   S3,   0x00
+    vld        U3,   S4,   0x00
+
+    vst        U0,   S10,   0x00
+    vst        U1,   S10,   0x10
+    vst        U2,   S10,   0x20
+    vst        U3,   S10,   0x30
+
+    addi.d     S10,  S10,  0x40
+
+.L_J0:
+    addi.d     J,      J,     -1
+    blt        ZERO, J,   .L_J1
+
+.L_M1:  /* if(m&2) */
+    andi       I,      M,    0x02
+    beq        ZERO,   I,    .L_M2
+
+    move       S1,     TS     //aoffset1
+    add.d      S2,     S1,    TL
+
+    slli.d     T0,     TL,    0x01
+    add.d      TS,     TS,    T0
+
+    move       S8,     TD     //boffset1
+    addi.d     TD,     TD,    0x80
+
+    srai.d     I,      N,     0x02
+    beq        ZERO,   I,     .L_M1N1
+
+.L_M1I1:  /* if(i>0) */
+    vld       U0,     S1,    0x00
+    vld       U1,     S1,    0x10
+    vld       U2,     S1,    0x20
+    vld       U3,     S1,    0x30
+
+    vld       U4,     S2,    0x00
+    vld       U5,     S2,    0x10
+    vld       U6,     S2,    0x20
+    vld       U7,     S2,    0x30
+
+    vst       U0,     S8,    0x00
+    vst       U1,     S8,    0x10
+    vst       U2,     S8,    0x20
+    vst       U3,     S8,    0x30
+
+    vst       U4,     S8,    0x40
+    vst       U5,     S8,    0x50
+    vst       U6,     S8,    0x60
+    vst       U7,     S8,    0x70
+
+    addi.d     S1,     S1,    0x40
+    addi.d     S2,     S2,    0x40
+    slli.d     T0,     M,     0x06
+    add.d      S8,     S8,    T0
+
+    addi.d     I,      I,     -1
+    blt        ZERO,   I,     .L_M1I1
+
+.L_M1N1:  /* if(n&2) */
+    andi       I,      N,    0x02
+    beq        ZERO,   I,    .L_M1N2
+
+    vld       U0,     S1,    0x00
+    vld       U1,     S1,    0x10
+
+    vld       U2,     S2,    0x00
+    vld       U3,     S2,    0x10
+
+    vst       U0,     S9,    0x00
+    vst       U1,     S9,    0x10
+    vst       U2,     S9,    0x20
+    vst       U3,     S9,    0x30
+
+    addi.d     S1,     S1,    0x20
+    addi.d     S2,     S2,    0x20
+    addi.d     S9,     S9,    0x40
+
+.L_M1N2:  /* if(n&1) */
+    andi       I,      N,    0x01
+    beq        ZERO,   I,    .L_M2
+
+    vld       U0,      S1,    0x00
+    vld       U1,      S2,    0x00
+
+    vst       U0,      S10,   0x00
+    vst       U1,      S10,   0x10
+
+    addi.d     S10,    S10,   0x20
+
+.L_M2:  /* if(m&1) */
+    andi       I,      M,    0x01
+    beq        ZERO,   I,    .L_M0
+
+    move       S1,     TS     //aoffset1
+    move       S8,     TD     //boffset1
+
+    srai.d     I,      N,     0x02
+    beq        ZERO,   I,     .L_M2N1
+
+.L_M2I1:  /* if(i>0) */
+    vld       U0,     S1,    0x00
+    vld       U1,     S1,    0x10
+    vld       U2,     S1,    0x20
+    vld       U3,     S1,    0x30
+
+    vst       U0,     S8,    0x00
+    vst       U1,     S8,    0x10
+    vst       U2,     S8,    0x20
+    vst       U3,     S8,    0x30
+
+    addi.d     S1,     S1,    0x40
+    slli.d     T0,     M,     0x06
+    add.d      S8,     S8,    T0
+
+    addi.d     I,      I,     -1
+    blt        ZERO,   I,     .L_M2I1
+
+.L_M2N1:  /* if(n&2) */
+    andi       I,      N,    0x02
+    beq        ZERO,   I,    .L_M2N2
+
+    vld       U0,     S1,    0x00
+    vld       U1,     S1,    0x10
+
+    vst       U0,     S9,    0x00
+    vst       U1,     S9,    0x10
+
+    addi.d     S1,     S1,    0x20
+
+.L_M2N2:  /* if(n&1) */
+    andi       I,      N,    0x01
+    beq        ZERO,   I,    .L_M0
+
+    vld       U0,     S1,    0x00
+
+    vst       U0,     S10,    0x00
+
+.L_M0:
+    LDARG     $r23,  $sp, 0
+    addi.d    $sp,   $sp, 8
+    jirl      $r0,   $r1, 0x00
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/zgemm_tcopy_8_lasx.S b/kernel/loongarch64/zgemm_tcopy_8_lasx.S
new file mode 100644
index 000000000..f7440dc24
--- /dev/null
+++ b/kernel/loongarch64/zgemm_tcopy_8_lasx.S
@@ -0,0 +1,268 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define S5     $r16
+#define S6     $r17
+#define S7     $r18
+#define S8     $r19
+#define TD     $r20
+#define TS     $r11
+#define TL     $r7
+#define T0     $r23
+#define ZERO   $r0
+
+#define F0     $f0
+#define F1     $f1
+#define F2     $f2
+#define F3     $f3
+#define F4     $f4
+#define F5     $f5
+#define F6     $f6
+#define F7     $f7
+
+/* LASX vectors */
+#define U0     $xr0
+#define U1     $xr1
+#define U2     $xr2
+#define U3     $xr3
+#define U4     $xr4
+#define U5     $xr5
+#define U6     $xr6
+#define U7     $xr7
+#define D0     $xr8
+#define D1     $xr9
+#define D2     $xr10
+#define D3     $xr11
+#define D4     $xr12
+#define D5     $xr13
+#define D6     $xr14
+#define D7     $xr15
+
+
+    PROLOGUE
+
+    addi.d     $sp,  $sp,  -8
+    SDARG      $r23, $sp,  0
+
+    move       TS,     SRC          //aoffset
+    move       TD,     DST          //boffset
+    slli.d     TL,     LDA,   0x03  //lda
+    slli.d     TL,     TL,    0x01
+
+    srai.d     J,      N,     0x03  //j
+
+    beq        J,      ZERO,  .L_N1
+
+.L_J1:  /* if(j>0) j--*/
+    move       S1,     TS     //aoffset1
+    slli.d     T0,     TL,    0x01  //2*lda
+    add.d      S2,     TS,    TL
+    addi.d     TS,     TS,    0x80
+
+    srai.d     I,      M,     0x01
+    beq        ZERO,   I,     .L_J1M1
+
+.L_J1I1:  /* if(i>0) i--*/
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S1,    0x20
+    xvld       U2,     S1,    0x40
+    xvld       U3,     S1,    0x60
+    xvld       U4,     S2,    0x00
+    xvld       U5,     S2,    0x20
+    xvld       U6,     S2,    0x40
+    xvld       U7,     S2,    0x60
+
+    xvst       U0,     TD,    0x00
+    xvst       U1,     TD,    0x20
+    xvst       U2,     TD,    0x40
+    xvst       U3,     TD,    0x60
+    xvst       U4,     TD,    0x80
+    xvst       U5,     TD,    0xa0
+    xvst       U6,     TD,    0xc0
+    xvst       U7,     TD,    0xe0
+
+    add.d      S1,     S1,    T0
+    add.d      S2,     S2,    T0
+    addi.d     TD,     TD,    0x100
+
+    addi.d     I,      I,     -1
+    blt        ZERO,   I,     .L_J1I1
+
+.L_J1M1:  /* if(m&1) */
+    andi       I,      M,     0x01
+    beq        ZERO,   I,     .L_J0
+
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S1,    0x20
+    xvld       U2,     S1,    0x40
+    xvld       U3,     S1,    0x60
+
+    xvst       U0,     TD,    0x00
+    xvst       U1,     TD,    0x20
+    xvst       U2,     TD,    0x40
+    xvst       U3,     TD,    0x60
+
+    addi.d     TD,     TD,    0x80
+
+.L_J0:
+    addi.d     J,      J,     -1
+    blt        ZERO,   J,     .L_J1
+
+.L_N1:  /* if(n&4) */
+    andi       I,      N,     0x04
+    beq        ZERO,   I,     .L_N2
+
+    move       S1,     TS     //aoffset1
+    slli.d     T0,     TL,    0x01  //2*lda
+    add.d      S2,     TS,    TL
+    addi.d     TS,     TS,    0x40
+
+    srai.d     I,      M,     0x01
+    beq        ZERO,   I,     .L_N1M1
+
+.L_N1I1:   /* if(i>0) i-- */
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S1,    0x20
+    xvld       U2,     S2,    0x00
+    xvld       U3,     S2,    0x20
+
+    xvst       U0,     TD,    0x00
+    xvst       U1,     TD,    0x20
+    xvst       U2,     TD,    0x40
+    xvst       U3,     TD,    0x60
+
+    add.d      S1,     S1,    T0
+    add.d      S2,     S2,    T0
+    addi.d     TD,     TD,    0x80
+
+    addi.d     I,      I,     -1
+    blt        ZERO,   I,     .L_N1I1
+
+.L_N1M1:  /* if(m&1) */
+    andi       I,      M,     0x01
+    beq        ZERO,   I,     .L_N2
+
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S1,    0x20
+
+    xvst       U0,     TD,    0x00
+    xvst       U1,     TD,    0x20
+
+    addi.d     TD,     TD,    0x40
+
+.L_N2:  /* if(n&2) */
+    andi       I,      N,     0x02
+    beq        ZERO,   I,     .L_N3
+
+    move       S1,     TS     //aoffset1
+    slli.d     T0,     TL,    0x01  //2*lda
+    add.d      S2,     TS,    TL
+    addi.d     TS,     TS,    0x20
+
+    srai.d     I,      M,     0x01
+    beq        ZERO,   I,     .L_N2M1
+
+.L_N2I1:  /* if(i>0) i-- */
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S2,    0x00
+
+    xvst       U0,     TD,    0x00
+    xvst       U1,     TD,    0x20
+
+    add.d      S1,     S1,    T0
+    add.d      S2,     S2,    T0
+
+    addi.d     TD,     TD,    0x40
+
+    addi.d     I,      I,     -1
+    blt        ZERO,   I,     .L_N2I1
+
+.L_N2M1:   /* if(m&1) */
+    andi       I,      M,     0x01
+    beq        ZERO,   I,     .L_N3
+
+    xvld       U0,     S1,    0x00
+
+    xvst       U0,     TD,    0x00
+
+    addi.d     TD,     TD,    0x20
+
+.L_N3:   /* if(n&1) */
+    andi       I,      N,     0x01
+    beq        ZERO,   I,     .L_N0
+
+    move       S1,     TS     //aoffset1
+    slli.d     T0,     TL,    0x01  //2*lda
+    add.d      S2,     TS,    TL
+
+    srai.d     I,      M,     0x01
+    beq        ZERO,   I,     .L_N3M1
+
+.L_N3I1:  /* if(i>0) i-- */
+    vld       $vr0,     S1,    0x00
+    vld       $vr1,     S2,    0x00
+
+    vst       $vr0,     TD,    0x00
+    vst       $vr1,     TD,    0x10
+
+    add.d      S1,     S1,    T0
+    add.d      S2,     S2,    T0
+    addi.d     TD,     TD,    0x20
+
+    addi.d     I,      I,     -1
+    blt        ZERO,   I,     .L_N3I1
+
+.L_N3M1:  /* if(m&1) */
+    andi       I,      M,     0x01
+    beq        ZERO,   I,     .L_N0
+
+    vld       $vr0,     S1,    0x00
+
+    vst       $vr0,     TD,    0x00
+
+.L_N0:
+    LDARG      $r23,   $sp,   0
+    addi.d     $sp,    $sp,   8
+    jirl       $r0,    $r1,   0x00
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/mips/KERNEL.P5600 b/kernel/mips/KERNEL.P5600
index f0fb5e087..c37b88adb 100644
--- a/kernel/mips/KERNEL.P5600
+++ b/kernel/mips/KERNEL.P5600
@@ -35,7 +35,7 @@ DSUMKERNEL  = ../mips/sum.c
 CSUMKERNEL  = ../mips/zsum.c
 ZSUMKERNEL  = ../mips/zsum.c
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 SASUMKERNEL  = ../mips/sasum_msa.c
 DASUMKERNEL  = ../mips/dasum_msa.c
 CASUMKERNEL  = ../mips/casum_msa.c
@@ -47,7 +47,7 @@ CASUMKERNEL  = ../mips/zasum.c
 ZASUMKERNEL  = ../mips/zasum.c
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 SAXPYKERNEL  = ../mips/saxpy_msa.c
 DAXPYKERNEL  = ../mips/daxpy_msa.c
 CAXPYKERNEL  = ../mips/caxpy_msa.c
@@ -59,7 +59,7 @@ CAXPYKERNEL  = ../mips/zaxpy.c
 ZAXPYKERNEL  = ../mips/zaxpy.c
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 SCOPYKERNEL  = ../mips/scopy_msa.c
 DCOPYKERNEL  = ../mips/dcopy_msa.c
 CCOPYKERNEL  = ../mips/ccopy_msa.c
@@ -71,7 +71,7 @@ CCOPYKERNEL  = ../mips/zcopy.c
 ZCOPYKERNEL  = ../mips/zcopy.c
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 SDOTKERNEL   = ../mips/sdot_msa.c
 DDOTKERNEL   = ../mips/ddot_msa.c
 CDOTKERNEL   = ../mips/cdot_msa.c
@@ -88,7 +88,7 @@ DNRM2KERNEL  = ../mips/nrm2.c
 CNRM2KERNEL  = ../mips/znrm2.c
 ZNRM2KERNEL  = ../mips/znrm2.c
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 SROTKERNEL   = ../mips/srot_msa.c
 DROTKERNEL   = ../mips/drot_msa.c
 CROTKERNEL   = ../mips/crot_msa.c
@@ -100,7 +100,7 @@ CROTKERNEL   = ../mips/zrot.c
 ZROTKERNEL   = ../mips/zrot.c
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 SSCALKERNEL  = ../mips/sscal_msa.c
 DSCALKERNEL  = ../mips/dscal_msa.c
 #CSCALKERNEL  = ../mips/cscal_msa.c
@@ -114,7 +114,7 @@ CSCALKERNEL  = ../mips/zscal.c
 ZSCALKERNEL  = ../mips/zscal.c
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 SSWAPKERNEL  = ../mips/sswap_msa.c
 DSWAPKERNEL  = ../mips/dswap_msa.c
 CSWAPKERNEL  = ../mips/cswap_msa.c
@@ -126,7 +126,7 @@ CSWAPKERNEL  = ../mips/zswap.c
 ZSWAPKERNEL  = ../mips/zswap.c
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 SGEMVNKERNEL = ../mips/sgemv_n_msa.c
 DGEMVNKERNEL = ../mips/dgemv_n_msa.c
 CGEMVNKERNEL = ../mips/cgemv_n_msa.c
@@ -138,7 +138,7 @@ CGEMVNKERNEL = ../mips/zgemv_n.c
 ZGEMVNKERNEL = ../mips/zgemv_n.c
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 SGEMVTKERNEL = ../mips/sgemv_t_msa.c
 DGEMVTKERNEL = ../mips/dgemv_t_msa.c
 CGEMVTKERNEL = ../mips/cgemv_t_msa.c
@@ -150,7 +150,7 @@ CGEMVTKERNEL = ../mips/zgemv_t.c
 ZGEMVTKERNEL = ../mips/zgemv_t.c
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 SGEMMKERNEL    = ../mips/sgemm_kernel_8x8_msa.c
 SGEMMONCOPY    = ../mips/sgemm_ncopy_8_msa.c
 SGEMMOTCOPY    = ../mips/sgemm_tcopy_8_msa.c
@@ -164,7 +164,7 @@ SGEMMONCOPYOBJ = sgemm_oncopy.o
 SGEMMOTCOPYOBJ = sgemm_otcopy.o
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 DGEMMKERNEL    = ../mips/dgemm_kernel_8x4_msa.c
 DGEMMINCOPY    = ../mips/dgemm_ncopy_8_msa.c
 DGEMMITCOPY    = ../mips/dgemm_tcopy_8_msa.c
@@ -182,7 +182,7 @@ DGEMMONCOPYOBJ = dgemm_oncopy.o
 DGEMMOTCOPYOBJ = dgemm_otcopy.o
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 CGEMMKERNEL    = ../mips/cgemm_kernel_8x4_msa.c
 CGEMMINCOPY    = ../mips/cgemm_ncopy_8_msa.c
 CGEMMITCOPY    = ../mips/cgemm_tcopy_8_msa.c
@@ -200,7 +200,7 @@ CGEMMONCOPYOBJ = cgemm_oncopy.o
 CGEMMOTCOPYOBJ = cgemm_otcopy.o
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 ZGEMMKERNEL    = ../mips/zgemm_kernel_4x4_msa.c
 ZGEMMONCOPY    = ../mips/zgemm_ncopy_4_msa.c
 ZGEMMOTCOPY    = ../mips/zgemm_tcopy_4_msa.c
@@ -214,7 +214,7 @@ ZGEMMONCOPYOBJ = zgemm_oncopy.o
 ZGEMMOTCOPYOBJ = zgemm_otcopy.o
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 STRSMKERNEL_LN = ../mips/strsm_kernel_LN_8x8_msa.c
 STRSMKERNEL_LT = ../mips/strsm_kernel_LT_8x8_msa.c
 STRSMKERNEL_RN = ../mips/strsm_kernel_RN_8x8_msa.c
@@ -226,7 +226,7 @@ STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
 STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 DTRSMKERNEL_LN = ../mips/dtrsm_kernel_LN_8x4_msa.c
 DTRSMKERNEL_LT = ../mips/dtrsm_kernel_LT_8x4_msa.c
 DTRSMKERNEL_RN = ../mips/dtrsm_kernel_RN_8x4_msa.c
@@ -238,7 +238,7 @@ DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
 DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
 CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
 CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
@@ -250,7 +250,7 @@ CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
 CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
 ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
 ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
diff --git a/kernel/mips64/KERNEL.LOONGSON3R4 b/kernel/mips64/KERNEL.LOONGSON3R4
index b81e5441d..1149d97f1 100644
--- a/kernel/mips64/KERNEL.LOONGSON3R4
+++ b/kernel/mips64/KERNEL.LOONGSON3R4
@@ -1,4 +1,4 @@
-ifdef HAVE_MSA
+ifndef NO_MSA
 SAXPYKERNEL = ../mips/saxpy_msa.c
 DAXPYKERNEL = ../mips/daxpy_msa.c
 CAXPYKERNEL = ../mips/caxpy_msa.c
@@ -8,14 +8,14 @@ SAXPYKERNEL = axpy_loongson3a.S
 DAXPYKERNEL = daxpy_loongson3a_simd.S
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 SCOPYKERNEL  = ../mips/scopy_msa.c
 DCOPYKERNEL  = ../mips/dcopy_msa.c
 CCOPYKERNEL  = ../mips/ccopy_msa.c
 ZCOPYKERNEL  = ../mips/zcopy_msa.c
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 SDOTKERNEL   = ../mips/sdot_msa.c
 DDOTKERNEL   = ../mips/ddot_msa.c
 CDOTKERNEL   = ../mips/cdot_msa.c
@@ -23,21 +23,21 @@ ZDOTKERNEL   = ../mips/zdot_msa.c
 endif
 DSDOTKERNEL  = ../mips/dot.c
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 SROTKERNEL   = ../mips/srot_msa.c
 DROTKERNEL   = ../mips/drot_msa.c
 CROTKERNEL   = ../mips/crot_msa.c
 ZROTKERNEL   = ../mips/zrot_msa.c
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 SSCALKERNEL  = ../mips/sscal_msa.c
 DSCALKERNEL  = ../mips/dscal_msa.c
 CSCALKERNEL  = ../mips/cscal_msa.c
 ZSCALKERNEL  = ../mips/zscal_msa.c
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 SGEMVNKERNEL = ../mips/sgemv_n_msa.c
 DGEMVNKERNEL = ../mips/dgemv_n_msa.c
 SGEMVTKERNEL = ../mips/sgemv_t_msa.c
@@ -57,21 +57,21 @@ ZGEMVNKERNEL = zgemv_n_loongson3a.c
 ZGEMVTKERNEL = zgemv_t_loongson3a.c
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 SASUMKERNEL  = ../mips/sasum_msa.c
 DASUMKERNEL  = ../mips/dasum_msa.c
 CASUMKERNEL  = ../mips/casum_msa.c
 ZASUMKERNEL  = ../mips/zasum_msa.c
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 SSWAPKERNEL  = ../mips/sswap_msa.c
 DSWAPKERNEL  = ../mips/dswap_msa.c
 CSWAPKERNEL  = ../mips/cswap_msa.c
 ZSWAPKERNEL  = ../mips/zswap_msa.c
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 SGEMMKERNEL    = ../mips/sgemm_kernel_8x8_msa.c
 SGEMMONCOPY    = ../mips/sgemm_ncopy_8_msa.c
 SGEMMOTCOPY    = ../mips/sgemm_tcopy_8_msa.c
@@ -89,7 +89,7 @@ SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
 SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 DGEMMKERNEL    = ../mips/dgemm_kernel_8x4_msa.c
 DGEMMINCOPY    = ../mips/dgemm_ncopy_8_msa.c
 DGEMMITCOPY    = ../mips/dgemm_tcopy_8_msa.c
@@ -107,7 +107,7 @@ DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
 DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 CGEMMKERNEL    = ../mips/cgemm_kernel_8x4_msa.c
 CGEMMINCOPY    = ../mips/cgemm_ncopy_8_msa.c
 CGEMMITCOPY    = ../mips/cgemm_tcopy_8_msa.c
@@ -129,7 +129,7 @@ CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
 CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 ZGEMMKERNEL    = ../mips/zgemm_kernel_4x4_msa.c
 ZGEMMONCOPY    = ../mips/zgemm_ncopy_4_msa.c
 ZGEMMOTCOPY    = ../mips/zgemm_tcopy_4_msa.c
@@ -143,7 +143,7 @@ ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
 ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 STRSMKERNEL_LN = ../mips/strsm_kernel_LN_8x8_msa.c
 STRSMKERNEL_LT = ../mips/strsm_kernel_LT_8x8_msa.c
 STRSMKERNEL_RN = ../mips/strsm_kernel_RN_8x8_msa.c
@@ -155,7 +155,7 @@ STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
 STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 DTRSMKERNEL_LN = ../mips/dtrsm_kernel_LN_8x4_msa.c
 DTRSMKERNEL_LT = ../mips/dtrsm_kernel_LT_8x4_msa.c
 DTRSMKERNEL_RN = ../mips/dtrsm_kernel_RN_8x4_msa.c
@@ -167,7 +167,7 @@ DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
 DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
 CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
 CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
@@ -179,7 +179,7 @@ CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
 CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
 ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
 ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10
index 9047c714c..c84cd91d2 100644
--- a/kernel/power/KERNEL.POWER10
+++ b/kernel/power/KERNEL.POWER10
@@ -25,7 +25,7 @@ ZTRMMKERNEL	= zgemm_kernel_power10.S
 endif
 
 SGEMMKERNEL    =  sgemm_kernel_power10.c
-SGEMMINCOPY    = ../generic/gemm_ncopy_16.c
+SGEMMINCOPY    = sgemm_ncopy_16_power.c
 SGEMMITCOPY    = sgemm_tcopy_16_power8.S
 SGEMMONCOPY    =  ../generic/gemm_ncopy_8.c
 SGEMMOTCOPY    = sgemm_tcopy_8_power8.S
diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8
index 2b8e65948..700a68e44 100644
--- a/kernel/power/KERNEL.POWER8
+++ b/kernel/power/KERNEL.POWER8
@@ -50,7 +50,7 @@ CTRMMKERNEL	= ctrmm_kernel_8x4_power8.S
 ZTRMMKERNEL	= ztrmm_kernel_8x2_power8.S
 
 SGEMMKERNEL    =  sgemm_kernel_16x8_power8.S
-SGEMMINCOPY    = ../generic/gemm_ncopy_16.c
+SGEMMINCOPY    = sgemm_ncopy_16_power.c
 SGEMMITCOPY    = sgemm_tcopy_16_power8.S
 SGEMMONCOPY    =  ../generic/gemm_ncopy_8.c
 SGEMMOTCOPY    = sgemm_tcopy_8_power8.S
diff --git a/kernel/power/KERNEL.POWER9 b/kernel/power/KERNEL.POWER9
index b6b102b3e..7d007d1a2 100644
--- a/kernel/power/KERNEL.POWER9
+++ b/kernel/power/KERNEL.POWER9
@@ -13,7 +13,7 @@ CTRMMKERNEL	= cgemm_kernel_power9.S
 ZTRMMKERNEL	= zgemm_kernel_power9.S
 
 SGEMMKERNEL    =  sgemm_kernel_power9.S
-SGEMMINCOPY    = ../generic/gemm_ncopy_16.c
+SGEMMINCOPY    = sgemm_ncopy_16_power.c
 SGEMMITCOPY    = sgemm_tcopy_16_power8.S
 SGEMMONCOPY    =  ../generic/gemm_ncopy_8.c
 SGEMMOTCOPY    = sgemm_tcopy_8_power8.S
diff --git a/kernel/power/sgemm_ncopy_16_power.c b/kernel/power/sgemm_ncopy_16_power.c
new file mode 100755
index 000000000..babe1376e
--- /dev/null
+++ b/kernel/power/sgemm_ncopy_16_power.c
@@ -0,0 +1,482 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include <altivec.h>
+#include "common.h"
+
+int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
+  BLASLONG i, j;
+
+  IFLOAT *aoffset;
+  IFLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4;
+  IFLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8;
+  IFLOAT *aoffset9, *aoffset10, *aoffset11, *aoffset12;
+  IFLOAT *aoffset13, *aoffset14, *aoffset15, *aoffset16;
+
+  IFLOAT *boffset;
+  IFLOAT ctemp01, ctemp02, ctemp03, ctemp04;
+  IFLOAT ctemp05, ctemp06, ctemp07, ctemp08;
+  IFLOAT ctemp09, ctemp10, ctemp11, ctemp12;
+  IFLOAT ctemp13, ctemp14, ctemp15, ctemp16;
+  IFLOAT ctemp17,  ctemp19 ;
+  IFLOAT ctemp21,  ctemp23 ;
+  IFLOAT ctemp25,  ctemp27 ;
+  IFLOAT ctemp29,  ctemp31 ;
+
+  aoffset = a;
+  boffset = b;
+  j = (n >> 4);
+  if (j > 0){
+    do{
+      aoffset1  = aoffset;
+      aoffset2  = aoffset1  + lda;
+      aoffset3  = aoffset2  + lda;
+      aoffset4  = aoffset3  + lda;
+      aoffset5  = aoffset4  + lda;
+      aoffset6  = aoffset5  + lda;
+      aoffset7  = aoffset6  + lda;
+      aoffset8  = aoffset7  + lda;
+      aoffset9  = aoffset8  + lda;
+      aoffset10 = aoffset9  + lda;
+      aoffset11 = aoffset10 + lda;
+      aoffset12 = aoffset11 + lda;
+      aoffset13 = aoffset12 + lda;
+      aoffset14 = aoffset13 + lda;
+      aoffset15 = aoffset14 + lda;
+      aoffset16 = aoffset15 + lda;
+      aoffset += 16 * lda;
+      i = (m >> 2);
+      if (i > 0){
+	vector float c1, c2, c3, c4, c5, c6, c7, c8;
+	vector float c9, c10, c11, c12, c13, c14, c15, c16;
+	vector float t1, t2, t3, t4, t5, t6, t7, t8;
+	vector float t9, t10, t11, t12;
+	do{
+	   c1 = vec_xl(0, aoffset1);
+	   c2 = vec_xl(0, aoffset2);
+	   c3 = vec_xl(0, aoffset3);
+	   c4 = vec_xl(0, aoffset4);
+	   c5 = vec_xl(0, aoffset5);
+	   c6 = vec_xl(0, aoffset6);
+	   c7 = vec_xl(0, aoffset7);
+	   c8 = vec_xl(0, aoffset8);
+	   c9 = vec_xl(0, aoffset9);
+	   c10 = vec_xl(0, aoffset10);
+	   c11 = vec_xl(0, aoffset11);
+	   c12 = vec_xl(0, aoffset12);
+	   c13 = vec_xl(0, aoffset13);
+	   c14 = vec_xl(0, aoffset14);
+	   c15 = vec_xl(0, aoffset15);
+	   c16 = vec_xl(0, aoffset16);
+
+           t1  = vec_mergeh(c1, c2);
+           t2  = vec_mergeh(c3, c4);
+           t3  = vec_mergeh(c5, c6);
+           t4  = vec_mergeh(c7, c8);
+           t9  = vec_mergeh(c9, c10);
+           t10  = vec_mergeh(c11, c12);
+           t11  = vec_mergeh(c13, c14);
+           t12  = vec_mergeh(c15, c16);
+
+	   t5 = vec_xxpermdi(t1, t2, 0b00);
+           t6 = vec_xxpermdi(t3, t4, 0b00);
+	   t7 = vec_xxpermdi(t9, t10, 0b00);
+	   t8 = vec_xxpermdi(t11, t12, 0b00);
+
+	   vec_xst(t5, 0, boffset);
+	   vec_xst(t6, 0, boffset+4);
+	   vec_xst(t7, 0, boffset+8);
+	   vec_xst(t8, 0, boffset+12);
+	   t5 = vec_xxpermdi(t1, t2, 0b11);
+	   t6 = vec_xxpermdi(t3, t4, 0b11);
+	   t7 = vec_xxpermdi(t9, t10, 0b11);
+	   t8 = vec_xxpermdi(t11, t12, 0b11);
+	   vec_xst(t5, 0, boffset+16);
+	   vec_xst(t6, 0, boffset+20);
+	   vec_xst(t7, 0, boffset+24);
+	   vec_xst(t8, 0, boffset+28);
+
+           t1  = vec_mergel(c1, c2);
+           t2  = vec_mergel(c3, c4);
+           t3  = vec_mergel(c5, c6);
+           t4  = vec_mergel(c7, c8);
+           t9  = vec_mergel(c9, c10);
+           t10  = vec_mergel(c11, c12);
+           t11  = vec_mergel(c13, c14);
+           t12  = vec_mergel(c15, c16);
+  	   t5 = vec_xxpermdi(t1, t2, 0b00);
+	   t6 = vec_xxpermdi(t3, t4, 0b00);
+	   t7 = vec_xxpermdi(t9, t10, 0b00);
+	   t8 = vec_xxpermdi(t11, t12, 0b00);
+	   vec_xst(t5, 0, boffset+32);
+	   vec_xst(t6, 0, boffset+36);
+	   vec_xst(t7, 0, boffset+40);
+	   vec_xst(t8, 0, boffset+44);
+
+	   t5 = vec_xxpermdi(t1, t2, 0b11);
+	   t6 = vec_xxpermdi(t3, t4, 0b11);
+	   t7 = vec_xxpermdi(t9, t10, 0b11);
+	   t8 = vec_xxpermdi(t11, t12, 0b11);
+	   vec_xst(t5, 0, boffset+48);
+	   vec_xst(t6, 0, boffset+52);
+	   vec_xst(t7, 0, boffset+56);
+	   vec_xst(t8, 0, boffset+60);
+
+	  aoffset1 +=  4;
+	  aoffset2 +=  4;
+	  aoffset3 +=  4;
+	  aoffset4 +=  4;
+	  aoffset5 +=  4;
+	  aoffset6 +=  4;
+	  aoffset7 +=  4;
+	  aoffset8 +=  4;
+
+	  aoffset9  +=  4;
+	  aoffset10 +=  4;
+	  aoffset11 +=  4;
+	  aoffset12 +=  4;
+	  aoffset13 +=  4;
+	  aoffset14 +=  4;
+	  aoffset15 +=  4;
+	  aoffset16 +=  4;
+	  boffset   += 64;
+
+	  i --;
+	}while(i > 0);
+      }
+      i = (m & 3);
+      if (i > 0){
+        do{
+	ctemp01 = *(aoffset1 +  0);
+	ctemp03 = *(aoffset2 +  0);
+	ctemp05 = *(aoffset3 +  0);
+	ctemp07 = *(aoffset4 +  0);
+	ctemp09 = *(aoffset5 +  0);
+	ctemp11 = *(aoffset6 +  0);
+	ctemp13 = *(aoffset7 +  0);
+	ctemp15 = *(aoffset8 +  0);
+
+	ctemp17 = *(aoffset9 +  0);
+	ctemp19 = *(aoffset10 +  0);
+	ctemp21 = *(aoffset11 +  0);
+	ctemp23 = *(aoffset12 +  0);
+	ctemp25 = *(aoffset13 +  0);
+	ctemp27 = *(aoffset14 +  0);
+	ctemp29 = *(aoffset15 +  0);
+	ctemp31 = *(aoffset16 +  0);
+	*(boffset +  0) = ctemp01;
+	*(boffset +  1) = ctemp03;
+	*(boffset +  2) = ctemp05;
+	*(boffset +  3) = ctemp07;
+	*(boffset +  4) = ctemp09;
+	*(boffset +  5) = ctemp11;
+	*(boffset +  6) = ctemp13;
+	*(boffset +  7) = ctemp15;
+
+	*(boffset +  8) = ctemp17;
+	*(boffset +  9) = ctemp19;
+	*(boffset + 10) = ctemp21;
+	*(boffset + 11) = ctemp23;
+	*(boffset + 12) = ctemp25;
+	*(boffset + 13) = ctemp27;
+	*(boffset + 14) = ctemp29;
+	*(boffset + 15) = ctemp31;
+	  aoffset1+=1;
+	  aoffset2+=1;
+	  aoffset3+=1;
+	  aoffset4+=1;
+	  aoffset5+=1;
+	  aoffset6+=1;
+	  aoffset7+=1;
+	  aoffset8+=1;
+	  aoffset9+=1;
+	  aoffset10+=1;
+	  aoffset11+=1;
+	  aoffset12+=1;
+	  aoffset13+=1;
+	  aoffset14+=1;
+	  aoffset15+=1;
+	  aoffset16+=1;
+	boffset  += 16;
+        i --;
+        }while(i > 0);
+      }
+      j--;
+    }while(j > 0);
+  } /* end of if(j > 0) */
+
+  if (n & 8){
+    aoffset1  = aoffset;
+    aoffset2  = aoffset1  + lda;
+    aoffset3  = aoffset2  + lda;
+    aoffset4  = aoffset3  + lda;
+    aoffset5  = aoffset4  + lda;
+    aoffset6  = aoffset5  + lda;
+    aoffset7  = aoffset6  + lda;
+    aoffset8  = aoffset7  + lda;
+    aoffset += 8 * lda;
+
+    i = (m >> 2);
+    if (i > 0){
+      vector float c1, c2, c3, c4, c5, c6, c7, c8;
+      vector float t1, t2, t3, t4, t5, t6, t7, t8;
+      do{
+        c1 = vec_xl(0, aoffset1);
+        c2 = vec_xl(0, aoffset2);
+        c3 = vec_xl(0, aoffset3);
+        c4 = vec_xl(0, aoffset4);
+        c5 = vec_xl(0, aoffset5);
+        c6 = vec_xl(0, aoffset6);
+        c7 = vec_xl(0, aoffset7);
+        c8 = vec_xl(0, aoffset8);
+
+        t1  = vec_mergeh(c1, c2);
+        t2  = vec_mergeh(c3, c4);
+        t3  = vec_mergeh(c5, c6);
+        t4  = vec_mergeh(c7, c8);
+
+        t5 = vec_xxpermdi(t1, t2, 0b00);
+        t6 = vec_xxpermdi(t3, t4, 0b00);
+        t7 = vec_xxpermdi(t1, t2, 0b11);
+        t8 = vec_xxpermdi(t3, t4, 0b11);
+
+        vec_xst(t5, 0, boffset);
+        vec_xst(t6, 0, boffset+4);
+        vec_xst(t7, 0, boffset+8);
+        vec_xst(t8, 0, boffset+12);
+
+        t1  = vec_mergel(c1, c2);
+        t2  = vec_mergel(c3, c4);
+        t3  = vec_mergel(c5, c6);
+        t4  = vec_mergel(c7, c8);
+
+        t5 = vec_xxpermdi(t1, t2, 0b00);
+        t6 = vec_xxpermdi(t3, t4, 0b00);
+        t7 = vec_xxpermdi(t1, t2, 0b11);
+        t8 = vec_xxpermdi(t3, t4, 0b11);
+
+        vec_xst(t5, 0, boffset+16);
+        vec_xst(t6, 0, boffset+20);
+        vec_xst(t7, 0, boffset+24);
+        vec_xst(t8, 0, boffset+28);
+
+        aoffset1 +=  4;
+        aoffset2 +=  4;
+        aoffset3 +=  4;
+        aoffset4 +=  4;
+        aoffset5 +=  4;
+        aoffset6 +=  4;
+        aoffset7 +=  4;
+        aoffset8 +=  4;
+
+        boffset   += 32;
+        i--;
+      }while(i > 0);
+    }
+
+    i = (m & 3);
+    if (i > 0) {
+      do {
+        ctemp01 = *(aoffset1 +  0);
+        ctemp03 = *(aoffset2 +  0);
+        ctemp05 = *(aoffset3 +  0);
+        ctemp07 = *(aoffset4 +  0);
+        ctemp09 = *(aoffset5 +  0);
+        ctemp11 = *(aoffset6 +  0);
+        ctemp13 = *(aoffset7 +  0);
+        ctemp15 = *(aoffset8 +  0);
+
+        *(boffset +  0) = ctemp01;
+        *(boffset +  1) = ctemp03;
+        *(boffset +  2) = ctemp05;
+        *(boffset +  3) = ctemp07;
+        *(boffset +  4) = ctemp09;
+        *(boffset +  5) = ctemp11;
+        *(boffset +  6) = ctemp13;
+        *(boffset +  7) = ctemp15;
+
+        aoffset1+=1;
+        aoffset2+=1;
+        aoffset3+=1;
+        aoffset4+=1;
+        aoffset5+=1;
+        aoffset6+=1;
+        aoffset7+=1;
+        aoffset8+=1;
+
+        boffset   += 8;
+        i--;
+      } while (i > 0);
+    }
+  }
+
+  if (n & 4){
+    aoffset1  = aoffset;
+    aoffset2  = aoffset1  + lda;
+    aoffset3  = aoffset2  + lda;
+    aoffset4  = aoffset3  + lda;
+    aoffset += 4 * lda;
+
+    i = (m >> 2);
+    if (i > 0){
+      vector float c1, c2, c3, c4;
+      vector float t1, t2, t3, t4;
+      do{
+        c1 = vec_xl(0, aoffset1);
+        c2 = vec_xl(0, aoffset2);
+        c3 = vec_xl(0, aoffset3);
+        c4 = vec_xl(0, aoffset4);
+
+        t1  = vec_mergeh(c1, c2);
+        t2  = vec_mergeh(c3, c4);
+
+        t3 = vec_xxpermdi(t1, t2, 0b00);
+        t4 = vec_xxpermdi(t1, t2, 0b11);
+
+        vec_xst(t3, 0, boffset);
+        vec_xst(t4, 0, boffset+4);
+
+        t1  = vec_mergel(c1, c2);
+        t2  = vec_mergel(c3, c4);
+
+        t3 = vec_xxpermdi(t1, t2, 0b00);
+        t4 = vec_xxpermdi(t1, t2, 0b11);
+
+        vec_xst(t3, 0, boffset+8);
+        vec_xst(t4, 0, boffset+12);
+
+        aoffset1 +=  4;
+        aoffset2 +=  4;
+        aoffset3 +=  4;
+        aoffset4 +=  4;
+
+        boffset   += 16;
+        i--;
+      }while(i > 0);
+    }
+
+    i = (m & 3);
+    if (i > 0) {
+      do {
+        ctemp01 = *(aoffset1 +  0);
+        ctemp03 = *(aoffset2 +  0);
+        ctemp05 = *(aoffset3 +  0);
+        ctemp07 = *(aoffset4 +  0);
+
+        *(boffset +  0) = ctemp01;
+        *(boffset +  1) = ctemp03;
+        *(boffset +  2) = ctemp05;
+        *(boffset +  3) = ctemp07;
+
+        aoffset1+=1;
+        aoffset2+=1;
+        aoffset3+=1;
+        aoffset4+=1;
+
+        boffset   += 4;
+        i--;
+      } while (i > 0);
+    }
+  }
+
+  if (n & 2){
+    aoffset1  = aoffset;
+    aoffset2  = aoffset1  + lda;
+    aoffset += 2 * lda;
+
+    i = (m >> 1);
+    if (i > 0){
+      do{
+	ctemp01 = *(aoffset1 +  0);
+	ctemp02 = *(aoffset1 +  1);
+	ctemp03 = *(aoffset2 +  0);
+	ctemp04 = *(aoffset2 +  1);
+
+	*(boffset +  0) = ctemp01;
+	*(boffset +  1) = ctemp03;
+	*(boffset +  2) = ctemp02;
+	*(boffset +  3) = ctemp04;
+
+	aoffset1 +=  2;
+	aoffset2 +=  2;
+	boffset   += 4;
+
+	i --;
+      }while(i > 0);
+    }
+
+    if (m & 1){
+      ctemp01 = *(aoffset1 +  0);
+      ctemp03 = *(aoffset2 +  0);
+
+      *(boffset +  0) = ctemp01;
+      *(boffset +  1) = ctemp03;
+      boffset   += 2;
+    }
+  }
+
+  if (n & 1){
+    aoffset1  = aoffset;
+
+    i = (m >> 1);
+    if (i > 0){
+      do{
+	ctemp01 = *(aoffset1 +  0);
+	ctemp02 = *(aoffset1 +  1);
+
+	*(boffset +  0) = ctemp01;
+	*(boffset +  1) = ctemp02;
+
+	aoffset1 +=  2;
+	boffset   += 2;
+
+	i --;
+      }while(i > 0);
+    }
+
+    if (m & 1){
+      ctemp01 = *(aoffset1 +  0);
+
+      *(boffset +  0) = ctemp01;
+      // boffset   += 1;
+    }
+  }
+
+  return 0;
+}
diff --git a/kernel/riscv64/KERNEL.C910V b/kernel/riscv64/KERNEL.C910V
index e6f2b3314..2798a870e 100644
--- a/kernel/riscv64/KERNEL.C910V
+++ b/kernel/riscv64/KERNEL.C910V
@@ -42,8 +42,8 @@ ZSUMKERNEL  = ../arm/zsum.c
 
 SAXPYKERNEL  = axpy_vector.c
 DAXPYKERNEL  = axpy_vector.c
-CAXPYKERNEL  = zaxpy.c
-ZAXPYKERNEL  = zaxpy.c
+CAXPYKERNEL  = zaxpy_vector.c
+ZAXPYKERNEL  = zaxpy_vector.c
 
 SAXPBYKERNEL  = axpby_vector.c
 DAXPBYKERNEL  = axpby_vector.c
@@ -59,7 +59,7 @@ SDOTKERNEL   = dot_vector.c
 DDOTKERNEL   = dot_vector.c
 CDOTKERNEL   = zdot_vector.c
 ZDOTKERNEL   = zdot_vector.c
-DSDOTKERNEL  = ../generic/dot.c
+DSDOTKERNEL  = dsdot_vector.c
 
 SNRM2KERNEL  = nrm2_vector.c
 DNRM2KERNEL  = nrm2_vector.c
diff --git a/kernel/riscv64/KERNEL.RISCV64_GENERIC b/kernel/riscv64/KERNEL.RISCV64_GENERIC
index 61a8a2b91..15bcd2289 100644
--- a/kernel/riscv64/KERNEL.RISCV64_GENERIC
+++ b/kernel/riscv64/KERNEL.RISCV64_GENERIC
@@ -45,6 +45,11 @@ DAXPYKERNEL  = ../riscv64/axpy.c
 CAXPYKERNEL  = ../riscv64/zaxpy.c
 ZAXPYKERNEL  = ../riscv64/zaxpy.c
 
+SAXPBYKERNEL  = ../riscv64/axpby.c
+DAXPBYKERNEL  = ../riscv64/axpby.c
+CAXPBYKERNEL  = ../riscv64/zaxpby.c
+ZAXPBYKERNEL  = ../riscv64/zaxpby.c
+
 SCOPYKERNEL  = ../riscv64/copy.c
 DCOPYKERNEL  = ../riscv64/copy.c
 CCOPYKERNEL  = ../riscv64/zcopy.c
diff --git a/kernel/riscv64/KERNEL.RISCV64_ZVL128B b/kernel/riscv64/KERNEL.RISCV64_ZVL128B
new file mode 100644
index 000000000..fec69ee09
--- /dev/null
+++ b/kernel/riscv64/KERNEL.RISCV64_ZVL128B
@@ -0,0 +1,243 @@
+SAMAXKERNEL  = amax_rvv.c
+DAMAXKERNEL  = amax_rvv.c
+CAMAXKERNEL  = zamax_rvv.c
+ZAMAXKERNEL  = zamax_rvv.c
+
+SAMINKERNEL  = amin_rvv.c
+DAMINKERNEL  = amin_rvv.c
+CAMINKERNEL  = zamin_rvv.c
+ZAMINKERNEL  = zamin_rvv.c
+
+SMAXKERNEL   = max_rvv.c
+DMAXKERNEL   = max_rvv.c
+
+SMINKERNEL   = min_rvv.c
+DMINKERNEL   = min_rvv.c
+
+ISAMAXKERNEL = iamax_rvv.c
+IDAMAXKERNEL = iamax_rvv.c
+ICAMAXKERNEL = izamax_rvv.c
+IZAMAXKERNEL = izamax_rvv.c
+
+ISAMINKERNEL = iamin_rvv.c
+IDAMINKERNEL = iamin_rvv.c
+ICAMINKERNEL = izamin_rvv.c
+IZAMINKERNEL = izamin_rvv.c
+
+ISMAXKERNEL  = imax_rvv.c
+IDMAXKERNEL  = imax_rvv.c
+
+ISMINKERNEL  = imin_rvv.c
+IDMINKERNEL  = imin_rvv.c
+
+SASUMKERNEL  = asum_rvv.c
+DASUMKERNEL  = asum_rvv.c
+CASUMKERNEL  = zasum_rvv.c
+ZASUMKERNEL  = zasum_rvv.c
+
+SSUMKERNEL  = sum_rvv.c
+DSUMKERNEL  = sum_rvv.c
+CSUMKERNEL  = zsum_rvv.c
+ZSUMKERNEL  = zsum_rvv.c
+
+SAXPYKERNEL  = axpy_rvv.c
+DAXPYKERNEL  = axpy_rvv.c
+CAXPYKERNEL  = zaxpy_rvv.c
+ZAXPYKERNEL  = zaxpy_rvv.c
+
+SAXPBYKERNEL  = axpby_rvv.c
+DAXPBYKERNEL  = axpby_rvv.c
+CAXPBYKERNEL  = zaxpby_rvv.c
+ZAXPBYKERNEL  = zaxpby_rvv.c
+
+SCOPYKERNEL  = copy_rvv.c
+DCOPYKERNEL  = copy_rvv.c
+CCOPYKERNEL  = zcopy_rvv.c
+ZCOPYKERNEL  = zcopy_rvv.c
+
+SDOTKERNEL   = dot_rvv.c
+DDOTKERNEL   = dot_rvv.c
+CDOTKERNEL   = zdot_rvv.c
+ZDOTKERNEL   = zdot_rvv.c
+DSDOTKERNEL  = dot_rvv.c
+
+SNRM2KERNEL  = nrm2_rvv.c
+DNRM2KERNEL  = nrm2_rvv.c
+CNRM2KERNEL  = znrm2_rvv.c
+ZNRM2KERNEL  = znrm2_rvv.c
+
+SROTKERNEL   = rot_rvv.c
+DROTKERNEL   = rot_rvv.c
+CROTKERNEL   = zrot_rvv.c
+ZROTKERNEL   = zrot_rvv.c
+
+SSCALKERNEL  = scal_rvv.c
+DSCALKERNEL  = scal_rvv.c
+CSCALKERNEL  = zscal_rvv.c
+ZSCALKERNEL  = zscal_rvv.c
+
+SSWAPKERNEL  = swap_rvv.c
+DSWAPKERNEL  = swap_rvv.c
+CSWAPKERNEL  = zswap_rvv.c
+ZSWAPKERNEL  = zswap_rvv.c
+
+SGEMVNKERNEL = gemv_n_rvv.c
+DGEMVNKERNEL = gemv_n_rvv.c
+CGEMVNKERNEL = zgemv_n_rvv.c
+ZGEMVNKERNEL = zgemv_n_rvv.c
+
+SGEMVTKERNEL = gemv_t_rvv.c
+DGEMVTKERNEL = gemv_t_rvv.c
+CGEMVTKERNEL = zgemv_t_rvv.c
+ZGEMVTKERNEL = zgemv_t_rvv.c
+
+SGEMMKERNEL    =  sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_zvl128b.c
+SGEMMONCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
+SGEMMOTCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
+SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
+SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
+SGEMMINCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
+SGEMMITCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
+SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)
+SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)
+endif
+
+DGEMMKERNEL    =  dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_zvl128b.c
+DGEMMONCOPY    =  ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
+DGEMMOTCOPY    =  ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
+DGEMMONCOPYOBJ =  dgemm_oncopy$(TSUFFIX).$(SUFFIX)
+DGEMMOTCOPYOBJ =  dgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
+DGEMMINCOPY    =  ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
+DGEMMITCOPY    =  ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
+DGEMMINCOPYOBJ =  dgemm_incopy$(TSUFFIX).$(SUFFIX)
+DGEMMITCOPYOBJ =  dgemm_itcopy$(TSUFFIX).$(SUFFIX)
+endif
+
+CGEMMKERNEL    =  cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_zvl128b.c
+CGEMMONCOPY    =  ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
+CGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
+CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
+CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
+CGEMMINCOPY    =  ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
+CGEMMITCOPY    =  ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
+CGEMMINCOPYOBJ =  cgemm_incopy$(TSUFFIX).$(SUFFIX)
+CGEMMITCOPYOBJ =  cgemm_itcopy$(TSUFFIX).$(SUFFIX)
+endif
+
+ZGEMMKERNEL    =  zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_zvl128b.c
+ZGEMMONCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
+ZGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
+ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
+ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
+ZGEMMINCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
+ZGEMMITCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
+ZGEMMINCOPYOBJ =  zgemm_incopy$(TSUFFIX).$(SUFFIX)
+ZGEMMITCOPYOBJ =  zgemm_itcopy$(TSUFFIX).$(SUFFIX)
+endif
+
+STRMMKERNEL	   =  strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_zvl128b.c
+STRMMUNCOPY_M  =  ../generic/trmm_uncopy_$(SGEMM_UNROLL_M).c
+STRMMLNCOPY_M  =  ../generic/trmm_lncopy_$(SGEMM_UNROLL_M).c
+STRMMUTCOPY_M  =  ../generic/trmm_utcopy_$(SGEMM_UNROLL_M).c
+STRMMLTCOPY_M  =  ../generic/trmm_ltcopy_$(SGEMM_UNROLL_M).c
+
+DTRMMKERNEL	   =  dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_zvl128b.c
+DTRMMUNCOPY_M  =  ../generic/trmm_uncopy_$(DGEMM_UNROLL_M).c
+DTRMMLNCOPY_M  =  ../generic/trmm_lncopy_$(DGEMM_UNROLL_M).c
+DTRMMUTCOPY_M  =  ../generic/trmm_utcopy_$(DGEMM_UNROLL_M).c
+DTRMMLTCOPY_M  =  ../generic/trmm_ltcopy_$(DGEMM_UNROLL_M).c
+
+CTRMMKERNEL	   =  ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_zvl128b.c
+CTRMMUNCOPY_M  =  ../generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c
+CTRMMLNCOPY_M  =  ../generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c
+CTRMMUTCOPY_M  =  ../generic/ztrmm_utcopy_$(CGEMM_UNROLL_M).c
+CTRMMLTCOPY_M  =  ../generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c
+
+ZTRMMKERNEL	   =  ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_zvl128b.c
+ZTRMMUNCOPY_M  =  ../generic/ztrmm_uncopy_$(ZGEMM_UNROLL_M).c
+ZTRMMLNCOPY_M  =  ../generic/ztrmm_lncopy_$(ZGEMM_UNROLL_M).c
+ZTRMMUTCOPY_M  =  ../generic/ztrmm_utcopy_$(ZGEMM_UNROLL_M).c
+ZTRMMLTCOPY_M  =  ../generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_M).c
+
+STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c
+STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c
+STRSMKERNEL_RN	=  ../generic/trsm_kernel_RN.c
+STRSMKERNEL_RT	=  ../generic/trsm_kernel_RT.c
+
+DTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+DTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+DTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+DTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+CTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+CTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+CTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+CTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+ZTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+ZTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+ZTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+SSYMV_U_KERNEL =  symv_U_rvv.c 
+SSYMV_L_KERNEL =  symv_L_rvv.c
+DSYMV_U_KERNEL =  symv_U_rvv.c 
+DSYMV_L_KERNEL =  symv_L_rvv.c
+CSYMV_U_KERNEL =  zsymv_U_rvv.c
+CSYMV_L_KERNEL =  zsymv_L_rvv.c
+ZSYMV_U_KERNEL =  zsymv_U_rvv.c
+ZSYMV_L_KERNEL =  zsymv_L_rvv.c
+
+CHEMV_L_KERNEL =  zhemv_LM_rvv.c
+CHEMV_M_KERNEL =  zhemv_LM_rvv.c
+CHEMV_U_KERNEL =  zhemv_UV_rvv.c
+CHEMV_V_KERNEL =  zhemv_UV_rvv.c
+ZHEMV_L_KERNEL =  zhemv_LM_rvv.c
+ZHEMV_M_KERNEL =  zhemv_LM_rvv.c
+ZHEMV_U_KERNEL =  zhemv_UV_rvv.c
+ZHEMV_V_KERNEL =  zhemv_UV_rvv.c
+
+SSYMMUCOPY_M   =  ../generic/symm_ucopy_$(SGEMM_UNROLL_M).c 
+SSYMMLCOPY_M   =  ../generic/symm_lcopy_$(SGEMM_UNROLL_M).c
+
+DSYMMUCOPY_M   =  ../generic/symm_ucopy_$(DGEMM_UNROLL_M).c 
+DSYMMLCOPY_M   =  ../generic/symm_lcopy_$(DGEMM_UNROLL_M).c
+
+CSYMMUCOPY_M   =  ../generic/zsymm_ucopy_$(CGEMM_UNROLL_M).c 
+CSYMMLCOPY_M   =  ../generic/zsymm_lcopy_$(CGEMM_UNROLL_M).c
+
+ZSYMMUCOPY_M   =  ../generic/zsymm_ucopy_$(ZGEMM_UNROLL_M).c 
+ZSYMMLCOPY_M   =  ../generic/zsymm_lcopy_$(ZGEMM_UNROLL_M).c
+
+CHEMMLTCOPY_M  =  ../generic/zhemm_ltcopy_$(CGEMM_UNROLL_M).c
+CHEMMUTCOPY_M  =  ../generic/zhemm_utcopy_$(CGEMM_UNROLL_M).c
+
+ZHEMMLTCOPY_M  =  ../generic/zhemm_ltcopy_$(ZGEMM_UNROLL_M).c
+ZHEMMUTCOPY_M  =  ../generic/zhemm_utcopy_$(ZGEMM_UNROLL_M).c
+
+LSAME_KERNEL = ../generic/lsame.c
+
+SCABS_KERNEL = ../generic/cabs.c
+DCABS_KERNEL = ../generic/cabs.c
+QCABS_KERNEL = ../generic/cabs.c
+
+ifndef SGEMM_BETA
+SGEMM_BETA = gemm_beta_rvv.c
+endif
+ifndef DGEMM_BETA
+DGEMM_BETA = gemm_beta_rvv.c
+endif
+ifndef CGEMM_BETA
+CGEMM_BETA = zgemm_beta_rvv.c
+endif
+ifndef ZGEMM_BETA
+ZGEMM_BETA = zgemm_beta_rvv.c
+endif
diff --git a/kernel/riscv64/KERNEL.RISCV64_ZVL256B b/kernel/riscv64/KERNEL.RISCV64_ZVL256B
new file mode 100644
index 000000000..d8690682f
--- /dev/null
+++ b/kernel/riscv64/KERNEL.RISCV64_ZVL256B
@@ -0,0 +1,199 @@
+SAMAXKERNEL  = amax_vector.c
+DAMAXKERNEL  = amax_vector.c
+CAMAXKERNEL  = zamax_vector.c
+ZAMAXKERNEL  = zamax_vector.c
+
+SAMINKERNEL  = amin_vector.c
+DAMINKERNEL  = amin_vector.c
+CAMINKERNEL  = zamin_vector.c
+ZAMINKERNEL  = zamin_vector.c
+
+SMAXKERNEL   = max_vector.c
+DMAXKERNEL   = max_vector.c
+
+SMINKERNEL   = min_vector.c
+DMINKERNEL   = min_vector.c
+
+ISAMAXKERNEL = iamax_vector.c
+IDAMAXKERNEL = iamax_vector.c
+ICAMAXKERNEL = izamax_vector.c
+IZAMAXKERNEL = izamax_vector.c
+
+ISAMINKERNEL = iamin_vector.c
+IDAMINKERNEL = iamin_vector.c
+ICAMINKERNEL = izamin_vector.c
+IZAMINKERNEL = izamin_vector.c
+
+ISMAXKERNEL  = imax_vector.c
+IDMAXKERNEL  = imax_vector.c
+
+ISMINKERNEL  = imin_vector.c
+IDMINKERNEL  = imin_vector.c
+
+SASUMKERNEL  = asum_vector.c
+DASUMKERNEL  = asum_vector.c
+CASUMKERNEL  = zasum_vector.c
+ZASUMKERNEL  = zasum_vector.c
+
+SSUMKERNEL  = sum_vector.c
+DSUMKERNEL  = sum_vector.c
+CSUMKERNEL  = zsum_vector.c
+ZSUMKERNEL  = zsum_vector.c
+
+SAXPYKERNEL  = axpy_vector.c
+DAXPYKERNEL  = axpy_vector.c
+CAXPYKERNEL  = zaxpy_vector.c
+ZAXPYKERNEL  = zaxpy_vector.c
+
+SCOPYKERNEL  = copy_vector.c
+DCOPYKERNEL  = copy_vector.c
+CCOPYKERNEL  = zcopy_vector.c
+ZCOPYKERNEL  = zcopy_vector.c
+
+SDOTKERNEL   = dot_vector.c
+DDOTKERNEL   = dot_vector.c
+CDOTKERNEL   = zdot_vector.c
+ZDOTKERNEL   = zdot_vector.c
+DSDOTKERNEL  = ../generic/dot.c
+
+SNRM2KERNEL  = nrm2_vector.c
+DNRM2KERNEL  = nrm2_vector.c
+CNRM2KERNEL  = znrm2_vector.c
+ZNRM2KERNEL  = znrm2_vector.c
+
+SROTKERNEL   = rot_vector.c
+DROTKERNEL   = rot_vector.c
+CROTKERNEL   = zrot_vector.c
+ZROTKERNEL   = zrot_vector.c
+
+SSCALKERNEL  = scal_vector.c
+DSCALKERNEL  = scal_vector.c
+CSCALKERNEL  = zscal_vector.c
+ZSCALKERNEL  = zscal_vector.c
+
+SSWAPKERNEL  = swap_vector.c
+DSWAPKERNEL  = swap_vector.c
+CSWAPKERNEL  = zswap_vector.c
+ZSWAPKERNEL  = zswap_vector.c
+
+SGEMVNKERNEL = gemv_n_vector.c
+DGEMVNKERNEL = gemv_n_vector.c
+CGEMVNKERNEL = zgemv_n_vector.c
+ZGEMVNKERNEL = zgemv_n_vector.c
+
+SGEMVTKERNEL = gemv_t_vector.c
+DGEMVTKERNEL = gemv_t_vector.c
+CGEMVTKERNEL = zgemv_t_vector.c
+ZGEMVTKERNEL = zgemv_t_vector.c
+
+STRMMKERNEL	= strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_zvl256b.c
+DTRMMKERNEL	= dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_zvl256b.c
+CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_zvl256b.c
+ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_zvl256b.c
+
+SGEMMKERNEL    =  sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_zvl256b.c
+SGEMMONCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
+SGEMMOTCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
+SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
+SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
+ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
+SGEMMINCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
+SGEMMITCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
+SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)
+SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)
+endif
+
+DGEMMKERNEL    =  dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_zvl256b.c
+DGEMMONCOPY    =  ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
+DGEMMOTCOPY    =  ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
+DGEMMONCOPYOBJ =  dgemm_oncopy$(TSUFFIX).$(SUFFIX)
+DGEMMOTCOPYOBJ =  dgemm_otcopy$(TSUFFIX).$(SUFFIX)
+ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
+DGEMMINCOPY    =  ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
+DGEMMITCOPY    =  ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
+DGEMMINCOPYOBJ =  dgemm_incopy$(TSUFFIX).$(SUFFIX)
+DGEMMITCOPYOBJ =  dgemm_itcopy$(TSUFFIX).$(SUFFIX)
+endif
+
+CGEMMKERNEL    =  cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_zvl256b.c
+CGEMMONCOPY    =  ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
+CGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
+CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
+CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
+CGEMMINCOPY    =  ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
+CGEMMITCOPY    =  ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
+CGEMMINCOPYOBJ =  cgemm_incopy$(TSUFFIX).$(SUFFIX)
+CGEMMITCOPYOBJ =  cgemm_itcopy$(TSUFFIX).$(SUFFIX)
+endif
+
+ZGEMMKERNEL    =  zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_zvl256b.c
+ZGEMMONCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
+ZGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
+ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
+ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
+ZGEMMINCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
+ZGEMMITCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
+ZGEMMINCOPYOBJ =  zgemm_incopy$(TSUFFIX).$(SUFFIX)
+ZGEMMITCOPYOBJ =  zgemm_itcopy$(TSUFFIX).$(SUFFIX)
+endif
+
+STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c
+STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c
+STRSMKERNEL_RN	=  ../generic/trsm_kernel_RN.c
+STRSMKERNEL_RT	=  ../generic/trsm_kernel_RT.c
+
+DTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+DTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+DTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+DTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+CTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+CTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+CTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+CTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+ZTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+ZTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+ZTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+SSYMV_U_KERNEL =  symv_U_vector.c
+SSYMV_L_KERNEL =  symv_L_vector.c
+DSYMV_U_KERNEL =  symv_U_vector.c
+DSYMV_L_KERNEL =  symv_L_vector.c
+CSYMV_U_KERNEL =  ../generic/zsymv_k.c
+CSYMV_L_KERNEL =  ../generic/zsymv_k.c
+ZSYMV_U_KERNEL =  ../generic/zsymv_k.c
+ZSYMV_L_KERNEL =  ../generic/zsymv_k.c
+
+CHEMV_L_KERNEL =  zhemv_LM_vector.c
+CHEMV_M_KERNEL =  zhemv_LM_vector.c
+CHEMV_U_KERNEL =  zhemv_UV_vector.c
+CHEMV_V_KERNEL =  zhemv_UV_vector.c
+ZHEMV_L_KERNEL =  zhemv_LM_vector.c
+ZHEMV_M_KERNEL =  zhemv_LM_vector.c
+ZHEMV_U_KERNEL =  zhemv_UV_vector.c
+ZHEMV_V_KERNEL =  zhemv_UV_vector.c
+
+LSAME_KERNEL = ../generic/lsame.c
+
+SCABS_KERNEL	= ../generic/cabs.c
+DCABS_KERNEL	= ../generic/cabs.c
+QCABS_KERNEL	= ../generic/cabs.c
+
+ifndef SGEMM_BETA
+SGEMM_BETA = ../generic/gemm_beta.c
+endif
+ifndef DGEMM_BETA
+DGEMM_BETA = ../generic/gemm_beta.c
+endif
+ifndef CGEMM_BETA
+CGEMM_BETA = ../generic/zgemm_beta.c
+endif
+ifndef ZGEMM_BETA
+ZGEMM_BETA = ../generic/zgemm_beta.c
+endif
diff --git a/kernel/riscv64/KERNEL.x280 b/kernel/riscv64/KERNEL.x280
new file mode 100644
index 000000000..86708fe01
--- /dev/null
+++ b/kernel/riscv64/KERNEL.x280
@@ -0,0 +1,281 @@
+# **********************************************************************************
+# Copyright (c) 2022, The OpenBLAS Project
+# All rights reserved.
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# 3. Neither the name of the OpenBLAS project nor the names of
+# its contributors may be used to endorse or promote products
+# derived from this software without specific prior written permission.
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+# USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# **********************************************************************************
+
+SAMAXKERNEL  = amax_rvv.c
+DAMAXKERNEL  = amax_rvv.c
+CAMAXKERNEL  = zamax_rvv.c
+ZAMAXKERNEL  = zamax_rvv.c
+
+SAMINKERNEL  = amin_rvv.c
+DAMINKERNEL  = amin_rvv.c
+CAMINKERNEL  = zamin_rvv.c
+ZAMINKERNEL  = zamin_rvv.c
+
+SMAXKERNEL   = max_rvv.c
+DMAXKERNEL   = max_rvv.c
+
+SMINKERNEL   = min_rvv.c
+DMINKERNEL   = min_rvv.c
+
+ISAMAXKERNEL = iamax_rvv.c
+IDAMAXKERNEL = iamax_rvv.c
+ICAMAXKERNEL = izamax_rvv.c
+IZAMAXKERNEL = izamax_rvv.c
+
+ISAMINKERNEL = iamin_rvv.c
+IDAMINKERNEL = iamin_rvv.c
+ICAMINKERNEL = izamin_rvv.c
+IZAMINKERNEL = izamin_rvv.c
+
+ISMAXKERNEL  = imax_rvv.c
+IDMAXKERNEL  = imax_rvv.c
+
+ISMINKERNEL  = imin_rvv.c
+IDMINKERNEL  = imin_rvv.c
+
+SASUMKERNEL  = asum_rvv.c
+DASUMKERNEL  = asum_rvv.c
+CASUMKERNEL  = zasum_rvv.c
+ZASUMKERNEL  = zasum_rvv.c
+
+SSUMKERNEL  = sum_rvv.c
+DSUMKERNEL  = sum_rvv.c
+CSUMKERNEL  = zsum_rvv.c
+ZSUMKERNEL  = zsum_rvv.c
+
+SAXPYKERNEL  = axpy_rvv.c
+DAXPYKERNEL  = axpy_rvv.c
+CAXPYKERNEL  = zaxpy_rvv.c
+ZAXPYKERNEL  = zaxpy_rvv.c
+
+SAXPBYKERNEL  = axpby_rvv.c
+DAXPBYKERNEL  = axpby_rvv.c
+CAXPBYKERNEL  = zaxpby_rvv.c
+ZAXPBYKERNEL  = zaxpby_rvv.c
+
+SCOPYKERNEL  = copy_rvv.c
+DCOPYKERNEL  = copy_rvv.c
+CCOPYKERNEL  = zcopy_rvv.c
+ZCOPYKERNEL  = zcopy_rvv.c
+
+SDOTKERNEL   = dot_rvv.c
+DDOTKERNEL   = dot_rvv.c
+CDOTKERNEL   = zdot_rvv.c
+ZDOTKERNEL   = zdot_rvv.c
+DSDOTKERNEL  = dot_rvv.c
+
+SNRM2KERNEL  = nrm2_rvv.c
+DNRM2KERNEL  = nrm2_rvv.c
+CNRM2KERNEL  = znrm2_rvv.c
+ZNRM2KERNEL  = znrm2_rvv.c
+
+SROTKERNEL   = rot_rvv.c
+DROTKERNEL   = rot_rvv.c
+CROTKERNEL   = zrot_rvv.c
+ZROTKERNEL   = zrot_rvv.c
+
+SSCALKERNEL  = scal_rvv.c
+DSCALKERNEL  = scal_rvv.c
+CSCALKERNEL  = zscal_rvv.c
+ZSCALKERNEL  = zscal_rvv.c
+
+SSWAPKERNEL  = swap_rvv.c
+DSWAPKERNEL  = swap_rvv.c
+CSWAPKERNEL  = zswap_rvv.c
+ZSWAPKERNEL  = zswap_rvv.c
+
+SGEMVNKERNEL = gemv_n_rvv.c
+DGEMVNKERNEL = gemv_n_rvv.c
+CGEMVNKERNEL = zgemv_n_rvv.c
+ZGEMVNKERNEL = zgemv_n_rvv.c
+
+SGEMVTKERNEL = gemv_t_rvv.c
+DGEMVTKERNEL = gemv_t_rvv.c
+CGEMVTKERNEL = zgemv_t_rvv.c
+ZGEMVTKERNEL = zgemv_t_rvv.c
+
+CTRMMKERNEL     = ztrmmkernel_rvv_v1x4.c
+ZTRMMKERNEL     = ztrmmkernel_rvv_v1x4.c
+
+# SGEMM_UNROLL_N set in params.h
+ifeq ($(SGEMM_UNROLL_N), 8)
+# UNROLL_M is VLMAX
+SGEMMKERNEL    =  gemmkernel_rvv_v1x8.c
+SGEMMINCOPY    =  gemm_ncopy_rvv_v1.c
+SGEMMITCOPY    =  gemm_tcopy_rvv_v1.c
+SGEMMONCOPY    =  gemm_ncopy_$(SGEMM_UNROLL_N)_rvv.c
+SGEMMOTCOPY    =  gemm_tcopy_$(SGEMM_UNROLL_N)_rvv.c
+SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)
+SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)
+SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
+SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+STRMMKERNEL	= trmmkernel_rvv_v1x8.c 
+
+STRMMUNCOPY_M  =  trmm_uncopy_rvv_v1.c
+STRMMLNCOPY_M  =  trmm_lncopy_rvv_v1.c
+STRMMUTCOPY_M  =  trmm_utcopy_rvv_v1.c
+STRMMLTCOPY_M  =  trmm_ltcopy_rvv_v1.c
+
+SSYMMUCOPY_M   =  symm_ucopy_rvv_v1.c 
+SSYMMLCOPY_M   =  symm_lcopy_rvv_v1.c
+endif
+
+# SGEMM_UNROLL_N set in params.h
+ifeq ($(DGEMM_UNROLL_N), 8)
+# UNROLL_M is VLMAX
+DGEMMKERNEL    =  gemmkernel_rvv_v1x8.c
+DGEMMINCOPY    =  gemm_ncopy_rvv_v1.c
+DGEMMITCOPY    =  gemm_tcopy_rvv_v1.c
+DGEMMONCOPY    =  gemm_ncopy_$(DGEMM_UNROLL_N)_rvv.c
+DGEMMOTCOPY    =  gemm_tcopy_$(DGEMM_UNROLL_N)_rvv.c
+DGEMMINCOPYOBJ =  dgemm_incopy$(TSUFFIX).$(SUFFIX)
+DGEMMITCOPYOBJ =  dgemm_itcopy$(TSUFFIX).$(SUFFIX)
+DGEMMONCOPYOBJ =  dgemm_oncopy$(TSUFFIX).$(SUFFIX)
+DGEMMOTCOPYOBJ =  dgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+DTRMMKERNEL = trmmkernel_rvv_v1x8.c
+DTRMMUNCOPY_M  =  trmm_uncopy_rvv_v1.c
+DTRMMLNCOPY_M  =  trmm_lncopy_rvv_v1.c
+DTRMMUTCOPY_M  =  trmm_utcopy_rvv_v1.c
+DTRMMLTCOPY_M  =  trmm_ltcopy_rvv_v1.c
+
+DSYMMUCOPY_M   =  symm_ucopy_rvv_v1.c
+DSYMMLCOPY_M   =  symm_lcopy_rvv_v1.c
+endif
+
+CGEMMKERNEL    =  zgemmkernel_rvv_v1x4.c
+CGEMMINCOPY    =  zgemm_ncopy_rvv_v1.c
+CGEMMITCOPY    =  zgemm_tcopy_rvv_v1.c
+CGEMMONCOPY    =  zgemm_ncopy_4_rvv.c
+CGEMMOTCOPY    =  zgemm_tcopy_4_rvv.c
+
+CGEMMINCOPYOBJ =  cgemm_incopy$(TSUFFIX).$(SUFFIX)
+CGEMMITCOPYOBJ =  cgemm_itcopy$(TSUFFIX).$(SUFFIX)
+CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
+CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+ZGEMMKERNEL    = zgemmkernel_rvv_v1x4.c
+
+ZGEMMINCOPY    =  zgemm_ncopy_rvv_v1.c
+ZGEMMITCOPY    =  zgemm_tcopy_rvv_v1.c
+ZGEMMONCOPY    =  zgemm_ncopy_4_rvv.c
+ZGEMMOTCOPY    =  zgemm_tcopy_4_rvv.c
+
+ZGEMMINCOPYOBJ =  zgemm_incopy$(TSUFFIX).$(SUFFIX)
+ZGEMMITCOPYOBJ =  zgemm_itcopy$(TSUFFIX).$(SUFFIX)
+ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
+ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+STRSMKERNEL_LN	=  trsm_kernel_LN_rvv_v1.c
+STRSMKERNEL_LT	=  trsm_kernel_LT_rvv_v1.c
+STRSMKERNEL_RN	=  trsm_kernel_RN_rvv_v1.c
+STRSMKERNEL_RT	=  trsm_kernel_RT_rvv_v1.c
+
+DTRSMKERNEL_LN	=  trsm_kernel_LN_rvv_v1.c
+DTRSMKERNEL_LT	=  trsm_kernel_LT_rvv_v1.c
+DTRSMKERNEL_RN	=  trsm_kernel_RN_rvv_v1.c
+DTRSMKERNEL_RT	=  trsm_kernel_RT_rvv_v1.c
+
+CTRSMKERNEL_LN  =  trsm_kernel_LN_rvv_v1.c
+CTRSMKERNEL_LT  =  trsm_kernel_LT_rvv_v1.c
+CTRSMKERNEL_RN  =  trsm_kernel_RN_rvv_v1.c
+CTRSMKERNEL_RT  =  trsm_kernel_RT_rvv_v1.c
+
+ZTRSMKERNEL_LN  =  trsm_kernel_LN_rvv_v1.c
+ZTRSMKERNEL_LT  =  trsm_kernel_LT_rvv_v1.c
+ZTRSMKERNEL_RN  =  trsm_kernel_RN_rvv_v1.c
+ZTRSMKERNEL_RT  =  trsm_kernel_RT_rvv_v1.c
+
+TRSMCOPYLN_M    =  trsm_lncopy_rvv_v1.c
+TRSMCOPYLT_M    =  trsm_ltcopy_rvv_v1.c
+TRSMCOPYUN_M    =  trsm_uncopy_rvv_v1.c
+TRSMCOPYUT_M    =  trsm_utcopy_rvv_v1.c
+
+ZTRSMCOPYLN_M   =  ztrsm_lncopy_rvv_v1.c
+ZTRSMCOPYLT_M   =  ztrsm_ltcopy_rvv_v1.c
+ZTRSMCOPYUN_M   =  ztrsm_uncopy_rvv_v1.c
+ZTRSMCOPYUT_M   =  ztrsm_utcopy_rvv_v1.c
+
+SSYMV_U_KERNEL =  symv_U_rvv.c 
+SSYMV_L_KERNEL =  symv_L_rvv.c
+DSYMV_U_KERNEL =  symv_U_rvv.c 
+DSYMV_L_KERNEL =  symv_L_rvv.c
+CSYMV_U_KERNEL =  zsymv_U_rvv.c
+CSYMV_L_KERNEL =  zsymv_L_rvv.c
+ZSYMV_U_KERNEL =  zsymv_U_rvv.c
+ZSYMV_L_KERNEL =  zsymv_L_rvv.c
+
+CHEMV_L_KERNEL =  zhemv_LM_rvv.c
+CHEMV_M_KERNEL =  zhemv_LM_rvv.c
+CHEMV_U_KERNEL =  zhemv_UV_rvv.c
+CHEMV_V_KERNEL =  zhemv_UV_rvv.c
+ZHEMV_L_KERNEL =  zhemv_LM_rvv.c
+ZHEMV_M_KERNEL =  zhemv_LM_rvv.c
+ZHEMV_U_KERNEL =  zhemv_UV_rvv.c
+ZHEMV_V_KERNEL =  zhemv_UV_rvv.c
+
+ZHEMMLTCOPY_M    =  zhemm_ltcopy_rvv_v1.c
+ZHEMMUTCOPY_M    =  zhemm_utcopy_rvv_v1.c
+
+CHEMMLTCOPY_M    =  zhemm_ltcopy_rvv_v1.c
+CHEMMUTCOPY_M    =  zhemm_utcopy_rvv_v1.c
+
+ZSYMMUCOPY_M    =  zsymm_ucopy_rvv_v1.c
+ZSYMMLCOPY_M    =  zsymm_lcopy_rvv_v1.c
+
+CSYMMUCOPY_M    =  zsymm_ucopy_rvv_v1.c
+CSYMMLCOPY_M    =  zsymm_lcopy_rvv_v1.c
+
+ZTRMMUNCOPY_M  =  ztrmm_uncopy_rvv_v1.c
+ZTRMMLNCOPY_M  =  ztrmm_lncopy_rvv_v1.c
+ZTRMMUTCOPY_M  =  ztrmm_utcopy_rvv_v1.c
+ZTRMMLTCOPY_M  =  ztrmm_ltcopy_rvv_v1.c
+
+CTRMMUNCOPY_M  =  ztrmm_uncopy_rvv_v1.c
+CTRMMLNCOPY_M  =  ztrmm_lncopy_rvv_v1.c
+CTRMMUTCOPY_M  =  ztrmm_utcopy_rvv_v1.c
+CTRMMLTCOPY_M  =  ztrmm_ltcopy_rvv_v1.c
+
+LSAME_KERNEL = ../generic/lsame.c
+
+SCABS_KERNEL	= ../generic/cabs.c
+DCABS_KERNEL	= ../generic/cabs.c
+QCABS_KERNEL	= ../generic/cabs.c
+
+ifndef SGEMM_BETA
+SGEMM_BETA = gemm_beta_rvv.c
+endif
+ifndef DGEMM_BETA
+DGEMM_BETA = gemm_beta_rvv.c
+endif
+ifndef CGEMM_BETA
+CGEMM_BETA = zgemm_beta_rvv.c
+endif
+ifndef ZGEMM_BETA
+ZGEMM_BETA = zgemm_beta_rvv.c
+endif
diff --git a/kernel/riscv64/amax_rvv.c b/kernel/riscv64/amax_rvv.c
new file mode 100644
index 000000000..451fbc834
--- /dev/null
+++ b/kernel/riscv64/amax_rvv.c
@@ -0,0 +1,102 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <float.h>
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m8()
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e32m1()
+#define FLOAT_V_T               vfloat32m8_t
+#define FLOAT_V_T_M1            vfloat32m1_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m8
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m8
+#define VFREDMAXVS_FLOAT        __riscv_vfredmax_vs_f32m8_f32m1
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m8
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
+#define VFMAXVV_FLOAT_TU        __riscv_vfmax_vv_f32m8_tu
+#define VFABSV_FLOAT            __riscv_vfabs_v_f32m8
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m8()
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e64m1()
+#define FLOAT_V_T               vfloat64m8_t
+#define FLOAT_V_T_M1            vfloat64m1_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m8
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m8
+#define VFREDMAXVS_FLOAT        __riscv_vfredmax_vs_f64m8_f64m1
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m8
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
+#define VFMAXVV_FLOAT_TU        __riscv_vfmax_vv_f64m8_tu
+#define VFABSV_FLOAT            __riscv_vfabs_v_f64m8
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+    FLOAT maxf = 0.0;
+
+    if (n <= 0 || inc_x <= 0) return(maxf);
+
+    FLOAT_V_T vx, vmax;
+    FLOAT_V_T_M1 v_res;
+
+    v_res = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1);
+    size_t vlmax = VSETVL_MAX;
+    vmax = VFMVVF_FLOAT(0.0, vlmax);
+
+    if(inc_x == 1) {
+
+        for (size_t vl; n > 0; n -= vl, x += vl) {
+            vl = VSETVL(n);
+
+            vx = VLEV_FLOAT(x, vl);
+            vx = VFABSV_FLOAT(vx, vl);
+            vmax = VFMAXVV_FLOAT_TU(vmax, vmax, vx, vl);
+        }
+
+    } else {
+
+        BLASLONG stride_x = inc_x * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x) {
+            vl = VSETVL(n);
+
+            vx = VLSEV_FLOAT(x, stride_x, vl);
+            vx = VFABSV_FLOAT(vx, vl);
+            vmax = VFMAXVV_FLOAT_TU(vmax, vmax, vx, vl);
+        }
+
+    }
+
+    v_res = VFREDMAXVS_FLOAT(vmax, v_res, vlmax);
+    maxf = VFMVFS_FLOAT_M1(v_res);
+
+    return(maxf);
+}
diff --git a/kernel/riscv64/amax_vector.c b/kernel/riscv64/amax_vector.c
index 1b7799340..b66d4871e 100644
--- a/kernel/riscv64/amax_vector.c
+++ b/kernel/riscv64/amax_vector.c
@@ -28,36 +28,41 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 #include <math.h>
 
-#if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m8(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
-#define FLOAT_V_T vfloat32m8_t
-#define FLOAT_V_T_M1 vfloat32m1_t
-#define VLEV_FLOAT vle32_v_f32m8
-#define VLSEV_FLOAT vlse32_v_f32m8
-#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1
-#define MASK_T vbool4_t
-#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
-#define VFMVVF_FLOAT vfmv_v_f_f32m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m
-#define VFMAXVV_FLOAT vfmax_vv_f32m8
+#ifdef RISCV64_ZVL256B
+#       define LMUL m2
+#       if defined(DOUBLE)
+#               define ELEN 64
+#       else
+#               define ELEN 32
+#       endif
 #else
-#define VSETVL(n) vsetvl_e64m8(n)
-#define VSETVL_MAX vsetvlmax_e64m1()
-#define FLOAT_V_T vfloat64m8_t
-#define FLOAT_V_T_M1 vfloat64m1_t
-#define VLEV_FLOAT vle64_v_f64m8
-#define VLSEV_FLOAT vlse64_v_f64m8
-#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1
-#define MASK_T vbool8_t
-#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
-#define VFMVVF_FLOAT vfmv_v_f_f64m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m
-#define VFMAXVV_FLOAT vfmax_vv_f64m8
+#       define LMUL m8
+#       if defined(DOUBLE)
+#               define ELEN 64
+#       else
+#               define ELEN 32
+#       endif
 #endif
 
+#define _
+#define JOIN2_X(x, y) x ## y
+#define JOIN2(x, y) JOIN2_X(x, y)
+#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
+
+#define VSETVL          JOIN(RISCV_RVV(vsetvl),    _e,     ELEN,   LMUL,   _)
+#define FLOAT_V_T       JOIN(vfloat,            ELEN,   LMUL,   _t,     _)
+#define FLOAT_V_T_M1    JOIN(vfloat,            ELEN,   m1,     _t,     _)
+#define VLEV_FLOAT      JOIN(RISCV_RVV(vle),       ELEN,   _v_f,   ELEN,   LMUL)
+#define VLSEV_FLOAT     JOIN(RISCV_RVV(vlse),      ELEN,   _v_f,   ELEN,   LMUL)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDMAXVS_FLOAT(va, vb, gvl) JOIN(RISCV_RVV(vfredmax_vs_f),  ELEN,   LMUL,   _f, JOIN2( ELEN,   m1))(v_res, va, vb, gvl)
+#else
+#define VFREDMAXVS_FLOAT JOIN(RISCV_RVV(vfredmax_vs_f),  ELEN,   LMUL,   _f, JOIN2( ELEN,   m1))
+#endif
+#define VFABS_FLOAT     JOIN(RISCV_RVV(vfabs),      _v_f,  ELEN,   LMUL,   _)
+#define VFMVVF_FLOAT    JOIN(RISCV_RVV(vfmv),      _v_f_f, ELEN,   LMUL,   _)
+#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv),      _v_f_f, ELEN,   m1,     _)
+
 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
 	BLASLONG i=0, j=0;
@@ -65,103 +70,28 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	FLOAT maxf=0.0;
 	if (n <= 0 || inc_x <= 0) return(maxf);
         unsigned int gvl = 0;
-        FLOAT_V_T v0, v1, v_max;
-        FLOAT_V_T_M1 v_res, v_zero;
-        gvl = VSETVL_MAX;
-        v_res = VFMVVF_FLOAT_M1(0, gvl);
-        v_zero = VFMVVF_FLOAT_M1(0, gvl);
+        FLOAT_V_T v0, v1;
+        FLOAT_V_T_M1 v_res;
+        v_res = VFMVVF_FLOAT_M1(0, 1);
 
-        MASK_T mask0, mask1;
-        FLOAT zero = 0.0;
         if(inc_x == 1){
                 gvl = VSETVL(n);
                 if(gvl <= n/2){
-                        v_max = VFMVVF_FLOAT(0, gvl);
                         for(i=0,j=0; i<n/(gvl*2); i++){
                                 v0 = VLEV_FLOAT(&x[j], gvl);
                                 v1 = VLEV_FLOAT(&x[j+gvl], gvl);
-                                mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
-                                //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl);
-#if defined(DOUBLE)
-asm volatile(
-        "vsetvli    zero, zero, e8, m1\n\t"
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e64,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+vd"(v0)
-        :"vd"(mask0), "f"(zero), "r"(gvl)
-        :"v0");
-#else
-asm volatile(
-        "vsetvli    zero, zero, e8, m1\n\t"
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e32,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+vd"(v0)
-        :"vd"(mask0), "f"(zero), "r"(gvl)
-        :"v0");
-#endif
-
-                                v_max = VFMAXVV_FLOAT(v_max, v0, gvl);
-
-                                v1 = VLEV_FLOAT(&x[j+gvl], gvl);
-                                mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
-                                //v1 = VFRSUBVF_MASK_FLOAT(v1, 0, mask1, gvl);
-#if defined(DOUBLE)
-asm volatile(
-        "vsetvli    zero, zero, e8, m1\n\t"
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e64,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+vd"(v1)
-        :"vd"(mask1), "f"(zero), "r"(gvl)
-        :"v0");
-#else
-asm volatile(
-        "vsetvli    zero, zero, e8, m1\n\t"
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e32,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+vd"(v1)
-        :"vd"(mask1), "f"(zero), "r"(gvl)
-        :"v0");
-#endif
-
-                                v_max = VFMAXVV_FLOAT(v_max, v1, gvl);
+                                v0 = VFABS_FLOAT(v0, gvl);
+                                v1 = VFABS_FLOAT(v1, gvl);
+                                v_res = VFREDMAXVS_FLOAT(v0, v_res, gvl);
+                                v_res = VFREDMAXVS_FLOAT(v1, v_res, gvl);
                                 j += gvl*2;
                         }
-                        v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_zero, gvl);
-                        maxf = *((FLOAT*)&v_res);
-                        //maxf = v_res[0];
                 }
                 for(;j<n;){
                         gvl = VSETVL(n-j);
                         v0 = VLEV_FLOAT(&x[j], gvl);
-                        mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
-                        //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl);
-#if defined(DOUBLE)
-asm volatile(
-        "vsetvli    zero, zero, e8, m1\n\t"
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e64,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+vd"(v0)
-        :"vd"(mask0), "f"(zero), "r"(gvl)
-        :"v0");
-#else
-asm volatile(
-        "vsetvli    zero, zero, e8, m1\n\t"
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e32,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+vd"(v0)
-        :"vd"(mask0), "f"(zero), "r"(gvl)
-        :"v0");
-#endif
-
-                        v_res = VFREDMAXVS_FLOAT(v_res, v0, v_zero, gvl);
-                        if(*((FLOAT*)&v_res) > maxf)
-                                maxf = *((FLOAT*)&v_res);
+                        v0 = VFABS_FLOAT(v0, gvl);
+                        v_res = VFREDMAXVS_FLOAT(v0, v_res, gvl);
                         j += gvl;
                 }
         }else{
@@ -169,94 +99,27 @@ asm volatile(
                 BLASLONG stride_x = inc_x * sizeof(FLOAT);
                 if(gvl <= n/2){
                         BLASLONG inc_xv = inc_x * gvl;
-                        v_max = VFMVVF_FLOAT(0, gvl);
                         for(i=0,j=0; i<n/(gvl*2); i++){
                                 v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
-                                mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
-                                //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl);
-#if defined(DOUBLE)
-asm volatile(
-        "vsetvli    zero, zero, e8, m1\n\t"
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e64,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+vd"(v0)
-        :"vd"(mask0), "f"(zero), "r"(gvl)
-        :"v0");
-#else
-asm volatile(
-        "vsetvli    zero, zero, e8, m1\n\t"
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e32,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+vd"(v0)
-        :"vd"(mask0), "f"(zero), "r"(gvl)
-        :"v0");
-#endif
-
-                                v_max = VFMAXVV_FLOAT(v_max, v0, gvl);
-
                                 v1 = VLSEV_FLOAT(&x[ix+inc_xv], stride_x, gvl);
-                                mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
-                                //v1 = VFRSUBVF_MASK_FLOAT(v1, 0, mask1, gvl);
-#if defined(DOUBLE)
-asm volatile(
-        "vsetvli    zero, zero, e8, m1\n\t"
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e64,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+vd"(v1)
-        :"vd"(mask1), "f"(zero), "r"(gvl)
-        :"v0");
-#else
-asm volatile(
-        "vsetvli    zero, zero, e8, m1\n\t"
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e32,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+vd"(v1)
-        :"vd"(mask1), "f"(zero), "r"(gvl)
-        :"v0");
-#endif
-
-                                v_max = VFMAXVV_FLOAT(v_max, v1, gvl);
+                                v0 = VFABS_FLOAT(v0, gvl);
+                                v1 = VFABS_FLOAT(v1, gvl);
+                                v_res = VFREDMAXVS_FLOAT(v0, v_res, gvl);
+                                v_res = VFREDMAXVS_FLOAT(v1, v_res, gvl);
                                 j += gvl*2;
                                 ix += inc_xv*2;
                         }
-                        v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_zero, gvl);
-                        maxf = *((FLOAT*)&v_res);
                 }
                 for(;j<n;){
                         gvl = VSETVL(n-j);
                         v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
-                        mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
-                        //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl);
-#if defined(DOUBLE)
-asm volatile(
-        "vsetvli    zero, zero, e8, m1\n\t"
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e64,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+vd"(v0)
-        :"vd"(mask0), "f"(zero), "r"(gvl)
-        :"v0");
-#else
-asm volatile(
-        "vsetvli    zero, zero, e8, m1\n\t"
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e32,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+vd"(v0)
-        :"vd"(mask0), "f"(zero), "r"(gvl)
-        :"v0");
-#endif
-
-                        v_res = VFREDMAXVS_FLOAT(v_res, v0, v_zero, gvl);
-                        if(*((FLOAT*)&v_res) > maxf)
-                                maxf = *((FLOAT*)&v_res);
+                        v0 = VFABS_FLOAT(v0, gvl);
+                        v_res = VFREDMAXVS_FLOAT(v0, v_res, gvl);
                         j += gvl;
                 }
         }
+
+        maxf = EXTRACT_FLOAT(v_res);
 	return(maxf);
 }
 
diff --git a/kernel/riscv64/amin_rvv.c b/kernel/riscv64/amin_rvv.c
new file mode 100644
index 000000000..5186d7b12
--- /dev/null
+++ b/kernel/riscv64/amin_rvv.c
@@ -0,0 +1,102 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <float.h>
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m8()
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e32m1()
+#define FLOAT_V_T               vfloat32m8_t
+#define FLOAT_V_T_M1            vfloat32m1_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m8
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m8
+#define VFREDMINVS_FLOAT        __riscv_vfredmin_vs_f32m8_f32m1
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m8
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
+#define VFMINVV_FLOAT_TU        __riscv_vfmin_vv_f32m8_tu
+#define VFABSV_FLOAT            __riscv_vfabs_v_f32m8
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m8()
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e64m1()
+#define FLOAT_V_T               vfloat64m8_t
+#define FLOAT_V_T_M1            vfloat64m1_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m8
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m8
+#define VFREDMINVS_FLOAT        __riscv_vfredmin_vs_f64m8_f64m1
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m8
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
+#define VFMINVV_FLOAT_TU        __riscv_vfmin_vv_f64m8_tu
+#define VFABSV_FLOAT            __riscv_vfabs_v_f64m8
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+    FLOAT minf = 0.0;
+
+    if (n <= 0 || inc_x <= 0) return(minf);
+
+    FLOAT_V_T vx, vmin;
+    FLOAT_V_T_M1 v_res;
+    
+    v_res = VFMVVF_FLOAT_M1(FLT_MAX, VSETVL_MAX_M1);
+    size_t vlmax = VSETVL_MAX;
+    vmin = VFMVVF_FLOAT(FLT_MAX, vlmax);
+
+    if(inc_x == 1) {
+
+        for (size_t vl; n > 0; n -= vl, x += vl) {
+            vl = VSETVL(n);
+
+            vx = VLEV_FLOAT(x, vl);
+            vx = VFABSV_FLOAT(vx, vl);
+            vmin = VFMINVV_FLOAT_TU(vmin, vmin, vx, vl);
+        }
+
+    } else {
+
+        BLASLONG stride_x = inc_x * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x) {
+            vl = VSETVL(n);
+
+            vx = VLSEV_FLOAT(x, stride_x, vl);
+            vx = VFABSV_FLOAT(vx, vl);
+            vmin = VFMINVV_FLOAT_TU(vmin, vmin, vx, vl);
+        }
+
+    }
+
+    v_res = VFREDMINVS_FLOAT(vmin, v_res, vlmax);
+    minf = VFMVFS_FLOAT_M1(v_res);
+
+    return(minf);
+}
diff --git a/kernel/riscv64/amin_vector.c b/kernel/riscv64/amin_vector.c
index f9b7defae..c4578eabf 100644
--- a/kernel/riscv64/amin_vector.c
+++ b/kernel/riscv64/amin_vector.c
@@ -26,232 +26,108 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 #include "common.h"
-#include <math.h>
-#include <float.h>
 
-#if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m8(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
-#define FLOAT_V_T vfloat32m8_t
-#define FLOAT_V_T_M1 vfloat32m1_t
-#define VLEV_FLOAT vle32_v_f32m8
-#define VLSEV_FLOAT vlse32_v_f32m8
-#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1
-#define MASK_T vbool4_t
-#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
-#define VFMVVF_FLOAT vfmv_v_f_f32m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m
-#define VFMINVV_FLOAT vfmin_vv_f32m8
+#ifdef RISCV64_ZVL256B
+#       define LMUL m2
+#       if defined(DOUBLE)
+#               define ELEN 64
+#               define ABS fabs
+#       else
+#               define ELEN 32
+#               define ABS fabsf
+#       endif
 #else
-#define VSETVL(n) vsetvl_e64m8(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
-#define FLOAT_V_T vfloat64m8_t
-#define FLOAT_V_T_M1 vfloat64m1_t
-#define VLEV_FLOAT vle64_v_f64m8
-#define VLSEV_FLOAT vlse64_v_f64m8
-#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1
-#define MASK_T vbool8_t
-#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
-#define VFMVVF_FLOAT vfmv_v_f_f64m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m
-#define VFMINVV_FLOAT vfmin_vv_f64m8
+#       define LMUL m8
+#       if defined(DOUBLE)
+#               define ELEN 64
+#               define ABS fabs
+#       else
+#               define ELEN 32
+#               define ABS fabsf
+#       endif
 #endif
 
+#define _
+#define JOIN2_X(x, y) x ## y
+#define JOIN2(x, y) JOIN2_X(x, y)
+#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
+
+#define VSETVL          JOIN(RISCV_RVV(vsetvl),    _e,     ELEN,   LMUL,   _)
+#define FLOAT_V_T       JOIN(vfloat,            ELEN,   LMUL,   _t,     _)
+#define FLOAT_V_T_M1    JOIN(vfloat,            ELEN,   m1,     _t,     _)
+#define VLEV_FLOAT      JOIN(RISCV_RVV(vle),       ELEN,   _v_f,   ELEN,   LMUL)
+#define VLSEV_FLOAT     JOIN(RISCV_RVV(vlse),      ELEN,   _v_f,   ELEN,   LMUL)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDMINVS_FLOAT(va, vb, gvl) JOIN(RISCV_RVV(vfredmin_vs_f),  ELEN,   LMUL,   _f, JOIN2( ELEN,   m1))(v_res, va, vb, gvl)
+#else
+#define VFREDMINVS_FLOAT JOIN(RISCV_RVV(vfredmin_vs_f),  ELEN,   LMUL,   _f, JOIN2( ELEN,   m1))
+#endif
+#define VFABS_FLOAT     JOIN(RISCV_RVV(vfabs),      _v_f,  ELEN,   LMUL,   _)
+#define VFMVVF_FLOAT    JOIN(RISCV_RVV(vfmv),      _v_f_f  ELEN,   LMUL,   _)
+#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv),      _v_f_f, ELEN,   m1,     _)
+
 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
-	BLASLONG i=0, j=0;
-	if (n <= 0 || inc_x <= 0) return(0.0);
-	FLOAT minf=FLT_MAX;
+        BLASLONG i=0, j=0;
+        BLASLONG ix=0;
+        FLOAT minf=0.0;
+        if (n <= 0 || inc_x <= 0) return(minf);
+
+        minf = ABS(*x);
+        x += inc_x;
+        --n;
+        if (n == 0) return(minf);
+
         unsigned int gvl = 0;
-        FLOAT_V_T v0, v1, v_min;
-        FLOAT_V_T_M1 v_res, v_max;
-        gvl = VSETVL_MAX;
-        v_res = VFMVVF_FLOAT_M1(0, gvl);
-        v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl);
+        FLOAT_V_T v0, v1;
+        FLOAT_V_T_M1 v_res;
+        v_res = VFMVVF_FLOAT_M1(minf, 1);
 
-        MASK_T mask0, mask1;
-	    FLOAT zero = 0.0;
         if(inc_x == 1){
                 gvl = VSETVL(n);
                 if(gvl <= n/2){
-                        v_min = VFMVVF_FLOAT(FLT_MAX, gvl);
                         for(i=0,j=0; i<n/(gvl*2); i++){
                                 v0 = VLEV_FLOAT(&x[j], gvl);
-                                mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
-                                //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl);
-#if defined(DOUBLE)
-asm volatile(
-        "vsetvli    zero, zero, e8, m1\n\t"
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e64,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+vd"(v0)
-        :"vd"(mask0), "f"(zero), "r"(gvl)
-        :"v0");
-#else
-asm volatile(
-        "vsetvli    zero, zero, e8, m1\n\t"
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e32,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+vd"(v0)
-        :"vd"(mask0), "f"(zero), "r"(gvl)
-        :"v0");
-#endif
-                                v_min = VFMINVV_FLOAT(v_min, v0, gvl);
-
                                 v1 = VLEV_FLOAT(&x[j+gvl], gvl);
-                                mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
-                                //v1 = VFRSUBVF_MASK_FLOAT(v1, 0, mask1, gvl);
-#if defined(DOUBLE)
-asm volatile(
-        "vsetvli    zero, zero, e8, m1\n\t"
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e64,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+vd"(v1)
-        :"vd"(mask1), "f"(zero), "r"(gvl)
-        :"v0");
-#else
-asm volatile(
-        "vsetvli    zero, zero, e8, m1\n\t"
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e32,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+vd"(v1)
-        :"vd"(mask1), "f"(zero), "r"(gvl)
-        :"v0");
-#endif
-
-                                v_min = VFMINVV_FLOAT(v_min, v1, gvl);
+                                v0 = VFABS_FLOAT(v0, gvl);
+                                v1 = VFABS_FLOAT(v1, gvl);
+                                v_res = VFREDMINVS_FLOAT(v0, v_res, gvl);
+                                v_res = VFREDMINVS_FLOAT(v1, v_res, gvl);
                                 j += gvl*2;
                         }
-                        v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
-                        minf = *((FLOAT*)&v_res);
                 }
                 for(;j<n;){
                         gvl = VSETVL(n-j);
                         v0 = VLEV_FLOAT(&x[j], gvl);
-                        mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
-                        //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl);
-#if defined(DOUBLE)
-asm volatile(
-        "vsetvli    zero, zero, e8, m1\n\t"
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e64,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+vd"(v0)
-        :"vd"(mask0), "f"(zero), "r"(gvl)
-        :"v0");
-#else
-asm volatile(
-        "vsetvli    zero, zero, e8, m1\n\t"
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e32,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+vd"(v0)
-        :"vd"(mask0), "f"(zero), "r"(gvl)
-        :"v0");
-#endif
-                        v_res = VFREDMINVS_FLOAT(v_res, v0, v_max, gvl);
-                        if(*((FLOAT*)&v_res) < minf)
-                                minf = *((FLOAT*)&v_res);
+                        v0 = VFABS_FLOAT(v0, gvl);
+                        v_res = VFREDMINVS_FLOAT(v0, v_res, gvl);
                         j += gvl;
                 }
         }else{
                 gvl = VSETVL(n);
                 BLASLONG stride_x = inc_x * sizeof(FLOAT);
                 if(gvl <= n/2){
-                        BLASLONG idx = 0, inc_xv = inc_x * gvl;
-                        v_min = VFMVVF_FLOAT(FLT_MAX, gvl);
+                        BLASLONG inc_xv = inc_x * gvl;
                         for(i=0,j=0; i<n/(gvl*2); i++){
-                                v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl);
-                                mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
-                                //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl);
-#if defined(DOUBLE)
-asm volatile(
-        "vsetvli    zero, zero, e8, m1\n\t"
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e64,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+vd"(v0)
-        :"vd"(mask0), "f"(zero), "r"(gvl)
-        :"v0");
-#else
-asm volatile(
-        "vsetvli    zero, zero, e8, m1\n\t"
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e32,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+vd"(v0)
-        :"vd"(mask0), "f"(zero), "r"(gvl)
-        :"v0");
-#endif
-                                v_min = VFMINVV_FLOAT(v_min, v0, gvl);
-
-                                v1 = VLSEV_FLOAT(&x[idx+inc_xv], stride_x, gvl);
-                                mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
-                                //v1 = VFRSUBVF_MASK_FLOAT(v1, 0, mask1, gvl);
-#if defined(DOUBLE)
-asm volatile(
-        "vsetvli    zero, zero, e8, m1\n\t"
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e64,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+vd"(v1)
-        :"vd"(mask1), "f"(zero), "r"(gvl)
-        :"v0");
-#else
-asm volatile(
-        "vsetvli    zero, zero, e8, m1\n\t"
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e32,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+vd"(v1)
-        :"vd"(mask1), "f"(zero), "r"(gvl)
-        :"v0");
-#endif
-
-                                v_min = VFMINVV_FLOAT(v_min, v1, gvl);
+                                v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                v1 = VLSEV_FLOAT(&x[ix+inc_xv], stride_x, gvl);
+                                v0 = VFABS_FLOAT(v0, gvl);
+                                v1 = VFABS_FLOAT(v1, gvl);
+                                v_res = VFREDMINVS_FLOAT(v0, v_res, gvl);
+                                v_res = VFREDMINVS_FLOAT(v1, v_res, gvl);
                                 j += gvl*2;
-                                idx += inc_xv*2;
+                                ix += inc_xv*2;
                         }
-                        v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
-                        minf = *((FLOAT*)&v_res);
                 }
                 for(;j<n;){
                         gvl = VSETVL(n-j);
                         v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
-                        mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
-                        //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl);
-#if defined(DOUBLE)
-asm volatile(
-        "vsetvli    zero, zero, e8, m1\n\t"
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e64,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+vd"(v0)
-        :"vd"(mask0), "f"(zero), "r"(gvl)
-        :"v0");
-#else
-asm volatile(
-        "vsetvli    zero, zero, e8, m1\n\t"
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e32,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+vd"(v0)
-        :"vd"(mask0), "f"(zero), "r"(gvl)
-        :"v0");
-#endif
-                        v_res = VFREDMINVS_FLOAT(v_res, v0, v_max, gvl);
-                        if(*((FLOAT*)&v_res) < minf)
-                                minf = *((FLOAT*)&v_res);
+                        v0 = VFABS_FLOAT(v0, gvl);
+                        v_res = VFREDMINVS_FLOAT(v0, v_res, gvl);
                         j += gvl;
                 }
         }
-	return(minf);
-}
-
 
+        minf = EXTRACT_FLOAT(v_res);
+        return(minf);
+}
diff --git a/kernel/riscv64/asum_rvv.c b/kernel/riscv64/asum_rvv.c
new file mode 100644
index 000000000..0ea610cbb
--- /dev/null
+++ b/kernel/riscv64/asum_rvv.c
@@ -0,0 +1,99 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m8()
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e32m1()
+#define FLOAT_V_T               vfloat32m8_t
+#define FLOAT_V_T_M1            vfloat32m1_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m8
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m8
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m8
+#define VFADDVV_FLOAT_TU        __riscv_vfadd_vv_f32m8_tu
+#define VFABSV_FLOAT            __riscv_vfabs_v_f32m8
+#define VFREDSUMVS_FLOAT        __riscv_vfredusum_vs_f32m8_f32m1
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m8()
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e64m1()
+#define FLOAT_V_T               vfloat64m8_t
+#define FLOAT_V_T_M1            vfloat64m1_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m8
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m8
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m8
+#define VFADDVV_FLOAT_TU        __riscv_vfadd_vv_f64m8_tu
+#define VFABSV_FLOAT            __riscv_vfabs_v_f64m8
+#define VFREDSUMVS_FLOAT        __riscv_vfredusum_vs_f64m8_f64m1
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+    FLOAT asumf = 0.0;
+    if (n <= 0 || inc_x <= 0) return(asumf);
+
+    FLOAT_V_T vx, vsum;
+    FLOAT_V_T_M1 v_res;
+
+    v_res = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1);
+    size_t vlmax = VSETVL_MAX;
+    vsum = VFMVVF_FLOAT(0.0, vlmax);
+
+    if(inc_x == 1) {
+
+        for (size_t vl; n > 0; n -= vl, x += vl) {
+            vl = VSETVL(n);
+
+            vx = VLEV_FLOAT(x, vl);
+            vx = VFABSV_FLOAT(vx, vl);
+            vsum = VFADDVV_FLOAT_TU(vsum, vsum, vx, vl);
+        }
+
+    } else {
+
+        BLASLONG stride_x = inc_x * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x) {
+            vl = VSETVL(n);
+
+            vx = VLSEV_FLOAT(x, stride_x, vl);
+            vx = VFABSV_FLOAT(vx, vl);
+            vsum = VFADDVV_FLOAT_TU(vsum, vsum, vx, vl);
+        }
+
+    }
+
+    v_res = VFREDSUMVS_FLOAT(vsum, v_res, vlmax); 
+    asumf = VFMVFS_FLOAT_M1(v_res);
+    return(asumf);
+}
diff --git a/kernel/riscv64/asum_vector.c b/kernel/riscv64/asum_vector.c
index fc73362bc..a652eafdd 100644
--- a/kernel/riscv64/asum_vector.c
+++ b/kernel/riscv64/asum_vector.c
@@ -28,111 +28,101 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 #include <math.h>
 
-#if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m8(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
-#define FLOAT_V_T vfloat32m8_t
-#define FLOAT_V_T_M1 vfloat32m1_t
-#define VLEV_FLOAT vle32_v_f32m8
-#define VLSEV_FLOAT vlse32_v_f32m8
-#define VFREDSUMVS_FLOAT vfredosum_vs_f32m8_f32m1
-#define MASK_T vbool4_t
-#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
-#define VFMVVF_FLOAT vfmv_v_f_f32m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m
-#define VFADDVV_FLOAT vfadd_vv_f32m8
+#ifdef RISCV64_ZVL256B
+#       define LMUL m2
+#       if defined(DOUBLE)
+#               define ELEN 64
+#       else
+#               define ELEN 32
+#       endif
 #else
-#define VSETVL(n) vsetvl_e64m8(n)
-#define VSETVL_MAX vsetvlmax_e64m1()
-#define FLOAT_V_T vfloat64m8_t
-#define FLOAT_V_T_M1 vfloat64m1_t
-#define VLEV_FLOAT vle64_v_f64m8
-#define VLSEV_FLOAT vlse64_v_f64m8
-#define VFREDSUMVS_FLOAT vfredusum_vs_f64m8_f64m1
-#define MASK_T vbool8_t
-#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
-#define VFMVVF_FLOAT vfmv_v_f_f64m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m
-#define VFADDVV_FLOAT vfadd_vv_f64m8
+#       define LMUL m8
+#       if defined(DOUBLE)
+#               define ELEN 64
+#       else
+#               define ELEN 32
+#       endif
 #endif
+
+#define _
+#define JOIN2_X(x, y) x ## y
+#define JOIN2(x, y) JOIN2_X(x, y)
+#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
+
+#define VSETVL          JOIN(RISCV_RVV(vsetvl),    _e,     ELEN,   LMUL,   _)
+#define FLOAT_V_T       JOIN(vfloat,            ELEN,   LMUL,   _t,     _)
+#define FLOAT_V_T_M1    JOIN(vfloat,            ELEN,   m1,     _t,     _)
+#define VLEV_FLOAT      JOIN(RISCV_RVV(vle),       ELEN,   _v_f,   ELEN,   LMUL)
+#define VLSEV_FLOAT     JOIN(RISCV_RVV(vlse),      ELEN,   _v_f,   ELEN,   LMUL)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDSUMVS_FLOAT(va, vb, gvl) JOIN(RISCV_RVV(vfredusum_vs_f),  ELEN,   LMUL,   _f, JOIN2( ELEN,   m1))(v_res, va, vb, gvl)
+#else
+#define VFREDSUMVS_FLOAT JOIN(RISCV_RVV(vfredusum_vs_f),  ELEN,   LMUL,   _f, JOIN2( ELEN,   m1))
+#endif
+#define VFABS_FLOAT     JOIN(RISCV_RVV(vfabs),     _v_f,   ELEN,   LMUL,   _)
+#define VFMVVF_FLOAT    JOIN(RISCV_RVV(vfmv),      _v_f_f, ELEN,   LMUL,   _)
+#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv),      _v_f_f, ELEN,   m1,     _)
+#define VFADDVV_FLOAT   JOIN(RISCV_RVV(vfadd),     _vv_f,  ELEN,   LMUL,   _)
+
 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
 	BLASLONG i=0, j=0;
-	BLASLONG ix=0;
 	FLOAT asumf=0.0;
 	if (n <= 0 || inc_x <= 0) return(asumf);
         unsigned int gvl = 0;
-        FLOAT_V_T v0, v1, v_zero,v_sum;
-        FLOAT_V_T_M1 v_res, v_z0;
-        gvl = VSETVL_MAX;
-        v_res = VFMVVF_FLOAT_M1(0, gvl);
-        v_z0 = VFMVVF_FLOAT_M1(0, gvl);
+        FLOAT_V_T v0, v1, v_sum;
+        FLOAT_V_T_M1 v_res;
+        v_res = VFMVVF_FLOAT_M1(0, 1);
 
-        MASK_T mask0, mask1;
         if(inc_x == 1){
                 gvl = VSETVL(n);
-                v_zero = VFMVVF_FLOAT(0, gvl);
                 if(gvl <= n/2){
                         v_sum = VFMVVF_FLOAT(0, gvl);
                         for(i=0,j=0; i<n/(gvl*2); i++){
                                 v0 = VLEV_FLOAT(&x[j], gvl);
-                                mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
-                                v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl);
+                                v0 = VFABS_FLOAT(v0, gvl);
                                 v_sum = VFADDVV_FLOAT(v_sum, v0, gvl);
 
                                 v1 = VLEV_FLOAT(&x[j+gvl], gvl);
-                                mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
-                                v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl);
+                                v1 = VFABS_FLOAT(v1, gvl);
                                 v_sum = VFADDVV_FLOAT(v_sum, v1, gvl);
                                 j += gvl * 2;
                         }
-                        v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, gvl);
-                        asumf += *((FLOAT*)&v_res);
+                        v_res = VFREDSUMVS_FLOAT(v_sum, v_res, gvl);
                 }
                 for(;j<n;){
                         gvl = VSETVL(n-j);
                         v0 = VLEV_FLOAT(&x[j], gvl);
-                        mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
-                        v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl);
-                        v_res = VFREDSUMVS_FLOAT(v_res, v0, v_z0, gvl);
-                        asumf += *((FLOAT*)&v_res);
+                        v0 = VFABS_FLOAT(v0, gvl);
+                        v_res = VFREDSUMVS_FLOAT(v0, v_res, gvl);
                         j += gvl;
                 }
         }else{
                 gvl = VSETVL(n);
                 unsigned int stride_x = inc_x * sizeof(FLOAT);
-                v_zero = VFMVVF_FLOAT(0, gvl);
                 if(gvl <= n/2){
                         v_sum = VFMVVF_FLOAT(0, gvl);
-                        BLASLONG inc_xv = inc_x * gvl;
                         for(i=0,j=0; i<n/(gvl*2); i++){
-                                v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
-                                mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
-                                v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl);
+                                v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
+                                v0 = VFABS_FLOAT(v0, gvl);
                                 v_sum = VFADDVV_FLOAT(v_sum, v0, gvl);
 
-                                v1 = VLSEV_FLOAT(&x[ix+inc_xv], stride_x, gvl);
-                                mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
-                                v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl);
+                                v1 = VLSEV_FLOAT(&x[(j+gvl)*inc_x], stride_x, gvl);
+                                v1 = VFABS_FLOAT(v1, gvl);
                                 v_sum = VFADDVV_FLOAT(v_sum, v1, gvl);
                                 j += gvl * 2;
-                                inc_xv += inc_xv * 2;
                         }
-                        v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, gvl);
-                        asumf += *((FLOAT*)&v_res);
+                        v_res = VFREDSUMVS_FLOAT(v_sum, v_res, gvl);
                 }
                 for(;j<n;){
                         gvl = VSETVL(n-j);
                         v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
-                        mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
-                        v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl);
-                        v_res = VFREDSUMVS_FLOAT(v_res, v0, v_z0, gvl);
-                        asumf += *((FLOAT*)&v_res);
+                        v0 = VFABS_FLOAT(v0, gvl);
+                        v_res = VFREDSUMVS_FLOAT(v0, v_res, gvl);
                         j += gvl;
                 }
         }
+        asumf = EXTRACT_FLOAT(v_res);
 	return(asumf);
 }
 
diff --git a/kernel/riscv64/axpby.c b/kernel/riscv64/axpby.c
index 278747f75..04f9518d3 100644
--- a/kernel/riscv64/axpby.c
+++ b/kernel/riscv64/axpby.c
@@ -33,7 +33,7 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *
 	BLASLONG i=0;
 	BLASLONG ix,iy;
 
-	if ( n < 0     )  return(0);
+	if ( n <= 0     )  return(0);
 
 	ix = 0;
 	iy = 0;
diff --git a/kernel/riscv64/axpby_rvv.c b/kernel/riscv64/axpby_rvv.c
new file mode 100644
index 000000000..d7fb86eab
--- /dev/null
+++ b/kernel/riscv64/axpby_rvv.c
@@ -0,0 +1,173 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m8(n)
+#define FLOAT_V_T               vfloat32m8_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m8
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m8
+#define VSEV_FLOAT              __riscv_vse32_v_f32m8
+#define VSSEV_FLOAT             __riscv_vsse32_v_f32m8
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f32m8
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f32m8
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m8
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m8(n)
+#define FLOAT_V_T               vfloat64m8_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m8
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m8
+#define VSEV_FLOAT              __riscv_vse64_v_f64m8
+#define VSSEV_FLOAT             __riscv_vsse64_v_f64m8
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f64m8
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f64m8
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m8
+#endif
+
+int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *y, BLASLONG inc_y)
+{
+    FLOAT_V_T vx, vy;
+
+    if ( n <= 0     )  return(0);
+
+    if ( beta == 0.0 ) {
+        if ( alpha == 0.0 ) {
+            if (1 == inc_y) {
+                memset(&y[0], 0, n * sizeof(FLOAT));
+            } else {
+                BLASLONG stride_y = inc_y * sizeof(FLOAT);
+                size_t vl = VSETVL(n);
+                vy = VFMVVF_FLOAT(0.0, vl);
+                for ( ; n > 0; n -= vl, y += vl*inc_y) {
+                    vl = VSETVL(n);
+                    VSSEV_FLOAT(y, stride_y, vy, vl);
+                }
+            }
+
+        } else {
+            if ((1 == inc_x) && (1 == inc_y)) {
+                for (size_t vl; n > 0; n -= vl, x += vl, y += vl) {
+                    vl = VSETVL(n);
+                    vx = VLEV_FLOAT(x, vl);
+                    vy = VFMULVF_FLOAT(vx, alpha, vl);
+                    VSEV_FLOAT (y, vy, vl);
+                }
+            } else if (1 == inc_x) {
+                BLASLONG stride_y = inc_y * sizeof(FLOAT);
+                for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) {
+                    vl = VSETVL(n);
+                    vx = VLEV_FLOAT(x, vl);
+                    vy = VFMULVF_FLOAT(vx, alpha, vl);
+                    VSSEV_FLOAT (y, stride_y, vy, vl);
+                }
+            } else if (1 == inc_y) {
+                BLASLONG stride_x = inc_x * sizeof(FLOAT);
+                for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) {
+                    vl = VSETVL(n);
+                    vx = VLSEV_FLOAT(x, stride_x, vl);
+                    vy = VFMULVF_FLOAT(vx, alpha, vl);
+                    VSEV_FLOAT (y, vy, vl);
+                }
+            } else {
+                BLASLONG stride_x = inc_x * sizeof(FLOAT);
+                BLASLONG stride_y = inc_y * sizeof(FLOAT);
+                for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) {
+                    vl = VSETVL(n);
+                    vx = VLSEV_FLOAT(x, stride_x, vl);
+                    vy = VFMULVF_FLOAT(vx, alpha, vl);
+                    VSSEV_FLOAT (y, stride_y, vy, vl);
+                }
+            }
+        }
+
+    } else {
+        if ( alpha == 0.0 ) {
+            if (1 == inc_y) {
+                for (size_t vl; n > 0; n -= vl, y += vl) {
+                    vl = VSETVL(n);
+                    vy = VLEV_FLOAT(y, vl);
+                    vy = VFMULVF_FLOAT(vy, beta, vl);
+                    VSEV_FLOAT (y, vy, vl);
+                }
+            } else {
+                BLASLONG stride_y = inc_y * sizeof(FLOAT);
+                for (size_t vl; n > 0; n -= vl, y += vl*inc_y) {
+                    vl = VSETVL(n);
+                    vy = VLSEV_FLOAT(y, stride_y, vl);
+                    vy = VFMULVF_FLOAT(vy, beta, vl);
+                    VSSEV_FLOAT (y, stride_y, vy, vl);
+                }
+            }
+
+        } else {
+            if ((1 == inc_x) && (1 == inc_y)) {
+                for (size_t vl; n > 0; n -= vl, x += vl, y += vl) {
+                    vl = VSETVL(n);
+                    vx = VLEV_FLOAT(x, vl);
+                    vy = VLEV_FLOAT(y, vl);
+                    vy = VFMULVF_FLOAT(vy, beta, vl);
+                    vy = VFMACCVF_FLOAT(vy, alpha, vx, vl);
+                    VSEV_FLOAT (y, vy, vl);
+                }
+            } else if (1 == inc_x) {
+                BLASLONG stride_y = inc_y * sizeof(FLOAT);
+                for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) {
+                    vl = VSETVL(n);
+                    vx = VLEV_FLOAT(x, vl);
+                    vy = VLSEV_FLOAT(y, stride_y, vl);
+                    vy = VFMULVF_FLOAT(vy, beta, vl);
+                    vy = VFMACCVF_FLOAT(vy, alpha, vx, vl);
+                    VSSEV_FLOAT (y, stride_y, vy, vl);
+                }
+            } else if (1 == inc_y) {
+                BLASLONG stride_x = inc_x * sizeof(FLOAT);
+                for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) {
+                    vl = VSETVL(n);
+                    vx = VLSEV_FLOAT(x, stride_x, vl);
+                    vy = VLEV_FLOAT(y, vl);
+                    vy = VFMULVF_FLOAT(vy, beta, vl);
+                    vy = VFMACCVF_FLOAT(vy, alpha, vx, vl);
+                    VSEV_FLOAT (y, vy, vl);
+                }
+            } else {
+                BLASLONG stride_x = inc_x * sizeof(FLOAT);
+                BLASLONG stride_y = inc_y * sizeof(FLOAT);
+                for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) {
+                    vl = VSETVL(n);
+                    vx = VLSEV_FLOAT(x, stride_x, vl);
+                    vy = VLSEV_FLOAT(y, stride_y, vl);
+                    vy = VFMULVF_FLOAT(vy, beta, vl);
+                    vy = VFMACCVF_FLOAT(vy, alpha, vx, vl);
+                    VSSEV_FLOAT (y, stride_y, vy, vl);
+                }
+            }
+        }
+    }
+
+    return(0);
+}
diff --git a/kernel/riscv64/axpby_vector.c b/kernel/riscv64/axpby_vector.c
index 676dfd474..721aad2b0 100644
--- a/kernel/riscv64/axpby_vector.c
+++ b/kernel/riscv64/axpby_vector.c
@@ -27,31 +27,40 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-#if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m4(n)
-#define FLOAT_V_T vfloat32m4_t
-#define VLEV_FLOAT vle32_v_f32m4
-#define VLSEV_FLOAT vlse32_v_f32m4
-#define VSEV_FLOAT vse32_v_f32m4
-#define VSSEV_FLOAT vsse32_v_f32m4
-#define VFMACCVF_FLOAT vfmacc_vf_f32m4
-#define VFMVVF_FLOAT vfmv_v_f_f32m4
-#define VFMULVF_FLOAT vfmul_vf_f32m4
+#ifdef RISCV64_ZVL256B
+#       define LMUL m2
+#       if defined(DOUBLE)
+#               define ELEN 64
+#       else
+#               define ELEN 32
+#       endif
 #else
-#define VSETVL(n) vsetvl_e64m4(n)
-#define FLOAT_V_T vfloat64m4_t
-#define VLEV_FLOAT vle64_v_f64m4
-#define VLSEV_FLOAT vlse64_v_f64m4
-#define VSEV_FLOAT vse64_v_f64m4
-#define VSSEV_FLOAT vsse64_v_f64m4
-#define VFMACCVF_FLOAT vfmacc_vf_f64m4
-#define VFMVVF_FLOAT vfmv_v_f_f64m4
-#define VFMULVF_FLOAT vfmul_vf_f64m4
+#       define LMUL m4
+#       if defined(DOUBLE)
+#               define ELEN 64
+#       else
+#               define ELEN 32
+#       endif
 #endif
 
+#define _
+#define JOIN2_X(x, y) x ## y
+#define JOIN2(x, y) JOIN2_X(x, y)
+#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
+
+#define VSETVL          JOIN(RISCV_RVV(vsetvl),    _e,     ELEN,   LMUL,   _)
+#define FLOAT_V_T       JOIN(vfloat,            ELEN,   LMUL,   _t,     _)
+#define VLEV_FLOAT      JOIN(RISCV_RVV(vle),       ELEN,   _v_f,   ELEN,   LMUL)
+#define VLSEV_FLOAT     JOIN(RISCV_RVV(vlse),      ELEN,   _v_f,   ELEN,   LMUL)
+#define VSEV_FLOAT      JOIN(RISCV_RVV(vse),       ELEN,   _v_f,   ELEN,   LMUL)
+#define VSSEV_FLOAT     JOIN(RISCV_RVV(vsse),      ELEN,   _v_f,   ELEN,   LMUL)
+#define VFMACCVF_FLOAT  JOIN(RISCV_RVV(vfmacc),    _vf_f,  ELEN,   LMUL,   _)
+#define VFMVVF_FLOAT    JOIN(RISCV_RVV(vfmv),      _v_f_f, ELEN,   LMUL,   _)
+#define VFMULVF_FLOAT   JOIN(RISCV_RVV(vfmul),     _vf_f,  ELEN,   LMUL,   _)
+
 int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *y, BLASLONG inc_y)
 {
-	if (n < 0)  return(0);
+	if (n <= 0)  return(0);
 
 	BLASLONG i=0, j=0;
 	unsigned int gvl = 0;
@@ -60,6 +69,63 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *
 
 	BLASLONG stride_x, stride_y, ix = 0, iy = 0;
 
+	if (inc_x == 0 || inc_y == 0) { /* use trivial non-vectorized loop if either increment is zero */
+
+	if ( beta == 0.0 )
+	{
+
+		if ( alpha == 0.0 )
+		{
+			while(i < n)
+			{
+				y[iy] = 0.0 ;
+				iy += inc_y ;
+				i++ ;
+			}
+		}
+		else
+		{
+			while(i < n)
+			{
+				y[iy] = alpha * x[ix] ;
+				ix += inc_x ;
+				iy += inc_y ;
+				i++ ;
+			}
+
+
+		}
+
+	}
+	else
+	{
+
+		if ( alpha == 0.0 )
+		{
+			while(i < n)
+			{
+				y[iy] =  beta * y[iy] ;
+				iy += inc_y ;
+				i++ ;
+			}
+		}
+		else
+		{
+			while(i < n)
+			{
+				y[iy] = alpha * x[ix] + beta * y[iy] ;
+				ix += inc_x ;
+				iy += inc_y ;
+				i++ ;
+			}
+		}
+
+	}
+
+	return(0);
+
+	} else { /* vectorized approach for non-zero increments */
+
         if(beta == 0.0){
                 if(alpha == 0.0){//alpha == 0 && beta == 0
                         if(inc_y == 1){
@@ -372,5 +438,6 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *
                 }
         }
 	return(0);
+	}
 }
 
diff --git a/kernel/riscv64/axpy.c b/kernel/riscv64/axpy.c
index fb1094dd9..19d12ad3f 100644
--- a/kernel/riscv64/axpy.c
+++ b/kernel/riscv64/axpy.c
@@ -42,7 +42,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
 	BLASLONG i=0;
 	BLASLONG ix,iy;
 
-	if ( n < 0     )  return(0);
+	if ( n <= 0     )  return(0);
 	if ( da == 0.0 ) return(0);
 
 	ix = 0;
diff --git a/kernel/riscv64/axpy_rvv.c b/kernel/riscv64/axpy_rvv.c
new file mode 100644
index 000000000..8bc2f30de
--- /dev/null
+++ b/kernel/riscv64/axpy_rvv.c
@@ -0,0 +1,109 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m8(n)
+#define FLOAT_V_T               vfloat32m8_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m8
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m8
+#define VSEV_FLOAT              __riscv_vse32_v_f32m8
+#define VSSEV_FLOAT             __riscv_vsse32_v_f32m8
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f32m8
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m8(n)
+#define FLOAT_V_T               vfloat64m8_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m8
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m8
+#define VSEV_FLOAT              __riscv_vse64_v_f64m8
+#define VSSEV_FLOAT             __riscv_vsse64_v_f64m8
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f64m8
+#endif
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+    if ( n <= 0    ) return(0);
+    if ( da == 0.0 ) return(0);
+
+    FLOAT_V_T vx, vy;
+
+    if(inc_x == 1 && inc_y == 1) {
+
+        for (size_t vl; n > 0; n -= vl, x += vl, y += vl) {
+            vl = VSETVL(n);
+
+            vx = VLEV_FLOAT(x, vl);
+            vy = VLEV_FLOAT(y, vl);
+            vy = VFMACCVF_FLOAT(vy, da, vx, vl);
+            VSEV_FLOAT (y, vy, vl);
+        }
+
+    } else if (1 == inc_y) {
+
+        BLASLONG stride_x = inc_x * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) {
+            vl = VSETVL(n);
+
+            vx = VLSEV_FLOAT(x, stride_x, vl);
+            vy = VLEV_FLOAT(y, vl);
+            vy = VFMACCVF_FLOAT(vy, da, vx, vl);
+            VSEV_FLOAT(y, vy, vl);
+        }
+
+    } else if (1 == inc_x) {
+
+        BLASLONG stride_y = inc_y * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) {
+            vl = VSETVL(n);
+
+            vx = VLEV_FLOAT(x, vl);
+            vy = VLSEV_FLOAT(y, stride_y, vl);
+            vy = VFMACCVF_FLOAT(vy, da, vx, vl);
+            VSSEV_FLOAT(y, stride_y, vy, vl);
+        }
+
+    } else {
+
+        BLASLONG stride_x = inc_x * sizeof(FLOAT);
+        BLASLONG stride_y = inc_y * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) {
+            vl = VSETVL(n);
+
+            vx = VLSEV_FLOAT(x, stride_x, vl);
+            vy = VLSEV_FLOAT(y, stride_y, vl);
+            vy = VFMACCVF_FLOAT(vy, da, vx, vl);
+            VSSEV_FLOAT(y, stride_y, vy, vl);
+        }
+
+    }
+
+    return(0);
+}
diff --git a/kernel/riscv64/axpy_vector.c b/kernel/riscv64/axpy_vector.c
index 6f921f2d6..6dffe5f09 100644
--- a/kernel/riscv64/axpy_vector.c
+++ b/kernel/riscv64/axpy_vector.c
@@ -25,26 +25,38 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
+
 #include "common.h"
 
-#if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m4(n)
-#define FLOAT_V_T vfloat32m4_t
-#define VLEV_FLOAT vle32_v_f32m4
-#define VLSEV_FLOAT vlse32_v_f32m4
-#define VSEV_FLOAT vse32_v_f32m4
-#define VSSEV_FLOAT vsse32_v_f32m4
-#define VFMACCVF_FLOAT vfmacc_vf_f32m4
+#ifdef RISCV64_ZVL256B
+#       define LMUL m2
+#       if defined(DOUBLE)
+#               define ELEN 64
+#       else
+#               define ELEN 32
+#       endif
 #else
-#define VSETVL(n) vsetvl_e64m4(n)
-#define FLOAT_V_T vfloat64m4_t
-#define VLEV_FLOAT vle64_v_f64m4
-#define VLSEV_FLOAT vlse64_v_f64m4
-#define VSEV_FLOAT vse64_v_f64m4
-#define VSSEV_FLOAT vsse64_v_f64m4
-#define VFMACCVF_FLOAT vfmacc_vf_f64m4
+#       define LMUL m4
+#       if defined(DOUBLE)
+#               define ELEN 64
+#       else
+#               define ELEN 32
+#       endif
 #endif
 
+#define _
+#define JOIN2_X(x, y) x ## y
+#define JOIN2(x, y) JOIN2_X(x, y)
+#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
+
+#define VSETVL          JOIN(RISCV_RVV(vsetvl),    _e,     ELEN,   LMUL,   _)
+#define FLOAT_V_T       JOIN(vfloat,    	ELEN,   LMUL,   _t,     _)
+#define VLEV_FLOAT      JOIN(RISCV_RVV(vle),       ELEN,   _v_f,   ELEN,   LMUL)
+#define VLSEV_FLOAT     JOIN(RISCV_RVV(vlse),      ELEN,   _v_f,   ELEN,   LMUL)
+#define VSEV_FLOAT      JOIN(RISCV_RVV(vse),       ELEN,   _v_f,   ELEN,   LMUL)
+#define VSSEV_FLOAT     JOIN(RISCV_RVV(vsse),      ELEN,   _v_f,   ELEN,   LMUL)
+#define VFMACCVF_FLOAT  JOIN(RISCV_RVV(vfmacc),    _vf_f, 	ELEN,   LMUL,   _)
+
 int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
 {
 	BLASLONG i=0, j=0, jx=0, jy=0;
@@ -53,7 +65,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
 	FLOAT_V_T vy0, vy1;
 	BLASLONG stride_x, stride_y;
 
-	if (n < 0)  return(0);
+	if (n <= 0)  return(0);
 	if (da == 0.0) return(0);
 
 	if (inc_x == 1 && inc_y == 1) {
diff --git a/kernel/riscv64/cgemm_kernel_8x4_zvl128b.c b/kernel/riscv64/cgemm_kernel_8x4_zvl128b.c
new file mode 100644
index 000000000..bd615389c
--- /dev/null
+++ b/kernel/riscv64/cgemm_kernel_8x4_zvl128b.c
@@ -0,0 +1,996 @@
+/*
+
+AUTOGENERATED KERNEL
+Script: ./kernel/riscv64/generate_kernel.py
+Settings:
+ LMUL=2
+ M=8
+ M_tail_scalar_from=2
+ N=4
+ __riscv_='__riscv_'
+ complex=True
+ conjugate=False
+ cpu='zvl128b'
+ force_acc_double=False
+ index_type='BLASLONG'
+ op='gemm'
+ param_precision='float'
+ reg_width_bits=128
+ tail_policy=''
+ trace=False
+
+Derived:
+ ELEN_ACC=32
+ ELEN_PARAM=32
+ LMUL_ACC=2
+ VFMACC='__riscv_vfmacc_vf_f32m2'
+ VFMUL='__riscv_vfmul_vf_f32m2'
+ VLEV='__riscv_vle32_v_f32m2'
+ VLSEV='__riscv_vlse32_v_f32m2'
+ VMACC_TO_ACC='__riscv_vfmacc_vf_f32m2'
+ VMUL_TO_ACC='__riscv_vfmul_vf_f32m2'
+ VSETVL='__riscv_vsetvl_e32m2'
+ VSEV='__riscv_vse32_v_f32m2'
+ VSSEV='__riscv_vsse32_v_f32m2'
+ acc_vector_t='vfloat32m2_t'
+ output='cgemm_kernel_8x4_zvl128b.c'
+ param_scalar_t='float'
+ param_vector_t='vfloat32m2_t'
+
+*/
+
+#include "common.h"
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+#define S0 1
+#define S1 -1
+#define S2 1
+#define S3 1
+#define VFMACC_RR __riscv_vfmsac
+#define VFMACC_RI __riscv_vfmacc
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+#define S0 1
+#define S1 1
+#define S2 1
+#define S3 -1
+#define VFMACC_RR __riscv_vfmacc
+#define VFMACC_RI __riscv_vfmsac
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+#define S0 1
+#define S1 1
+#define S2 -1
+#define S3 1
+#define VFMACC_RR __riscv_vfmacc
+#define VFMACC_RI __riscv_vfnmsac
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+#define S0 1
+#define S1 -1
+#define S2 -1
+#define S3 -1
+#define VFMACC_RR __riscv_vfmsac
+#define VFMACC_RI __riscv_vfnmacc
+#endif
+
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc)
+
+{
+    BLASLONG gvl = 0;
+    BLASLONG m_top = 0;
+    BLASLONG n_top = 0;
+
+    // -- MAIN PASS
+
+    for (BLASLONG j = 0; j < N / 4; j += 1) {
+        m_top = 0;
+        BLASLONG gvl = __riscv_vsetvl_e32m2(8);
+
+        for (BLASLONG i = 0; i < M / 8; i += 1) {
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            float B0r = B[bi + 0 * 2 + 0];
+            float B0i = B[bi + 0 * 2 + 1];
+            float B1r = B[bi + 1 * 2 + 0];
+            float B1i = B[bi + 1 * 2 + 1];
+            float B2r = B[bi + 2 * 2 + 0];
+            float B2i = B[bi + 2 * 2 + 1];
+            float B3r = B[bi + 3 * 2 + 0];
+            float B3i = B[bi + 3 * 2 + 1];
+            bi += 4 * 2;
+
+            vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+            vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ai += 8 * 2;
+
+            // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k
+            // leaving 6 vector registers for temporaries
+            // performing 2 operations between reuses of temporaries
+            vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
+            vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
+            vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl);
+            vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl);
+            tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
+            vfloat32m2_t ACC0r = tmp0r;
+            vfloat32m2_t ACC0i = tmp0i;
+            vfloat32m2_t ACC1r = tmp1r;
+            vfloat32m2_t ACC1i = tmp1i;
+            tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl);
+            tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl);
+            tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl);
+            tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl);
+            tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl);
+            tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl);
+            tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl);
+            tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl);
+            vfloat32m2_t ACC2r = tmp0r;
+            vfloat32m2_t ACC2i = tmp0i;
+            vfloat32m2_t ACC3r = tmp1r;
+            vfloat32m2_t ACC3i = tmp1i;
+
+            for (BLASLONG k = 1; k < K; k++) {
+                B0r = B[bi + 0 * 2 + 0];
+                B0i = B[bi + 0 * 2 + 1];
+                B1r = B[bi + 1 * 2 + 0];
+                B1i = B[bi + 1 * 2 + 1];
+                B2r = B[bi + 2 * 2 + 0];
+                B2i = B[bi + 2 * 2 + 1];
+                B3r = B[bi + 3 * 2 + 0];
+                B3i = B[bi + 3 * 2 + 1];
+                bi += 4 * 2;
+
+                A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+                A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+                ai += 8 * 2;
+
+                tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl);
+                tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
+                ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl);
+                tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl);
+                tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl);
+                tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl);
+                tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl);
+                tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl);
+                ACC2r = __riscv_vfadd(ACC2r, tmp0r, gvl);
+                ACC2i = __riscv_vfadd(ACC2i, tmp0i, gvl);
+                ACC3r = __riscv_vfadd(ACC3r, tmp1r, gvl);
+                ACC3i = __riscv_vfadd(ACC3i, tmp1i, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
+            vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t C1r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
+            vfloat32m2_t C1i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t C2r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
+            vfloat32m2_t C2i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t C3r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
+            vfloat32m2_t C3i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
+
+            C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl);
+            C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl);
+            C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl);
+            C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl);
+            C2r = __riscv_vfmacc(C2r, alphar, ACC2r, gvl);
+            C2i = __riscv_vfmacc(C2i, alphar, ACC2i, gvl);
+            C3r = __riscv_vfmacc(C3r, alphar, ACC3r, gvl);
+            C3i = __riscv_vfmacc(C3i, alphar, ACC3i, gvl);
+            C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl);
+            C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl);
+            C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl);
+            C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl);
+            C2r = __riscv_vfnmsac(C2r, alphai, ACC2i, gvl);
+            C2i = __riscv_vfmacc(C2i, alphai, ACC2r, gvl);
+            C3r = __riscv_vfnmsac(C3r, alphai, ACC3i, gvl);
+            C3i = __riscv_vfmacc(C3i, alphai, ACC3r, gvl);
+
+            ci = n_top * ldc + m_top;
+
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C2r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C2i, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C3r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C3i, gvl);
+
+            m_top += 8;
+        }
+
+        // -- tails for main pass
+
+        if (M & 4) {
+            gvl = __riscv_vsetvl_e32m2(4);
+
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            float B0r = B[bi + 0 * 2 + 0];
+            float B0i = B[bi + 0 * 2 + 1];
+            float B1r = B[bi + 1 * 2 + 0];
+            float B1i = B[bi + 1 * 2 + 1];
+            float B2r = B[bi + 2 * 2 + 0];
+            float B2i = B[bi + 2 * 2 + 1];
+            float B3r = B[bi + 3 * 2 + 0];
+            float B3i = B[bi + 3 * 2 + 1];
+            bi += 4 * 2;
+
+            vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+            vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ai += 4 * 2;
+
+            // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k
+            // leaving 6 vector registers for temporaries
+            // performing 2 operations between reuses of temporaries
+            vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
+            vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
+            vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl);
+            vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl);
+            tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
+            vfloat32m2_t ACC0r = tmp0r;
+            vfloat32m2_t ACC0i = tmp0i;
+            vfloat32m2_t ACC1r = tmp1r;
+            vfloat32m2_t ACC1i = tmp1i;
+            tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl);
+            tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl);
+            tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl);
+            tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl);
+            tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl);
+            tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl);
+            tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl);
+            tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl);
+            vfloat32m2_t ACC2r = tmp0r;
+            vfloat32m2_t ACC2i = tmp0i;
+            vfloat32m2_t ACC3r = tmp1r;
+            vfloat32m2_t ACC3i = tmp1i;
+
+            for (BLASLONG k = 1; k < K; k++) {
+                B0r = B[bi + 0 * 2 + 0];
+                B0i = B[bi + 0 * 2 + 1];
+                B1r = B[bi + 1 * 2 + 0];
+                B1i = B[bi + 1 * 2 + 1];
+                B2r = B[bi + 2 * 2 + 0];
+                B2i = B[bi + 2 * 2 + 1];
+                B3r = B[bi + 3 * 2 + 0];
+                B3i = B[bi + 3 * 2 + 1];
+                bi += 4 * 2;
+
+                A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+                A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+                ai += 4 * 2;
+
+                tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl);
+                tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
+                ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl);
+                tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl);
+                tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl);
+                tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl);
+                tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl);
+                tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl);
+                ACC2r = __riscv_vfadd(ACC2r, tmp0r, gvl);
+                ACC2i = __riscv_vfadd(ACC2i, tmp0i, gvl);
+                ACC3r = __riscv_vfadd(ACC3r, tmp1r, gvl);
+                ACC3i = __riscv_vfadd(ACC3i, tmp1i, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
+            vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t C1r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
+            vfloat32m2_t C1i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t C2r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
+            vfloat32m2_t C2i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t C3r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
+            vfloat32m2_t C3i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
+
+            C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl);
+            C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl);
+            C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl);
+            C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl);
+            C2r = __riscv_vfmacc(C2r, alphar, ACC2r, gvl);
+            C2i = __riscv_vfmacc(C2i, alphar, ACC2i, gvl);
+            C3r = __riscv_vfmacc(C3r, alphar, ACC3r, gvl);
+            C3i = __riscv_vfmacc(C3i, alphar, ACC3i, gvl);
+            C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl);
+            C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl);
+            C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl);
+            C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl);
+            C2r = __riscv_vfnmsac(C2r, alphai, ACC2i, gvl);
+            C2i = __riscv_vfmacc(C2i, alphai, ACC2r, gvl);
+            C3r = __riscv_vfnmsac(C3r, alphai, ACC3i, gvl);
+            C3i = __riscv_vfmacc(C3i, alphai, ACC3r, gvl);
+
+            ci = n_top * ldc + m_top;
+
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C2r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C2i, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C3r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C3i, gvl);
+
+            m_top += 4;
+        }
+
+        if (M & 2) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            float result4 = 0;
+            float result5 = 0;
+            float result6 = 0;
+            float result7 = 0;
+            float result8 = 0;
+            float result9 = 0;
+            float result10 = 0;
+            float result11 = 0;
+            float result12 = 0;
+            float result13 = 0;
+            float result14 = 0;
+            float result15 = 0;
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+
+            for (BLASLONG k = 0; k < K; k++) {
+                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
+                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
+                result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1];
+                result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1];
+                result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1];
+                result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1];
+                result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1];
+                result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1];
+                result8 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1];
+                result9 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1];
+                result10 += S0 * A[ai + 2 + 0] * B[bi + 4 + 0] + S1 * A[ai + 2 + 1] * B[bi + 4 + 1];
+                result11 += S2 * A[ai + 2 + 1] * B[bi + 4 + 0] + S3 * A[ai + 2 + 0] * B[bi + 4 + 1];
+                result12 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1];
+                result13 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1];
+                result14 += S0 * A[ai + 2 + 0] * B[bi + 6 + 0] + S1 * A[ai + 2 + 1] * B[bi + 6 + 1];
+                result15 += S2 * A[ai + 2 + 1] * B[bi + 6 + 0] + S3 * A[ai + 2 + 0] * B[bi + 6 + 1];
+                ai += 2 * 2;
+                bi += 4 * 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            float Cr, Ci;
+            Cr = C[(ci + 0 * ldc + 0) * 2 + 0];
+            Ci = C[(ci + 0 * ldc + 0) * 2 + 1];
+            Cr += result0 * alphar;
+            Ci += result1 * alphar;
+            Cr -= result1 * alphai;
+            Ci += result0 * alphai;
+            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
+            Cr = C[(ci + 0 * ldc + 1) * 2 + 0];
+            Ci = C[(ci + 0 * ldc + 1) * 2 + 1];
+            Cr += result2 * alphar;
+            Ci += result3 * alphar;
+            Cr -= result3 * alphai;
+            Ci += result2 * alphai;
+            C[(ci + 0 * ldc + 1) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 1) * 2 + 1] = Ci;
+            Cr = C[(ci + 1 * ldc + 0) * 2 + 0];
+            Ci = C[(ci + 1 * ldc + 0) * 2 + 1];
+            Cr += result4 * alphar;
+            Ci += result5 * alphar;
+            Cr -= result5 * alphai;
+            Ci += result4 * alphai;
+            C[(ci + 1 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 1 * ldc + 0) * 2 + 1] = Ci;
+            Cr = C[(ci + 1 * ldc + 1) * 2 + 0];
+            Ci = C[(ci + 1 * ldc + 1) * 2 + 1];
+            Cr += result6 * alphar;
+            Ci += result7 * alphar;
+            Cr -= result7 * alphai;
+            Ci += result6 * alphai;
+            C[(ci + 1 * ldc + 1) * 2 + 0] = Cr;
+            C[(ci + 1 * ldc + 1) * 2 + 1] = Ci;
+            Cr = C[(ci + 2 * ldc + 0) * 2 + 0];
+            Ci = C[(ci + 2 * ldc + 0) * 2 + 1];
+            Cr += result8 * alphar;
+            Ci += result9 * alphar;
+            Cr -= result9 * alphai;
+            Ci += result8 * alphai;
+            C[(ci + 2 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 2 * ldc + 0) * 2 + 1] = Ci;
+            Cr = C[(ci + 2 * ldc + 1) * 2 + 0];
+            Ci = C[(ci + 2 * ldc + 1) * 2 + 1];
+            Cr += result10 * alphar;
+            Ci += result11 * alphar;
+            Cr -= result11 * alphai;
+            Ci += result10 * alphai;
+            C[(ci + 2 * ldc + 1) * 2 + 0] = Cr;
+            C[(ci + 2 * ldc + 1) * 2 + 1] = Ci;
+            Cr = C[(ci + 3 * ldc + 0) * 2 + 0];
+            Ci = C[(ci + 3 * ldc + 0) * 2 + 1];
+            Cr += result12 * alphar;
+            Ci += result13 * alphar;
+            Cr -= result13 * alphai;
+            Ci += result12 * alphai;
+            C[(ci + 3 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 3 * ldc + 0) * 2 + 1] = Ci;
+            Cr = C[(ci + 3 * ldc + 1) * 2 + 0];
+            Ci = C[(ci + 3 * ldc + 1) * 2 + 1];
+            Cr += result14 * alphar;
+            Ci += result15 * alphar;
+            Cr -= result15 * alphai;
+            Ci += result14 * alphai;
+            C[(ci + 3 * ldc + 1) * 2 + 0] = Cr;
+            C[(ci + 3 * ldc + 1) * 2 + 1] = Ci;
+            m_top += 2;
+        }
+
+        if (M & 1) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            float result4 = 0;
+            float result5 = 0;
+            float result6 = 0;
+            float result7 = 0;
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+
+            for (BLASLONG k = 0; k < K; k++) {
+                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
+                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
+                result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1];
+                result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1];
+                result4 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1];
+                result5 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1];
+                result6 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1];
+                result7 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1];
+                ai += 1 * 2;
+                bi += 4 * 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            float Cr, Ci;
+            Cr = C[(ci + 0 * ldc + 0) * 2 + 0];
+            Ci = C[(ci + 0 * ldc + 0) * 2 + 1];
+            Cr += result0 * alphar;
+            Ci += result1 * alphar;
+            Cr -= result1 * alphai;
+            Ci += result0 * alphai;
+            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
+            Cr = C[(ci + 1 * ldc + 0) * 2 + 0];
+            Ci = C[(ci + 1 * ldc + 0) * 2 + 1];
+            Cr += result2 * alphar;
+            Ci += result3 * alphar;
+            Cr -= result3 * alphai;
+            Ci += result2 * alphai;
+            C[(ci + 1 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 1 * ldc + 0) * 2 + 1] = Ci;
+            Cr = C[(ci + 2 * ldc + 0) * 2 + 0];
+            Ci = C[(ci + 2 * ldc + 0) * 2 + 1];
+            Cr += result4 * alphar;
+            Ci += result5 * alphar;
+            Cr -= result5 * alphai;
+            Ci += result4 * alphai;
+            C[(ci + 2 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 2 * ldc + 0) * 2 + 1] = Ci;
+            Cr = C[(ci + 3 * ldc + 0) * 2 + 0];
+            Ci = C[(ci + 3 * ldc + 0) * 2 + 1];
+            Cr += result6 * alphar;
+            Ci += result7 * alphar;
+            Cr -= result7 * alphai;
+            Ci += result6 * alphai;
+            C[(ci + 3 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 3 * ldc + 0) * 2 + 1] = Ci;
+            m_top += 1;
+        }
+
+        n_top += 4;
+    }
+
+    // -- tails for N=2
+
+    if (N & 2) {
+        gvl = __riscv_vsetvl_e32m2(8);
+        m_top = 0;
+
+        for (BLASLONG i = 0; i < M / 8; i += 1) {
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            float B0r = B[bi + 0 * 2 + 0];
+            float B0i = B[bi + 0 * 2 + 1];
+            float B1r = B[bi + 1 * 2 + 0];
+            float B1i = B[bi + 1 * 2 + 1];
+            bi += 2 * 2;
+
+            vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+            vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ai += 8 * 2;
+
+            // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k
+            // leaving 10 vector registers for temporaries
+            vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
+            vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
+            vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl);
+            vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl);
+            tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
+            vfloat32m2_t ACC0r = tmp0r;
+            vfloat32m2_t ACC0i = tmp0i;
+            vfloat32m2_t ACC1r = tmp1r;
+            vfloat32m2_t ACC1i = tmp1i;
+
+            for (BLASLONG k = 1; k < K; k++) {
+                B0r = B[bi + 0 * 2 + 0];
+                B0i = B[bi + 0 * 2 + 1];
+                B1r = B[bi + 1 * 2 + 0];
+                B1i = B[bi + 1 * 2 + 1];
+                bi += 2 * 2;
+
+                A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+                A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+                ai += 8 * 2;
+
+                tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl);
+                tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
+                ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
+            vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t C1r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
+            vfloat32m2_t C1i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
+
+            C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl);
+            C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl);
+            C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl);
+            C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl);
+            C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl);
+            C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl);
+            C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl);
+            C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl);
+
+            ci = n_top * ldc + m_top;
+
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl);
+
+            m_top += 8;
+        }
+
+        if (M & 4) {
+            gvl = __riscv_vsetvl_e32m2(4);
+
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            float B0r = B[bi + 0 * 2 + 0];
+            float B0i = B[bi + 0 * 2 + 1];
+            float B1r = B[bi + 1 * 2 + 0];
+            float B1i = B[bi + 1 * 2 + 1];
+            bi += 2 * 2;
+
+            vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+            vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ai += 4 * 2;
+
+            // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k
+            // leaving 10 vector registers for temporaries
+            vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
+            vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
+            vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl);
+            vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl);
+            tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
+            vfloat32m2_t ACC0r = tmp0r;
+            vfloat32m2_t ACC0i = tmp0i;
+            vfloat32m2_t ACC1r = tmp1r;
+            vfloat32m2_t ACC1i = tmp1i;
+
+            for (BLASLONG k = 1; k < K; k++) {
+                B0r = B[bi + 0 * 2 + 0];
+                B0i = B[bi + 0 * 2 + 1];
+                B1r = B[bi + 1 * 2 + 0];
+                B1i = B[bi + 1 * 2 + 1];
+                bi += 2 * 2;
+
+                A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+                A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+                ai += 4 * 2;
+
+                tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl);
+                tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
+                ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
+            vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t C1r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
+            vfloat32m2_t C1i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
+
+            C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl);
+            C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl);
+            C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl);
+            C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl);
+            C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl);
+            C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl);
+            C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl);
+            C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl);
+
+            ci = n_top * ldc + m_top;
+
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl);
+
+            m_top += 4;
+        }
+
+        if (M & 2) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            float result4 = 0;
+            float result5 = 0;
+            float result6 = 0;
+            float result7 = 0;
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+
+            for (BLASLONG k = 0; k < K; k++) {
+                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
+                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
+                result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1];
+                result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1];
+                result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1];
+                result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1];
+                result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1];
+                result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1];
+                ai += 2 * 2;
+                bi += 2 * 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            float Cr, Ci;
+            Cr = C[(ci + 0 * ldc + 0) * 2 + 0];
+            Ci = C[(ci + 0 * ldc + 0) * 2 + 1];
+            Cr += result0 * alphar;
+            Ci += result1 * alphar;
+            Cr -= result1 * alphai;
+            Ci += result0 * alphai;
+            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
+            Cr = C[(ci + 0 * ldc + 1) * 2 + 0];
+            Ci = C[(ci + 0 * ldc + 1) * 2 + 1];
+            Cr += result2 * alphar;
+            Ci += result3 * alphar;
+            Cr -= result3 * alphai;
+            Ci += result2 * alphai;
+            C[(ci + 0 * ldc + 1) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 1) * 2 + 1] = Ci;
+            Cr = C[(ci + 1 * ldc + 0) * 2 + 0];
+            Ci = C[(ci + 1 * ldc + 0) * 2 + 1];
+            Cr += result4 * alphar;
+            Ci += result5 * alphar;
+            Cr -= result5 * alphai;
+            Ci += result4 * alphai;
+            C[(ci + 1 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 1 * ldc + 0) * 2 + 1] = Ci;
+            Cr = C[(ci + 1 * ldc + 1) * 2 + 0];
+            Ci = C[(ci + 1 * ldc + 1) * 2 + 1];
+            Cr += result6 * alphar;
+            Ci += result7 * alphar;
+            Cr -= result7 * alphai;
+            Ci += result6 * alphai;
+            C[(ci + 1 * ldc + 1) * 2 + 0] = Cr;
+            C[(ci + 1 * ldc + 1) * 2 + 1] = Ci;
+            m_top += 2;
+        }
+
+        if (M & 1) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+
+            for (BLASLONG k = 0; k < K; k++) {
+                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
+                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
+                result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1];
+                result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1];
+                ai += 1 * 2;
+                bi += 2 * 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            float Cr, Ci;
+            Cr = C[(ci + 0 * ldc + 0) * 2 + 0];
+            Ci = C[(ci + 0 * ldc + 0) * 2 + 1];
+            Cr += result0 * alphar;
+            Ci += result1 * alphar;
+            Cr -= result1 * alphai;
+            Ci += result0 * alphai;
+            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
+            Cr = C[(ci + 1 * ldc + 0) * 2 + 0];
+            Ci = C[(ci + 1 * ldc + 0) * 2 + 1];
+            Cr += result2 * alphar;
+            Ci += result3 * alphar;
+            Cr -= result3 * alphai;
+            Ci += result2 * alphai;
+            C[(ci + 1 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 1 * ldc + 0) * 2 + 1] = Ci;
+            m_top += 1;
+        }
+
+        n_top += 2;
+    }
+
+    // -- tails for N=1
+
+    if (N & 1) {
+        gvl = __riscv_vsetvl_e32m2(8);
+        m_top = 0;
+
+        for (BLASLONG i = 0; i < M / 8; i += 1) {
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            float B0r = B[bi + 0 * 2 + 0];
+            float B0i = B[bi + 0 * 2 + 1];
+            bi += 1 * 2;
+
+            vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+            vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ai += 8 * 2;
+
+            // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k
+            // leaving 12 vector registers for temporaries
+            vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
+            vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
+            tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+            vfloat32m2_t ACC0r = tmp0r;
+            vfloat32m2_t ACC0i = tmp0i;
+
+            for (BLASLONG k = 1; k < K; k++) {
+                B0r = B[bi + 0 * 2 + 0];
+                B0i = B[bi + 0 * 2 + 1];
+                bi += 1 * 2;
+
+                A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+                A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+                ai += 8 * 2;
+
+                tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
+                tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+                ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
+            vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
+
+            C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl);
+            C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl);
+            C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl);
+            C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl);
+
+            ci = n_top * ldc + m_top;
+
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl);
+
+            m_top += 8;
+        }
+
+        if (M & 4) {
+            gvl = __riscv_vsetvl_e32m2(4);
+
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            float B0r = B[bi + 0 * 2 + 0];
+            float B0i = B[bi + 0 * 2 + 1];
+            bi += 1 * 2;
+
+            vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+            vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ai += 4 * 2;
+
+            // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k
+            // leaving 12 vector registers for temporaries
+            vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
+            vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
+            tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+            vfloat32m2_t ACC0r = tmp0r;
+            vfloat32m2_t ACC0i = tmp0i;
+
+            for (BLASLONG k = 1; k < K; k++) {
+                B0r = B[bi + 0 * 2 + 0];
+                B0i = B[bi + 0 * 2 + 1];
+                bi += 1 * 2;
+
+                A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+                A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+                ai += 4 * 2;
+
+                tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
+                tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+                ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
+            vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
+
+            C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl);
+            C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl);
+            C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl);
+            C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl);
+
+            ci = n_top * ldc + m_top;
+
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl);
+
+            m_top += 4;
+        }
+
+        if (M & 2) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+
+            for (BLASLONG k = 0; k < K; k++) {
+                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
+                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
+                result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1];
+                result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1];
+                ai += 2 * 2;
+                bi += 1 * 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            float Cr, Ci;
+            Cr = C[(ci + 0 * ldc + 0) * 2 + 0];
+            Ci = C[(ci + 0 * ldc + 0) * 2 + 1];
+            Cr += result0 * alphar;
+            Ci += result1 * alphar;
+            Cr -= result1 * alphai;
+            Ci += result0 * alphai;
+            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
+            Cr = C[(ci + 0 * ldc + 1) * 2 + 0];
+            Ci = C[(ci + 0 * ldc + 1) * 2 + 1];
+            Cr += result2 * alphar;
+            Ci += result3 * alphar;
+            Cr -= result3 * alphai;
+            Ci += result2 * alphai;
+            C[(ci + 0 * ldc + 1) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 1) * 2 + 1] = Ci;
+            m_top += 2;
+        }
+
+        if (M & 1) {
+            float result0 = 0;
+            float result1 = 0;
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+
+            for (BLASLONG k = 0; k < K; k++) {
+                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
+                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
+                ai += 1 * 2;
+                bi += 1 * 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            float Cr, Ci;
+            Cr = C[(ci + 0 * ldc + 0) * 2 + 0];
+            Ci = C[(ci + 0 * ldc + 0) * 2 + 1];
+            Cr += result0 * alphar;
+            Ci += result1 * alphar;
+            Cr -= result1 * alphai;
+            Ci += result0 * alphai;
+            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
+            m_top += 1;
+        }
+
+        n_top += 1;
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/cgemm_kernel_8x8_zvl256b.c b/kernel/riscv64/cgemm_kernel_8x8_zvl256b.c
new file mode 100644
index 000000000..7980c029a
--- /dev/null
+++ b/kernel/riscv64/cgemm_kernel_8x8_zvl256b.c
@@ -0,0 +1,1931 @@
+/*
+
+AUTOGENERATED KERNEL
+Settings:
+ LMUL=1
+ M=8
+ M_tail_scalar_from=1
+ N=8
+ __riscv_='__riscv_'
+ complex=True
+ conjugate=False
+ cpu='zvl256b'
+ force_acc_double=False
+ index_type='BLASLONG'
+ op='gemm'
+ param_precision='float'
+ reg_width_bits=256
+ tail_policy=''
+ trace=False
+
+Derived:
+ ELEN_ACC=32
+ ELEN_PARAM=32
+ LMUL_ACC=1
+ VFMACC='__riscv_vfmacc_vf_f32m1'
+ VFMUL='__riscv_vfmul_vf_f32m1'
+ VLEV='__riscv_vle32_v_f32m1'
+ VLSEV='__riscv_vlse32_v_f32m1'
+ VMACC_TO_ACC='__riscv_vfmacc_vf_f32m1'
+ VMUL_TO_ACC='__riscv_vfmul_vf_f32m1'
+ VSETVL='__riscv_vsetvl_e32m1'
+ VSEV='__riscv_vse32_v_f32m1'
+ VSSEV='__riscv_vsse32_v_f32m1'
+ acc_vector_t='vfloat32m1_t'
+ output='cgemm_kernel_8x8_zvl256b.c'
+ param_scalar_t='float'
+ param_vector_t='vfloat32m1_t'
+
+*/
+
+#include "common.h"
+
+
+
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+    #define S0  1
+    #define S1 -1
+    #define S2  1
+    #define S3  1
+    #define VFMACC_RR __riscv_vfmsac
+    #define VFMACC_RI __riscv_vfmacc
+#endif
+#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
+    #define S0  1
+    #define S1  1
+    #define S2  1
+    #define S3 -1
+    #define VFMACC_RR __riscv_vfmacc
+    #define VFMACC_RI __riscv_vfmsac
+#endif
+#if   defined(RN) || defined(RT) || defined(CN) || defined(CT)
+    #define S0  1
+    #define S1  1
+    #define S2 -1
+    #define S3  1
+    #define VFMACC_RR __riscv_vfmacc
+    #define VFMACC_RI __riscv_vfnmsac
+#endif
+#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
+    #define S0  1
+    #define S1 -1
+    #define S2 -1
+    #define S3 -1
+    #define VFMACC_RR __riscv_vfmsac
+    #define VFMACC_RI __riscv_vfnmacc
+#endif
+
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc)
+
+{
+    BLASLONG gvl = 0;
+    BLASLONG m_top = 0;
+    BLASLONG n_top = 0;
+
+
+    // -- MAIN PASS
+
+    for (BLASLONG j=0; j<N/8; j+=1) {
+        m_top = 0;
+        BLASLONG gvl = __riscv_vsetvl_e32m1(8);
+
+
+        for (BLASLONG i=0; i<M/8; i+=1) {
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            float B0r = B[bi+0*2+0];
+            float B0i = B[bi+0*2+1];
+            float B1r = B[bi+1*2+0];
+            float B1i = B[bi+1*2+1];
+            float B2r = B[bi+2*2+0];
+            float B2i = B[bi+2*2+1];
+            float B3r = B[bi+3*2+0];
+            float B3i = B[bi+3*2+1];
+            float B4r = B[bi+4*2+0];
+            float B4i = B[bi+4*2+1];
+            float B5r = B[bi+5*2+0];
+            float B5i = B[bi+5*2+1];
+            float B6r = B[bi+6*2+0];
+            float B6i = B[bi+6*2+1];
+            float B7r = B[bi+7*2+0];
+            float B7i = B[bi+7*2+1];
+            bi += 8*2;
+
+            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 8*2;
+
+            // 2 vector regs to hold A array contents, 16 regs to hold values accumulated over k
+            // leaving 14 vector registers for temporaries
+            // performing 4 operations between reuses of temporaries
+            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+            vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+            vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+            vfloat32m1_t tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
+            vfloat32m1_t tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
+            vfloat32m1_t tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
+            vfloat32m1_t tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+            tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+            tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+            tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+            tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+            vfloat32m1_t ACC0r = tmp0r;
+            vfloat32m1_t ACC0i = tmp0i;
+            vfloat32m1_t ACC1r = tmp1r;
+            vfloat32m1_t ACC1i = tmp1i;
+            vfloat32m1_t ACC2r = tmp2r;
+            vfloat32m1_t ACC2i = tmp2i;
+            vfloat32m1_t ACC3r = tmp3r;
+            vfloat32m1_t ACC3i = tmp3i;
+            tmp0r = __riscv_vfmul_vf_f32m1( A0i, B4i, gvl);
+            tmp0i = __riscv_vfmul_vf_f32m1( A0r, B4i, gvl);
+            tmp1r = __riscv_vfmul_vf_f32m1( A0i, B5i, gvl);
+            tmp1i = __riscv_vfmul_vf_f32m1( A0r, B5i, gvl);
+            tmp2r = __riscv_vfmul_vf_f32m1( A0i, B6i, gvl);
+            tmp2i = __riscv_vfmul_vf_f32m1( A0r, B6i, gvl);
+            tmp3r = __riscv_vfmul_vf_f32m1( A0i, B7i, gvl);
+            tmp3i = __riscv_vfmul_vf_f32m1( A0r, B7i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B4r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B4r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B5r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B5r, A0i, gvl);
+            tmp2r = VFMACC_RR( tmp2r, B6r, A0r, gvl);
+            tmp2i = VFMACC_RI( tmp2i, B6r, A0i, gvl);
+            tmp3r = VFMACC_RR( tmp3r, B7r, A0r, gvl);
+            tmp3i = VFMACC_RI( tmp3i, B7r, A0i, gvl);
+            vfloat32m1_t ACC4r = tmp0r;
+            vfloat32m1_t ACC4i = tmp0i;
+            vfloat32m1_t ACC5r = tmp1r;
+            vfloat32m1_t ACC5i = tmp1i;
+            vfloat32m1_t ACC6r = tmp2r;
+            vfloat32m1_t ACC6i = tmp2i;
+            vfloat32m1_t ACC7r = tmp3r;
+            vfloat32m1_t ACC7i = tmp3i;
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                B2r = B[bi+2*2+0];
+                B2i = B[bi+2*2+1];
+                B3r = B[bi+3*2+0];
+                B3i = B[bi+3*2+1];
+                B4r = B[bi+4*2+0];
+                B4i = B[bi+4*2+1];
+                B5r = B[bi+5*2+0];
+                B5i = B[bi+5*2+1];
+                B6r = B[bi+6*2+0];
+                B6i = B[bi+6*2+1];
+                B7r = B[bi+7*2+0];
+                B7i = B[bi+7*2+1];
+                bi += 8*2;
+
+                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 8*2;
+
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+                tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
+                tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
+                tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
+                tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+                tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+                tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+                tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+                tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+                ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
+                ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
+                ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
+                ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B4i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B4i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B5i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B5i, gvl);
+                tmp2r = __riscv_vfmul_vf_f32m1( A0i, B6i, gvl);
+                tmp2i = __riscv_vfmul_vf_f32m1( A0r, B6i, gvl);
+                tmp3r = __riscv_vfmul_vf_f32m1( A0i, B7i, gvl);
+                tmp3i = __riscv_vfmul_vf_f32m1( A0r, B7i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B4r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B4r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B5r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B5r, A0i, gvl);
+                tmp2r = VFMACC_RR( tmp2r, B6r, A0r, gvl);
+                tmp2i = VFMACC_RI( tmp2i, B6r, A0i, gvl);
+                tmp3r = VFMACC_RR( tmp3r, B7r, A0r, gvl);
+                tmp3i = VFMACC_RI( tmp3i, B7r, A0i, gvl);
+                ACC4r = __riscv_vfadd( ACC4r, tmp0r, gvl);
+                ACC4i = __riscv_vfadd( ACC4i, tmp0i, gvl);
+                ACC5r = __riscv_vfadd( ACC5r, tmp1r, gvl);
+                ACC5i = __riscv_vfadd( ACC5i, tmp1i, gvl);
+                ACC6r = __riscv_vfadd( ACC6r, tmp2r, gvl);
+                ACC6i = __riscv_vfadd( ACC6i, tmp2i, gvl);
+                ACC7r = __riscv_vfadd( ACC7r, tmp3r, gvl);
+                ACC7i = __riscv_vfadd( ACC7i, tmp3i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t C0r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C0i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C1r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C1i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C2r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C2i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C3r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C3i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C4r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C4i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C5r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C5i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C6r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C6i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C7r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C7i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            
+            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
+            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
+            C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
+            C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
+            C2r = __riscv_vfmacc( C2r, alphar, ACC2r, gvl );
+            C2i = __riscv_vfmacc( C2i, alphar, ACC2i, gvl );
+            C3r = __riscv_vfmacc( C3r, alphar, ACC3r, gvl );
+            C3i = __riscv_vfmacc( C3i, alphar, ACC3i, gvl );
+            C4r = __riscv_vfmacc( C4r, alphar, ACC4r, gvl );
+            C4i = __riscv_vfmacc( C4i, alphar, ACC4i, gvl );
+            C5r = __riscv_vfmacc( C5r, alphar, ACC5r, gvl );
+            C5i = __riscv_vfmacc( C5i, alphar, ACC5i, gvl );
+            C6r = __riscv_vfmacc( C6r, alphar, ACC6r, gvl );
+            C6i = __riscv_vfmacc( C6i, alphar, ACC6i, gvl );
+            C7r = __riscv_vfmacc( C7r, alphar, ACC7r, gvl );
+            C7i = __riscv_vfmacc( C7i, alphar, ACC7i, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+            C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
+            C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
+            C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
+            C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
+            C4r = __riscv_vfnmsac( C4r, alphai, ACC4i, gvl );
+            C4i = __riscv_vfmacc ( C4i, alphai, ACC4r, gvl );
+            C5r = __riscv_vfnmsac( C5r, alphai, ACC5i, gvl );
+            C5i = __riscv_vfmacc ( C5i, alphai, ACC5r, gvl );
+            C6r = __riscv_vfnmsac( C6r, alphai, ACC6i, gvl );
+            C6i = __riscv_vfmacc ( C6i, alphai, ACC6r, gvl );
+            C7r = __riscv_vfnmsac( C7r, alphai, ACC7i, gvl );
+            C7i = __riscv_vfmacc ( C7i, alphai, ACC7r, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C4r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C4i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C5r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C5i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C6r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C6i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C7r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C7i, gvl);
+            
+            m_top += 8;
+        }
+
+
+
+        // -- tails for main pass
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e32m1(4);
+
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            float B0r = B[bi+0*2+0];
+            float B0i = B[bi+0*2+1];
+            float B1r = B[bi+1*2+0];
+            float B1i = B[bi+1*2+1];
+            float B2r = B[bi+2*2+0];
+            float B2i = B[bi+2*2+1];
+            float B3r = B[bi+3*2+0];
+            float B3i = B[bi+3*2+1];
+            float B4r = B[bi+4*2+0];
+            float B4i = B[bi+4*2+1];
+            float B5r = B[bi+5*2+0];
+            float B5i = B[bi+5*2+1];
+            float B6r = B[bi+6*2+0];
+            float B6i = B[bi+6*2+1];
+            float B7r = B[bi+7*2+0];
+            float B7i = B[bi+7*2+1];
+            bi += 8*2;
+
+            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 4*2;
+
+            // 2 vector regs to hold A array contents, 16 regs to hold values accumulated over k
+            // leaving 14 vector registers for temporaries
+            // performing 4 operations between reuses of temporaries
+            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+            vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+            vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+            vfloat32m1_t tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
+            vfloat32m1_t tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
+            vfloat32m1_t tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
+            vfloat32m1_t tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+            tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+            tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+            tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+            tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+            vfloat32m1_t ACC0r = tmp0r;
+            vfloat32m1_t ACC0i = tmp0i;
+            vfloat32m1_t ACC1r = tmp1r;
+            vfloat32m1_t ACC1i = tmp1i;
+            vfloat32m1_t ACC2r = tmp2r;
+            vfloat32m1_t ACC2i = tmp2i;
+            vfloat32m1_t ACC3r = tmp3r;
+            vfloat32m1_t ACC3i = tmp3i;
+            tmp0r = __riscv_vfmul_vf_f32m1( A0i, B4i, gvl);
+            tmp0i = __riscv_vfmul_vf_f32m1( A0r, B4i, gvl);
+            tmp1r = __riscv_vfmul_vf_f32m1( A0i, B5i, gvl);
+            tmp1i = __riscv_vfmul_vf_f32m1( A0r, B5i, gvl);
+            tmp2r = __riscv_vfmul_vf_f32m1( A0i, B6i, gvl);
+            tmp2i = __riscv_vfmul_vf_f32m1( A0r, B6i, gvl);
+            tmp3r = __riscv_vfmul_vf_f32m1( A0i, B7i, gvl);
+            tmp3i = __riscv_vfmul_vf_f32m1( A0r, B7i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B4r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B4r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B5r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B5r, A0i, gvl);
+            tmp2r = VFMACC_RR( tmp2r, B6r, A0r, gvl);
+            tmp2i = VFMACC_RI( tmp2i, B6r, A0i, gvl);
+            tmp3r = VFMACC_RR( tmp3r, B7r, A0r, gvl);
+            tmp3i = VFMACC_RI( tmp3i, B7r, A0i, gvl);
+            vfloat32m1_t ACC4r = tmp0r;
+            vfloat32m1_t ACC4i = tmp0i;
+            vfloat32m1_t ACC5r = tmp1r;
+            vfloat32m1_t ACC5i = tmp1i;
+            vfloat32m1_t ACC6r = tmp2r;
+            vfloat32m1_t ACC6i = tmp2i;
+            vfloat32m1_t ACC7r = tmp3r;
+            vfloat32m1_t ACC7i = tmp3i;
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                B2r = B[bi+2*2+0];
+                B2i = B[bi+2*2+1];
+                B3r = B[bi+3*2+0];
+                B3i = B[bi+3*2+1];
+                B4r = B[bi+4*2+0];
+                B4i = B[bi+4*2+1];
+                B5r = B[bi+5*2+0];
+                B5i = B[bi+5*2+1];
+                B6r = B[bi+6*2+0];
+                B6i = B[bi+6*2+1];
+                B7r = B[bi+7*2+0];
+                B7i = B[bi+7*2+1];
+                bi += 8*2;
+
+                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 4*2;
+
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+                tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
+                tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
+                tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
+                tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+                tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+                tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+                tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+                tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+                ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
+                ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
+                ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
+                ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B4i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B4i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B5i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B5i, gvl);
+                tmp2r = __riscv_vfmul_vf_f32m1( A0i, B6i, gvl);
+                tmp2i = __riscv_vfmul_vf_f32m1( A0r, B6i, gvl);
+                tmp3r = __riscv_vfmul_vf_f32m1( A0i, B7i, gvl);
+                tmp3i = __riscv_vfmul_vf_f32m1( A0r, B7i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B4r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B4r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B5r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B5r, A0i, gvl);
+                tmp2r = VFMACC_RR( tmp2r, B6r, A0r, gvl);
+                tmp2i = VFMACC_RI( tmp2i, B6r, A0i, gvl);
+                tmp3r = VFMACC_RR( tmp3r, B7r, A0r, gvl);
+                tmp3i = VFMACC_RI( tmp3i, B7r, A0i, gvl);
+                ACC4r = __riscv_vfadd( ACC4r, tmp0r, gvl);
+                ACC4i = __riscv_vfadd( ACC4i, tmp0i, gvl);
+                ACC5r = __riscv_vfadd( ACC5r, tmp1r, gvl);
+                ACC5i = __riscv_vfadd( ACC5i, tmp1i, gvl);
+                ACC6r = __riscv_vfadd( ACC6r, tmp2r, gvl);
+                ACC6i = __riscv_vfadd( ACC6i, tmp2i, gvl);
+                ACC7r = __riscv_vfadd( ACC7r, tmp3r, gvl);
+                ACC7i = __riscv_vfadd( ACC7i, tmp3i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t C0r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C0i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C1r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C1i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C2r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C2i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C3r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C3i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C4r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C4i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C5r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C5i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C6r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C6i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C7r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C7i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            
+            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
+            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
+            C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
+            C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
+            C2r = __riscv_vfmacc( C2r, alphar, ACC2r, gvl );
+            C2i = __riscv_vfmacc( C2i, alphar, ACC2i, gvl );
+            C3r = __riscv_vfmacc( C3r, alphar, ACC3r, gvl );
+            C3i = __riscv_vfmacc( C3i, alphar, ACC3i, gvl );
+            C4r = __riscv_vfmacc( C4r, alphar, ACC4r, gvl );
+            C4i = __riscv_vfmacc( C4i, alphar, ACC4i, gvl );
+            C5r = __riscv_vfmacc( C5r, alphar, ACC5r, gvl );
+            C5i = __riscv_vfmacc( C5i, alphar, ACC5i, gvl );
+            C6r = __riscv_vfmacc( C6r, alphar, ACC6r, gvl );
+            C6i = __riscv_vfmacc( C6i, alphar, ACC6i, gvl );
+            C7r = __riscv_vfmacc( C7r, alphar, ACC7r, gvl );
+            C7i = __riscv_vfmacc( C7i, alphar, ACC7i, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+            C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
+            C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
+            C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
+            C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
+            C4r = __riscv_vfnmsac( C4r, alphai, ACC4i, gvl );
+            C4i = __riscv_vfmacc ( C4i, alphai, ACC4r, gvl );
+            C5r = __riscv_vfnmsac( C5r, alphai, ACC5i, gvl );
+            C5i = __riscv_vfmacc ( C5i, alphai, ACC5r, gvl );
+            C6r = __riscv_vfnmsac( C6r, alphai, ACC6i, gvl );
+            C6i = __riscv_vfmacc ( C6i, alphai, ACC6r, gvl );
+            C7r = __riscv_vfnmsac( C7r, alphai, ACC7i, gvl );
+            C7i = __riscv_vfmacc ( C7i, alphai, ACC7r, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C4r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C4i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C5r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C5i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C6r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C6i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C7r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C7i, gvl);
+            
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            gvl = __riscv_vsetvl_e32m1(2);
+
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            float B0r = B[bi+0*2+0];
+            float B0i = B[bi+0*2+1];
+            float B1r = B[bi+1*2+0];
+            float B1i = B[bi+1*2+1];
+            float B2r = B[bi+2*2+0];
+            float B2i = B[bi+2*2+1];
+            float B3r = B[bi+3*2+0];
+            float B3i = B[bi+3*2+1];
+            float B4r = B[bi+4*2+0];
+            float B4i = B[bi+4*2+1];
+            float B5r = B[bi+5*2+0];
+            float B5i = B[bi+5*2+1];
+            float B6r = B[bi+6*2+0];
+            float B6i = B[bi+6*2+1];
+            float B7r = B[bi+7*2+0];
+            float B7i = B[bi+7*2+1];
+            bi += 8*2;
+
+            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 2*2;
+
+            // 2 vector regs to hold A array contents, 16 regs to hold values accumulated over k
+            // leaving 14 vector registers for temporaries
+            // performing 4 operations between reuses of temporaries
+            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+            vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+            vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+            vfloat32m1_t tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
+            vfloat32m1_t tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
+            vfloat32m1_t tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
+            vfloat32m1_t tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+            tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+            tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+            tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+            tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+            vfloat32m1_t ACC0r = tmp0r;
+            vfloat32m1_t ACC0i = tmp0i;
+            vfloat32m1_t ACC1r = tmp1r;
+            vfloat32m1_t ACC1i = tmp1i;
+            vfloat32m1_t ACC2r = tmp2r;
+            vfloat32m1_t ACC2i = tmp2i;
+            vfloat32m1_t ACC3r = tmp3r;
+            vfloat32m1_t ACC3i = tmp3i;
+            tmp0r = __riscv_vfmul_vf_f32m1( A0i, B4i, gvl);
+            tmp0i = __riscv_vfmul_vf_f32m1( A0r, B4i, gvl);
+            tmp1r = __riscv_vfmul_vf_f32m1( A0i, B5i, gvl);
+            tmp1i = __riscv_vfmul_vf_f32m1( A0r, B5i, gvl);
+            tmp2r = __riscv_vfmul_vf_f32m1( A0i, B6i, gvl);
+            tmp2i = __riscv_vfmul_vf_f32m1( A0r, B6i, gvl);
+            tmp3r = __riscv_vfmul_vf_f32m1( A0i, B7i, gvl);
+            tmp3i = __riscv_vfmul_vf_f32m1( A0r, B7i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B4r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B4r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B5r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B5r, A0i, gvl);
+            tmp2r = VFMACC_RR( tmp2r, B6r, A0r, gvl);
+            tmp2i = VFMACC_RI( tmp2i, B6r, A0i, gvl);
+            tmp3r = VFMACC_RR( tmp3r, B7r, A0r, gvl);
+            tmp3i = VFMACC_RI( tmp3i, B7r, A0i, gvl);
+            vfloat32m1_t ACC4r = tmp0r;
+            vfloat32m1_t ACC4i = tmp0i;
+            vfloat32m1_t ACC5r = tmp1r;
+            vfloat32m1_t ACC5i = tmp1i;
+            vfloat32m1_t ACC6r = tmp2r;
+            vfloat32m1_t ACC6i = tmp2i;
+            vfloat32m1_t ACC7r = tmp3r;
+            vfloat32m1_t ACC7i = tmp3i;
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                B2r = B[bi+2*2+0];
+                B2i = B[bi+2*2+1];
+                B3r = B[bi+3*2+0];
+                B3i = B[bi+3*2+1];
+                B4r = B[bi+4*2+0];
+                B4i = B[bi+4*2+1];
+                B5r = B[bi+5*2+0];
+                B5i = B[bi+5*2+1];
+                B6r = B[bi+6*2+0];
+                B6i = B[bi+6*2+1];
+                B7r = B[bi+7*2+0];
+                B7i = B[bi+7*2+1];
+                bi += 8*2;
+
+                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 2*2;
+
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+                tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
+                tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
+                tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
+                tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+                tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+                tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+                tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+                tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+                ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
+                ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
+                ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
+                ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B4i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B4i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B5i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B5i, gvl);
+                tmp2r = __riscv_vfmul_vf_f32m1( A0i, B6i, gvl);
+                tmp2i = __riscv_vfmul_vf_f32m1( A0r, B6i, gvl);
+                tmp3r = __riscv_vfmul_vf_f32m1( A0i, B7i, gvl);
+                tmp3i = __riscv_vfmul_vf_f32m1( A0r, B7i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B4r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B4r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B5r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B5r, A0i, gvl);
+                tmp2r = VFMACC_RR( tmp2r, B6r, A0r, gvl);
+                tmp2i = VFMACC_RI( tmp2i, B6r, A0i, gvl);
+                tmp3r = VFMACC_RR( tmp3r, B7r, A0r, gvl);
+                tmp3i = VFMACC_RI( tmp3i, B7r, A0i, gvl);
+                ACC4r = __riscv_vfadd( ACC4r, tmp0r, gvl);
+                ACC4i = __riscv_vfadd( ACC4i, tmp0i, gvl);
+                ACC5r = __riscv_vfadd( ACC5r, tmp1r, gvl);
+                ACC5i = __riscv_vfadd( ACC5i, tmp1i, gvl);
+                ACC6r = __riscv_vfadd( ACC6r, tmp2r, gvl);
+                ACC6i = __riscv_vfadd( ACC6i, tmp2i, gvl);
+                ACC7r = __riscv_vfadd( ACC7r, tmp3r, gvl);
+                ACC7i = __riscv_vfadd( ACC7i, tmp3i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t C0r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C0i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C1r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C1i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C2r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C2i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C3r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C3i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C4r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C4i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C5r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C5i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C6r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C6i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C7r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C7i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            
+            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
+            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
+            C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
+            C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
+            C2r = __riscv_vfmacc( C2r, alphar, ACC2r, gvl );
+            C2i = __riscv_vfmacc( C2i, alphar, ACC2i, gvl );
+            C3r = __riscv_vfmacc( C3r, alphar, ACC3r, gvl );
+            C3i = __riscv_vfmacc( C3i, alphar, ACC3i, gvl );
+            C4r = __riscv_vfmacc( C4r, alphar, ACC4r, gvl );
+            C4i = __riscv_vfmacc( C4i, alphar, ACC4i, gvl );
+            C5r = __riscv_vfmacc( C5r, alphar, ACC5r, gvl );
+            C5i = __riscv_vfmacc( C5i, alphar, ACC5i, gvl );
+            C6r = __riscv_vfmacc( C6r, alphar, ACC6r, gvl );
+            C6i = __riscv_vfmacc( C6i, alphar, ACC6i, gvl );
+            C7r = __riscv_vfmacc( C7r, alphar, ACC7r, gvl );
+            C7i = __riscv_vfmacc( C7i, alphar, ACC7i, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+            C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
+            C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
+            C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
+            C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
+            C4r = __riscv_vfnmsac( C4r, alphai, ACC4i, gvl );
+            C4i = __riscv_vfmacc ( C4i, alphai, ACC4r, gvl );
+            C5r = __riscv_vfnmsac( C5r, alphai, ACC5i, gvl );
+            C5i = __riscv_vfmacc ( C5i, alphai, ACC5r, gvl );
+            C6r = __riscv_vfnmsac( C6r, alphai, ACC6i, gvl );
+            C6i = __riscv_vfmacc ( C6i, alphai, ACC6r, gvl );
+            C7r = __riscv_vfnmsac( C7r, alphai, ACC7i, gvl );
+            C7i = __riscv_vfmacc ( C7i, alphai, ACC7r, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C4r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C4i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C5r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C5i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C6r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C6i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C7r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C7i, gvl);
+            
+            m_top += 2;
+        }
+
+
+        if( M & 1 ) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            float result4 = 0;
+            float result5 = 0;
+            float result6 = 0;
+            float result7 = 0;
+            float result8 = 0;
+            float result9 = 0;
+            float result10 = 0;
+            float result11 = 0;
+            float result12 = 0;
+            float result13 = 0;
+            float result14 = 0;
+            float result15 = 0;
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+
+            for(BLASLONG k=0; k<K; k++) {
+                result0+=S0*A[ai+0+0]*B[bi+0+0] + S1*A[ai+0+1]*B[bi+0+1];
+                result1+=S2*A[ai+0+1]*B[bi+0+0] + S3*A[ai+0+0]*B[bi+0+1];
+                result2+=S0*A[ai+0+0]*B[bi+2+0] + S1*A[ai+0+1]*B[bi+2+1];
+                result3+=S2*A[ai+0+1]*B[bi+2+0] + S3*A[ai+0+0]*B[bi+2+1];
+                result4+=S0*A[ai+0+0]*B[bi+4+0] + S1*A[ai+0+1]*B[bi+4+1];
+                result5+=S2*A[ai+0+1]*B[bi+4+0] + S3*A[ai+0+0]*B[bi+4+1];
+                result6+=S0*A[ai+0+0]*B[bi+6+0] + S1*A[ai+0+1]*B[bi+6+1];
+                result7+=S2*A[ai+0+1]*B[bi+6+0] + S3*A[ai+0+0]*B[bi+6+1];
+                result8+=S0*A[ai+0+0]*B[bi+8+0] + S1*A[ai+0+1]*B[bi+8+1];
+                result9+=S2*A[ai+0+1]*B[bi+8+0] + S3*A[ai+0+0]*B[bi+8+1];
+                result10+=S0*A[ai+0+0]*B[bi+10+0] + S1*A[ai+0+1]*B[bi+10+1];
+                result11+=S2*A[ai+0+1]*B[bi+10+0] + S3*A[ai+0+0]*B[bi+10+1];
+                result12+=S0*A[ai+0+0]*B[bi+12+0] + S1*A[ai+0+1]*B[bi+12+1];
+                result13+=S2*A[ai+0+1]*B[bi+12+0] + S3*A[ai+0+0]*B[bi+12+1];
+                result14+=S0*A[ai+0+0]*B[bi+14+0] + S1*A[ai+0+1]*B[bi+14+1];
+                result15+=S2*A[ai+0+1]*B[bi+14+0] + S3*A[ai+0+0]*B[bi+14+1];
+                ai+=1*2;
+                bi+=8*2;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            float Cr, Ci;
+            Cr = C[(ci+0*ldc+0)*2+0];
+            Ci = C[(ci+0*ldc+0)*2+1];
+            Cr += result0*alphar;
+            Ci += result1*alphar;
+            Cr -= result1*alphai;
+            Ci += result0*alphai;
+            C[(ci+0*ldc+0)*2+0] = Cr;
+            C[(ci+0*ldc+0)*2+1] = Ci;
+            Cr = C[(ci+1*ldc+0)*2+0];
+            Ci = C[(ci+1*ldc+0)*2+1];
+            Cr += result2*alphar;
+            Ci += result3*alphar;
+            Cr -= result3*alphai;
+            Ci += result2*alphai;
+            C[(ci+1*ldc+0)*2+0] = Cr;
+            C[(ci+1*ldc+0)*2+1] = Ci;
+            Cr = C[(ci+2*ldc+0)*2+0];
+            Ci = C[(ci+2*ldc+0)*2+1];
+            Cr += result4*alphar;
+            Ci += result5*alphar;
+            Cr -= result5*alphai;
+            Ci += result4*alphai;
+            C[(ci+2*ldc+0)*2+0] = Cr;
+            C[(ci+2*ldc+0)*2+1] = Ci;
+            Cr = C[(ci+3*ldc+0)*2+0];
+            Ci = C[(ci+3*ldc+0)*2+1];
+            Cr += result6*alphar;
+            Ci += result7*alphar;
+            Cr -= result7*alphai;
+            Ci += result6*alphai;
+            C[(ci+3*ldc+0)*2+0] = Cr;
+            C[(ci+3*ldc+0)*2+1] = Ci;
+            Cr = C[(ci+4*ldc+0)*2+0];
+            Ci = C[(ci+4*ldc+0)*2+1];
+            Cr += result8*alphar;
+            Ci += result9*alphar;
+            Cr -= result9*alphai;
+            Ci += result8*alphai;
+            C[(ci+4*ldc+0)*2+0] = Cr;
+            C[(ci+4*ldc+0)*2+1] = Ci;
+            Cr = C[(ci+5*ldc+0)*2+0];
+            Ci = C[(ci+5*ldc+0)*2+1];
+            Cr += result10*alphar;
+            Ci += result11*alphar;
+            Cr -= result11*alphai;
+            Ci += result10*alphai;
+            C[(ci+5*ldc+0)*2+0] = Cr;
+            C[(ci+5*ldc+0)*2+1] = Ci;
+            Cr = C[(ci+6*ldc+0)*2+0];
+            Ci = C[(ci+6*ldc+0)*2+1];
+            Cr += result12*alphar;
+            Ci += result13*alphar;
+            Cr -= result13*alphai;
+            Ci += result12*alphai;
+            C[(ci+6*ldc+0)*2+0] = Cr;
+            C[(ci+6*ldc+0)*2+1] = Ci;
+            Cr = C[(ci+7*ldc+0)*2+0];
+            Ci = C[(ci+7*ldc+0)*2+1];
+            Cr += result14*alphar;
+            Ci += result15*alphar;
+            Cr -= result15*alphai;
+            Ci += result14*alphai;
+            C[(ci+7*ldc+0)*2+0] = Cr;
+            C[(ci+7*ldc+0)*2+1] = Ci;
+            m_top+=1;
+        }
+
+        n_top += 8;
+    }
+
+
+
+    // -- tails for N=4
+
+    if( N & 4 ) {
+        gvl = __riscv_vsetvl_e32m1(8);
+        m_top = 0;
+
+        for (BLASLONG i=0; i<M/8; i+=1) {
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            float B0r = B[bi+0*2+0];
+            float B0i = B[bi+0*2+1];
+            float B1r = B[bi+1*2+0];
+            float B1i = B[bi+1*2+1];
+            float B2r = B[bi+2*2+0];
+            float B2i = B[bi+2*2+1];
+            float B3r = B[bi+3*2+0];
+            float B3i = B[bi+3*2+1];
+            bi += 4*2;
+
+            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 8*2;
+
+            // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k
+            // leaving 22 vector registers for temporaries
+            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+            vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+            vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+            vfloat32m1_t tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
+            vfloat32m1_t tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
+            vfloat32m1_t tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
+            vfloat32m1_t tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+            tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+            tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+            tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+            tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+            vfloat32m1_t ACC0r = tmp0r;
+            vfloat32m1_t ACC0i = tmp0i;
+            vfloat32m1_t ACC1r = tmp1r;
+            vfloat32m1_t ACC1i = tmp1i;
+            vfloat32m1_t ACC2r = tmp2r;
+            vfloat32m1_t ACC2i = tmp2i;
+            vfloat32m1_t ACC3r = tmp3r;
+            vfloat32m1_t ACC3i = tmp3i;
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                B2r = B[bi+2*2+0];
+                B2i = B[bi+2*2+1];
+                B3r = B[bi+3*2+0];
+                B3i = B[bi+3*2+1];
+                bi += 4*2;
+
+                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 8*2;
+
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+                tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
+                tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
+                tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
+                tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+                tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+                tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+                tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+                tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+                ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
+                ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
+                ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
+                ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t C0r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C0i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C1r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C1i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C2r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C2i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C3r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C3i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            
+            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
+            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
+            C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
+            C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
+            C2r = __riscv_vfmacc( C2r, alphar, ACC2r, gvl );
+            C2i = __riscv_vfmacc( C2i, alphar, ACC2i, gvl );
+            C3r = __riscv_vfmacc( C3r, alphar, ACC3r, gvl );
+            C3i = __riscv_vfmacc( C3i, alphar, ACC3i, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+            C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
+            C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
+            C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
+            C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
+            
+            m_top += 8;
+        }
+
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e32m1(4);
+
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            float B0r = B[bi+0*2+0];
+            float B0i = B[bi+0*2+1];
+            float B1r = B[bi+1*2+0];
+            float B1i = B[bi+1*2+1];
+            float B2r = B[bi+2*2+0];
+            float B2i = B[bi+2*2+1];
+            float B3r = B[bi+3*2+0];
+            float B3i = B[bi+3*2+1];
+            bi += 4*2;
+
+            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 4*2;
+
+            // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k
+            // leaving 22 vector registers for temporaries
+            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+            vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+            vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+            vfloat32m1_t tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
+            vfloat32m1_t tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
+            vfloat32m1_t tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
+            vfloat32m1_t tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+            tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+            tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+            tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+            tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+            vfloat32m1_t ACC0r = tmp0r;
+            vfloat32m1_t ACC0i = tmp0i;
+            vfloat32m1_t ACC1r = tmp1r;
+            vfloat32m1_t ACC1i = tmp1i;
+            vfloat32m1_t ACC2r = tmp2r;
+            vfloat32m1_t ACC2i = tmp2i;
+            vfloat32m1_t ACC3r = tmp3r;
+            vfloat32m1_t ACC3i = tmp3i;
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                B2r = B[bi+2*2+0];
+                B2i = B[bi+2*2+1];
+                B3r = B[bi+3*2+0];
+                B3i = B[bi+3*2+1];
+                bi += 4*2;
+
+                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 4*2;
+
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+                tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
+                tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
+                tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
+                tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+                tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+                tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+                tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+                tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+                ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
+                ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
+                ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
+                ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t C0r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C0i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C1r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C1i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C2r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C2i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C3r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C3i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            
+            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
+            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
+            C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
+            C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
+            C2r = __riscv_vfmacc( C2r, alphar, ACC2r, gvl );
+            C2i = __riscv_vfmacc( C2i, alphar, ACC2i, gvl );
+            C3r = __riscv_vfmacc( C3r, alphar, ACC3r, gvl );
+            C3i = __riscv_vfmacc( C3i, alphar, ACC3i, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+            C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
+            C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
+            C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
+            C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
+            
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            gvl = __riscv_vsetvl_e32m1(2);
+
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            float B0r = B[bi+0*2+0];
+            float B0i = B[bi+0*2+1];
+            float B1r = B[bi+1*2+0];
+            float B1i = B[bi+1*2+1];
+            float B2r = B[bi+2*2+0];
+            float B2i = B[bi+2*2+1];
+            float B3r = B[bi+3*2+0];
+            float B3i = B[bi+3*2+1];
+            bi += 4*2;
+
+            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 2*2;
+
+            // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k
+            // leaving 22 vector registers for temporaries
+            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+            vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+            vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+            vfloat32m1_t tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
+            vfloat32m1_t tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
+            vfloat32m1_t tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
+            vfloat32m1_t tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+            tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+            tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+            tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+            tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+            vfloat32m1_t ACC0r = tmp0r;
+            vfloat32m1_t ACC0i = tmp0i;
+            vfloat32m1_t ACC1r = tmp1r;
+            vfloat32m1_t ACC1i = tmp1i;
+            vfloat32m1_t ACC2r = tmp2r;
+            vfloat32m1_t ACC2i = tmp2i;
+            vfloat32m1_t ACC3r = tmp3r;
+            vfloat32m1_t ACC3i = tmp3i;
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                B2r = B[bi+2*2+0];
+                B2i = B[bi+2*2+1];
+                B3r = B[bi+3*2+0];
+                B3i = B[bi+3*2+1];
+                bi += 4*2;
+
+                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 2*2;
+
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+                tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
+                tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
+                tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
+                tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+                tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+                tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+                tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+                tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+                ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
+                ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
+                ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
+                ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t C0r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C0i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C1r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C1i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C2r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C2i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C3r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C3i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            
+            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
+            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
+            C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
+            C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
+            C2r = __riscv_vfmacc( C2r, alphar, ACC2r, gvl );
+            C2i = __riscv_vfmacc( C2i, alphar, ACC2i, gvl );
+            C3r = __riscv_vfmacc( C3r, alphar, ACC3r, gvl );
+            C3i = __riscv_vfmacc( C3i, alphar, ACC3i, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+            C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
+            C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
+            C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
+            C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
+            
+            m_top += 2;
+        }
+
+
+        if( M & 1 ) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            float result4 = 0;
+            float result5 = 0;
+            float result6 = 0;
+            float result7 = 0;
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+
+            for(BLASLONG k=0; k<K; k++) {
+                result0+=S0*A[ai+0+0]*B[bi+0+0] + S1*A[ai+0+1]*B[bi+0+1];
+                result1+=S2*A[ai+0+1]*B[bi+0+0] + S3*A[ai+0+0]*B[bi+0+1];
+                result2+=S0*A[ai+0+0]*B[bi+2+0] + S1*A[ai+0+1]*B[bi+2+1];
+                result3+=S2*A[ai+0+1]*B[bi+2+0] + S3*A[ai+0+0]*B[bi+2+1];
+                result4+=S0*A[ai+0+0]*B[bi+4+0] + S1*A[ai+0+1]*B[bi+4+1];
+                result5+=S2*A[ai+0+1]*B[bi+4+0] + S3*A[ai+0+0]*B[bi+4+1];
+                result6+=S0*A[ai+0+0]*B[bi+6+0] + S1*A[ai+0+1]*B[bi+6+1];
+                result7+=S2*A[ai+0+1]*B[bi+6+0] + S3*A[ai+0+0]*B[bi+6+1];
+                ai+=1*2;
+                bi+=4*2;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            float Cr, Ci;
+            Cr = C[(ci+0*ldc+0)*2+0];
+            Ci = C[(ci+0*ldc+0)*2+1];
+            Cr += result0*alphar;
+            Ci += result1*alphar;
+            Cr -= result1*alphai;
+            Ci += result0*alphai;
+            C[(ci+0*ldc+0)*2+0] = Cr;
+            C[(ci+0*ldc+0)*2+1] = Ci;
+            Cr = C[(ci+1*ldc+0)*2+0];
+            Ci = C[(ci+1*ldc+0)*2+1];
+            Cr += result2*alphar;
+            Ci += result3*alphar;
+            Cr -= result3*alphai;
+            Ci += result2*alphai;
+            C[(ci+1*ldc+0)*2+0] = Cr;
+            C[(ci+1*ldc+0)*2+1] = Ci;
+            Cr = C[(ci+2*ldc+0)*2+0];
+            Ci = C[(ci+2*ldc+0)*2+1];
+            Cr += result4*alphar;
+            Ci += result5*alphar;
+            Cr -= result5*alphai;
+            Ci += result4*alphai;
+            C[(ci+2*ldc+0)*2+0] = Cr;
+            C[(ci+2*ldc+0)*2+1] = Ci;
+            Cr = C[(ci+3*ldc+0)*2+0];
+            Ci = C[(ci+3*ldc+0)*2+1];
+            Cr += result6*alphar;
+            Ci += result7*alphar;
+            Cr -= result7*alphai;
+            Ci += result6*alphai;
+            C[(ci+3*ldc+0)*2+0] = Cr;
+            C[(ci+3*ldc+0)*2+1] = Ci;
+            m_top+=1;
+        }
+
+        n_top += 4;
+    }
+
+
+
+    // -- tails for N=2
+
+    if( N & 2 ) {
+        gvl = __riscv_vsetvl_e32m1(8);
+        m_top = 0;
+
+        for (BLASLONG i=0; i<M/8; i+=1) {
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            float B0r = B[bi+0*2+0];
+            float B0i = B[bi+0*2+1];
+            float B1r = B[bi+1*2+0];
+            float B1i = B[bi+1*2+1];
+            bi += 2*2;
+
+            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 8*2;
+
+            // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k
+            // leaving 26 vector registers for temporaries
+            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+            vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+            vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+            vfloat32m1_t ACC0r = tmp0r;
+            vfloat32m1_t ACC0i = tmp0i;
+            vfloat32m1_t ACC1r = tmp1r;
+            vfloat32m1_t ACC1i = tmp1i;
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                bi += 2*2;
+
+                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 8*2;
+
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t C0r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C0i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C1r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C1i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            
+            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
+            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
+            C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
+            C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            
+            m_top += 8;
+        }
+
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e32m1(4);
+
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            float B0r = B[bi+0*2+0];
+            float B0i = B[bi+0*2+1];
+            float B1r = B[bi+1*2+0];
+            float B1i = B[bi+1*2+1];
+            bi += 2*2;
+
+            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 4*2;
+
+            // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k
+            // leaving 26 vector registers for temporaries
+            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+            vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+            vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+            vfloat32m1_t ACC0r = tmp0r;
+            vfloat32m1_t ACC0i = tmp0i;
+            vfloat32m1_t ACC1r = tmp1r;
+            vfloat32m1_t ACC1i = tmp1i;
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                bi += 2*2;
+
+                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 4*2;
+
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t C0r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C0i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C1r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C1i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            
+            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
+            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
+            C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
+            C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            gvl = __riscv_vsetvl_e32m1(2);
+
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            float B0r = B[bi+0*2+0];
+            float B0i = B[bi+0*2+1];
+            float B1r = B[bi+1*2+0];
+            float B1i = B[bi+1*2+1];
+            bi += 2*2;
+
+            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 2*2;
+
+            // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k
+            // leaving 26 vector registers for temporaries
+            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+            vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+            vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+            vfloat32m1_t ACC0r = tmp0r;
+            vfloat32m1_t ACC0i = tmp0i;
+            vfloat32m1_t ACC1r = tmp1r;
+            vfloat32m1_t ACC1i = tmp1i;
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                bi += 2*2;
+
+                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 2*2;
+
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t C0r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C0i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C1r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C1i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            
+            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
+            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
+            C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
+            C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            
+            m_top += 2;
+        }
+
+
+        if( M & 1 ) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+
+            for(BLASLONG k=0; k<K; k++) {
+                result0+=S0*A[ai+0+0]*B[bi+0+0] + S1*A[ai+0+1]*B[bi+0+1];
+                result1+=S2*A[ai+0+1]*B[bi+0+0] + S3*A[ai+0+0]*B[bi+0+1];
+                result2+=S0*A[ai+0+0]*B[bi+2+0] + S1*A[ai+0+1]*B[bi+2+1];
+                result3+=S2*A[ai+0+1]*B[bi+2+0] + S3*A[ai+0+0]*B[bi+2+1];
+                ai+=1*2;
+                bi+=2*2;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            float Cr, Ci;
+            Cr = C[(ci+0*ldc+0)*2+0];
+            Ci = C[(ci+0*ldc+0)*2+1];
+            Cr += result0*alphar;
+            Ci += result1*alphar;
+            Cr -= result1*alphai;
+            Ci += result0*alphai;
+            C[(ci+0*ldc+0)*2+0] = Cr;
+            C[(ci+0*ldc+0)*2+1] = Ci;
+            Cr = C[(ci+1*ldc+0)*2+0];
+            Ci = C[(ci+1*ldc+0)*2+1];
+            Cr += result2*alphar;
+            Ci += result3*alphar;
+            Cr -= result3*alphai;
+            Ci += result2*alphai;
+            C[(ci+1*ldc+0)*2+0] = Cr;
+            C[(ci+1*ldc+0)*2+1] = Ci;
+            m_top+=1;
+        }
+
+        n_top += 2;
+    }
+
+
+
+    // -- tails for N=1
+
+    if( N & 1 ) {
+        gvl = __riscv_vsetvl_e32m1(8);
+        m_top = 0;
+
+        for (BLASLONG i=0; i<M/8; i+=1) {
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            float B0r = B[bi+0*2+0];
+            float B0i = B[bi+0*2+1];
+            bi += 1*2;
+
+            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 8*2;
+
+            // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k
+            // leaving 28 vector registers for temporaries
+            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            vfloat32m1_t ACC0r = tmp0r;
+            vfloat32m1_t ACC0i = tmp0i;
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                bi += 1*2;
+
+                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 8*2;
+
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t C0r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C0i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            
+            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
+            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            
+            m_top += 8;
+        }
+
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e32m1(4);
+
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            float B0r = B[bi+0*2+0];
+            float B0i = B[bi+0*2+1];
+            bi += 1*2;
+
+            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 4*2;
+
+            // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k
+            // leaving 28 vector registers for temporaries
+            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            vfloat32m1_t ACC0r = tmp0r;
+            vfloat32m1_t ACC0i = tmp0i;
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                bi += 1*2;
+
+                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 4*2;
+
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t C0r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C0i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            
+            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
+            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            gvl = __riscv_vsetvl_e32m1(2);
+
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            float B0r = B[bi+0*2+0];
+            float B0i = B[bi+0*2+1];
+            bi += 1*2;
+
+            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 2*2;
+
+            // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k
+            // leaving 28 vector registers for temporaries
+            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            vfloat32m1_t ACC0r = tmp0r;
+            vfloat32m1_t ACC0i = tmp0i;
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                bi += 1*2;
+
+                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 2*2;
+
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t C0r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C0i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            
+            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
+            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            
+            m_top += 2;
+        }
+
+
+        if( M & 1 ) {
+            float result0 = 0;
+            float result1 = 0;
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+
+            for(BLASLONG k=0; k<K; k++) {
+                result0+=S0*A[ai+0+0]*B[bi+0+0] + S1*A[ai+0+1]*B[bi+0+1];
+                result1+=S2*A[ai+0+1]*B[bi+0+0] + S3*A[ai+0+0]*B[bi+0+1];
+                ai+=1*2;
+                bi+=1*2;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            float Cr, Ci;
+            Cr = C[(ci+0*ldc+0)*2+0];
+            Ci = C[(ci+0*ldc+0)*2+1];
+            Cr += result0*alphar;
+            Ci += result1*alphar;
+            Cr -= result1*alphai;
+            Ci += result0*alphai;
+            C[(ci+0*ldc+0)*2+0] = Cr;
+            C[(ci+0*ldc+0)*2+1] = Ci;
+            m_top+=1;
+        }
+
+        n_top += 1;
+    }
+
+    return 0;
+}
+
diff --git a/kernel/riscv64/copy.c b/kernel/riscv64/copy.c
index 7b4f04f30..e79ca59af 100644
--- a/kernel/riscv64/copy.c
+++ b/kernel/riscv64/copy.c
@@ -41,7 +41,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
 	BLASLONG i=0;
 	BLASLONG ix=0,iy=0;
 
-	if ( n < 0     )  return(0);
+	if ( n <= 0     )  return(0);
 
 	while(i < n)
 	{
diff --git a/kernel/riscv64/copy_rvv.c b/kernel/riscv64/copy_rvv.c
new file mode 100644
index 000000000..9d4b84095
--- /dev/null
+++ b/kernel/riscv64/copy_rvv.c
@@ -0,0 +1,94 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m8(n)
+#define FLOAT_V_T               vfloat32m8_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m8
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m8
+#define VSEV_FLOAT              __riscv_vse32_v_f32m8
+#define VSSEV_FLOAT             __riscv_vsse32_v_f32m8
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m8(n)
+#define FLOAT_V_T               vfloat64m8_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m8
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m8
+#define VSEV_FLOAT              __riscv_vse64_v_f64m8
+#define VSSEV_FLOAT             __riscv_vsse64_v_f64m8
+#endif
+
+int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+{
+    if(n <= 0)  return(0);
+
+    FLOAT_V_T v0;
+
+    if(inc_x == 1 && inc_y == 1) {
+
+        for(size_t vl; n > 0; n -= vl, x += vl, y += vl) {
+            vl = VSETVL(n);
+            v0 = VLEV_FLOAT(x, vl);
+            VSEV_FLOAT(y, v0, vl);
+        }
+
+    } else if (inc_y == 1) {
+
+        BLASLONG stride_x = inc_x * sizeof(FLOAT);
+
+        for(size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) {
+            vl = VSETVL(n);
+            v0 = VLSEV_FLOAT(x, stride_x, vl);
+            VSEV_FLOAT(y, v0, vl);
+        }
+
+    } else if(inc_x == 1) {
+
+        BLASLONG stride_y = inc_y * sizeof(FLOAT);
+
+        for(size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) {
+            vl = VSETVL(n);
+            v0 = VLEV_FLOAT(x, vl);
+            VSSEV_FLOAT(y, stride_y, v0, vl);
+        }
+
+    } else {
+
+        BLASLONG stride_x = inc_x * sizeof(FLOAT);
+        BLASLONG stride_y = inc_y * sizeof(FLOAT);
+
+        for(size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) {
+            vl = VSETVL(n);
+            v0 = VLSEV_FLOAT(x, stride_x, vl);
+            VSSEV_FLOAT(y, stride_y, v0, vl);
+        }
+
+    }
+
+    return(0);
+}
diff --git a/kernel/riscv64/copy_vector.c b/kernel/riscv64/copy_vector.c
index fee5e195d..ccbd6e482 100644
--- a/kernel/riscv64/copy_vector.c
+++ b/kernel/riscv64/copy_vector.c
@@ -25,22 +25,35 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 #include "common.h"
-#if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m8(n)
-#define FLOAT_V_T vfloat32m8_t
-#define VLEV_FLOAT vle32_v_f32m8
-#define VLSEV_FLOAT vlse32_v_f32m8
-#define VSEV_FLOAT vse32_v_f32m8
-#define VSSEV_FLOAT vsse32_v_f32m8
+
+#ifdef RISCV64_ZVL256B
+#       define LMUL m2
+#       if defined(DOUBLE)
+#               define ELEN 64
+#       else
+#               define ELEN 32
+#       endif
 #else
-#define VSETVL(n) vsetvl_e64m8(n)
-#define FLOAT_V_T vfloat64m8_t
-#define VLEV_FLOAT vle64_v_f64m8
-#define VLSEV_FLOAT vlse64_v_f64m8
-#define VSEV_FLOAT vse64_v_f64m8
-#define VSSEV_FLOAT vsse64_v_f64m8
+#       define LMUL m8
+#       if defined(DOUBLE)
+#               define ELEN 64
+#       else
+#               define ELEN 32
+#       endif
 #endif
 
+#define _
+#define JOIN2_X(x, y) x ## y
+#define JOIN2(x, y) JOIN2_X(x, y)
+#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
+
+#define VSETVL          JOIN(RISCV_RVV(vsetvl),    _e,     ELEN,   LMUL,   _)
+#define FLOAT_V_T       JOIN(vfloat,            ELEN,   LMUL,   _t,     _)
+#define VLEV_FLOAT      JOIN(RISCV_RVV(vle),       ELEN,   _v_f,   ELEN,   LMUL)
+#define VLSEV_FLOAT     JOIN(RISCV_RVV(vlse),      ELEN,   _v_f,   ELEN,   LMUL)
+#define VSEV_FLOAT      JOIN(RISCV_RVV(vse),       ELEN,   _v_f,   ELEN,   LMUL)
+#define VSSEV_FLOAT     JOIN(RISCV_RVV(vsse),      ELEN,   _v_f,   ELEN,   LMUL)
+
 int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
 {
 	BLASLONG i=0, j=0;
@@ -58,7 +71,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
                 stride_x = inc_x * sizeof(FLOAT);
                 if(gvl <= n/4){
                         BLASLONG inc_xv = inc_x * gvl;
-                        BLASLONG gvl3 = gvl * 3;
+                        unsigned int gvl3 = gvl * 3;
                         BLASLONG inc_xv3 = inc_xv * 3;
                         for(i=0,j=0; i<n/(4*gvl); i++){
                                 v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
@@ -86,7 +99,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
                 if(gvl <= n/4){
                         BLASLONG inc_yv = inc_y * gvl;
                         BLASLONG inc_yv3 = inc_yv * 3;
-                        BLASLONG gvl3 = gvl * 3;
+                        unsigned int gvl3 = gvl * 3;
                         for(i=0,j=0; i<n/(4*gvl); i++){
                                 v0 = VLEV_FLOAT(&x[j], gvl);
                                 VSSEV_FLOAT(&y[iy], stride_y, v0, gvl);
diff --git a/kernel/riscv64/ctrmm_kernel_8x4_zvl128b.c b/kernel/riscv64/ctrmm_kernel_8x4_zvl128b.c
new file mode 100644
index 000000000..3268cb810
--- /dev/null
+++ b/kernel/riscv64/ctrmm_kernel_8x4_zvl128b.c
@@ -0,0 +1,1102 @@
+/*
+
+AUTOGENERATED KERNEL
+Script: ./kernel/riscv64/generate_kernel.py
+Settings:
+ LMUL=2
+ M=8
+ M_tail_scalar_from=2
+ N=4
+ __riscv_='__riscv_'
+ complex=True
+ conjugate=False
+ cpu='zvl128b'
+ force_acc_double=False
+ index_type='BLASLONG'
+ op='trmm'
+ param_precision='float'
+ reg_width_bits=128
+ tail_policy=''
+ trace=False
+
+Derived:
+ ELEN_ACC=32
+ ELEN_PARAM=32
+ LMUL_ACC=2
+ VFMACC='__riscv_vfmacc_vf_f32m2'
+ VFMUL='__riscv_vfmul_vf_f32m2'
+ VLEV='__riscv_vle32_v_f32m2'
+ VLSEV='__riscv_vlse32_v_f32m2'
+ VMACC_TO_ACC='__riscv_vfmacc_vf_f32m2'
+ VMUL_TO_ACC='__riscv_vfmul_vf_f32m2'
+ VSETVL='__riscv_vsetvl_e32m2'
+ VSEV='__riscv_vse32_v_f32m2'
+ VSSEV='__riscv_vsse32_v_f32m2'
+ acc_vector_t='vfloat32m2_t'
+ output='ctrmm_kernel_8x4_zvl128b.c'
+ param_scalar_t='float'
+ param_vector_t='vfloat32m2_t'
+
+*/
+
+#include "common.h"
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+#define S0 1
+#define S1 -1
+#define S2 1
+#define S3 1
+#define VFMACC_RR __riscv_vfmsac
+#define VFMACC_RI __riscv_vfmacc
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+#define S0 1
+#define S1 1
+#define S2 1
+#define S3 -1
+#define VFMACC_RR __riscv_vfmacc
+#define VFMACC_RI __riscv_vfmsac
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+#define S0 1
+#define S1 1
+#define S2 -1
+#define S3 1
+#define VFMACC_RR __riscv_vfmacc
+#define VFMACC_RI __riscv_vfnmsac
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+#define S0 1
+#define S1 -1
+#define S2 -1
+#define S3 -1
+#define VFMACC_RR __riscv_vfmsac
+#define VFMACC_RI __riscv_vfnmacc
+#endif
+
+#if defined(LEFT) != defined(TRANSA)
+#define BACKWARDS
+#endif
+
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc, BLASLONG offset)
+
+{
+    BLASLONG gvl = 0;
+    BLASLONG m_top = 0;
+    BLASLONG n_top = 0;
+
+    // -- MAIN PASS
+
+    for (BLASLONG j = 0; j < N / 4; j += 1) {
+        m_top = 0;
+        BLASLONG gvl = __riscv_vsetvl_e32m2(8);
+
+        for (BLASLONG i = 0; i < M / 8; i += 1) {
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 8 * 2;
+            bi += off * 4 * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 8;
+#else
+            pass_K = off + 4;
+#endif
+#endif
+            float B0r = B[bi + 0 * 2 + 0];
+            float B0i = B[bi + 0 * 2 + 1];
+            float B1r = B[bi + 1 * 2 + 0];
+            float B1i = B[bi + 1 * 2 + 1];
+            float B2r = B[bi + 2 * 2 + 0];
+            float B2i = B[bi + 2 * 2 + 1];
+            float B3r = B[bi + 3 * 2 + 0];
+            float B3i = B[bi + 3 * 2 + 1];
+            bi += 4 * 2;
+
+            vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+            vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ai += 8 * 2;
+
+            // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k
+            // leaving 6 vector registers for temporaries
+            // performing 2 operations between reuses of temporaries
+            vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
+            vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
+            vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl);
+            vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl);
+            tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
+            vfloat32m2_t ACC0r = tmp0r;
+            vfloat32m2_t ACC0i = tmp0i;
+            vfloat32m2_t ACC1r = tmp1r;
+            vfloat32m2_t ACC1i = tmp1i;
+            tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl);
+            tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl);
+            tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl);
+            tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl);
+            tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl);
+            tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl);
+            tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl);
+            tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl);
+            vfloat32m2_t ACC2r = tmp0r;
+            vfloat32m2_t ACC2i = tmp0i;
+            vfloat32m2_t ACC3r = tmp1r;
+            vfloat32m2_t ACC3i = tmp1i;
+
+            for (BLASLONG k = 1; k < pass_K; k++) {
+                B0r = B[bi + 0 * 2 + 0];
+                B0i = B[bi + 0 * 2 + 1];
+                B1r = B[bi + 1 * 2 + 0];
+                B1i = B[bi + 1 * 2 + 1];
+                B2r = B[bi + 2 * 2 + 0];
+                B2i = B[bi + 2 * 2 + 1];
+                B3r = B[bi + 3 * 2 + 0];
+                B3i = B[bi + 3 * 2 + 1];
+                bi += 4 * 2;
+
+                A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+                A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+                ai += 8 * 2;
+
+                tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl);
+                tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
+                ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl);
+                tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl);
+                tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl);
+                tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl);
+                tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl);
+                tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl);
+                ACC2r = __riscv_vfadd(ACC2r, tmp0r, gvl);
+                ACC2i = __riscv_vfadd(ACC2i, tmp0i, gvl);
+                ACC3r = __riscv_vfadd(ACC3r, tmp1r, gvl);
+                ACC3i = __riscv_vfadd(ACC3i, tmp1i, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat32m2_t C0r = __riscv_vfmul(ACC0r, alphar, gvl);
+            vfloat32m2_t C0i = __riscv_vfmul(ACC0i, alphar, gvl);
+            vfloat32m2_t C1r = __riscv_vfmul(ACC1r, alphar, gvl);
+            vfloat32m2_t C1i = __riscv_vfmul(ACC1i, alphar, gvl);
+            vfloat32m2_t C2r = __riscv_vfmul(ACC2r, alphar, gvl);
+            vfloat32m2_t C2i = __riscv_vfmul(ACC2i, alphar, gvl);
+            vfloat32m2_t C3r = __riscv_vfmul(ACC3r, alphar, gvl);
+            vfloat32m2_t C3i = __riscv_vfmul(ACC3i, alphar, gvl);
+            C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl);
+            C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl);
+            C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl);
+            C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl);
+            C2r = __riscv_vfnmsac(C2r, alphai, ACC2i, gvl);
+            C2i = __riscv_vfmacc(C2i, alphai, ACC2r, gvl);
+            C3r = __riscv_vfnmsac(C3r, alphai, ACC3i, gvl);
+            C3i = __riscv_vfmacc(C3i, alphai, ACC3r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C2r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C2i, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C3r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C3i, gvl);
+
+            m_top += 8;
+        }
+
+        // -- tails for main pass
+
+        if (M & 4) {
+            gvl = __riscv_vsetvl_e32m2(4);
+
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 4 * 2;
+            bi += off * 4 * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 4;
+#else
+            pass_K = off + 4;
+#endif
+#endif
+            float B0r = B[bi + 0 * 2 + 0];
+            float B0i = B[bi + 0 * 2 + 1];
+            float B1r = B[bi + 1 * 2 + 0];
+            float B1i = B[bi + 1 * 2 + 1];
+            float B2r = B[bi + 2 * 2 + 0];
+            float B2i = B[bi + 2 * 2 + 1];
+            float B3r = B[bi + 3 * 2 + 0];
+            float B3i = B[bi + 3 * 2 + 1];
+            bi += 4 * 2;
+
+            vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+            vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ai += 4 * 2;
+
+            // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k
+            // leaving 6 vector registers for temporaries
+            // performing 2 operations between reuses of temporaries
+            vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
+            vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
+            vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl);
+            vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl);
+            tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
+            vfloat32m2_t ACC0r = tmp0r;
+            vfloat32m2_t ACC0i = tmp0i;
+            vfloat32m2_t ACC1r = tmp1r;
+            vfloat32m2_t ACC1i = tmp1i;
+            tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl);
+            tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl);
+            tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl);
+            tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl);
+            tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl);
+            tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl);
+            tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl);
+            tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl);
+            vfloat32m2_t ACC2r = tmp0r;
+            vfloat32m2_t ACC2i = tmp0i;
+            vfloat32m2_t ACC3r = tmp1r;
+            vfloat32m2_t ACC3i = tmp1i;
+
+            for (BLASLONG k = 1; k < pass_K; k++) {
+                B0r = B[bi + 0 * 2 + 0];
+                B0i = B[bi + 0 * 2 + 1];
+                B1r = B[bi + 1 * 2 + 0];
+                B1i = B[bi + 1 * 2 + 1];
+                B2r = B[bi + 2 * 2 + 0];
+                B2i = B[bi + 2 * 2 + 1];
+                B3r = B[bi + 3 * 2 + 0];
+                B3i = B[bi + 3 * 2 + 1];
+                bi += 4 * 2;
+
+                A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+                A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+                ai += 4 * 2;
+
+                tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl);
+                tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
+                ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl);
+                tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl);
+                tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl);
+                tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl);
+                tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl);
+                tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl);
+                ACC2r = __riscv_vfadd(ACC2r, tmp0r, gvl);
+                ACC2i = __riscv_vfadd(ACC2i, tmp0i, gvl);
+                ACC3r = __riscv_vfadd(ACC3r, tmp1r, gvl);
+                ACC3i = __riscv_vfadd(ACC3i, tmp1i, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat32m2_t C0r = __riscv_vfmul(ACC0r, alphar, gvl);
+            vfloat32m2_t C0i = __riscv_vfmul(ACC0i, alphar, gvl);
+            vfloat32m2_t C1r = __riscv_vfmul(ACC1r, alphar, gvl);
+            vfloat32m2_t C1i = __riscv_vfmul(ACC1i, alphar, gvl);
+            vfloat32m2_t C2r = __riscv_vfmul(ACC2r, alphar, gvl);
+            vfloat32m2_t C2i = __riscv_vfmul(ACC2i, alphar, gvl);
+            vfloat32m2_t C3r = __riscv_vfmul(ACC3r, alphar, gvl);
+            vfloat32m2_t C3i = __riscv_vfmul(ACC3i, alphar, gvl);
+            C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl);
+            C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl);
+            C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl);
+            C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl);
+            C2r = __riscv_vfnmsac(C2r, alphai, ACC2i, gvl);
+            C2i = __riscv_vfmacc(C2i, alphai, ACC2r, gvl);
+            C3r = __riscv_vfnmsac(C3r, alphai, ACC3i, gvl);
+            C3i = __riscv_vfmacc(C3i, alphai, ACC3r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C2r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C2i, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C3r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C3i, gvl);
+
+            m_top += 4;
+        }
+
+        if (M & 2) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            float result4 = 0;
+            float result5 = 0;
+            float result6 = 0;
+            float result7 = 0;
+            float result8 = 0;
+            float result9 = 0;
+            float result10 = 0;
+            float result11 = 0;
+            float result12 = 0;
+            float result13 = 0;
+            float result14 = 0;
+            float result15 = 0;
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 2 * 2;
+            bi += off * 4 * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 2;
+#else
+            pass_K = off + 4;
+#endif
+#endif
+
+            for (BLASLONG k = 0; k < pass_K; k++) {
+                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
+                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
+                result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1];
+                result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1];
+                result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1];
+                result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1];
+                result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1];
+                result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1];
+                result8 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1];
+                result9 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1];
+                result10 += S0 * A[ai + 2 + 0] * B[bi + 4 + 0] + S1 * A[ai + 2 + 1] * B[bi + 4 + 1];
+                result11 += S2 * A[ai + 2 + 1] * B[bi + 4 + 0] + S3 * A[ai + 2 + 0] * B[bi + 4 + 1];
+                result12 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1];
+                result13 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1];
+                result14 += S0 * A[ai + 2 + 0] * B[bi + 6 + 0] + S1 * A[ai + 2 + 1] * B[bi + 6 + 1];
+                result15 += S2 * A[ai + 2 + 1] * B[bi + 6 + 0] + S3 * A[ai + 2 + 0] * B[bi + 6 + 1];
+                ai += 2 * 2;
+                bi += 4 * 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            float Cr, Ci;
+            Cr = result0 * alphar;
+            Ci = result1 * alphar;
+            Cr -= result1 * alphai;
+            Ci += result0 * alphai;
+            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
+            Cr = result2 * alphar;
+            Ci = result3 * alphar;
+            Cr -= result3 * alphai;
+            Ci += result2 * alphai;
+            C[(ci + 0 * ldc + 1) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 1) * 2 + 1] = Ci;
+            Cr = result4 * alphar;
+            Ci = result5 * alphar;
+            Cr -= result5 * alphai;
+            Ci += result4 * alphai;
+            C[(ci + 1 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 1 * ldc + 0) * 2 + 1] = Ci;
+            Cr = result6 * alphar;
+            Ci = result7 * alphar;
+            Cr -= result7 * alphai;
+            Ci += result6 * alphai;
+            C[(ci + 1 * ldc + 1) * 2 + 0] = Cr;
+            C[(ci + 1 * ldc + 1) * 2 + 1] = Ci;
+            Cr = result8 * alphar;
+            Ci = result9 * alphar;
+            Cr -= result9 * alphai;
+            Ci += result8 * alphai;
+            C[(ci + 2 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 2 * ldc + 0) * 2 + 1] = Ci;
+            Cr = result10 * alphar;
+            Ci = result11 * alphar;
+            Cr -= result11 * alphai;
+            Ci += result10 * alphai;
+            C[(ci + 2 * ldc + 1) * 2 + 0] = Cr;
+            C[(ci + 2 * ldc + 1) * 2 + 1] = Ci;
+            Cr = result12 * alphar;
+            Ci = result13 * alphar;
+            Cr -= result13 * alphai;
+            Ci += result12 * alphai;
+            C[(ci + 3 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 3 * ldc + 0) * 2 + 1] = Ci;
+            Cr = result14 * alphar;
+            Ci = result15 * alphar;
+            Cr -= result15 * alphai;
+            Ci += result14 * alphai;
+            C[(ci + 3 * ldc + 1) * 2 + 0] = Cr;
+            C[(ci + 3 * ldc + 1) * 2 + 1] = Ci;
+            m_top += 2;
+        }
+
+        if (M & 1) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            float result4 = 0;
+            float result5 = 0;
+            float result6 = 0;
+            float result7 = 0;
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 1 * 2;
+            bi += off * 4 * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 1;
+#else
+            pass_K = off + 4;
+#endif
+#endif
+
+            for (BLASLONG k = 0; k < pass_K; k++) {
+                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
+                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
+                result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1];
+                result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1];
+                result4 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1];
+                result5 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1];
+                result6 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1];
+                result7 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1];
+                ai += 1 * 2;
+                bi += 4 * 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            float Cr, Ci;
+            Cr = result0 * alphar;
+            Ci = result1 * alphar;
+            Cr -= result1 * alphai;
+            Ci += result0 * alphai;
+            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
+            Cr = result2 * alphar;
+            Ci = result3 * alphar;
+            Cr -= result3 * alphai;
+            Ci += result2 * alphai;
+            C[(ci + 1 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 1 * ldc + 0) * 2 + 1] = Ci;
+            Cr = result4 * alphar;
+            Ci = result5 * alphar;
+            Cr -= result5 * alphai;
+            Ci += result4 * alphai;
+            C[(ci + 2 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 2 * ldc + 0) * 2 + 1] = Ci;
+            Cr = result6 * alphar;
+            Ci = result7 * alphar;
+            Cr -= result7 * alphai;
+            Ci += result6 * alphai;
+            C[(ci + 3 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 3 * ldc + 0) * 2 + 1] = Ci;
+            m_top += 1;
+        }
+
+        n_top += 4;
+    }
+
+    // -- tails for N=2
+
+    if (N & 2) {
+        gvl = __riscv_vsetvl_e32m2(8);
+        m_top = 0;
+
+        for (BLASLONG i = 0; i < M / 8; i += 1) {
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 8 * 2;
+            bi += off * 2 * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 8;
+#else
+            pass_K = off + 2;
+#endif
+#endif
+            float B0r = B[bi + 0 * 2 + 0];
+            float B0i = B[bi + 0 * 2 + 1];
+            float B1r = B[bi + 1 * 2 + 0];
+            float B1i = B[bi + 1 * 2 + 1];
+            bi += 2 * 2;
+
+            vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+            vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ai += 8 * 2;
+
+            // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k
+            // leaving 10 vector registers for temporaries
+            vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
+            vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
+            vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl);
+            vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl);
+            tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
+            vfloat32m2_t ACC0r = tmp0r;
+            vfloat32m2_t ACC0i = tmp0i;
+            vfloat32m2_t ACC1r = tmp1r;
+            vfloat32m2_t ACC1i = tmp1i;
+
+            for (BLASLONG k = 1; k < pass_K; k++) {
+                B0r = B[bi + 0 * 2 + 0];
+                B0i = B[bi + 0 * 2 + 1];
+                B1r = B[bi + 1 * 2 + 0];
+                B1i = B[bi + 1 * 2 + 1];
+                bi += 2 * 2;
+
+                A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+                A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+                ai += 8 * 2;
+
+                tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl);
+                tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
+                ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat32m2_t C0r = __riscv_vfmul(ACC0r, alphar, gvl);
+            vfloat32m2_t C0i = __riscv_vfmul(ACC0i, alphar, gvl);
+            vfloat32m2_t C1r = __riscv_vfmul(ACC1r, alphar, gvl);
+            vfloat32m2_t C1i = __riscv_vfmul(ACC1i, alphar, gvl);
+            C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl);
+            C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl);
+            C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl);
+            C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl);
+
+            m_top += 8;
+        }
+
+        if (M & 4) {
+            gvl = __riscv_vsetvl_e32m2(4);
+
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 4 * 2;
+            bi += off * 2 * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 4;
+#else
+            pass_K = off + 2;
+#endif
+#endif
+            float B0r = B[bi + 0 * 2 + 0];
+            float B0i = B[bi + 0 * 2 + 1];
+            float B1r = B[bi + 1 * 2 + 0];
+            float B1i = B[bi + 1 * 2 + 1];
+            bi += 2 * 2;
+
+            vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+            vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ai += 4 * 2;
+
+            // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k
+            // leaving 10 vector registers for temporaries
+            vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
+            vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
+            vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl);
+            vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl);
+            tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
+            vfloat32m2_t ACC0r = tmp0r;
+            vfloat32m2_t ACC0i = tmp0i;
+            vfloat32m2_t ACC1r = tmp1r;
+            vfloat32m2_t ACC1i = tmp1i;
+
+            for (BLASLONG k = 1; k < pass_K; k++) {
+                B0r = B[bi + 0 * 2 + 0];
+                B0i = B[bi + 0 * 2 + 1];
+                B1r = B[bi + 1 * 2 + 0];
+                B1i = B[bi + 1 * 2 + 1];
+                bi += 2 * 2;
+
+                A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+                A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+                ai += 4 * 2;
+
+                tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl);
+                tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
+                ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat32m2_t C0r = __riscv_vfmul(ACC0r, alphar, gvl);
+            vfloat32m2_t C0i = __riscv_vfmul(ACC0i, alphar, gvl);
+            vfloat32m2_t C1r = __riscv_vfmul(ACC1r, alphar, gvl);
+            vfloat32m2_t C1i = __riscv_vfmul(ACC1i, alphar, gvl);
+            C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl);
+            C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl);
+            C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl);
+            C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl);
+
+            m_top += 4;
+        }
+
+        if (M & 2) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            float result4 = 0;
+            float result5 = 0;
+            float result6 = 0;
+            float result7 = 0;
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 2 * 2;
+            bi += off * 2 * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 2;
+#else
+            pass_K = off + 2;
+#endif
+#endif
+
+            for (BLASLONG k = 0; k < pass_K; k++) {
+                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
+                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
+                result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1];
+                result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1];
+                result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1];
+                result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1];
+                result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1];
+                result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1];
+                ai += 2 * 2;
+                bi += 2 * 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            float Cr, Ci;
+            Cr = result0 * alphar;
+            Ci = result1 * alphar;
+            Cr -= result1 * alphai;
+            Ci += result0 * alphai;
+            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
+            Cr = result2 * alphar;
+            Ci = result3 * alphar;
+            Cr -= result3 * alphai;
+            Ci += result2 * alphai;
+            C[(ci + 0 * ldc + 1) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 1) * 2 + 1] = Ci;
+            Cr = result4 * alphar;
+            Ci = result5 * alphar;
+            Cr -= result5 * alphai;
+            Ci += result4 * alphai;
+            C[(ci + 1 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 1 * ldc + 0) * 2 + 1] = Ci;
+            Cr = result6 * alphar;
+            Ci = result7 * alphar;
+            Cr -= result7 * alphai;
+            Ci += result6 * alphai;
+            C[(ci + 1 * ldc + 1) * 2 + 0] = Cr;
+            C[(ci + 1 * ldc + 1) * 2 + 1] = Ci;
+            m_top += 2;
+        }
+
+        if (M & 1) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 1 * 2;
+            bi += off * 2 * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 1;
+#else
+            pass_K = off + 2;
+#endif
+#endif
+
+            for (BLASLONG k = 0; k < pass_K; k++) {
+                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
+                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
+                result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1];
+                result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1];
+                ai += 1 * 2;
+                bi += 2 * 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            float Cr, Ci;
+            Cr = result0 * alphar;
+            Ci = result1 * alphar;
+            Cr -= result1 * alphai;
+            Ci += result0 * alphai;
+            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
+            Cr = result2 * alphar;
+            Ci = result3 * alphar;
+            Cr -= result3 * alphai;
+            Ci += result2 * alphai;
+            C[(ci + 1 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 1 * ldc + 0) * 2 + 1] = Ci;
+            m_top += 1;
+        }
+
+        n_top += 2;
+    }
+
+    // -- tails for N=1
+
+    if (N & 1) {
+        gvl = __riscv_vsetvl_e32m2(8);
+        m_top = 0;
+
+        for (BLASLONG i = 0; i < M / 8; i += 1) {
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 8 * 2;
+            bi += off * 1 * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 8;
+#else
+            pass_K = off + 1;
+#endif
+#endif
+            float B0r = B[bi + 0 * 2 + 0];
+            float B0i = B[bi + 0 * 2 + 1];
+            bi += 1 * 2;
+
+            vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+            vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ai += 8 * 2;
+
+            // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k
+            // leaving 12 vector registers for temporaries
+            vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
+            vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
+            tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+            vfloat32m2_t ACC0r = tmp0r;
+            vfloat32m2_t ACC0i = tmp0i;
+
+            for (BLASLONG k = 1; k < pass_K; k++) {
+                B0r = B[bi + 0 * 2 + 0];
+                B0i = B[bi + 0 * 2 + 1];
+                bi += 1 * 2;
+
+                A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+                A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+                ai += 8 * 2;
+
+                tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
+                tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+                ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat32m2_t C0r = __riscv_vfmul(ACC0r, alphar, gvl);
+            vfloat32m2_t C0i = __riscv_vfmul(ACC0i, alphar, gvl);
+            C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl);
+            C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl);
+
+            m_top += 8;
+        }
+
+        if (M & 4) {
+            gvl = __riscv_vsetvl_e32m2(4);
+
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 4 * 2;
+            bi += off * 1 * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 4;
+#else
+            pass_K = off + 1;
+#endif
+#endif
+            float B0r = B[bi + 0 * 2 + 0];
+            float B0i = B[bi + 0 * 2 + 1];
+            bi += 1 * 2;
+
+            vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+            vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ai += 4 * 2;
+
+            // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k
+            // leaving 12 vector registers for temporaries
+            vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
+            vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
+            tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+            vfloat32m2_t ACC0r = tmp0r;
+            vfloat32m2_t ACC0i = tmp0i;
+
+            for (BLASLONG k = 1; k < pass_K; k++) {
+                B0r = B[bi + 0 * 2 + 0];
+                B0i = B[bi + 0 * 2 + 1];
+                bi += 1 * 2;
+
+                A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+                A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+                ai += 4 * 2;
+
+                tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
+                tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+                ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat32m2_t C0r = __riscv_vfmul(ACC0r, alphar, gvl);
+            vfloat32m2_t C0i = __riscv_vfmul(ACC0i, alphar, gvl);
+            C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl);
+            C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl);
+
+            m_top += 4;
+        }
+
+        if (M & 2) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 2 * 2;
+            bi += off * 1 * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 2;
+#else
+            pass_K = off + 1;
+#endif
+#endif
+
+            for (BLASLONG k = 0; k < pass_K; k++) {
+                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
+                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
+                result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1];
+                result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1];
+                ai += 2 * 2;
+                bi += 1 * 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            float Cr, Ci;
+            Cr = result0 * alphar;
+            Ci = result1 * alphar;
+            Cr -= result1 * alphai;
+            Ci += result0 * alphai;
+            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
+            Cr = result2 * alphar;
+            Ci = result3 * alphar;
+            Cr -= result3 * alphai;
+            Ci += result2 * alphai;
+            C[(ci + 0 * ldc + 1) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 1) * 2 + 1] = Ci;
+            m_top += 2;
+        }
+
+        if (M & 1) {
+            float result0 = 0;
+            float result1 = 0;
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 1 * 2;
+            bi += off * 1 * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 1;
+#else
+            pass_K = off + 1;
+#endif
+#endif
+
+            for (BLASLONG k = 0; k < pass_K; k++) {
+                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
+                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
+                ai += 1 * 2;
+                bi += 1 * 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            float Cr, Ci;
+            Cr = result0 * alphar;
+            Ci = result1 * alphar;
+            Cr -= result1 * alphai;
+            Ci += result0 * alphai;
+            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
+            m_top += 1;
+        }
+
+        n_top += 1;
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/ctrmm_kernel_8x8_zvl256b.c b/kernel/riscv64/ctrmm_kernel_8x8_zvl256b.c
new file mode 100644
index 000000000..b4f111a69
--- /dev/null
+++ b/kernel/riscv64/ctrmm_kernel_8x8_zvl256b.c
@@ -0,0 +1,2007 @@
+/*
+
+AUTOGENERATED KERNEL
+Settings:
+ LMUL=1
+ M=8
+ M_tail_scalar_from=1
+ N=8
+ __riscv_='__riscv_'
+ complex=True
+ conjugate=False
+ cpu='zvl256b'
+ force_acc_double=False
+ index_type='BLASLONG'
+ op='trmm'
+ param_precision='float'
+ reg_width_bits=256
+ tail_policy=''
+ trace=False
+
+Derived:
+ ELEN_ACC=32
+ ELEN_PARAM=32
+ LMUL_ACC=1
+ VFMACC='__riscv_vfmacc_vf_f32m1'
+ VFMUL='__riscv_vfmul_vf_f32m1'
+ VLEV='__riscv_vle32_v_f32m1'
+ VLSEV='__riscv_vlse32_v_f32m1'
+ VMACC_TO_ACC='__riscv_vfmacc_vf_f32m1'
+ VMUL_TO_ACC='__riscv_vfmul_vf_f32m1'
+ VSETVL='__riscv_vsetvl_e32m1'
+ VSEV='__riscv_vse32_v_f32m1'
+ VSSEV='__riscv_vsse32_v_f32m1'
+ acc_vector_t='vfloat32m1_t'
+ output='ctrmm_kernel_8x8_zvl256b.c'
+ param_scalar_t='float'
+ param_vector_t='vfloat32m1_t'
+
+*/
+
+#include "common.h"
+
+
+
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+    #define S0  1
+    #define S1 -1
+    #define S2  1
+    #define S3  1
+    #define VFMACC_RR __riscv_vfmsac
+    #define VFMACC_RI __riscv_vfmacc
+#endif
+#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
+    #define S0  1
+    #define S1  1
+    #define S2  1
+    #define S3 -1
+    #define VFMACC_RR __riscv_vfmacc
+    #define VFMACC_RI __riscv_vfmsac
+#endif
+#if   defined(RN) || defined(RT) || defined(CN) || defined(CT)
+    #define S0  1
+    #define S1  1
+    #define S2 -1
+    #define S3  1
+    #define VFMACC_RR __riscv_vfmacc
+    #define VFMACC_RI __riscv_vfnmsac
+#endif
+#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
+    #define S0  1
+    #define S1 -1
+    #define S2 -1
+    #define S3 -1
+    #define VFMACC_RR __riscv_vfmsac
+    #define VFMACC_RI __riscv_vfnmacc
+#endif
+
+
+#if defined(LEFT) != defined(TRANSA)
+    #define BACKWARDS
+#endif
+
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc, BLASLONG offset)
+
+{
+    BLASLONG gvl = 0;
+    BLASLONG m_top = 0;
+    BLASLONG n_top = 0;
+
+
+    // -- MAIN PASS
+
+    for (BLASLONG j=0; j<N/8; j+=1) {
+        m_top = 0;
+        BLASLONG gvl = __riscv_vsetvl_e32m1(8);
+
+
+        for (BLASLONG i=0; i<M/8; i+=1) {
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*8*2;
+                bi += off*8*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 8;
+                #else
+                    pass_K = off + 8;
+                #endif
+            #endif
+            float B0r = B[bi+0*2+0];
+            float B0i = B[bi+0*2+1];
+            float B1r = B[bi+1*2+0];
+            float B1i = B[bi+1*2+1];
+            float B2r = B[bi+2*2+0];
+            float B2i = B[bi+2*2+1];
+            float B3r = B[bi+3*2+0];
+            float B3i = B[bi+3*2+1];
+            float B4r = B[bi+4*2+0];
+            float B4i = B[bi+4*2+1];
+            float B5r = B[bi+5*2+0];
+            float B5i = B[bi+5*2+1];
+            float B6r = B[bi+6*2+0];
+            float B6i = B[bi+6*2+1];
+            float B7r = B[bi+7*2+0];
+            float B7i = B[bi+7*2+1];
+            bi += 8*2;
+
+            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 8*2;
+
+            // 2 vector regs to hold A array contents, 16 regs to hold values accumulated over k
+            // leaving 14 vector registers for temporaries
+            // performing 4 operations between reuses of temporaries
+            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+            vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+            vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+            vfloat32m1_t tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
+            vfloat32m1_t tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
+            vfloat32m1_t tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
+            vfloat32m1_t tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+            tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+            tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+            tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+            tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+            vfloat32m1_t ACC0r = tmp0r;
+            vfloat32m1_t ACC0i = tmp0i;
+            vfloat32m1_t ACC1r = tmp1r;
+            vfloat32m1_t ACC1i = tmp1i;
+            vfloat32m1_t ACC2r = tmp2r;
+            vfloat32m1_t ACC2i = tmp2i;
+            vfloat32m1_t ACC3r = tmp3r;
+            vfloat32m1_t ACC3i = tmp3i;
+            tmp0r = __riscv_vfmul_vf_f32m1( A0i, B4i, gvl);
+            tmp0i = __riscv_vfmul_vf_f32m1( A0r, B4i, gvl);
+            tmp1r = __riscv_vfmul_vf_f32m1( A0i, B5i, gvl);
+            tmp1i = __riscv_vfmul_vf_f32m1( A0r, B5i, gvl);
+            tmp2r = __riscv_vfmul_vf_f32m1( A0i, B6i, gvl);
+            tmp2i = __riscv_vfmul_vf_f32m1( A0r, B6i, gvl);
+            tmp3r = __riscv_vfmul_vf_f32m1( A0i, B7i, gvl);
+            tmp3i = __riscv_vfmul_vf_f32m1( A0r, B7i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B4r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B4r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B5r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B5r, A0i, gvl);
+            tmp2r = VFMACC_RR( tmp2r, B6r, A0r, gvl);
+            tmp2i = VFMACC_RI( tmp2i, B6r, A0i, gvl);
+            tmp3r = VFMACC_RR( tmp3r, B7r, A0r, gvl);
+            tmp3i = VFMACC_RI( tmp3i, B7r, A0i, gvl);
+            vfloat32m1_t ACC4r = tmp0r;
+            vfloat32m1_t ACC4i = tmp0i;
+            vfloat32m1_t ACC5r = tmp1r;
+            vfloat32m1_t ACC5i = tmp1i;
+            vfloat32m1_t ACC6r = tmp2r;
+            vfloat32m1_t ACC6i = tmp2i;
+            vfloat32m1_t ACC7r = tmp3r;
+            vfloat32m1_t ACC7i = tmp3i;
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                B2r = B[bi+2*2+0];
+                B2i = B[bi+2*2+1];
+                B3r = B[bi+3*2+0];
+                B3i = B[bi+3*2+1];
+                B4r = B[bi+4*2+0];
+                B4i = B[bi+4*2+1];
+                B5r = B[bi+5*2+0];
+                B5i = B[bi+5*2+1];
+                B6r = B[bi+6*2+0];
+                B6i = B[bi+6*2+1];
+                B7r = B[bi+7*2+0];
+                B7i = B[bi+7*2+1];
+                bi += 8*2;
+
+                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 8*2;
+
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+                tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
+                tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
+                tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
+                tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+                tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+                tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+                tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+                tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+                ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
+                ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
+                ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
+                ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B4i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B4i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B5i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B5i, gvl);
+                tmp2r = __riscv_vfmul_vf_f32m1( A0i, B6i, gvl);
+                tmp2i = __riscv_vfmul_vf_f32m1( A0r, B6i, gvl);
+                tmp3r = __riscv_vfmul_vf_f32m1( A0i, B7i, gvl);
+                tmp3i = __riscv_vfmul_vf_f32m1( A0r, B7i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B4r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B4r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B5r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B5r, A0i, gvl);
+                tmp2r = VFMACC_RR( tmp2r, B6r, A0r, gvl);
+                tmp2i = VFMACC_RI( tmp2i, B6r, A0i, gvl);
+                tmp3r = VFMACC_RR( tmp3r, B7r, A0r, gvl);
+                tmp3i = VFMACC_RI( tmp3i, B7r, A0i, gvl);
+                ACC4r = __riscv_vfadd( ACC4r, tmp0r, gvl);
+                ACC4i = __riscv_vfadd( ACC4i, tmp0i, gvl);
+                ACC5r = __riscv_vfadd( ACC5r, tmp1r, gvl);
+                ACC5i = __riscv_vfadd( ACC5i, tmp1i, gvl);
+                ACC6r = __riscv_vfadd( ACC6r, tmp2r, gvl);
+                ACC6i = __riscv_vfadd( ACC6i, tmp2i, gvl);
+                ACC7r = __riscv_vfadd( ACC7r, tmp3r, gvl);
+                ACC7i = __riscv_vfadd( ACC7i, tmp3i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
+            vfloat32m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
+            vfloat32m1_t C1r = __riscv_vfmul( ACC1r, alphar, gvl );
+            vfloat32m1_t C1i = __riscv_vfmul( ACC1i, alphar, gvl );
+            vfloat32m1_t C2r = __riscv_vfmul( ACC2r, alphar, gvl );
+            vfloat32m1_t C2i = __riscv_vfmul( ACC2i, alphar, gvl );
+            vfloat32m1_t C3r = __riscv_vfmul( ACC3r, alphar, gvl );
+            vfloat32m1_t C3i = __riscv_vfmul( ACC3i, alphar, gvl );
+            vfloat32m1_t C4r = __riscv_vfmul( ACC4r, alphar, gvl );
+            vfloat32m1_t C4i = __riscv_vfmul( ACC4i, alphar, gvl );
+            vfloat32m1_t C5r = __riscv_vfmul( ACC5r, alphar, gvl );
+            vfloat32m1_t C5i = __riscv_vfmul( ACC5i, alphar, gvl );
+            vfloat32m1_t C6r = __riscv_vfmul( ACC6r, alphar, gvl );
+            vfloat32m1_t C6i = __riscv_vfmul( ACC6i, alphar, gvl );
+            vfloat32m1_t C7r = __riscv_vfmul( ACC7r, alphar, gvl );
+            vfloat32m1_t C7i = __riscv_vfmul( ACC7i, alphar, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+            C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
+            C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
+            C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
+            C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
+            C4r = __riscv_vfnmsac( C4r, alphai, ACC4i, gvl );
+            C4i = __riscv_vfmacc ( C4i, alphai, ACC4r, gvl );
+            C5r = __riscv_vfnmsac( C5r, alphai, ACC5i, gvl );
+            C5i = __riscv_vfmacc ( C5i, alphai, ACC5r, gvl );
+            C6r = __riscv_vfnmsac( C6r, alphai, ACC6i, gvl );
+            C6i = __riscv_vfmacc ( C6i, alphai, ACC6r, gvl );
+            C7r = __riscv_vfnmsac( C7r, alphai, ACC7i, gvl );
+            C7i = __riscv_vfmacc ( C7i, alphai, ACC7r, gvl );
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C4r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C4i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C5r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C5i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C6r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C6i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C7r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C7i, gvl);
+            
+            m_top += 8;
+        }
+
+
+
+        // -- tails for main pass
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e32m1(4);
+
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*4*2;
+                bi += off*8*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 4;
+                #else
+                    pass_K = off + 8;
+                #endif
+            #endif
+            float B0r = B[bi+0*2+0];
+            float B0i = B[bi+0*2+1];
+            float B1r = B[bi+1*2+0];
+            float B1i = B[bi+1*2+1];
+            float B2r = B[bi+2*2+0];
+            float B2i = B[bi+2*2+1];
+            float B3r = B[bi+3*2+0];
+            float B3i = B[bi+3*2+1];
+            float B4r = B[bi+4*2+0];
+            float B4i = B[bi+4*2+1];
+            float B5r = B[bi+5*2+0];
+            float B5i = B[bi+5*2+1];
+            float B6r = B[bi+6*2+0];
+            float B6i = B[bi+6*2+1];
+            float B7r = B[bi+7*2+0];
+            float B7i = B[bi+7*2+1];
+            bi += 8*2;
+
+            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 4*2;
+
+            // 2 vector regs to hold A array contents, 16 regs to hold values accumulated over k
+            // leaving 14 vector registers for temporaries
+            // performing 4 operations between reuses of temporaries
+            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+            vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+            vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+            vfloat32m1_t tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
+            vfloat32m1_t tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
+            vfloat32m1_t tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
+            vfloat32m1_t tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+            tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+            tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+            tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+            tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+            vfloat32m1_t ACC0r = tmp0r;
+            vfloat32m1_t ACC0i = tmp0i;
+            vfloat32m1_t ACC1r = tmp1r;
+            vfloat32m1_t ACC1i = tmp1i;
+            vfloat32m1_t ACC2r = tmp2r;
+            vfloat32m1_t ACC2i = tmp2i;
+            vfloat32m1_t ACC3r = tmp3r;
+            vfloat32m1_t ACC3i = tmp3i;
+            tmp0r = __riscv_vfmul_vf_f32m1( A0i, B4i, gvl);
+            tmp0i = __riscv_vfmul_vf_f32m1( A0r, B4i, gvl);
+            tmp1r = __riscv_vfmul_vf_f32m1( A0i, B5i, gvl);
+            tmp1i = __riscv_vfmul_vf_f32m1( A0r, B5i, gvl);
+            tmp2r = __riscv_vfmul_vf_f32m1( A0i, B6i, gvl);
+            tmp2i = __riscv_vfmul_vf_f32m1( A0r, B6i, gvl);
+            tmp3r = __riscv_vfmul_vf_f32m1( A0i, B7i, gvl);
+            tmp3i = __riscv_vfmul_vf_f32m1( A0r, B7i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B4r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B4r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B5r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B5r, A0i, gvl);
+            tmp2r = VFMACC_RR( tmp2r, B6r, A0r, gvl);
+            tmp2i = VFMACC_RI( tmp2i, B6r, A0i, gvl);
+            tmp3r = VFMACC_RR( tmp3r, B7r, A0r, gvl);
+            tmp3i = VFMACC_RI( tmp3i, B7r, A0i, gvl);
+            vfloat32m1_t ACC4r = tmp0r;
+            vfloat32m1_t ACC4i = tmp0i;
+            vfloat32m1_t ACC5r = tmp1r;
+            vfloat32m1_t ACC5i = tmp1i;
+            vfloat32m1_t ACC6r = tmp2r;
+            vfloat32m1_t ACC6i = tmp2i;
+            vfloat32m1_t ACC7r = tmp3r;
+            vfloat32m1_t ACC7i = tmp3i;
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                B2r = B[bi+2*2+0];
+                B2i = B[bi+2*2+1];
+                B3r = B[bi+3*2+0];
+                B3i = B[bi+3*2+1];
+                B4r = B[bi+4*2+0];
+                B4i = B[bi+4*2+1];
+                B5r = B[bi+5*2+0];
+                B5i = B[bi+5*2+1];
+                B6r = B[bi+6*2+0];
+                B6i = B[bi+6*2+1];
+                B7r = B[bi+7*2+0];
+                B7i = B[bi+7*2+1];
+                bi += 8*2;
+
+                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 4*2;
+
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+                tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
+                tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
+                tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
+                tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+                tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+                tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+                tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+                tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+                ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
+                ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
+                ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
+                ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B4i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B4i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B5i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B5i, gvl);
+                tmp2r = __riscv_vfmul_vf_f32m1( A0i, B6i, gvl);
+                tmp2i = __riscv_vfmul_vf_f32m1( A0r, B6i, gvl);
+                tmp3r = __riscv_vfmul_vf_f32m1( A0i, B7i, gvl);
+                tmp3i = __riscv_vfmul_vf_f32m1( A0r, B7i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B4r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B4r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B5r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B5r, A0i, gvl);
+                tmp2r = VFMACC_RR( tmp2r, B6r, A0r, gvl);
+                tmp2i = VFMACC_RI( tmp2i, B6r, A0i, gvl);
+                tmp3r = VFMACC_RR( tmp3r, B7r, A0r, gvl);
+                tmp3i = VFMACC_RI( tmp3i, B7r, A0i, gvl);
+                ACC4r = __riscv_vfadd( ACC4r, tmp0r, gvl);
+                ACC4i = __riscv_vfadd( ACC4i, tmp0i, gvl);
+                ACC5r = __riscv_vfadd( ACC5r, tmp1r, gvl);
+                ACC5i = __riscv_vfadd( ACC5i, tmp1i, gvl);
+                ACC6r = __riscv_vfadd( ACC6r, tmp2r, gvl);
+                ACC6i = __riscv_vfadd( ACC6i, tmp2i, gvl);
+                ACC7r = __riscv_vfadd( ACC7r, tmp3r, gvl);
+                ACC7i = __riscv_vfadd( ACC7i, tmp3i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
+            vfloat32m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
+            vfloat32m1_t C1r = __riscv_vfmul( ACC1r, alphar, gvl );
+            vfloat32m1_t C1i = __riscv_vfmul( ACC1i, alphar, gvl );
+            vfloat32m1_t C2r = __riscv_vfmul( ACC2r, alphar, gvl );
+            vfloat32m1_t C2i = __riscv_vfmul( ACC2i, alphar, gvl );
+            vfloat32m1_t C3r = __riscv_vfmul( ACC3r, alphar, gvl );
+            vfloat32m1_t C3i = __riscv_vfmul( ACC3i, alphar, gvl );
+            vfloat32m1_t C4r = __riscv_vfmul( ACC4r, alphar, gvl );
+            vfloat32m1_t C4i = __riscv_vfmul( ACC4i, alphar, gvl );
+            vfloat32m1_t C5r = __riscv_vfmul( ACC5r, alphar, gvl );
+            vfloat32m1_t C5i = __riscv_vfmul( ACC5i, alphar, gvl );
+            vfloat32m1_t C6r = __riscv_vfmul( ACC6r, alphar, gvl );
+            vfloat32m1_t C6i = __riscv_vfmul( ACC6i, alphar, gvl );
+            vfloat32m1_t C7r = __riscv_vfmul( ACC7r, alphar, gvl );
+            vfloat32m1_t C7i = __riscv_vfmul( ACC7i, alphar, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+            C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
+            C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
+            C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
+            C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
+            C4r = __riscv_vfnmsac( C4r, alphai, ACC4i, gvl );
+            C4i = __riscv_vfmacc ( C4i, alphai, ACC4r, gvl );
+            C5r = __riscv_vfnmsac( C5r, alphai, ACC5i, gvl );
+            C5i = __riscv_vfmacc ( C5i, alphai, ACC5r, gvl );
+            C6r = __riscv_vfnmsac( C6r, alphai, ACC6i, gvl );
+            C6i = __riscv_vfmacc ( C6i, alphai, ACC6r, gvl );
+            C7r = __riscv_vfnmsac( C7r, alphai, ACC7i, gvl );
+            C7i = __riscv_vfmacc ( C7i, alphai, ACC7r, gvl );
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C4r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C4i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C5r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C5i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C6r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C6i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C7r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C7i, gvl);
+            
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            gvl = __riscv_vsetvl_e32m1(2);
+
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*2*2;
+                bi += off*8*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 2;
+                #else
+                    pass_K = off + 8;
+                #endif
+            #endif
+            float B0r = B[bi+0*2+0];
+            float B0i = B[bi+0*2+1];
+            float B1r = B[bi+1*2+0];
+            float B1i = B[bi+1*2+1];
+            float B2r = B[bi+2*2+0];
+            float B2i = B[bi+2*2+1];
+            float B3r = B[bi+3*2+0];
+            float B3i = B[bi+3*2+1];
+            float B4r = B[bi+4*2+0];
+            float B4i = B[bi+4*2+1];
+            float B5r = B[bi+5*2+0];
+            float B5i = B[bi+5*2+1];
+            float B6r = B[bi+6*2+0];
+            float B6i = B[bi+6*2+1];
+            float B7r = B[bi+7*2+0];
+            float B7i = B[bi+7*2+1];
+            bi += 8*2;
+
+            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 2*2;
+
+            // 2 vector regs to hold A array contents, 16 regs to hold values accumulated over k
+            // leaving 14 vector registers for temporaries
+            // performing 4 operations between reuses of temporaries
+            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+            vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+            vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+            vfloat32m1_t tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
+            vfloat32m1_t tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
+            vfloat32m1_t tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
+            vfloat32m1_t tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+            tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+            tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+            tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+            tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+            vfloat32m1_t ACC0r = tmp0r;
+            vfloat32m1_t ACC0i = tmp0i;
+            vfloat32m1_t ACC1r = tmp1r;
+            vfloat32m1_t ACC1i = tmp1i;
+            vfloat32m1_t ACC2r = tmp2r;
+            vfloat32m1_t ACC2i = tmp2i;
+            vfloat32m1_t ACC3r = tmp3r;
+            vfloat32m1_t ACC3i = tmp3i;
+            tmp0r = __riscv_vfmul_vf_f32m1( A0i, B4i, gvl);
+            tmp0i = __riscv_vfmul_vf_f32m1( A0r, B4i, gvl);
+            tmp1r = __riscv_vfmul_vf_f32m1( A0i, B5i, gvl);
+            tmp1i = __riscv_vfmul_vf_f32m1( A0r, B5i, gvl);
+            tmp2r = __riscv_vfmul_vf_f32m1( A0i, B6i, gvl);
+            tmp2i = __riscv_vfmul_vf_f32m1( A0r, B6i, gvl);
+            tmp3r = __riscv_vfmul_vf_f32m1( A0i, B7i, gvl);
+            tmp3i = __riscv_vfmul_vf_f32m1( A0r, B7i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B4r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B4r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B5r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B5r, A0i, gvl);
+            tmp2r = VFMACC_RR( tmp2r, B6r, A0r, gvl);
+            tmp2i = VFMACC_RI( tmp2i, B6r, A0i, gvl);
+            tmp3r = VFMACC_RR( tmp3r, B7r, A0r, gvl);
+            tmp3i = VFMACC_RI( tmp3i, B7r, A0i, gvl);
+            vfloat32m1_t ACC4r = tmp0r;
+            vfloat32m1_t ACC4i = tmp0i;
+            vfloat32m1_t ACC5r = tmp1r;
+            vfloat32m1_t ACC5i = tmp1i;
+            vfloat32m1_t ACC6r = tmp2r;
+            vfloat32m1_t ACC6i = tmp2i;
+            vfloat32m1_t ACC7r = tmp3r;
+            vfloat32m1_t ACC7i = tmp3i;
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                B2r = B[bi+2*2+0];
+                B2i = B[bi+2*2+1];
+                B3r = B[bi+3*2+0];
+                B3i = B[bi+3*2+1];
+                B4r = B[bi+4*2+0];
+                B4i = B[bi+4*2+1];
+                B5r = B[bi+5*2+0];
+                B5i = B[bi+5*2+1];
+                B6r = B[bi+6*2+0];
+                B6i = B[bi+6*2+1];
+                B7r = B[bi+7*2+0];
+                B7i = B[bi+7*2+1];
+                bi += 8*2;
+
+                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 2*2;
+
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+                tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
+                tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
+                tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
+                tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+                tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+                tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+                tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+                tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+                ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
+                ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
+                ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
+                ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B4i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B4i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B5i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B5i, gvl);
+                tmp2r = __riscv_vfmul_vf_f32m1( A0i, B6i, gvl);
+                tmp2i = __riscv_vfmul_vf_f32m1( A0r, B6i, gvl);
+                tmp3r = __riscv_vfmul_vf_f32m1( A0i, B7i, gvl);
+                tmp3i = __riscv_vfmul_vf_f32m1( A0r, B7i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B4r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B4r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B5r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B5r, A0i, gvl);
+                tmp2r = VFMACC_RR( tmp2r, B6r, A0r, gvl);
+                tmp2i = VFMACC_RI( tmp2i, B6r, A0i, gvl);
+                tmp3r = VFMACC_RR( tmp3r, B7r, A0r, gvl);
+                tmp3i = VFMACC_RI( tmp3i, B7r, A0i, gvl);
+                ACC4r = __riscv_vfadd( ACC4r, tmp0r, gvl);
+                ACC4i = __riscv_vfadd( ACC4i, tmp0i, gvl);
+                ACC5r = __riscv_vfadd( ACC5r, tmp1r, gvl);
+                ACC5i = __riscv_vfadd( ACC5i, tmp1i, gvl);
+                ACC6r = __riscv_vfadd( ACC6r, tmp2r, gvl);
+                ACC6i = __riscv_vfadd( ACC6i, tmp2i, gvl);
+                ACC7r = __riscv_vfadd( ACC7r, tmp3r, gvl);
+                ACC7i = __riscv_vfadd( ACC7i, tmp3i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
+            vfloat32m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
+            vfloat32m1_t C1r = __riscv_vfmul( ACC1r, alphar, gvl );
+            vfloat32m1_t C1i = __riscv_vfmul( ACC1i, alphar, gvl );
+            vfloat32m1_t C2r = __riscv_vfmul( ACC2r, alphar, gvl );
+            vfloat32m1_t C2i = __riscv_vfmul( ACC2i, alphar, gvl );
+            vfloat32m1_t C3r = __riscv_vfmul( ACC3r, alphar, gvl );
+            vfloat32m1_t C3i = __riscv_vfmul( ACC3i, alphar, gvl );
+            vfloat32m1_t C4r = __riscv_vfmul( ACC4r, alphar, gvl );
+            vfloat32m1_t C4i = __riscv_vfmul( ACC4i, alphar, gvl );
+            vfloat32m1_t C5r = __riscv_vfmul( ACC5r, alphar, gvl );
+            vfloat32m1_t C5i = __riscv_vfmul( ACC5i, alphar, gvl );
+            vfloat32m1_t C6r = __riscv_vfmul( ACC6r, alphar, gvl );
+            vfloat32m1_t C6i = __riscv_vfmul( ACC6i, alphar, gvl );
+            vfloat32m1_t C7r = __riscv_vfmul( ACC7r, alphar, gvl );
+            vfloat32m1_t C7i = __riscv_vfmul( ACC7i, alphar, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+            C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
+            C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
+            C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
+            C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
+            C4r = __riscv_vfnmsac( C4r, alphai, ACC4i, gvl );
+            C4i = __riscv_vfmacc ( C4i, alphai, ACC4r, gvl );
+            C5r = __riscv_vfnmsac( C5r, alphai, ACC5i, gvl );
+            C5i = __riscv_vfmacc ( C5i, alphai, ACC5r, gvl );
+            C6r = __riscv_vfnmsac( C6r, alphai, ACC6i, gvl );
+            C6i = __riscv_vfmacc ( C6i, alphai, ACC6r, gvl );
+            C7r = __riscv_vfnmsac( C7r, alphai, ACC7i, gvl );
+            C7i = __riscv_vfmacc ( C7i, alphai, ACC7r, gvl );
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C4r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C4i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C5r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C5i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C6r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C6i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C7r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C7i, gvl);
+            
+            m_top += 2;
+        }
+
+
+        if( M & 1 ) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            float result4 = 0;
+            float result5 = 0;
+            float result6 = 0;
+            float result7 = 0;
+            float result8 = 0;
+            float result9 = 0;
+            float result10 = 0;
+            float result11 = 0;
+            float result12 = 0;
+            float result13 = 0;
+            float result14 = 0;
+            float result15 = 0;
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*1*2;
+                bi += off*8*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 1;
+                #else
+                    pass_K = off + 8;
+                #endif
+            #endif
+
+            for(BLASLONG k=0; k<pass_K; k++) {
+                result0+=S0*A[ai+0+0]*B[bi+0+0] + S1*A[ai+0+1]*B[bi+0+1];
+                result1+=S2*A[ai+0+1]*B[bi+0+0] + S3*A[ai+0+0]*B[bi+0+1];
+                result2+=S0*A[ai+0+0]*B[bi+2+0] + S1*A[ai+0+1]*B[bi+2+1];
+                result3+=S2*A[ai+0+1]*B[bi+2+0] + S3*A[ai+0+0]*B[bi+2+1];
+                result4+=S0*A[ai+0+0]*B[bi+4+0] + S1*A[ai+0+1]*B[bi+4+1];
+                result5+=S2*A[ai+0+1]*B[bi+4+0] + S3*A[ai+0+0]*B[bi+4+1];
+                result6+=S0*A[ai+0+0]*B[bi+6+0] + S1*A[ai+0+1]*B[bi+6+1];
+                result7+=S2*A[ai+0+1]*B[bi+6+0] + S3*A[ai+0+0]*B[bi+6+1];
+                result8+=S0*A[ai+0+0]*B[bi+8+0] + S1*A[ai+0+1]*B[bi+8+1];
+                result9+=S2*A[ai+0+1]*B[bi+8+0] + S3*A[ai+0+0]*B[bi+8+1];
+                result10+=S0*A[ai+0+0]*B[bi+10+0] + S1*A[ai+0+1]*B[bi+10+1];
+                result11+=S2*A[ai+0+1]*B[bi+10+0] + S3*A[ai+0+0]*B[bi+10+1];
+                result12+=S0*A[ai+0+0]*B[bi+12+0] + S1*A[ai+0+1]*B[bi+12+1];
+                result13+=S2*A[ai+0+1]*B[bi+12+0] + S3*A[ai+0+0]*B[bi+12+1];
+                result14+=S0*A[ai+0+0]*B[bi+14+0] + S1*A[ai+0+1]*B[bi+14+1];
+                result15+=S2*A[ai+0+1]*B[bi+14+0] + S3*A[ai+0+0]*B[bi+14+1];
+                ai+=1*2;
+                bi+=8*2;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            float Cr, Ci;
+            Cr = result0*alphar;
+            Ci = result1*alphar;
+            Cr -= result1*alphai;
+            Ci += result0*alphai;
+            C[(ci+0*ldc+0)*2+0] = Cr;
+            C[(ci+0*ldc+0)*2+1] = Ci;
+            Cr = result2*alphar;
+            Ci = result3*alphar;
+            Cr -= result3*alphai;
+            Ci += result2*alphai;
+            C[(ci+1*ldc+0)*2+0] = Cr;
+            C[(ci+1*ldc+0)*2+1] = Ci;
+            Cr = result4*alphar;
+            Ci = result5*alphar;
+            Cr -= result5*alphai;
+            Ci += result4*alphai;
+            C[(ci+2*ldc+0)*2+0] = Cr;
+            C[(ci+2*ldc+0)*2+1] = Ci;
+            Cr = result6*alphar;
+            Ci = result7*alphar;
+            Cr -= result7*alphai;
+            Ci += result6*alphai;
+            C[(ci+3*ldc+0)*2+0] = Cr;
+            C[(ci+3*ldc+0)*2+1] = Ci;
+            Cr = result8*alphar;
+            Ci = result9*alphar;
+            Cr -= result9*alphai;
+            Ci += result8*alphai;
+            C[(ci+4*ldc+0)*2+0] = Cr;
+            C[(ci+4*ldc+0)*2+1] = Ci;
+            Cr = result10*alphar;
+            Ci = result11*alphar;
+            Cr -= result11*alphai;
+            Ci += result10*alphai;
+            C[(ci+5*ldc+0)*2+0] = Cr;
+            C[(ci+5*ldc+0)*2+1] = Ci;
+            Cr = result12*alphar;
+            Ci = result13*alphar;
+            Cr -= result13*alphai;
+            Ci += result12*alphai;
+            C[(ci+6*ldc+0)*2+0] = Cr;
+            C[(ci+6*ldc+0)*2+1] = Ci;
+            Cr = result14*alphar;
+            Ci = result15*alphar;
+            Cr -= result15*alphai;
+            Ci += result14*alphai;
+            C[(ci+7*ldc+0)*2+0] = Cr;
+            C[(ci+7*ldc+0)*2+1] = Ci;
+            m_top+=1;
+        }
+
+        n_top += 8;
+    }
+
+
+
+    // -- tails for N=4
+
+    if( N & 4 ) {
+        gvl = __riscv_vsetvl_e32m1(8);
+        m_top = 0;
+
+        for (BLASLONG i=0; i<M/8; i+=1) {
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*8*2;
+                bi += off*4*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 8;
+                #else
+                    pass_K = off + 4;
+                #endif
+            #endif
+            float B0r = B[bi+0*2+0];
+            float B0i = B[bi+0*2+1];
+            float B1r = B[bi+1*2+0];
+            float B1i = B[bi+1*2+1];
+            float B2r = B[bi+2*2+0];
+            float B2i = B[bi+2*2+1];
+            float B3r = B[bi+3*2+0];
+            float B3i = B[bi+3*2+1];
+            bi += 4*2;
+
+            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 8*2;
+
+            // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k
+            // leaving 22 vector registers for temporaries
+            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+            vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+            vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+            vfloat32m1_t tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
+            vfloat32m1_t tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
+            vfloat32m1_t tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
+            vfloat32m1_t tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+            tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+            tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+            tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+            tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+            vfloat32m1_t ACC0r = tmp0r;
+            vfloat32m1_t ACC0i = tmp0i;
+            vfloat32m1_t ACC1r = tmp1r;
+            vfloat32m1_t ACC1i = tmp1i;
+            vfloat32m1_t ACC2r = tmp2r;
+            vfloat32m1_t ACC2i = tmp2i;
+            vfloat32m1_t ACC3r = tmp3r;
+            vfloat32m1_t ACC3i = tmp3i;
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                B2r = B[bi+2*2+0];
+                B2i = B[bi+2*2+1];
+                B3r = B[bi+3*2+0];
+                B3i = B[bi+3*2+1];
+                bi += 4*2;
+
+                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 8*2;
+
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+                tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
+                tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
+                tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
+                tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+                tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+                tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+                tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+                tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+                ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
+                ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
+                ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
+                ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
+            vfloat32m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
+            vfloat32m1_t C1r = __riscv_vfmul( ACC1r, alphar, gvl );
+            vfloat32m1_t C1i = __riscv_vfmul( ACC1i, alphar, gvl );
+            vfloat32m1_t C2r = __riscv_vfmul( ACC2r, alphar, gvl );
+            vfloat32m1_t C2i = __riscv_vfmul( ACC2i, alphar, gvl );
+            vfloat32m1_t C3r = __riscv_vfmul( ACC3r, alphar, gvl );
+            vfloat32m1_t C3i = __riscv_vfmul( ACC3i, alphar, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+            C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
+            C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
+            C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
+            C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
+            
+            m_top += 8;
+        }
+
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e32m1(4);
+
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*4*2;
+                bi += off*4*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 4;
+                #else
+                    pass_K = off + 4;
+                #endif
+            #endif
+            float B0r = B[bi+0*2+0];
+            float B0i = B[bi+0*2+1];
+            float B1r = B[bi+1*2+0];
+            float B1i = B[bi+1*2+1];
+            float B2r = B[bi+2*2+0];
+            float B2i = B[bi+2*2+1];
+            float B3r = B[bi+3*2+0];
+            float B3i = B[bi+3*2+1];
+            bi += 4*2;
+
+            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 4*2;
+
+            // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k
+            // leaving 22 vector registers for temporaries
+            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+            vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+            vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+            vfloat32m1_t tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
+            vfloat32m1_t tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
+            vfloat32m1_t tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
+            vfloat32m1_t tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+            tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+            tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+            tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+            tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+            vfloat32m1_t ACC0r = tmp0r;
+            vfloat32m1_t ACC0i = tmp0i;
+            vfloat32m1_t ACC1r = tmp1r;
+            vfloat32m1_t ACC1i = tmp1i;
+            vfloat32m1_t ACC2r = tmp2r;
+            vfloat32m1_t ACC2i = tmp2i;
+            vfloat32m1_t ACC3r = tmp3r;
+            vfloat32m1_t ACC3i = tmp3i;
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                B2r = B[bi+2*2+0];
+                B2i = B[bi+2*2+1];
+                B3r = B[bi+3*2+0];
+                B3i = B[bi+3*2+1];
+                bi += 4*2;
+
+                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 4*2;
+
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+                tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
+                tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
+                tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
+                tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+                tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+                tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+                tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+                tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+                ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
+                ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
+                ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
+                ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
+            vfloat32m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
+            vfloat32m1_t C1r = __riscv_vfmul( ACC1r, alphar, gvl );
+            vfloat32m1_t C1i = __riscv_vfmul( ACC1i, alphar, gvl );
+            vfloat32m1_t C2r = __riscv_vfmul( ACC2r, alphar, gvl );
+            vfloat32m1_t C2i = __riscv_vfmul( ACC2i, alphar, gvl );
+            vfloat32m1_t C3r = __riscv_vfmul( ACC3r, alphar, gvl );
+            vfloat32m1_t C3i = __riscv_vfmul( ACC3i, alphar, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+            C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
+            C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
+            C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
+            C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
+            
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            gvl = __riscv_vsetvl_e32m1(2);
+
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*2*2;
+                bi += off*4*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 2;
+                #else
+                    pass_K = off + 4;
+                #endif
+            #endif
+            float B0r = B[bi+0*2+0];
+            float B0i = B[bi+0*2+1];
+            float B1r = B[bi+1*2+0];
+            float B1i = B[bi+1*2+1];
+            float B2r = B[bi+2*2+0];
+            float B2i = B[bi+2*2+1];
+            float B3r = B[bi+3*2+0];
+            float B3i = B[bi+3*2+1];
+            bi += 4*2;
+
+            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 2*2;
+
+            // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k
+            // leaving 22 vector registers for temporaries
+            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+            vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+            vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+            vfloat32m1_t tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
+            vfloat32m1_t tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
+            vfloat32m1_t tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
+            vfloat32m1_t tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+            tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+            tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+            tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+            tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+            vfloat32m1_t ACC0r = tmp0r;
+            vfloat32m1_t ACC0i = tmp0i;
+            vfloat32m1_t ACC1r = tmp1r;
+            vfloat32m1_t ACC1i = tmp1i;
+            vfloat32m1_t ACC2r = tmp2r;
+            vfloat32m1_t ACC2i = tmp2i;
+            vfloat32m1_t ACC3r = tmp3r;
+            vfloat32m1_t ACC3i = tmp3i;
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                B2r = B[bi+2*2+0];
+                B2i = B[bi+2*2+1];
+                B3r = B[bi+3*2+0];
+                B3i = B[bi+3*2+1];
+                bi += 4*2;
+
+                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 2*2;
+
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+                tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
+                tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
+                tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
+                tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+                tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+                tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+                tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+                tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+                ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
+                ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
+                ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
+                ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
+            vfloat32m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
+            vfloat32m1_t C1r = __riscv_vfmul( ACC1r, alphar, gvl );
+            vfloat32m1_t C1i = __riscv_vfmul( ACC1i, alphar, gvl );
+            vfloat32m1_t C2r = __riscv_vfmul( ACC2r, alphar, gvl );
+            vfloat32m1_t C2i = __riscv_vfmul( ACC2i, alphar, gvl );
+            vfloat32m1_t C3r = __riscv_vfmul( ACC3r, alphar, gvl );
+            vfloat32m1_t C3i = __riscv_vfmul( ACC3i, alphar, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+            C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
+            C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
+            C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
+            C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
+            
+            m_top += 2;
+        }
+
+
+        if( M & 1 ) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            float result4 = 0;
+            float result5 = 0;
+            float result6 = 0;
+            float result7 = 0;
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*1*2;
+                bi += off*4*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 1;
+                #else
+                    pass_K = off + 4;
+                #endif
+            #endif
+
+            for(BLASLONG k=0; k<pass_K; k++) {
+                result0+=S0*A[ai+0+0]*B[bi+0+0] + S1*A[ai+0+1]*B[bi+0+1];
+                result1+=S2*A[ai+0+1]*B[bi+0+0] + S3*A[ai+0+0]*B[bi+0+1];
+                result2+=S0*A[ai+0+0]*B[bi+2+0] + S1*A[ai+0+1]*B[bi+2+1];
+                result3+=S2*A[ai+0+1]*B[bi+2+0] + S3*A[ai+0+0]*B[bi+2+1];
+                result4+=S0*A[ai+0+0]*B[bi+4+0] + S1*A[ai+0+1]*B[bi+4+1];
+                result5+=S2*A[ai+0+1]*B[bi+4+0] + S3*A[ai+0+0]*B[bi+4+1];
+                result6+=S0*A[ai+0+0]*B[bi+6+0] + S1*A[ai+0+1]*B[bi+6+1];
+                result7+=S2*A[ai+0+1]*B[bi+6+0] + S3*A[ai+0+0]*B[bi+6+1];
+                ai+=1*2;
+                bi+=4*2;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            float Cr, Ci;
+            Cr = result0*alphar;
+            Ci = result1*alphar;
+            Cr -= result1*alphai;
+            Ci += result0*alphai;
+            C[(ci+0*ldc+0)*2+0] = Cr;
+            C[(ci+0*ldc+0)*2+1] = Ci;
+            Cr = result2*alphar;
+            Ci = result3*alphar;
+            Cr -= result3*alphai;
+            Ci += result2*alphai;
+            C[(ci+1*ldc+0)*2+0] = Cr;
+            C[(ci+1*ldc+0)*2+1] = Ci;
+            Cr = result4*alphar;
+            Ci = result5*alphar;
+            Cr -= result5*alphai;
+            Ci += result4*alphai;
+            C[(ci+2*ldc+0)*2+0] = Cr;
+            C[(ci+2*ldc+0)*2+1] = Ci;
+            Cr = result6*alphar;
+            Ci = result7*alphar;
+            Cr -= result7*alphai;
+            Ci += result6*alphai;
+            C[(ci+3*ldc+0)*2+0] = Cr;
+            C[(ci+3*ldc+0)*2+1] = Ci;
+            m_top+=1;
+        }
+
+        n_top += 4;
+    }
+
+
+
+    // -- tails for N=2
+
+    if( N & 2 ) {
+        gvl = __riscv_vsetvl_e32m1(8);
+        m_top = 0;
+
+        for (BLASLONG i=0; i<M/8; i+=1) {
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*8*2;
+                bi += off*2*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 8;
+                #else
+                    pass_K = off + 2;
+                #endif
+            #endif
+            float B0r = B[bi+0*2+0];
+            float B0i = B[bi+0*2+1];
+            float B1r = B[bi+1*2+0];
+            float B1i = B[bi+1*2+1];
+            bi += 2*2;
+
+            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 8*2;
+
+            // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k
+            // leaving 26 vector registers for temporaries
+            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+            vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+            vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+            vfloat32m1_t ACC0r = tmp0r;
+            vfloat32m1_t ACC0i = tmp0i;
+            vfloat32m1_t ACC1r = tmp1r;
+            vfloat32m1_t ACC1i = tmp1i;
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                bi += 2*2;
+
+                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 8*2;
+
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
+            vfloat32m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
+            vfloat32m1_t C1r = __riscv_vfmul( ACC1r, alphar, gvl );
+            vfloat32m1_t C1i = __riscv_vfmul( ACC1i, alphar, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            
+            m_top += 8;
+        }
+
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e32m1(4);
+
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*4*2;
+                bi += off*2*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 4;
+                #else
+                    pass_K = off + 2;
+                #endif
+            #endif
+            float B0r = B[bi+0*2+0];
+            float B0i = B[bi+0*2+1];
+            float B1r = B[bi+1*2+0];
+            float B1i = B[bi+1*2+1];
+            bi += 2*2;
+
+            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 4*2;
+
+            // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k
+            // leaving 26 vector registers for temporaries
+            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+            vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+            vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+            vfloat32m1_t ACC0r = tmp0r;
+            vfloat32m1_t ACC0i = tmp0i;
+            vfloat32m1_t ACC1r = tmp1r;
+            vfloat32m1_t ACC1i = tmp1i;
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                bi += 2*2;
+
+                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 4*2;
+
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
+            vfloat32m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
+            vfloat32m1_t C1r = __riscv_vfmul( ACC1r, alphar, gvl );
+            vfloat32m1_t C1i = __riscv_vfmul( ACC1i, alphar, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            gvl = __riscv_vsetvl_e32m1(2);
+
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*2*2;
+                bi += off*2*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 2;
+                #else
+                    pass_K = off + 2;
+                #endif
+            #endif
+            float B0r = B[bi+0*2+0];
+            float B0i = B[bi+0*2+1];
+            float B1r = B[bi+1*2+0];
+            float B1i = B[bi+1*2+1];
+            bi += 2*2;
+
+            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 2*2;
+
+            // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k
+            // leaving 26 vector registers for temporaries
+            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+            vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+            vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+            vfloat32m1_t ACC0r = tmp0r;
+            vfloat32m1_t ACC0i = tmp0i;
+            vfloat32m1_t ACC1r = tmp1r;
+            vfloat32m1_t ACC1i = tmp1i;
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                bi += 2*2;
+
+                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 2*2;
+
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
+            vfloat32m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
+            vfloat32m1_t C1r = __riscv_vfmul( ACC1r, alphar, gvl );
+            vfloat32m1_t C1i = __riscv_vfmul( ACC1i, alphar, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            
+            m_top += 2;
+        }
+
+
+        if( M & 1 ) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*1*2;
+                bi += off*2*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 1;
+                #else
+                    pass_K = off + 2;
+                #endif
+            #endif
+
+            for(BLASLONG k=0; k<pass_K; k++) {
+                result0+=S0*A[ai+0+0]*B[bi+0+0] + S1*A[ai+0+1]*B[bi+0+1];
+                result1+=S2*A[ai+0+1]*B[bi+0+0] + S3*A[ai+0+0]*B[bi+0+1];
+                result2+=S0*A[ai+0+0]*B[bi+2+0] + S1*A[ai+0+1]*B[bi+2+1];
+                result3+=S2*A[ai+0+1]*B[bi+2+0] + S3*A[ai+0+0]*B[bi+2+1];
+                ai+=1*2;
+                bi+=2*2;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            float Cr, Ci;
+            Cr = result0*alphar;
+            Ci = result1*alphar;
+            Cr -= result1*alphai;
+            Ci += result0*alphai;
+            C[(ci+0*ldc+0)*2+0] = Cr;
+            C[(ci+0*ldc+0)*2+1] = Ci;
+            Cr = result2*alphar;
+            Ci = result3*alphar;
+            Cr -= result3*alphai;
+            Ci += result2*alphai;
+            C[(ci+1*ldc+0)*2+0] = Cr;
+            C[(ci+1*ldc+0)*2+1] = Ci;
+            m_top+=1;
+        }
+
+        n_top += 2;
+    }
+
+
+
+    // -- tails for N=1
+
+    if( N & 1 ) {
+        gvl = __riscv_vsetvl_e32m1(8);
+        m_top = 0;
+
+        for (BLASLONG i=0; i<M/8; i+=1) {
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*8*2;
+                bi += off*1*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 8;
+                #else
+                    pass_K = off + 1;
+                #endif
+            #endif
+            float B0r = B[bi+0*2+0];
+            float B0i = B[bi+0*2+1];
+            bi += 1*2;
+
+            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 8*2;
+
+            // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k
+            // leaving 28 vector registers for temporaries
+            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            vfloat32m1_t ACC0r = tmp0r;
+            vfloat32m1_t ACC0i = tmp0i;
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                bi += 1*2;
+
+                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 8*2;
+
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
+            vfloat32m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            
+            m_top += 8;
+        }
+
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e32m1(4);
+
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*4*2;
+                bi += off*1*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 4;
+                #else
+                    pass_K = off + 1;
+                #endif
+            #endif
+            float B0r = B[bi+0*2+0];
+            float B0i = B[bi+0*2+1];
+            bi += 1*2;
+
+            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 4*2;
+
+            // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k
+            // leaving 28 vector registers for temporaries
+            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            vfloat32m1_t ACC0r = tmp0r;
+            vfloat32m1_t ACC0i = tmp0i;
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                bi += 1*2;
+
+                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 4*2;
+
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
+            vfloat32m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            gvl = __riscv_vsetvl_e32m1(2);
+
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*2*2;
+                bi += off*1*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 2;
+                #else
+                    pass_K = off + 1;
+                #endif
+            #endif
+            float B0r = B[bi+0*2+0];
+            float B0i = B[bi+0*2+1];
+            bi += 1*2;
+
+            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 2*2;
+
+            // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k
+            // leaving 28 vector registers for temporaries
+            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            vfloat32m1_t ACC0r = tmp0r;
+            vfloat32m1_t ACC0i = tmp0i;
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                bi += 1*2;
+
+                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 2*2;
+
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
+            vfloat32m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            
+            m_top += 2;
+        }
+
+
+        if( M & 1 ) {
+            float result0 = 0;
+            float result1 = 0;
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*1*2;
+                bi += off*1*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 1;
+                #else
+                    pass_K = off + 1;
+                #endif
+            #endif
+
+            for(BLASLONG k=0; k<pass_K; k++) {
+                result0+=S0*A[ai+0+0]*B[bi+0+0] + S1*A[ai+0+1]*B[bi+0+1];
+                result1+=S2*A[ai+0+1]*B[bi+0+0] + S3*A[ai+0+0]*B[bi+0+1];
+                ai+=1*2;
+                bi+=1*2;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            float Cr, Ci;
+            Cr = result0*alphar;
+            Ci = result1*alphar;
+            Cr -= result1*alphai;
+            Ci += result0*alphai;
+            C[(ci+0*ldc+0)*2+0] = Cr;
+            C[(ci+0*ldc+0)*2+1] = Ci;
+            m_top+=1;
+        }
+
+        n_top += 1;
+    }
+
+    return 0;
+}
+
diff --git a/kernel/riscv64/dgemm_kernel_8x4_c910v.c b/kernel/riscv64/dgemm_kernel_8x4_c910v.c
index 79a7a98d9..b9bccbd49 100644
--- a/kernel/riscv64/dgemm_kernel_8x4_c910v.c
+++ b/kernel/riscv64/dgemm_kernel_8x4_c910v.c
@@ -196,7 +196,7 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
 		   
 		   asm volatile(
 				"vsetvli    zero, zero, e64,m1 \n\t"
-				"fmv.w.x    ft11, zero         \n\t"
+				"fmv.d.x    ft11, zero         \n\t"
 				"mv         t0,   %[BK]        \n\t"
 				
 				"vfmv.v.f   v16,  ft11         \n\t"
diff --git a/kernel/riscv64/dgemm_kernel_8x4_zvl128b.c b/kernel/riscv64/dgemm_kernel_8x4_zvl128b.c
new file mode 100644
index 000000000..a613f0bce
--- /dev/null
+++ b/kernel/riscv64/dgemm_kernel_8x4_zvl128b.c
@@ -0,0 +1,492 @@
+/*
+
+AUTOGENERATED KERNEL
+Script: ./kernel/riscv64/generate_kernel.py
+Settings:
+ LMUL=4
+ M=8
+ M_tail_scalar_from=2
+ N=4
+ __riscv_='__riscv_'
+ complex=False
+ conjugate=False
+ cpu='zvl128b'
+ force_acc_double=False
+ index_type='BLASLONG'
+ op='gemm'
+ param_precision='double'
+ reg_width_bits=128
+ tail_policy=''
+ trace=False
+
+Derived:
+ ELEN_ACC=64
+ ELEN_PARAM=64
+ LMUL_ACC=4
+ VFMACC='__riscv_vfmacc_vf_f64m4'
+ VFMUL='__riscv_vfmul_vf_f64m4'
+ VLEV='__riscv_vle64_v_f64m4'
+ VLSEV='__riscv_vlse64_v_f64m4'
+ VMACC_TO_ACC='__riscv_vfmacc_vf_f64m4'
+ VMUL_TO_ACC='__riscv_vfmul_vf_f64m4'
+ VSETVL='__riscv_vsetvl_e64m4'
+ VSEV='__riscv_vse64_v_f64m4'
+ VSSEV='__riscv_vsse64_v_f64m4'
+ acc_vector_t='vfloat64m4_t'
+ output='dgemm_kernel_8x4_zvl128b.c'
+ param_scalar_t='double'
+ param_vector_t='vfloat64m4_t'
+
+*/
+
+#include "common.h"
+
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc)
+
+{
+    BLASLONG gvl = 0;
+    BLASLONG m_top = 0;
+    BLASLONG n_top = 0;
+
+    // -- MAIN PASS
+
+    for (BLASLONG j = 0; j < N / 4; j += 1) {
+        m_top = 0;
+        BLASLONG gvl = __riscv_vsetvl_e64m4(8);
+
+        for (BLASLONG i = 0; i < M / 8; i += 1) {
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            double B0 = B[bi + 0];
+            double B1 = B[bi + 1];
+            double B2 = B[bi + 2];
+            double B3 = B[bi + 3];
+            bi += 4;
+
+            vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
+            ai += 8;
+
+            vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl);
+            vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl);
+            vfloat64m4_t result2 = __riscv_vfmul_vf_f64m4(A0, B2, gvl);
+            vfloat64m4_t result3 = __riscv_vfmul_vf_f64m4(A0, B3, gvl);
+
+            for (BLASLONG k = 1; k < K; k++) {
+                B0 = B[bi + 0];
+                B1 = B[bi + 1];
+                B2 = B[bi + 2];
+                B3 = B[bi + 3];
+                bi += 4;
+
+                A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl);
+                result2 = __riscv_vfmacc_vf_f64m4(result2, B2, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f64m4(result3, B3, A0, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat64m4_t c1 = __riscv_vle64_v_f64m4(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat64m4_t c2 = __riscv_vle64_v_f64m4(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat64m4_t c3 = __riscv_vle64_v_f64m4(&C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl);
+            c1 = __riscv_vfmacc_vf_f64m4(c1, alpha, result1, gvl);
+            c2 = __riscv_vfmacc_vf_f64m4(c2, alpha, result2, gvl);
+            c3 = __riscv_vfmacc_vf_f64m4(c3, alpha, result3, gvl);
+
+            ci = n_top * ldc + m_top;
+
+            __riscv_vse64_v_f64m4(&C[ci], c0, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse64_v_f64m4(&C[ci], c1, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse64_v_f64m4(&C[ci], c2, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse64_v_f64m4(&C[ci], c3, gvl);
+            m_top += 8;
+        }
+
+        // -- tails for main pass
+
+        if (M & 4) {
+            gvl = __riscv_vsetvl_e64m4(4);
+
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            double B0 = B[bi + 0];
+            double B1 = B[bi + 1];
+            double B2 = B[bi + 2];
+            double B3 = B[bi + 3];
+            bi += 4;
+
+            vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
+            ai += 4;
+
+            vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl);
+            vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl);
+            vfloat64m4_t result2 = __riscv_vfmul_vf_f64m4(A0, B2, gvl);
+            vfloat64m4_t result3 = __riscv_vfmul_vf_f64m4(A0, B3, gvl);
+
+            for (BLASLONG k = 1; k < K; k++) {
+                B0 = B[bi + 0];
+                B1 = B[bi + 1];
+                B2 = B[bi + 2];
+                B3 = B[bi + 3];
+                bi += 4;
+
+                A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl);
+                result2 = __riscv_vfmacc_vf_f64m4(result2, B2, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f64m4(result3, B3, A0, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat64m4_t c1 = __riscv_vle64_v_f64m4(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat64m4_t c2 = __riscv_vle64_v_f64m4(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat64m4_t c3 = __riscv_vle64_v_f64m4(&C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl);
+            c1 = __riscv_vfmacc_vf_f64m4(c1, alpha, result1, gvl);
+            c2 = __riscv_vfmacc_vf_f64m4(c2, alpha, result2, gvl);
+            c3 = __riscv_vfmacc_vf_f64m4(c3, alpha, result3, gvl);
+
+            ci = n_top * ldc + m_top;
+
+            __riscv_vse64_v_f64m4(&C[ci], c0, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse64_v_f64m4(&C[ci], c1, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse64_v_f64m4(&C[ci], c2, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse64_v_f64m4(&C[ci], c3, gvl);
+            m_top += 4;
+        }
+
+        if (M & 2) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            double result4 = 0;
+            double result5 = 0;
+            double result6 = 0;
+            double result7 = 0;
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+
+            for (BLASLONG k = 0; k < K; k++) {
+                result0 += A[ai + 0] * B[bi + 0];
+                result1 += A[ai + 1] * B[bi + 0];
+                result2 += A[ai + 0] * B[bi + 1];
+                result3 += A[ai + 1] * B[bi + 1];
+                result4 += A[ai + 0] * B[bi + 2];
+                result5 += A[ai + 1] * B[bi + 2];
+                result6 += A[ai + 0] * B[bi + 3];
+                result7 += A[ai + 1] * B[bi + 3];
+                ai += 2;
+                bi += 4;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            C[ci + 0 * ldc + 0] += alpha * result0;
+            C[ci + 0 * ldc + 1] += alpha * result1;
+            C[ci + 1 * ldc + 0] += alpha * result2;
+            C[ci + 1 * ldc + 1] += alpha * result3;
+            C[ci + 2 * ldc + 0] += alpha * result4;
+            C[ci + 2 * ldc + 1] += alpha * result5;
+            C[ci + 3 * ldc + 0] += alpha * result6;
+            C[ci + 3 * ldc + 1] += alpha * result7;
+            m_top += 2;
+        }
+
+        if (M & 1) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+
+            for (BLASLONG k = 0; k < K; k++) {
+                result0 += A[ai + 0] * B[bi + 0];
+                result1 += A[ai + 0] * B[bi + 1];
+                result2 += A[ai + 0] * B[bi + 2];
+                result3 += A[ai + 0] * B[bi + 3];
+                ai += 1;
+                bi += 4;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            C[ci + 0 * ldc + 0] += alpha * result0;
+            C[ci + 1 * ldc + 0] += alpha * result1;
+            C[ci + 2 * ldc + 0] += alpha * result2;
+            C[ci + 3 * ldc + 0] += alpha * result3;
+            m_top += 1;
+        }
+
+        n_top += 4;
+    }
+
+    // -- tails for N=2
+
+    if (N & 2) {
+        gvl = __riscv_vsetvl_e64m4(8);
+        m_top = 0;
+
+        for (BLASLONG i = 0; i < M / 8; i += 1) {
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            double B0 = B[bi + 0];
+            double B1 = B[bi + 1];
+            bi += 2;
+
+            vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
+            ai += 8;
+
+            vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl);
+            vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl);
+
+            for (BLASLONG k = 1; k < K; k++) {
+                B0 = B[bi + 0];
+                B1 = B[bi + 1];
+                bi += 2;
+
+                A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat64m4_t c1 = __riscv_vle64_v_f64m4(&C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl);
+            c1 = __riscv_vfmacc_vf_f64m4(c1, alpha, result1, gvl);
+
+            ci = n_top * ldc + m_top;
+
+            __riscv_vse64_v_f64m4(&C[ci], c0, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse64_v_f64m4(&C[ci], c1, gvl);
+            m_top += 8;
+        }
+
+        if (M & 4) {
+            gvl = __riscv_vsetvl_e64m4(4);
+
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            double B0 = B[bi + 0];
+            double B1 = B[bi + 1];
+            bi += 2;
+
+            vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
+            ai += 4;
+
+            vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl);
+            vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl);
+
+            for (BLASLONG k = 1; k < K; k++) {
+                B0 = B[bi + 0];
+                B1 = B[bi + 1];
+                bi += 2;
+
+                A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat64m4_t c1 = __riscv_vle64_v_f64m4(&C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl);
+            c1 = __riscv_vfmacc_vf_f64m4(c1, alpha, result1, gvl);
+
+            ci = n_top * ldc + m_top;
+
+            __riscv_vse64_v_f64m4(&C[ci], c0, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse64_v_f64m4(&C[ci], c1, gvl);
+            m_top += 4;
+        }
+
+        if (M & 2) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+
+            for (BLASLONG k = 0; k < K; k++) {
+                result0 += A[ai + 0] * B[bi + 0];
+                result1 += A[ai + 1] * B[bi + 0];
+                result2 += A[ai + 0] * B[bi + 1];
+                result3 += A[ai + 1] * B[bi + 1];
+                ai += 2;
+                bi += 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            C[ci + 0 * ldc + 0] += alpha * result0;
+            C[ci + 0 * ldc + 1] += alpha * result1;
+            C[ci + 1 * ldc + 0] += alpha * result2;
+            C[ci + 1 * ldc + 1] += alpha * result3;
+            m_top += 2;
+        }
+
+        if (M & 1) {
+            double result0 = 0;
+            double result1 = 0;
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+
+            for (BLASLONG k = 0; k < K; k++) {
+                result0 += A[ai + 0] * B[bi + 0];
+                result1 += A[ai + 0] * B[bi + 1];
+                ai += 1;
+                bi += 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            C[ci + 0 * ldc + 0] += alpha * result0;
+            C[ci + 1 * ldc + 0] += alpha * result1;
+            m_top += 1;
+        }
+
+        n_top += 2;
+    }
+
+    // -- tails for N=1
+
+    if (N & 1) {
+        gvl = __riscv_vsetvl_e64m4(8);
+        m_top = 0;
+
+        for (BLASLONG i = 0; i < M / 8; i += 1) {
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            double B0 = B[bi + 0];
+            bi += 1;
+
+            vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
+            ai += 8;
+
+            vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl);
+
+            for (BLASLONG k = 1; k < K; k++) {
+                B0 = B[bi + 0];
+                bi += 1;
+
+                A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl);
+
+            ci = n_top * ldc + m_top;
+
+            __riscv_vse64_v_f64m4(&C[ci], c0, gvl);
+            m_top += 8;
+        }
+
+        if (M & 4) {
+            gvl = __riscv_vsetvl_e64m4(4);
+
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            double B0 = B[bi + 0];
+            bi += 1;
+
+            vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
+            ai += 4;
+
+            vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl);
+
+            for (BLASLONG k = 1; k < K; k++) {
+                B0 = B[bi + 0];
+                bi += 1;
+
+                A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl);
+
+            ci = n_top * ldc + m_top;
+
+            __riscv_vse64_v_f64m4(&C[ci], c0, gvl);
+            m_top += 4;
+        }
+
+        if (M & 2) {
+            double result0 = 0;
+            double result1 = 0;
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+
+            for (BLASLONG k = 0; k < K; k++) {
+                result0 += A[ai + 0] * B[bi + 0];
+                result1 += A[ai + 1] * B[bi + 0];
+                ai += 2;
+                bi += 1;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            C[ci + 0 * ldc + 0] += alpha * result0;
+            C[ci + 0 * ldc + 1] += alpha * result1;
+            m_top += 2;
+        }
+
+        if (M & 1) {
+            double result0 = 0;
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+
+            for (BLASLONG k = 0; k < K; k++) {
+                result0 += A[ai + 0] * B[bi + 0];
+                ai += 1;
+                bi += 1;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            C[ci + 0 * ldc + 0] += alpha * result0;
+            m_top += 1;
+        }
+
+        n_top += 1;
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/dgemm_kernel_8x8_zvl256b.c b/kernel/riscv64/dgemm_kernel_8x8_zvl256b.c
new file mode 100644
index 000000000..760bfc893
--- /dev/null
+++ b/kernel/riscv64/dgemm_kernel_8x8_zvl256b.c
@@ -0,0 +1,860 @@
+/*
+
+AUTOGENERATED KERNEL
+Settings:
+ LMUL=1
+ M=8
+ M_tail_scalar_from=2
+ N=8
+ __riscv_='__riscv_'
+ complex=False
+ conjugate=False
+ cpu='zvl256b'
+ force_acc_double=False
+ index_type='BLASLONG'
+ op='gemm'
+ param_precision='double'
+ reg_width_bits=256
+ tail_policy=''
+ trace=False
+
+Derived:
+ ELEN_ACC=64
+ ELEN_PARAM=64
+ LMUL_ACC=1
+ VFMACC='__riscv_vfmacc_vf_f64m1'
+ VFMUL='__riscv_vfmul_vf_f64m1'
+ VLEV='__riscv_vle64_v_f64m1'
+ VLSEV='__riscv_vlse64_v_f64m1'
+ VMACC_TO_ACC='__riscv_vfmacc_vf_f64m1'
+ VMUL_TO_ACC='__riscv_vfmul_vf_f64m1'
+ VSETVL='__riscv_vsetvl_e64m1'
+ VSEV='__riscv_vse64_v_f64m1'
+ VSSEV='__riscv_vsse64_v_f64m1'
+ acc_vector_t='vfloat64m1_t'
+ output='dgemm_kernel_8x8_zvl256b.c'
+ param_scalar_t='double'
+ param_vector_t='vfloat64m1_t'
+
+*/
+
+#include "common.h"
+
+
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc)
+
+{
+    BLASLONG gvl = 0;
+    BLASLONG m_top = 0;
+    BLASLONG n_top = 0;
+
+
+    // -- MAIN PASS
+
+    for (BLASLONG j=0; j<N/8; j+=1) {
+        m_top = 0;
+        BLASLONG gvl = __riscv_vsetvl_e64m1(4);
+
+
+        for (BLASLONG i=0; i<M/8; i+=1) {
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            double B0 = B[bi+0];
+            double B1 = B[bi+1];
+            double B2 = B[bi+2];
+            double B3 = B[bi+3];
+            double B4 = B[bi+4];
+            double B5 = B[bi+5];
+            double B6 = B[bi+6];
+            double B7 = B[bi+7];
+            bi += 8;
+
+            vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+            vfloat64m1_t A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
+            ai += 8;
+
+            vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
+            vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A1, B0, gvl);
+            vfloat64m1_t result2 = __riscv_vfmul_vf_f64m1( A0, B1, gvl);
+            vfloat64m1_t result3 = __riscv_vfmul_vf_f64m1( A1, B1, gvl);
+            vfloat64m1_t result4 = __riscv_vfmul_vf_f64m1( A0, B2, gvl);
+            vfloat64m1_t result5 = __riscv_vfmul_vf_f64m1( A1, B2, gvl);
+            vfloat64m1_t result6 = __riscv_vfmul_vf_f64m1( A0, B3, gvl);
+            vfloat64m1_t result7 = __riscv_vfmul_vf_f64m1( A1, B3, gvl);
+            vfloat64m1_t result8 = __riscv_vfmul_vf_f64m1( A0, B4, gvl);
+            vfloat64m1_t result9 = __riscv_vfmul_vf_f64m1( A1, B4, gvl);
+            vfloat64m1_t result10 = __riscv_vfmul_vf_f64m1( A0, B5, gvl);
+            vfloat64m1_t result11 = __riscv_vfmul_vf_f64m1( A1, B5, gvl);
+            vfloat64m1_t result12 = __riscv_vfmul_vf_f64m1( A0, B6, gvl);
+            vfloat64m1_t result13 = __riscv_vfmul_vf_f64m1( A1, B6, gvl);
+            vfloat64m1_t result14 = __riscv_vfmul_vf_f64m1( A0, B7, gvl);
+            vfloat64m1_t result15 = __riscv_vfmul_vf_f64m1( A1, B7, gvl);
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                B2 = B[bi+2];
+                B3 = B[bi+3];
+                B4 = B[bi+4];
+                B5 = B[bi+5];
+                B6 = B[bi+6];
+                B7 = B[bi+7];
+                bi += 8;
+
+                A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+                A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f64m1( result1, B0, A1, gvl);
+                result2 = __riscv_vfmacc_vf_f64m1( result2, B1, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f64m1( result3, B1, A1, gvl);
+                result4 = __riscv_vfmacc_vf_f64m1( result4, B2, A0, gvl);
+                result5 = __riscv_vfmacc_vf_f64m1( result5, B2, A1, gvl);
+                result6 = __riscv_vfmacc_vf_f64m1( result6, B3, A0, gvl);
+                result7 = __riscv_vfmacc_vf_f64m1( result7, B3, A1, gvl);
+                result8 = __riscv_vfmacc_vf_f64m1( result8, B4, A0, gvl);
+                result9 = __riscv_vfmacc_vf_f64m1( result9, B4, A1, gvl);
+                result10 = __riscv_vfmacc_vf_f64m1( result10, B5, A0, gvl);
+                result11 = __riscv_vfmacc_vf_f64m1( result11, B5, A1, gvl);
+                result12 = __riscv_vfmacc_vf_f64m1( result12, B6, A0, gvl);
+                result13 = __riscv_vfmacc_vf_f64m1( result13, B6, A1, gvl);
+                result14 = __riscv_vfmacc_vf_f64m1( result14, B7, A0, gvl);
+                result15 = __riscv_vfmacc_vf_f64m1( result15, B7, A1, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
+            vfloat64m1_t c1 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
+            vfloat64m1_t c2 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
+            vfloat64m1_t c3 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
+            vfloat64m1_t c4 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
+            vfloat64m1_t c5 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
+            vfloat64m1_t c6 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
+            vfloat64m1_t c7 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
+            vfloat64m1_t c8 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
+            vfloat64m1_t c9 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
+            vfloat64m1_t c10 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
+            vfloat64m1_t c11 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
+            vfloat64m1_t c12 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
+            vfloat64m1_t c13 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
+            vfloat64m1_t c14 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
+            vfloat64m1_t c15 = __riscv_vle64_v_f64m1( &C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl );
+            c1 = __riscv_vfmacc_vf_f64m1( c1, alpha, result1, gvl );
+            c2 = __riscv_vfmacc_vf_f64m1( c2, alpha, result2, gvl );
+            c3 = __riscv_vfmacc_vf_f64m1( c3, alpha, result3, gvl );
+            c4 = __riscv_vfmacc_vf_f64m1( c4, alpha, result4, gvl );
+            c5 = __riscv_vfmacc_vf_f64m1( c5, alpha, result5, gvl );
+            c6 = __riscv_vfmacc_vf_f64m1( c6, alpha, result6, gvl );
+            c7 = __riscv_vfmacc_vf_f64m1( c7, alpha, result7, gvl );
+            c8 = __riscv_vfmacc_vf_f64m1( c8, alpha, result8, gvl );
+            c9 = __riscv_vfmacc_vf_f64m1( c9, alpha, result9, gvl );
+            c10 = __riscv_vfmacc_vf_f64m1( c10, alpha, result10, gvl );
+            c11 = __riscv_vfmacc_vf_f64m1( c11, alpha, result11, gvl );
+            c12 = __riscv_vfmacc_vf_f64m1( c12, alpha, result12, gvl );
+            c13 = __riscv_vfmacc_vf_f64m1( c13, alpha, result13, gvl );
+            c14 = __riscv_vfmacc_vf_f64m1( c14, alpha, result14, gvl );
+            c15 = __riscv_vfmacc_vf_f64m1( c15, alpha, result15, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c1, gvl); ci += ldc-gvl*1;
+            __riscv_vse64_v_f64m1( &C[ci], c2, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c3, gvl); ci += ldc-gvl*1;
+            __riscv_vse64_v_f64m1( &C[ci], c4, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c5, gvl); ci += ldc-gvl*1;
+            __riscv_vse64_v_f64m1( &C[ci], c6, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c7, gvl); ci += ldc-gvl*1;
+            __riscv_vse64_v_f64m1( &C[ci], c8, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c9, gvl); ci += ldc-gvl*1;
+            __riscv_vse64_v_f64m1( &C[ci], c10, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c11, gvl); ci += ldc-gvl*1;
+            __riscv_vse64_v_f64m1( &C[ci], c12, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c13, gvl); ci += ldc-gvl*1;
+            __riscv_vse64_v_f64m1( &C[ci], c14, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c15, gvl);
+            m_top += 8;
+        }
+
+
+
+        // -- tails for main pass
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e64m1(4);
+
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            double B0 = B[bi+0];
+            double B1 = B[bi+1];
+            double B2 = B[bi+2];
+            double B3 = B[bi+3];
+            double B4 = B[bi+4];
+            double B5 = B[bi+5];
+            double B6 = B[bi+6];
+            double B7 = B[bi+7];
+            bi += 8;
+
+            vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+            ai += 4;
+
+            vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
+            vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A0, B1, gvl);
+            vfloat64m1_t result2 = __riscv_vfmul_vf_f64m1( A0, B2, gvl);
+            vfloat64m1_t result3 = __riscv_vfmul_vf_f64m1( A0, B3, gvl);
+            vfloat64m1_t result4 = __riscv_vfmul_vf_f64m1( A0, B4, gvl);
+            vfloat64m1_t result5 = __riscv_vfmul_vf_f64m1( A0, B5, gvl);
+            vfloat64m1_t result6 = __riscv_vfmul_vf_f64m1( A0, B6, gvl);
+            vfloat64m1_t result7 = __riscv_vfmul_vf_f64m1( A0, B7, gvl);
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                B2 = B[bi+2];
+                B3 = B[bi+3];
+                B4 = B[bi+4];
+                B5 = B[bi+5];
+                B6 = B[bi+6];
+                B7 = B[bi+7];
+                bi += 8;
+
+                A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f64m1( result1, B1, A0, gvl);
+                result2 = __riscv_vfmacc_vf_f64m1( result2, B2, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f64m1( result3, B3, A0, gvl);
+                result4 = __riscv_vfmacc_vf_f64m1( result4, B4, A0, gvl);
+                result5 = __riscv_vfmacc_vf_f64m1( result5, B5, A0, gvl);
+                result6 = __riscv_vfmacc_vf_f64m1( result6, B6, A0, gvl);
+                result7 = __riscv_vfmacc_vf_f64m1( result7, B7, A0, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat64m1_t c1 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat64m1_t c2 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat64m1_t c3 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat64m1_t c4 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat64m1_t c5 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat64m1_t c6 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat64m1_t c7 = __riscv_vle64_v_f64m1( &C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl );
+            c1 = __riscv_vfmacc_vf_f64m1( c1, alpha, result1, gvl );
+            c2 = __riscv_vfmacc_vf_f64m1( c2, alpha, result2, gvl );
+            c3 = __riscv_vfmacc_vf_f64m1( c3, alpha, result3, gvl );
+            c4 = __riscv_vfmacc_vf_f64m1( c4, alpha, result4, gvl );
+            c5 = __riscv_vfmacc_vf_f64m1( c5, alpha, result5, gvl );
+            c6 = __riscv_vfmacc_vf_f64m1( c6, alpha, result6, gvl );
+            c7 = __riscv_vfmacc_vf_f64m1( c7, alpha, result7, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
+            __riscv_vse64_v_f64m1( &C[ci], c1, gvl); ci += ldc-gvl*0;
+            __riscv_vse64_v_f64m1( &C[ci], c2, gvl); ci += ldc-gvl*0;
+            __riscv_vse64_v_f64m1( &C[ci], c3, gvl); ci += ldc-gvl*0;
+            __riscv_vse64_v_f64m1( &C[ci], c4, gvl); ci += ldc-gvl*0;
+            __riscv_vse64_v_f64m1( &C[ci], c5, gvl); ci += ldc-gvl*0;
+            __riscv_vse64_v_f64m1( &C[ci], c6, gvl); ci += ldc-gvl*0;
+            __riscv_vse64_v_f64m1( &C[ci], c7, gvl);
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            double result4 = 0;
+            double result5 = 0;
+            double result6 = 0;
+            double result7 = 0;
+            double result8 = 0;
+            double result9 = 0;
+            double result10 = 0;
+            double result11 = 0;
+            double result12 = 0;
+            double result13 = 0;
+            double result14 = 0;
+            double result15 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+
+            for(BLASLONG k=0; k<K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                result1+=A[ai+1]*B[bi+0];
+                result2+=A[ai+0]*B[bi+1];
+                result3+=A[ai+1]*B[bi+1];
+                result4+=A[ai+0]*B[bi+2];
+                result5+=A[ai+1]*B[bi+2];
+                result6+=A[ai+0]*B[bi+3];
+                result7+=A[ai+1]*B[bi+3];
+                result8+=A[ai+0]*B[bi+4];
+                result9+=A[ai+1]*B[bi+4];
+                result10+=A[ai+0]*B[bi+5];
+                result11+=A[ai+1]*B[bi+5];
+                result12+=A[ai+0]*B[bi+6];
+                result13+=A[ai+1]*B[bi+6];
+                result14+=A[ai+0]*B[bi+7];
+                result15+=A[ai+1]*B[bi+7];
+                ai+=2;
+                bi+=8;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] += alpha * result0;
+            C[ci+0*ldc+1] += alpha * result1;
+            C[ci+1*ldc+0] += alpha * result2;
+            C[ci+1*ldc+1] += alpha * result3;
+            C[ci+2*ldc+0] += alpha * result4;
+            C[ci+2*ldc+1] += alpha * result5;
+            C[ci+3*ldc+0] += alpha * result6;
+            C[ci+3*ldc+1] += alpha * result7;
+            C[ci+4*ldc+0] += alpha * result8;
+            C[ci+4*ldc+1] += alpha * result9;
+            C[ci+5*ldc+0] += alpha * result10;
+            C[ci+5*ldc+1] += alpha * result11;
+            C[ci+6*ldc+0] += alpha * result12;
+            C[ci+6*ldc+1] += alpha * result13;
+            C[ci+7*ldc+0] += alpha * result14;
+            C[ci+7*ldc+1] += alpha * result15;
+            m_top+=2;
+        }
+
+
+        if( M & 1 ) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            double result4 = 0;
+            double result5 = 0;
+            double result6 = 0;
+            double result7 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+
+            for(BLASLONG k=0; k<K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                result1+=A[ai+0]*B[bi+1];
+                result2+=A[ai+0]*B[bi+2];
+                result3+=A[ai+0]*B[bi+3];
+                result4+=A[ai+0]*B[bi+4];
+                result5+=A[ai+0]*B[bi+5];
+                result6+=A[ai+0]*B[bi+6];
+                result7+=A[ai+0]*B[bi+7];
+                ai+=1;
+                bi+=8;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] += alpha * result0;
+            C[ci+1*ldc+0] += alpha * result1;
+            C[ci+2*ldc+0] += alpha * result2;
+            C[ci+3*ldc+0] += alpha * result3;
+            C[ci+4*ldc+0] += alpha * result4;
+            C[ci+5*ldc+0] += alpha * result5;
+            C[ci+6*ldc+0] += alpha * result6;
+            C[ci+7*ldc+0] += alpha * result7;
+            m_top+=1;
+        }
+
+        n_top += 8;
+    }
+
+
+
+    // -- tails for N=4
+
+    if( N & 4 ) {
+        gvl = __riscv_vsetvl_e64m1(4);
+        m_top = 0;
+
+        for (BLASLONG i=0; i<M/8; i+=1) {
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            double B0 = B[bi+0];
+            double B1 = B[bi+1];
+            double B2 = B[bi+2];
+            double B3 = B[bi+3];
+            bi += 4;
+
+            vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+            vfloat64m1_t A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
+            ai += 8;
+
+            vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
+            vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A1, B0, gvl);
+            vfloat64m1_t result2 = __riscv_vfmul_vf_f64m1( A0, B1, gvl);
+            vfloat64m1_t result3 = __riscv_vfmul_vf_f64m1( A1, B1, gvl);
+            vfloat64m1_t result4 = __riscv_vfmul_vf_f64m1( A0, B2, gvl);
+            vfloat64m1_t result5 = __riscv_vfmul_vf_f64m1( A1, B2, gvl);
+            vfloat64m1_t result6 = __riscv_vfmul_vf_f64m1( A0, B3, gvl);
+            vfloat64m1_t result7 = __riscv_vfmul_vf_f64m1( A1, B3, gvl);
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                B2 = B[bi+2];
+                B3 = B[bi+3];
+                bi += 4;
+
+                A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+                A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f64m1( result1, B0, A1, gvl);
+                result2 = __riscv_vfmacc_vf_f64m1( result2, B1, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f64m1( result3, B1, A1, gvl);
+                result4 = __riscv_vfmacc_vf_f64m1( result4, B2, A0, gvl);
+                result5 = __riscv_vfmacc_vf_f64m1( result5, B2, A1, gvl);
+                result6 = __riscv_vfmacc_vf_f64m1( result6, B3, A0, gvl);
+                result7 = __riscv_vfmacc_vf_f64m1( result7, B3, A1, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
+            vfloat64m1_t c1 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
+            vfloat64m1_t c2 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
+            vfloat64m1_t c3 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
+            vfloat64m1_t c4 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
+            vfloat64m1_t c5 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
+            vfloat64m1_t c6 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
+            vfloat64m1_t c7 = __riscv_vle64_v_f64m1( &C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl );
+            c1 = __riscv_vfmacc_vf_f64m1( c1, alpha, result1, gvl );
+            c2 = __riscv_vfmacc_vf_f64m1( c2, alpha, result2, gvl );
+            c3 = __riscv_vfmacc_vf_f64m1( c3, alpha, result3, gvl );
+            c4 = __riscv_vfmacc_vf_f64m1( c4, alpha, result4, gvl );
+            c5 = __riscv_vfmacc_vf_f64m1( c5, alpha, result5, gvl );
+            c6 = __riscv_vfmacc_vf_f64m1( c6, alpha, result6, gvl );
+            c7 = __riscv_vfmacc_vf_f64m1( c7, alpha, result7, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c1, gvl); ci += ldc-gvl*1;
+            __riscv_vse64_v_f64m1( &C[ci], c2, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c3, gvl); ci += ldc-gvl*1;
+            __riscv_vse64_v_f64m1( &C[ci], c4, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c5, gvl); ci += ldc-gvl*1;
+            __riscv_vse64_v_f64m1( &C[ci], c6, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c7, gvl);
+            m_top += 8;
+        }
+
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e64m1(4);
+
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            double B0 = B[bi+0];
+            double B1 = B[bi+1];
+            double B2 = B[bi+2];
+            double B3 = B[bi+3];
+            bi += 4;
+
+            vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+            ai += 4;
+
+            vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
+            vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A0, B1, gvl);
+            vfloat64m1_t result2 = __riscv_vfmul_vf_f64m1( A0, B2, gvl);
+            vfloat64m1_t result3 = __riscv_vfmul_vf_f64m1( A0, B3, gvl);
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                B2 = B[bi+2];
+                B3 = B[bi+3];
+                bi += 4;
+
+                A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f64m1( result1, B1, A0, gvl);
+                result2 = __riscv_vfmacc_vf_f64m1( result2, B2, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f64m1( result3, B3, A0, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat64m1_t c1 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat64m1_t c2 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat64m1_t c3 = __riscv_vle64_v_f64m1( &C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl );
+            c1 = __riscv_vfmacc_vf_f64m1( c1, alpha, result1, gvl );
+            c2 = __riscv_vfmacc_vf_f64m1( c2, alpha, result2, gvl );
+            c3 = __riscv_vfmacc_vf_f64m1( c3, alpha, result3, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
+            __riscv_vse64_v_f64m1( &C[ci], c1, gvl); ci += ldc-gvl*0;
+            __riscv_vse64_v_f64m1( &C[ci], c2, gvl); ci += ldc-gvl*0;
+            __riscv_vse64_v_f64m1( &C[ci], c3, gvl);
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            double result4 = 0;
+            double result5 = 0;
+            double result6 = 0;
+            double result7 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+
+            for(BLASLONG k=0; k<K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                result1+=A[ai+1]*B[bi+0];
+                result2+=A[ai+0]*B[bi+1];
+                result3+=A[ai+1]*B[bi+1];
+                result4+=A[ai+0]*B[bi+2];
+                result5+=A[ai+1]*B[bi+2];
+                result6+=A[ai+0]*B[bi+3];
+                result7+=A[ai+1]*B[bi+3];
+                ai+=2;
+                bi+=4;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] += alpha * result0;
+            C[ci+0*ldc+1] += alpha * result1;
+            C[ci+1*ldc+0] += alpha * result2;
+            C[ci+1*ldc+1] += alpha * result3;
+            C[ci+2*ldc+0] += alpha * result4;
+            C[ci+2*ldc+1] += alpha * result5;
+            C[ci+3*ldc+0] += alpha * result6;
+            C[ci+3*ldc+1] += alpha * result7;
+            m_top+=2;
+        }
+
+
+        if( M & 1 ) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+
+            for(BLASLONG k=0; k<K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                result1+=A[ai+0]*B[bi+1];
+                result2+=A[ai+0]*B[bi+2];
+                result3+=A[ai+0]*B[bi+3];
+                ai+=1;
+                bi+=4;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] += alpha * result0;
+            C[ci+1*ldc+0] += alpha * result1;
+            C[ci+2*ldc+0] += alpha * result2;
+            C[ci+3*ldc+0] += alpha * result3;
+            m_top+=1;
+        }
+
+        n_top += 4;
+    }
+
+
+
+    // -- tails for N=2
+
+    if( N & 2 ) {
+        gvl = __riscv_vsetvl_e64m1(4);
+        m_top = 0;
+
+        for (BLASLONG i=0; i<M/8; i+=1) {
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            double B0 = B[bi+0];
+            double B1 = B[bi+1];
+            bi += 2;
+
+            vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+            vfloat64m1_t A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
+            ai += 8;
+
+            vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
+            vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A1, B0, gvl);
+            vfloat64m1_t result2 = __riscv_vfmul_vf_f64m1( A0, B1, gvl);
+            vfloat64m1_t result3 = __riscv_vfmul_vf_f64m1( A1, B1, gvl);
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                bi += 2;
+
+                A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+                A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f64m1( result1, B0, A1, gvl);
+                result2 = __riscv_vfmacc_vf_f64m1( result2, B1, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f64m1( result3, B1, A1, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
+            vfloat64m1_t c1 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
+            vfloat64m1_t c2 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
+            vfloat64m1_t c3 = __riscv_vle64_v_f64m1( &C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl );
+            c1 = __riscv_vfmacc_vf_f64m1( c1, alpha, result1, gvl );
+            c2 = __riscv_vfmacc_vf_f64m1( c2, alpha, result2, gvl );
+            c3 = __riscv_vfmacc_vf_f64m1( c3, alpha, result3, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c1, gvl); ci += ldc-gvl*1;
+            __riscv_vse64_v_f64m1( &C[ci], c2, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c3, gvl);
+            m_top += 8;
+        }
+
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e64m1(4);
+
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            double B0 = B[bi+0];
+            double B1 = B[bi+1];
+            bi += 2;
+
+            vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+            ai += 4;
+
+            vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
+            vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A0, B1, gvl);
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                bi += 2;
+
+                A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f64m1( result1, B1, A0, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat64m1_t c1 = __riscv_vle64_v_f64m1( &C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl );
+            c1 = __riscv_vfmacc_vf_f64m1( c1, alpha, result1, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
+            __riscv_vse64_v_f64m1( &C[ci], c1, gvl);
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+
+            for(BLASLONG k=0; k<K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                result1+=A[ai+1]*B[bi+0];
+                result2+=A[ai+0]*B[bi+1];
+                result3+=A[ai+1]*B[bi+1];
+                ai+=2;
+                bi+=2;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] += alpha * result0;
+            C[ci+0*ldc+1] += alpha * result1;
+            C[ci+1*ldc+0] += alpha * result2;
+            C[ci+1*ldc+1] += alpha * result3;
+            m_top+=2;
+        }
+
+
+        if( M & 1 ) {
+            double result0 = 0;
+            double result1 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+
+            for(BLASLONG k=0; k<K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                result1+=A[ai+0]*B[bi+1];
+                ai+=1;
+                bi+=2;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] += alpha * result0;
+            C[ci+1*ldc+0] += alpha * result1;
+            m_top+=1;
+        }
+
+        n_top += 2;
+    }
+
+
+
+    // -- tails for N=1
+
+    if( N & 1 ) {
+        gvl = __riscv_vsetvl_e64m1(4);
+        m_top = 0;
+
+        for (BLASLONG i=0; i<M/8; i+=1) {
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            double B0 = B[bi+0];
+            bi += 1;
+
+            vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+            vfloat64m1_t A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
+            ai += 8;
+
+            vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
+            vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A1, B0, gvl);
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0 = B[bi+0];
+                bi += 1;
+
+                A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+                A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f64m1( result1, B0, A1, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
+            vfloat64m1_t c1 = __riscv_vle64_v_f64m1( &C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl );
+            c1 = __riscv_vfmacc_vf_f64m1( c1, alpha, result1, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c1, gvl);
+            m_top += 8;
+        }
+
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e64m1(4);
+
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            double B0 = B[bi+0];
+            bi += 1;
+
+            vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+            ai += 4;
+
+            vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0 = B[bi+0];
+                bi += 1;
+
+                A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vse64_v_f64m1( &C[ci], c0, gvl);
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            double result0 = 0;
+            double result1 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+
+            for(BLASLONG k=0; k<K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                result1+=A[ai+1]*B[bi+0];
+                ai+=2;
+                bi+=1;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] += alpha * result0;
+            C[ci+0*ldc+1] += alpha * result1;
+            m_top+=2;
+        }
+
+
+        if( M & 1 ) {
+            double result0 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+
+            for(BLASLONG k=0; k<K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                ai+=1;
+                bi+=1;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] += alpha * result0;
+            m_top+=1;
+        }
+
+        n_top += 1;
+    }
+
+    return 0;
+}
+
diff --git a/kernel/riscv64/dot.c b/kernel/riscv64/dot.c
index bf55998ca..8ad493a2b 100644
--- a/kernel/riscv64/dot.c
+++ b/kernel/riscv64/dot.c
@@ -44,14 +44,24 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
 {
 	BLASLONG i=0;
 	BLASLONG ix=0,iy=0;
+
+#if defined(DSDOT)
 	double dot = 0.0 ;
+#else
+	FLOAT  dot = 0.0 ;
+#endif
 
 	if ( n < 1 )  return(dot);
 
 	while(i < n)
 	{
 
+#if defined(DSDOT)
+		dot += (double) y[iy] * (double) x[ix] ;
+#else
 		dot += y[iy] * x[ix] ;
+#endif
+
 		ix  += inc_x ;
 		iy  += inc_y ;
 		i++ ;
diff --git a/kernel/riscv64/dot_rvv.c b/kernel/riscv64/dot_rvv.c
new file mode 100644
index 000000000..837badf41
--- /dev/null
+++ b/kernel/riscv64/dot_rvv.c
@@ -0,0 +1,126 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if defined(DSDOT)
+double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+#else
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+#endif
+{
+    double dot = 0.0;
+
+    if ( n <= 0 ) return(dot);
+
+    size_t vlmax = __riscv_vsetvlmax_e64m8();
+    vfloat64m8_t vr = __riscv_vfmv_v_f_f64m8(0, vlmax);
+
+    if(inc_x == 1 && inc_y == 1) {
+
+        for (size_t vl; n > 0; n -= vl, x += vl, y += vl) {
+            vl = __riscv_vsetvl_e64m8(n);
+
+#if !defined(DOUBLE)
+            vfloat32m4_t vx = __riscv_vle32_v_f32m4(x, vl);
+            vfloat32m4_t vy = __riscv_vle32_v_f32m4(y, vl);
+
+            vr = __riscv_vfwmacc_vv_f64m8_tu(vr, vx, vy, vl);
+#else
+            vfloat64m8_t vx = __riscv_vle64_v_f64m8(x, vl);
+            vfloat64m8_t vy = __riscv_vle64_v_f64m8(y, vl);
+
+            vr = __riscv_vfmacc_vv_f64m8_tu(vr, vx, vy, vl);
+#endif
+        }
+
+    } else if (1 == inc_x) {
+            
+        BLASLONG stride_y = inc_y * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) {
+            vl = __riscv_vsetvl_e64m8(n);
+
+#if !defined(DOUBLE)
+            vfloat32m4_t vx = __riscv_vle32_v_f32m4(x, vl);
+            vfloat32m4_t vy = __riscv_vlse32_v_f32m4(y, stride_y, vl);
+
+            vr = __riscv_vfwmacc_vv_f64m8_tu(vr, vx, vy, vl);
+#else
+            vfloat64m8_t vx = __riscv_vle64_v_f64m8(x, vl);
+            vfloat64m8_t vy = __riscv_vlse64_v_f64m8(y, stride_y, vl);
+
+            vr = __riscv_vfmacc_vv_f64m8_tu(vr, vx, vy, vl);
+#endif
+        }
+    } else if (1 == inc_y) {
+            
+        BLASLONG stride_x = inc_x * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) {
+            vl = __riscv_vsetvl_e64m8(n);
+
+#if !defined(DOUBLE)
+            vfloat32m4_t vx = __riscv_vlse32_v_f32m4(x, stride_x, vl);
+            vfloat32m4_t vy = __riscv_vle32_v_f32m4(y, vl);
+
+            vr = __riscv_vfwmacc_vv_f64m8_tu(vr, vx, vy, vl);
+#else
+            vfloat64m8_t vx = __riscv_vlse64_v_f64m8(x, stride_x, vl);
+            vfloat64m8_t vy = __riscv_vle64_v_f64m8(y, vl);
+
+            vr = __riscv_vfmacc_vv_f64m8_tu(vr, vx, vy, vl);
+#endif
+        }
+    } else {
+            
+        BLASLONG stride_x = inc_x * sizeof(FLOAT);
+        BLASLONG stride_y = inc_y * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) {
+            vl = __riscv_vsetvl_e64m8(n);
+
+#if !defined(DOUBLE)
+            vfloat32m4_t vx = __riscv_vlse32_v_f32m4(x, stride_x, vl);
+            vfloat32m4_t vy = __riscv_vlse32_v_f32m4(y, stride_y, vl);
+
+            vr = __riscv_vfwmacc_vv_f64m8_tu(vr, vx, vy, vl);
+#else
+            vfloat64m8_t vx = __riscv_vlse64_v_f64m8(x, stride_x, vl);
+            vfloat64m8_t vy = __riscv_vlse64_v_f64m8(y, stride_y, vl);
+
+            vr = __riscv_vfmacc_vv_f64m8_tu(vr, vx, vy, vl);
+#endif
+        }
+    }
+
+    vfloat64m1_t vec_zero = __riscv_vfmv_v_f_f64m1(0, vlmax);
+    vfloat64m1_t vec_sum = __riscv_vfredusum_vs_f64m8_f64m1(vr, vec_zero, vlmax);
+    dot = __riscv_vfmv_f_s_f64m1_f64(vec_sum);
+
+    return(dot);
+}
diff --git a/kernel/riscv64/dot_vector.c b/kernel/riscv64/dot_vector.c
index cc27d68ed..38ccc4778 100644
--- a/kernel/riscv64/dot_vector.c
+++ b/kernel/riscv64/dot_vector.c
@@ -27,31 +27,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m4(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
+#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n)
+#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)()
 #define FLOAT_V_T vfloat32m4_t
 #define FLOAT_V_T_M1 vfloat32m1_t
-#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLEV_FLOAT vle32_v_f32m4
-#define VLSEV_FLOAT vlse32_v_f32m4
-#define VFREDSUM_FLOAT vfredosum_vs_f32m4_f32m1
-#define VFMACCVV_FLOAT vfmacc_vv_f32m4
-#define VFMVVF_FLOAT vfmv_v_f_f32m4
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFDOTVV_FLOAT vfdot_vv_f32m4
+#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4)
+#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f32m4_f32m1(v_res, va, vb, gvl)
 #else
-#define VSETVL(n) vsetvl_e64m4(n)
-#define VSETVL_MAX vsetvlmax_e64m1()
+#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f32m4_f32m1)
+#endif
+#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f32m4)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m4)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1)
+#define VFDOTVV_FLOAT RISCV_RVV(vfdot_vv_f32m4)
+#else
+#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n)
+#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)()
 #define FLOAT_V_T vfloat64m4_t
 #define FLOAT_V_T_M1 vfloat64m1_t
-#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLEV_FLOAT vle64_v_f64m4
-#define VLSEV_FLOAT vlse64_v_f64m4
-#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
-#define VFMACCVV_FLOAT vfmacc_vv_f64m4
-#define VFMVVF_FLOAT vfmv_v_f_f64m4
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFDOTVV_FLOAT vfdot_vv_f64m4
+#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4)
+#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f64m4_f64m1(v_res, va, vb, gvl)
+#else
+#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f64m4_f64m1)
+#endif
+#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f64m4)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m4)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1)
+#define VFDOTVV_FLOAT RISCV_RVV(vfdot_vv_f64m4)
 #endif
 
 #if defined(DSDOT)
@@ -82,8 +88,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
                         j += gvl;
                 }
                 if(j > 0){
-                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                        dot += (double)VFMVFS_FLOAT(v_res);
+                        v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
+                        dot += (double)EXTRACT_FLOAT(v_res);
                 }
                 //tail
                 if(j < n){
@@ -93,13 +99,13 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
                         FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl);
                         //vr = VFDOTVV_FLOAT(vx, vy, gvl);
                         vr = VFMACCVV_FLOAT(vz, vx, vy, gvl);
-                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                        dot += (double)VFMVFS_FLOAT(v_res);
+                        v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
+                        dot += (double)EXTRACT_FLOAT(v_res);
                 }
         }else if(inc_y == 1){
                 gvl = VSETVL(n);
                 vr = VFMVVF_FLOAT(0, gvl);
-                 int stride_x = inc_x * sizeof(FLOAT);
+                BLASLONG stride_x = inc_x * sizeof(FLOAT);
                 for(i=0,j=0; i<n/gvl; i++){
                         vx = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
                         vy = VLEV_FLOAT(&y[j], gvl);
@@ -107,9 +113,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
                         j += gvl;
                 }
                 if(j > 0){
-                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                        dot += (double)VFMVFS_FLOAT(v_res);
-
+                        v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
+                        dot += (double)EXTRACT_FLOAT(v_res);
                 }
                 //tail
                 if(j < n){
@@ -119,14 +124,13 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
                         FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl);
                         //vr = VFDOTVV_FLOAT(vx, vy, gvl);
                         vr = VFMACCVV_FLOAT(vz, vx, vy, gvl);
-                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                        dot += (double)VFMVFS_FLOAT(v_res);
-
+                        v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
+                        dot += (double)EXTRACT_FLOAT(v_res);
                 }
         }else if(inc_x == 1){
                 gvl = VSETVL(n);
                 vr = VFMVVF_FLOAT(0, gvl);
-                 int stride_y = inc_y * sizeof(FLOAT);
+                BLASLONG stride_y = inc_y * sizeof(FLOAT);
                 for(i=0,j=0; i<n/gvl; i++){
                         vx = VLEV_FLOAT(&x[j], gvl);
                         vy = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl);
@@ -134,9 +138,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
                         j += gvl;
                 }
                 if(j > 0){
-                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                        dot += (double)VFMVFS_FLOAT(v_res);
-
+                        v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
+                        dot += (double)EXTRACT_FLOAT(v_res);
                 }
                 //tail
                 if(j < n){
@@ -146,15 +149,14 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
                         FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl);
                         //vr = VFDOTVV_FLOAT(vx, vy, gvl);
                         vr = VFMACCVV_FLOAT(vz, vx, vy, gvl);
-                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                        dot += (double)VFMVFS_FLOAT(v_res);
-
+                        v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
+                        dot += (double)EXTRACT_FLOAT(v_res);
                 }
         }else{
                 gvl = VSETVL(n);
                 vr = VFMVVF_FLOAT(0, gvl);
-                 int stride_x = inc_x * sizeof(FLOAT);
-                 int stride_y = inc_y * sizeof(FLOAT);
+                BLASLONG stride_x = inc_x * sizeof(FLOAT);
+                BLASLONG stride_y = inc_y * sizeof(FLOAT);
                 for(i=0,j=0; i<n/gvl; i++){
                         vx = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
                         vy = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl);
@@ -162,9 +164,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
                         j += gvl;
                 }
                 if(j > 0){
-                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                        dot += (double)VFMVFS_FLOAT(v_res);
-
+                        v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
+                        dot += (double)EXTRACT_FLOAT(v_res);
                 }
                 //tail
                 if(j < n){
@@ -174,9 +175,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
                         FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl);
                         //vr = VFDOTVV_FLOAT(vx, vy, gvl);
                         vr = VFMACCVV_FLOAT(vz, vx, vy, gvl);
-                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                        dot += (double)VFMVFS_FLOAT(v_res);
-
+                        v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
+                        dot += (double)EXTRACT_FLOAT(v_res);
                 }
         }
 	return(dot);
diff --git a/kernel/riscv64/dsdot_vector.c b/kernel/riscv64/dsdot_vector.c
new file mode 100644
index 000000000..e972828b5
--- /dev/null
+++ b/kernel/riscv64/dsdot_vector.c
@@ -0,0 +1,152 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+{
+        BLASLONG i=0, j=0;
+        double dot = 0.0 ;
+
+        if ( n < 1 )  return(dot);
+        vfloat64m4_t vr;
+        vfloat32m2_t vx, vy;
+        unsigned int gvl = 0;
+        vfloat64m1_t v_res, v_z0;
+        gvl = vsetvlmax_e64m1();
+        v_res = vfmv_v_f_f64m1(0, gvl);
+        v_z0 = vfmv_v_f_f64m1(0, gvl);
+
+        if(inc_x == 1 && inc_y == 1){
+                gvl = vsetvl_e64m4(n);
+                vr = vfmv_v_f_f64m4(0, gvl);
+                for(i=0,j=0; i<n/gvl; i++){
+                        vx = vle32_v_f32m2(&x[j], gvl);
+                        vy = vle32_v_f32m2(&y[j], gvl);
+                        vr = vfwmacc_vv_f64m4(vr, vx, vy, gvl);
+                        j += gvl;
+                }
+                if(j > 0){
+                        v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl);
+                        dot += (double)vfmv_f_s_f64m1_f64(v_res);
+                }
+                //tail
+                if(j < n){
+                        gvl = vsetvl_e64m4(n-j);
+                        vx = vle32_v_f32m2(&x[j], gvl);
+                        vy = vle32_v_f32m2(&y[j], gvl);
+                        vfloat64m4_t vz = vfmv_v_f_f64m4(0, gvl);
+                        //vr = vfdot_vv_f32m2(vx, vy, gvl);
+                        vr = vfwmacc_vv_f64m4(vz, vx, vy, gvl);
+                        v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl);
+                        dot += (double)vfmv_f_s_f64m1_f64(v_res);
+                }
+        }else if(inc_y == 1){
+                gvl = vsetvl_e64m4(n);
+                vr = vfmv_v_f_f64m4(0, gvl);
+                 int stride_x = inc_x * sizeof(FLOAT);
+                for(i=0,j=0; i<n/gvl; i++){
+                        vx = vlse32_v_f32m2(&x[j*inc_x], stride_x, gvl);
+                        vy = vle32_v_f32m2(&y[j], gvl);
+                        vr = vfwmacc_vv_f64m4(vr, vx, vy, gvl);
+                        j += gvl;
+                }
+                if(j > 0){
+                        v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl);
+                        dot += (double)vfmv_f_s_f64m1_f64(v_res);
+
+                }
+                //tail
+                if(j < n){
+                        gvl = vsetvl_e64m4(n-j);
+                        vx = vlse32_v_f32m2(&x[j*inc_x], stride_x, gvl);
+                        vy = vle32_v_f32m2(&y[j], gvl);
+                        vfloat64m4_t vz = vfmv_v_f_f64m4(0, gvl);
+                        //vr = vfdot_vv_f32m2(vx, vy, gvl);
+                        vr = vfwmacc_vv_f64m4(vz, vx, vy, gvl);
+                        v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl);
+                        dot += (double)vfmv_f_s_f64m1_f64(v_res);
+
+                }
+        }else if(inc_x == 1){
+                gvl = vsetvl_e64m4(n);
+                vr = vfmv_v_f_f64m4(0, gvl);
+                 int stride_y = inc_y * sizeof(FLOAT);
+                for(i=0,j=0; i<n/gvl; i++){
+                        vx = vle32_v_f32m2(&x[j], gvl);
+                        vy = vlse32_v_f32m2(&y[j*inc_y], stride_y, gvl);
+                        vr = vfwmacc_vv_f64m4(vr, vx, vy, gvl);
+                        j += gvl;
+                }
+                if(j > 0){
+                        v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl);
+                        dot += (double)vfmv_f_s_f64m1_f64(v_res);
+
+                }
+                //tail
+                if(j < n){
+                        gvl = vsetvl_e64m4(n-j);
+                        vx = vle32_v_f32m2(&x[j], gvl);
+                        vy = vlse32_v_f32m2(&y[j*inc_y], stride_y, gvl);
+                        vfloat64m4_t vz = vfmv_v_f_f64m4(0, gvl);
+                        //vr = vfdot_vv_f32m2(vx, vy, gvl);
+                        vr = vfwmacc_vv_f64m4(vz, vx, vy, gvl);
+                        v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl);
+                        dot += (double)vfmv_f_s_f64m1_f64(v_res);
+
+                }
+        }else{
+                gvl = vsetvl_e64m4(n);
+                vr = vfmv_v_f_f64m4(0, gvl);
+                 int stride_x = inc_x * sizeof(FLOAT);
+                 int stride_y = inc_y * sizeof(FLOAT);
+                for(i=0,j=0; i<n/gvl; i++){
+                        vx = vlse32_v_f32m2(&x[j*inc_x], stride_x, gvl);
+                        vy = vlse32_v_f32m2(&y[j*inc_y], stride_y, gvl);
+                        vr = vfwmacc_vv_f64m4(vr, vx, vy, gvl);
+                        j += gvl;
+                }
+                if(j > 0){
+                        v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl);
+                        dot += (double)vfmv_f_s_f64m1_f64(v_res);
+
+                }
+                //tail
+                if(j < n){
+                        gvl = vsetvl_e64m4(n-j);
+                        vx = vlse32_v_f32m2(&x[j*inc_x], stride_x, gvl);
+                        vy = vlse32_v_f32m2(&y[j*inc_y], stride_y, gvl);
+                        vfloat64m4_t vz = vfmv_v_f_f64m4(0, gvl);
+                        //vr = vfdot_vv_f32m2(vx, vy, gvl);
+                        vr = vfwmacc_vv_f64m4(vz, vx, vy, gvl);
+                        v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl);
+                        dot += (double)vfmv_f_s_f64m1_f64(v_res);
+
+                }
+        }
+        return(dot);
+}
diff --git a/kernel/riscv64/dtrmm_kernel_8x4_zvl128b.c b/kernel/riscv64/dtrmm_kernel_8x4_zvl128b.c
new file mode 100644
index 000000000..c1e0da86e
--- /dev/null
+++ b/kernel/riscv64/dtrmm_kernel_8x4_zvl128b.c
@@ -0,0 +1,660 @@
+/*
+
+AUTOGENERATED KERNEL
+Script: ./kernel/riscv64/generate_kernel.py
+Settings:
+ LMUL=4
+ M=8
+ M_tail_scalar_from=2
+ N=4
+ __riscv_='__riscv_'
+ complex=False
+ conjugate=False
+ cpu='zvl128b'
+ force_acc_double=False
+ index_type='BLASLONG'
+ op='trmm'
+ param_precision='double'
+ reg_width_bits=128
+ tail_policy=''
+ trace=False
+
+Derived:
+ ELEN_ACC=64
+ ELEN_PARAM=64
+ LMUL_ACC=4
+ VFMACC='__riscv_vfmacc_vf_f64m4'
+ VFMUL='__riscv_vfmul_vf_f64m4'
+ VLEV='__riscv_vle64_v_f64m4'
+ VLSEV='__riscv_vlse64_v_f64m4'
+ VMACC_TO_ACC='__riscv_vfmacc_vf_f64m4'
+ VMUL_TO_ACC='__riscv_vfmul_vf_f64m4'
+ VSETVL='__riscv_vsetvl_e64m4'
+ VSEV='__riscv_vse64_v_f64m4'
+ VSSEV='__riscv_vsse64_v_f64m4'
+ acc_vector_t='vfloat64m4_t'
+ output='dtrmm_kernel_8x4_zvl128b.c'
+ param_scalar_t='double'
+ param_vector_t='vfloat64m4_t'
+
+*/
+
+#include "common.h"
+
+#if defined(LEFT) != defined(TRANSA)
+#define BACKWARDS
+#endif
+
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc, BLASLONG offset)
+
+{
+    BLASLONG gvl = 0;
+    BLASLONG m_top = 0;
+    BLASLONG n_top = 0;
+
+    // -- MAIN PASS
+
+    for (BLASLONG j = 0; j < N / 4; j += 1) {
+        m_top = 0;
+        BLASLONG gvl = __riscv_vsetvl_e64m4(8);
+
+        for (BLASLONG i = 0; i < M / 8; i += 1) {
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 8;
+            bi += off * 4;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 8;
+#else
+            pass_K = off + 4;
+#endif
+#endif
+            double B0 = B[bi + 0];
+            double B1 = B[bi + 1];
+            double B2 = B[bi + 2];
+            double B3 = B[bi + 3];
+            bi += 4;
+
+            vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
+            ai += 8;
+
+            vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl);
+            vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl);
+            vfloat64m4_t result2 = __riscv_vfmul_vf_f64m4(A0, B2, gvl);
+            vfloat64m4_t result3 = __riscv_vfmul_vf_f64m4(A0, B3, gvl);
+
+            for (BLASLONG k = 1; k < pass_K; k++) {
+                B0 = B[bi + 0];
+                B1 = B[bi + 1];
+                B2 = B[bi + 2];
+                B3 = B[bi + 3];
+                bi += 4;
+
+                A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl);
+                result2 = __riscv_vfmacc_vf_f64m4(result2, B2, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f64m4(result3, B3, A0, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl);
+            vfloat64m4_t c1 = __riscv_vfmul_vf_f64m4(result1, alpha, gvl);
+            vfloat64m4_t c2 = __riscv_vfmul_vf_f64m4(result2, alpha, gvl);
+            vfloat64m4_t c3 = __riscv_vfmul_vf_f64m4(result3, alpha, gvl);
+            __riscv_vse64_v_f64m4(&C[ci], c0, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse64_v_f64m4(&C[ci], c1, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse64_v_f64m4(&C[ci], c2, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse64_v_f64m4(&C[ci], c3, gvl);
+            m_top += 8;
+        }
+
+        // -- tails for main pass
+
+        if (M & 4) {
+            gvl = __riscv_vsetvl_e64m4(4);
+
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 4;
+            bi += off * 4;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 4;
+#else
+            pass_K = off + 4;
+#endif
+#endif
+            double B0 = B[bi + 0];
+            double B1 = B[bi + 1];
+            double B2 = B[bi + 2];
+            double B3 = B[bi + 3];
+            bi += 4;
+
+            vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
+            ai += 4;
+
+            vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl);
+            vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl);
+            vfloat64m4_t result2 = __riscv_vfmul_vf_f64m4(A0, B2, gvl);
+            vfloat64m4_t result3 = __riscv_vfmul_vf_f64m4(A0, B3, gvl);
+
+            for (BLASLONG k = 1; k < pass_K; k++) {
+                B0 = B[bi + 0];
+                B1 = B[bi + 1];
+                B2 = B[bi + 2];
+                B3 = B[bi + 3];
+                bi += 4;
+
+                A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl);
+                result2 = __riscv_vfmacc_vf_f64m4(result2, B2, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f64m4(result3, B3, A0, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl);
+            vfloat64m4_t c1 = __riscv_vfmul_vf_f64m4(result1, alpha, gvl);
+            vfloat64m4_t c2 = __riscv_vfmul_vf_f64m4(result2, alpha, gvl);
+            vfloat64m4_t c3 = __riscv_vfmul_vf_f64m4(result3, alpha, gvl);
+            __riscv_vse64_v_f64m4(&C[ci], c0, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse64_v_f64m4(&C[ci], c1, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse64_v_f64m4(&C[ci], c2, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse64_v_f64m4(&C[ci], c3, gvl);
+            m_top += 4;
+        }
+
+        if (M & 2) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            double result4 = 0;
+            double result5 = 0;
+            double result6 = 0;
+            double result7 = 0;
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 2;
+            bi += off * 4;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 2;
+#else
+            pass_K = off + 4;
+#endif
+#endif
+
+            for (BLASLONG k = 0; k < pass_K; k++) {
+                result0 += A[ai + 0] * B[bi + 0];
+                result1 += A[ai + 1] * B[bi + 0];
+                result2 += A[ai + 0] * B[bi + 1];
+                result3 += A[ai + 1] * B[bi + 1];
+                result4 += A[ai + 0] * B[bi + 2];
+                result5 += A[ai + 1] * B[bi + 2];
+                result6 += A[ai + 0] * B[bi + 3];
+                result7 += A[ai + 1] * B[bi + 3];
+                ai += 2;
+                bi += 4;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            C[ci + 0 * ldc + 0] = alpha * result0;
+            C[ci + 0 * ldc + 1] = alpha * result1;
+            C[ci + 1 * ldc + 0] = alpha * result2;
+            C[ci + 1 * ldc + 1] = alpha * result3;
+            C[ci + 2 * ldc + 0] = alpha * result4;
+            C[ci + 2 * ldc + 1] = alpha * result5;
+            C[ci + 3 * ldc + 0] = alpha * result6;
+            C[ci + 3 * ldc + 1] = alpha * result7;
+            m_top += 2;
+        }
+
+        if (M & 1) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 1;
+            bi += off * 4;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 1;
+#else
+            pass_K = off + 4;
+#endif
+#endif
+
+            for (BLASLONG k = 0; k < pass_K; k++) {
+                result0 += A[ai + 0] * B[bi + 0];
+                result1 += A[ai + 0] * B[bi + 1];
+                result2 += A[ai + 0] * B[bi + 2];
+                result3 += A[ai + 0] * B[bi + 3];
+                ai += 1;
+                bi += 4;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            C[ci + 0 * ldc + 0] = alpha * result0;
+            C[ci + 1 * ldc + 0] = alpha * result1;
+            C[ci + 2 * ldc + 0] = alpha * result2;
+            C[ci + 3 * ldc + 0] = alpha * result3;
+            m_top += 1;
+        }
+
+        n_top += 4;
+    }
+
+    // -- tails for N=2
+
+    if (N & 2) {
+        gvl = __riscv_vsetvl_e64m4(8);
+        m_top = 0;
+
+        for (BLASLONG i = 0; i < M / 8; i += 1) {
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 8;
+            bi += off * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 8;
+#else
+            pass_K = off + 2;
+#endif
+#endif
+            double B0 = B[bi + 0];
+            double B1 = B[bi + 1];
+            bi += 2;
+
+            vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
+            ai += 8;
+
+            vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl);
+            vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl);
+
+            for (BLASLONG k = 1; k < pass_K; k++) {
+                B0 = B[bi + 0];
+                B1 = B[bi + 1];
+                bi += 2;
+
+                A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl);
+            vfloat64m4_t c1 = __riscv_vfmul_vf_f64m4(result1, alpha, gvl);
+            __riscv_vse64_v_f64m4(&C[ci], c0, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse64_v_f64m4(&C[ci], c1, gvl);
+            m_top += 8;
+        }
+
+        if (M & 4) {
+            gvl = __riscv_vsetvl_e64m4(4);
+
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 4;
+            bi += off * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 4;
+#else
+            pass_K = off + 2;
+#endif
+#endif
+            double B0 = B[bi + 0];
+            double B1 = B[bi + 1];
+            bi += 2;
+
+            vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
+            ai += 4;
+
+            vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl);
+            vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl);
+
+            for (BLASLONG k = 1; k < pass_K; k++) {
+                B0 = B[bi + 0];
+                B1 = B[bi + 1];
+                bi += 2;
+
+                A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl);
+            vfloat64m4_t c1 = __riscv_vfmul_vf_f64m4(result1, alpha, gvl);
+            __riscv_vse64_v_f64m4(&C[ci], c0, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse64_v_f64m4(&C[ci], c1, gvl);
+            m_top += 4;
+        }
+
+        if (M & 2) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 2;
+            bi += off * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 2;
+#else
+            pass_K = off + 2;
+#endif
+#endif
+
+            for (BLASLONG k = 0; k < pass_K; k++) {
+                result0 += A[ai + 0] * B[bi + 0];
+                result1 += A[ai + 1] * B[bi + 0];
+                result2 += A[ai + 0] * B[bi + 1];
+                result3 += A[ai + 1] * B[bi + 1];
+                ai += 2;
+                bi += 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            C[ci + 0 * ldc + 0] = alpha * result0;
+            C[ci + 0 * ldc + 1] = alpha * result1;
+            C[ci + 1 * ldc + 0] = alpha * result2;
+            C[ci + 1 * ldc + 1] = alpha * result3;
+            m_top += 2;
+        }
+
+        if (M & 1) {
+            double result0 = 0;
+            double result1 = 0;
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 1;
+            bi += off * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 1;
+#else
+            pass_K = off + 2;
+#endif
+#endif
+
+            for (BLASLONG k = 0; k < pass_K; k++) {
+                result0 += A[ai + 0] * B[bi + 0];
+                result1 += A[ai + 0] * B[bi + 1];
+                ai += 1;
+                bi += 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            C[ci + 0 * ldc + 0] = alpha * result0;
+            C[ci + 1 * ldc + 0] = alpha * result1;
+            m_top += 1;
+        }
+
+        n_top += 2;
+    }
+
+    // -- tails for N=1
+
+    if (N & 1) {
+        gvl = __riscv_vsetvl_e64m4(8);
+        m_top = 0;
+
+        for (BLASLONG i = 0; i < M / 8; i += 1) {
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 8;
+            bi += off * 1;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 8;
+#else
+            pass_K = off + 1;
+#endif
+#endif
+            double B0 = B[bi + 0];
+            bi += 1;
+
+            vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
+            ai += 8;
+
+            vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl);
+
+            for (BLASLONG k = 1; k < pass_K; k++) {
+                B0 = B[bi + 0];
+                bi += 1;
+
+                A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl);
+            __riscv_vse64_v_f64m4(&C[ci], c0, gvl);
+            m_top += 8;
+        }
+
+        if (M & 4) {
+            gvl = __riscv_vsetvl_e64m4(4);
+
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 4;
+            bi += off * 1;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 4;
+#else
+            pass_K = off + 1;
+#endif
+#endif
+            double B0 = B[bi + 0];
+            bi += 1;
+
+            vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
+            ai += 4;
+
+            vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl);
+
+            for (BLASLONG k = 1; k < pass_K; k++) {
+                B0 = B[bi + 0];
+                bi += 1;
+
+                A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl);
+            __riscv_vse64_v_f64m4(&C[ci], c0, gvl);
+            m_top += 4;
+        }
+
+        if (M & 2) {
+            double result0 = 0;
+            double result1 = 0;
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 2;
+            bi += off * 1;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 2;
+#else
+            pass_K = off + 1;
+#endif
+#endif
+
+            for (BLASLONG k = 0; k < pass_K; k++) {
+                result0 += A[ai + 0] * B[bi + 0];
+                result1 += A[ai + 1] * B[bi + 0];
+                ai += 2;
+                bi += 1;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            C[ci + 0 * ldc + 0] = alpha * result0;
+            C[ci + 0 * ldc + 1] = alpha * result1;
+            m_top += 2;
+        }
+
+        if (M & 1) {
+            double result0 = 0;
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 1;
+            bi += off * 1;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 1;
+#else
+            pass_K = off + 1;
+#endif
+#endif
+
+            for (BLASLONG k = 0; k < pass_K; k++) {
+                result0 += A[ai + 0] * B[bi + 0];
+                ai += 1;
+                bi += 1;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            C[ci + 0 * ldc + 0] = alpha * result0;
+            m_top += 1;
+        }
+
+        n_top += 1;
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/dtrmm_kernel_8x8_zvl256b.c b/kernel/riscv64/dtrmm_kernel_8x8_zvl256b.c
new file mode 100644
index 000000000..b1739f248
--- /dev/null
+++ b/kernel/riscv64/dtrmm_kernel_8x8_zvl256b.c
@@ -0,0 +1,1068 @@
+/*
+
+AUTOGENERATED KERNEL
+Settings:
+ LMUL=1
+ M=8
+ M_tail_scalar_from=2
+ N=8
+ __riscv_='__riscv_'
+ complex=False
+ conjugate=False
+ cpu='zvl256b'
+ force_acc_double=False
+ index_type='BLASLONG'
+ op='trmm'
+ param_precision='double'
+ reg_width_bits=256
+ tail_policy=''
+ trace=False
+
+Derived:
+ ELEN_ACC=64
+ ELEN_PARAM=64
+ LMUL_ACC=1
+ VFMACC='__riscv_vfmacc_vf_f64m1'
+ VFMUL='__riscv_vfmul_vf_f64m1'
+ VLEV='__riscv_vle64_v_f64m1'
+ VLSEV='__riscv_vlse64_v_f64m1'
+ VMACC_TO_ACC='__riscv_vfmacc_vf_f64m1'
+ VMUL_TO_ACC='__riscv_vfmul_vf_f64m1'
+ VSETVL='__riscv_vsetvl_e64m1'
+ VSEV='__riscv_vse64_v_f64m1'
+ VSSEV='__riscv_vsse64_v_f64m1'
+ acc_vector_t='vfloat64m1_t'
+ output='dtrmm_kernel_8x8_zvl256b.c'
+ param_scalar_t='double'
+ param_vector_t='vfloat64m1_t'
+
+*/
+
+#include "common.h"
+
+
+
+#if defined(LEFT) != defined(TRANSA)
+    #define BACKWARDS
+#endif
+
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc, BLASLONG offset)
+
+{
+    BLASLONG gvl = 0;
+    BLASLONG m_top = 0;
+    BLASLONG n_top = 0;
+
+
+    // -- MAIN PASS
+
+    for (BLASLONG j=0; j<N/8; j+=1) {
+        m_top = 0;
+        BLASLONG gvl = __riscv_vsetvl_e64m1(4);
+
+
+        for (BLASLONG i=0; i<M/8; i+=1) {
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*8;
+                bi += off*8;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 8;
+                #else
+                    pass_K = off + 8;
+                #endif
+            #endif
+            double B0 = B[bi+0];
+            double B1 = B[bi+1];
+            double B2 = B[bi+2];
+            double B3 = B[bi+3];
+            double B4 = B[bi+4];
+            double B5 = B[bi+5];
+            double B6 = B[bi+6];
+            double B7 = B[bi+7];
+            bi += 8;
+
+            vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+            vfloat64m1_t A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
+            ai += 8;
+
+            vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
+            vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A1, B0, gvl);
+            vfloat64m1_t result2 = __riscv_vfmul_vf_f64m1( A0, B1, gvl);
+            vfloat64m1_t result3 = __riscv_vfmul_vf_f64m1( A1, B1, gvl);
+            vfloat64m1_t result4 = __riscv_vfmul_vf_f64m1( A0, B2, gvl);
+            vfloat64m1_t result5 = __riscv_vfmul_vf_f64m1( A1, B2, gvl);
+            vfloat64m1_t result6 = __riscv_vfmul_vf_f64m1( A0, B3, gvl);
+            vfloat64m1_t result7 = __riscv_vfmul_vf_f64m1( A1, B3, gvl);
+            vfloat64m1_t result8 = __riscv_vfmul_vf_f64m1( A0, B4, gvl);
+            vfloat64m1_t result9 = __riscv_vfmul_vf_f64m1( A1, B4, gvl);
+            vfloat64m1_t result10 = __riscv_vfmul_vf_f64m1( A0, B5, gvl);
+            vfloat64m1_t result11 = __riscv_vfmul_vf_f64m1( A1, B5, gvl);
+            vfloat64m1_t result12 = __riscv_vfmul_vf_f64m1( A0, B6, gvl);
+            vfloat64m1_t result13 = __riscv_vfmul_vf_f64m1( A1, B6, gvl);
+            vfloat64m1_t result14 = __riscv_vfmul_vf_f64m1( A0, B7, gvl);
+            vfloat64m1_t result15 = __riscv_vfmul_vf_f64m1( A1, B7, gvl);
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                B2 = B[bi+2];
+                B3 = B[bi+3];
+                B4 = B[bi+4];
+                B5 = B[bi+5];
+                B6 = B[bi+6];
+                B7 = B[bi+7];
+                bi += 8;
+
+                A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+                A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f64m1( result1, B0, A1, gvl);
+                result2 = __riscv_vfmacc_vf_f64m1( result2, B1, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f64m1( result3, B1, A1, gvl);
+                result4 = __riscv_vfmacc_vf_f64m1( result4, B2, A0, gvl);
+                result5 = __riscv_vfmacc_vf_f64m1( result5, B2, A1, gvl);
+                result6 = __riscv_vfmacc_vf_f64m1( result6, B3, A0, gvl);
+                result7 = __riscv_vfmacc_vf_f64m1( result7, B3, A1, gvl);
+                result8 = __riscv_vfmacc_vf_f64m1( result8, B4, A0, gvl);
+                result9 = __riscv_vfmacc_vf_f64m1( result9, B4, A1, gvl);
+                result10 = __riscv_vfmacc_vf_f64m1( result10, B5, A0, gvl);
+                result11 = __riscv_vfmacc_vf_f64m1( result11, B5, A1, gvl);
+                result12 = __riscv_vfmacc_vf_f64m1( result12, B6, A0, gvl);
+                result13 = __riscv_vfmacc_vf_f64m1( result13, B6, A1, gvl);
+                result14 = __riscv_vfmacc_vf_f64m1( result14, B7, A0, gvl);
+                result15 = __riscv_vfmacc_vf_f64m1( result15, B7, A1, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t c0 = __riscv_vfmul_vf_f64m1( result0, alpha, gvl );
+            vfloat64m1_t c1 = __riscv_vfmul_vf_f64m1( result1, alpha, gvl );
+            vfloat64m1_t c2 = __riscv_vfmul_vf_f64m1( result2, alpha, gvl );
+            vfloat64m1_t c3 = __riscv_vfmul_vf_f64m1( result3, alpha, gvl );
+            vfloat64m1_t c4 = __riscv_vfmul_vf_f64m1( result4, alpha, gvl );
+            vfloat64m1_t c5 = __riscv_vfmul_vf_f64m1( result5, alpha, gvl );
+            vfloat64m1_t c6 = __riscv_vfmul_vf_f64m1( result6, alpha, gvl );
+            vfloat64m1_t c7 = __riscv_vfmul_vf_f64m1( result7, alpha, gvl );
+            vfloat64m1_t c8 = __riscv_vfmul_vf_f64m1( result8, alpha, gvl );
+            vfloat64m1_t c9 = __riscv_vfmul_vf_f64m1( result9, alpha, gvl );
+            vfloat64m1_t c10 = __riscv_vfmul_vf_f64m1( result10, alpha, gvl );
+            vfloat64m1_t c11 = __riscv_vfmul_vf_f64m1( result11, alpha, gvl );
+            vfloat64m1_t c12 = __riscv_vfmul_vf_f64m1( result12, alpha, gvl );
+            vfloat64m1_t c13 = __riscv_vfmul_vf_f64m1( result13, alpha, gvl );
+            vfloat64m1_t c14 = __riscv_vfmul_vf_f64m1( result14, alpha, gvl );
+            vfloat64m1_t c15 = __riscv_vfmul_vf_f64m1( result15, alpha, gvl );
+            __riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c1, gvl); ci += ldc-gvl*1;
+            __riscv_vse64_v_f64m1( &C[ci], c2, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c3, gvl); ci += ldc-gvl*1;
+            __riscv_vse64_v_f64m1( &C[ci], c4, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c5, gvl); ci += ldc-gvl*1;
+            __riscv_vse64_v_f64m1( &C[ci], c6, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c7, gvl); ci += ldc-gvl*1;
+            __riscv_vse64_v_f64m1( &C[ci], c8, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c9, gvl); ci += ldc-gvl*1;
+            __riscv_vse64_v_f64m1( &C[ci], c10, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c11, gvl); ci += ldc-gvl*1;
+            __riscv_vse64_v_f64m1( &C[ci], c12, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c13, gvl); ci += ldc-gvl*1;
+            __riscv_vse64_v_f64m1( &C[ci], c14, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c15, gvl);
+            m_top += 8;
+        }
+
+
+
+        // -- tails for main pass
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e64m1(4);
+
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*4;
+                bi += off*8;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 4;
+                #else
+                    pass_K = off + 8;
+                #endif
+            #endif
+            double B0 = B[bi+0];
+            double B1 = B[bi+1];
+            double B2 = B[bi+2];
+            double B3 = B[bi+3];
+            double B4 = B[bi+4];
+            double B5 = B[bi+5];
+            double B6 = B[bi+6];
+            double B7 = B[bi+7];
+            bi += 8;
+
+            vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+            ai += 4;
+
+            vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
+            vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A0, B1, gvl);
+            vfloat64m1_t result2 = __riscv_vfmul_vf_f64m1( A0, B2, gvl);
+            vfloat64m1_t result3 = __riscv_vfmul_vf_f64m1( A0, B3, gvl);
+            vfloat64m1_t result4 = __riscv_vfmul_vf_f64m1( A0, B4, gvl);
+            vfloat64m1_t result5 = __riscv_vfmul_vf_f64m1( A0, B5, gvl);
+            vfloat64m1_t result6 = __riscv_vfmul_vf_f64m1( A0, B6, gvl);
+            vfloat64m1_t result7 = __riscv_vfmul_vf_f64m1( A0, B7, gvl);
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                B2 = B[bi+2];
+                B3 = B[bi+3];
+                B4 = B[bi+4];
+                B5 = B[bi+5];
+                B6 = B[bi+6];
+                B7 = B[bi+7];
+                bi += 8;
+
+                A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f64m1( result1, B1, A0, gvl);
+                result2 = __riscv_vfmacc_vf_f64m1( result2, B2, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f64m1( result3, B3, A0, gvl);
+                result4 = __riscv_vfmacc_vf_f64m1( result4, B4, A0, gvl);
+                result5 = __riscv_vfmacc_vf_f64m1( result5, B5, A0, gvl);
+                result6 = __riscv_vfmacc_vf_f64m1( result6, B6, A0, gvl);
+                result7 = __riscv_vfmacc_vf_f64m1( result7, B7, A0, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t c0 = __riscv_vfmul_vf_f64m1( result0, alpha, gvl );
+            vfloat64m1_t c1 = __riscv_vfmul_vf_f64m1( result1, alpha, gvl );
+            vfloat64m1_t c2 = __riscv_vfmul_vf_f64m1( result2, alpha, gvl );
+            vfloat64m1_t c3 = __riscv_vfmul_vf_f64m1( result3, alpha, gvl );
+            vfloat64m1_t c4 = __riscv_vfmul_vf_f64m1( result4, alpha, gvl );
+            vfloat64m1_t c5 = __riscv_vfmul_vf_f64m1( result5, alpha, gvl );
+            vfloat64m1_t c6 = __riscv_vfmul_vf_f64m1( result6, alpha, gvl );
+            vfloat64m1_t c7 = __riscv_vfmul_vf_f64m1( result7, alpha, gvl );
+            __riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
+            __riscv_vse64_v_f64m1( &C[ci], c1, gvl); ci += ldc-gvl*0;
+            __riscv_vse64_v_f64m1( &C[ci], c2, gvl); ci += ldc-gvl*0;
+            __riscv_vse64_v_f64m1( &C[ci], c3, gvl); ci += ldc-gvl*0;
+            __riscv_vse64_v_f64m1( &C[ci], c4, gvl); ci += ldc-gvl*0;
+            __riscv_vse64_v_f64m1( &C[ci], c5, gvl); ci += ldc-gvl*0;
+            __riscv_vse64_v_f64m1( &C[ci], c6, gvl); ci += ldc-gvl*0;
+            __riscv_vse64_v_f64m1( &C[ci], c7, gvl);
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            double result4 = 0;
+            double result5 = 0;
+            double result6 = 0;
+            double result7 = 0;
+            double result8 = 0;
+            double result9 = 0;
+            double result10 = 0;
+            double result11 = 0;
+            double result12 = 0;
+            double result13 = 0;
+            double result14 = 0;
+            double result15 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*2;
+                bi += off*8;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 2;
+                #else
+                    pass_K = off + 8;
+                #endif
+            #endif
+
+            for(BLASLONG k=0; k<pass_K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                result1+=A[ai+1]*B[bi+0];
+                result2+=A[ai+0]*B[bi+1];
+                result3+=A[ai+1]*B[bi+1];
+                result4+=A[ai+0]*B[bi+2];
+                result5+=A[ai+1]*B[bi+2];
+                result6+=A[ai+0]*B[bi+3];
+                result7+=A[ai+1]*B[bi+3];
+                result8+=A[ai+0]*B[bi+4];
+                result9+=A[ai+1]*B[bi+4];
+                result10+=A[ai+0]*B[bi+5];
+                result11+=A[ai+1]*B[bi+5];
+                result12+=A[ai+0]*B[bi+6];
+                result13+=A[ai+1]*B[bi+6];
+                result14+=A[ai+0]*B[bi+7];
+                result15+=A[ai+1]*B[bi+7];
+                ai+=2;
+                bi+=8;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] = alpha * result0;
+            C[ci+0*ldc+1] = alpha * result1;
+            C[ci+1*ldc+0] = alpha * result2;
+            C[ci+1*ldc+1] = alpha * result3;
+            C[ci+2*ldc+0] = alpha * result4;
+            C[ci+2*ldc+1] = alpha * result5;
+            C[ci+3*ldc+0] = alpha * result6;
+            C[ci+3*ldc+1] = alpha * result7;
+            C[ci+4*ldc+0] = alpha * result8;
+            C[ci+4*ldc+1] = alpha * result9;
+            C[ci+5*ldc+0] = alpha * result10;
+            C[ci+5*ldc+1] = alpha * result11;
+            C[ci+6*ldc+0] = alpha * result12;
+            C[ci+6*ldc+1] = alpha * result13;
+            C[ci+7*ldc+0] = alpha * result14;
+            C[ci+7*ldc+1] = alpha * result15;
+            m_top+=2;
+        }
+
+
+        if( M & 1 ) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            double result4 = 0;
+            double result5 = 0;
+            double result6 = 0;
+            double result7 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*1;
+                bi += off*8;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 1;
+                #else
+                    pass_K = off + 8;
+                #endif
+            #endif
+
+            for(BLASLONG k=0; k<pass_K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                result1+=A[ai+0]*B[bi+1];
+                result2+=A[ai+0]*B[bi+2];
+                result3+=A[ai+0]*B[bi+3];
+                result4+=A[ai+0]*B[bi+4];
+                result5+=A[ai+0]*B[bi+5];
+                result6+=A[ai+0]*B[bi+6];
+                result7+=A[ai+0]*B[bi+7];
+                ai+=1;
+                bi+=8;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] = alpha * result0;
+            C[ci+1*ldc+0] = alpha * result1;
+            C[ci+2*ldc+0] = alpha * result2;
+            C[ci+3*ldc+0] = alpha * result3;
+            C[ci+4*ldc+0] = alpha * result4;
+            C[ci+5*ldc+0] = alpha * result5;
+            C[ci+6*ldc+0] = alpha * result6;
+            C[ci+7*ldc+0] = alpha * result7;
+            m_top+=1;
+        }
+
+        n_top += 8;
+    }
+
+
+
+    // -- tails for N=4
+
+    if( N & 4 ) {
+        gvl = __riscv_vsetvl_e64m1(4);
+        m_top = 0;
+
+        for (BLASLONG i=0; i<M/8; i+=1) {
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*8;
+                bi += off*4;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 8;
+                #else
+                    pass_K = off + 4;
+                #endif
+            #endif
+            double B0 = B[bi+0];
+            double B1 = B[bi+1];
+            double B2 = B[bi+2];
+            double B3 = B[bi+3];
+            bi += 4;
+
+            vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+            vfloat64m1_t A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
+            ai += 8;
+
+            vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
+            vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A1, B0, gvl);
+            vfloat64m1_t result2 = __riscv_vfmul_vf_f64m1( A0, B1, gvl);
+            vfloat64m1_t result3 = __riscv_vfmul_vf_f64m1( A1, B1, gvl);
+            vfloat64m1_t result4 = __riscv_vfmul_vf_f64m1( A0, B2, gvl);
+            vfloat64m1_t result5 = __riscv_vfmul_vf_f64m1( A1, B2, gvl);
+            vfloat64m1_t result6 = __riscv_vfmul_vf_f64m1( A0, B3, gvl);
+            vfloat64m1_t result7 = __riscv_vfmul_vf_f64m1( A1, B3, gvl);
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                B2 = B[bi+2];
+                B3 = B[bi+3];
+                bi += 4;
+
+                A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+                A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f64m1( result1, B0, A1, gvl);
+                result2 = __riscv_vfmacc_vf_f64m1( result2, B1, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f64m1( result3, B1, A1, gvl);
+                result4 = __riscv_vfmacc_vf_f64m1( result4, B2, A0, gvl);
+                result5 = __riscv_vfmacc_vf_f64m1( result5, B2, A1, gvl);
+                result6 = __riscv_vfmacc_vf_f64m1( result6, B3, A0, gvl);
+                result7 = __riscv_vfmacc_vf_f64m1( result7, B3, A1, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t c0 = __riscv_vfmul_vf_f64m1( result0, alpha, gvl );
+            vfloat64m1_t c1 = __riscv_vfmul_vf_f64m1( result1, alpha, gvl );
+            vfloat64m1_t c2 = __riscv_vfmul_vf_f64m1( result2, alpha, gvl );
+            vfloat64m1_t c3 = __riscv_vfmul_vf_f64m1( result3, alpha, gvl );
+            vfloat64m1_t c4 = __riscv_vfmul_vf_f64m1( result4, alpha, gvl );
+            vfloat64m1_t c5 = __riscv_vfmul_vf_f64m1( result5, alpha, gvl );
+            vfloat64m1_t c6 = __riscv_vfmul_vf_f64m1( result6, alpha, gvl );
+            vfloat64m1_t c7 = __riscv_vfmul_vf_f64m1( result7, alpha, gvl );
+            __riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c1, gvl); ci += ldc-gvl*1;
+            __riscv_vse64_v_f64m1( &C[ci], c2, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c3, gvl); ci += ldc-gvl*1;
+            __riscv_vse64_v_f64m1( &C[ci], c4, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c5, gvl); ci += ldc-gvl*1;
+            __riscv_vse64_v_f64m1( &C[ci], c6, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c7, gvl);
+            m_top += 8;
+        }
+
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e64m1(4);
+
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*4;
+                bi += off*4;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 4;
+                #else
+                    pass_K = off + 4;
+                #endif
+            #endif
+            double B0 = B[bi+0];
+            double B1 = B[bi+1];
+            double B2 = B[bi+2];
+            double B3 = B[bi+3];
+            bi += 4;
+
+            vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+            ai += 4;
+
+            vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
+            vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A0, B1, gvl);
+            vfloat64m1_t result2 = __riscv_vfmul_vf_f64m1( A0, B2, gvl);
+            vfloat64m1_t result3 = __riscv_vfmul_vf_f64m1( A0, B3, gvl);
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                B2 = B[bi+2];
+                B3 = B[bi+3];
+                bi += 4;
+
+                A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f64m1( result1, B1, A0, gvl);
+                result2 = __riscv_vfmacc_vf_f64m1( result2, B2, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f64m1( result3, B3, A0, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t c0 = __riscv_vfmul_vf_f64m1( result0, alpha, gvl );
+            vfloat64m1_t c1 = __riscv_vfmul_vf_f64m1( result1, alpha, gvl );
+            vfloat64m1_t c2 = __riscv_vfmul_vf_f64m1( result2, alpha, gvl );
+            vfloat64m1_t c3 = __riscv_vfmul_vf_f64m1( result3, alpha, gvl );
+            __riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
+            __riscv_vse64_v_f64m1( &C[ci], c1, gvl); ci += ldc-gvl*0;
+            __riscv_vse64_v_f64m1( &C[ci], c2, gvl); ci += ldc-gvl*0;
+            __riscv_vse64_v_f64m1( &C[ci], c3, gvl);
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            double result4 = 0;
+            double result5 = 0;
+            double result6 = 0;
+            double result7 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*2;
+                bi += off*4;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 2;
+                #else
+                    pass_K = off + 4;
+                #endif
+            #endif
+
+            for(BLASLONG k=0; k<pass_K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                result1+=A[ai+1]*B[bi+0];
+                result2+=A[ai+0]*B[bi+1];
+                result3+=A[ai+1]*B[bi+1];
+                result4+=A[ai+0]*B[bi+2];
+                result5+=A[ai+1]*B[bi+2];
+                result6+=A[ai+0]*B[bi+3];
+                result7+=A[ai+1]*B[bi+3];
+                ai+=2;
+                bi+=4;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] = alpha * result0;
+            C[ci+0*ldc+1] = alpha * result1;
+            C[ci+1*ldc+0] = alpha * result2;
+            C[ci+1*ldc+1] = alpha * result3;
+            C[ci+2*ldc+0] = alpha * result4;
+            C[ci+2*ldc+1] = alpha * result5;
+            C[ci+3*ldc+0] = alpha * result6;
+            C[ci+3*ldc+1] = alpha * result7;
+            m_top+=2;
+        }
+
+
+        if( M & 1 ) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*1;
+                bi += off*4;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 1;
+                #else
+                    pass_K = off + 4;
+                #endif
+            #endif
+
+            for(BLASLONG k=0; k<pass_K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                result1+=A[ai+0]*B[bi+1];
+                result2+=A[ai+0]*B[bi+2];
+                result3+=A[ai+0]*B[bi+3];
+                ai+=1;
+                bi+=4;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] = alpha * result0;
+            C[ci+1*ldc+0] = alpha * result1;
+            C[ci+2*ldc+0] = alpha * result2;
+            C[ci+3*ldc+0] = alpha * result3;
+            m_top+=1;
+        }
+
+        n_top += 4;
+    }
+
+
+
+    // -- tails for N=2
+
+    if( N & 2 ) {
+        gvl = __riscv_vsetvl_e64m1(4);
+        m_top = 0;
+
+        for (BLASLONG i=0; i<M/8; i+=1) {
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*8;
+                bi += off*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 8;
+                #else
+                    pass_K = off + 2;
+                #endif
+            #endif
+            double B0 = B[bi+0];
+            double B1 = B[bi+1];
+            bi += 2;
+
+            vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+            vfloat64m1_t A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
+            ai += 8;
+
+            vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
+            vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A1, B0, gvl);
+            vfloat64m1_t result2 = __riscv_vfmul_vf_f64m1( A0, B1, gvl);
+            vfloat64m1_t result3 = __riscv_vfmul_vf_f64m1( A1, B1, gvl);
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                bi += 2;
+
+                A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+                A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f64m1( result1, B0, A1, gvl);
+                result2 = __riscv_vfmacc_vf_f64m1( result2, B1, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f64m1( result3, B1, A1, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t c0 = __riscv_vfmul_vf_f64m1( result0, alpha, gvl );
+            vfloat64m1_t c1 = __riscv_vfmul_vf_f64m1( result1, alpha, gvl );
+            vfloat64m1_t c2 = __riscv_vfmul_vf_f64m1( result2, alpha, gvl );
+            vfloat64m1_t c3 = __riscv_vfmul_vf_f64m1( result3, alpha, gvl );
+            __riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c1, gvl); ci += ldc-gvl*1;
+            __riscv_vse64_v_f64m1( &C[ci], c2, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c3, gvl);
+            m_top += 8;
+        }
+
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e64m1(4);
+
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*4;
+                bi += off*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 4;
+                #else
+                    pass_K = off + 2;
+                #endif
+            #endif
+            double B0 = B[bi+0];
+            double B1 = B[bi+1];
+            bi += 2;
+
+            vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+            ai += 4;
+
+            vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
+            vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A0, B1, gvl);
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                bi += 2;
+
+                A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f64m1( result1, B1, A0, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t c0 = __riscv_vfmul_vf_f64m1( result0, alpha, gvl );
+            vfloat64m1_t c1 = __riscv_vfmul_vf_f64m1( result1, alpha, gvl );
+            __riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
+            __riscv_vse64_v_f64m1( &C[ci], c1, gvl);
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*2;
+                bi += off*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 2;
+                #else
+                    pass_K = off + 2;
+                #endif
+            #endif
+
+            for(BLASLONG k=0; k<pass_K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                result1+=A[ai+1]*B[bi+0];
+                result2+=A[ai+0]*B[bi+1];
+                result3+=A[ai+1]*B[bi+1];
+                ai+=2;
+                bi+=2;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] = alpha * result0;
+            C[ci+0*ldc+1] = alpha * result1;
+            C[ci+1*ldc+0] = alpha * result2;
+            C[ci+1*ldc+1] = alpha * result3;
+            m_top+=2;
+        }
+
+
+        if( M & 1 ) {
+            double result0 = 0;
+            double result1 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*1;
+                bi += off*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 1;
+                #else
+                    pass_K = off + 2;
+                #endif
+            #endif
+
+            for(BLASLONG k=0; k<pass_K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                result1+=A[ai+0]*B[bi+1];
+                ai+=1;
+                bi+=2;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] = alpha * result0;
+            C[ci+1*ldc+0] = alpha * result1;
+            m_top+=1;
+        }
+
+        n_top += 2;
+    }
+
+
+
+    // -- tails for N=1
+
+    if( N & 1 ) {
+        gvl = __riscv_vsetvl_e64m1(4);
+        m_top = 0;
+
+        for (BLASLONG i=0; i<M/8; i+=1) {
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*8;
+                bi += off*1;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 8;
+                #else
+                    pass_K = off + 1;
+                #endif
+            #endif
+            double B0 = B[bi+0];
+            bi += 1;
+
+            vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+            vfloat64m1_t A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
+            ai += 8;
+
+            vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
+            vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A1, B0, gvl);
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0 = B[bi+0];
+                bi += 1;
+
+                A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+                A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f64m1( result1, B0, A1, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t c0 = __riscv_vfmul_vf_f64m1( result0, alpha, gvl );
+            vfloat64m1_t c1 = __riscv_vfmul_vf_f64m1( result1, alpha, gvl );
+            __riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c1, gvl);
+            m_top += 8;
+        }
+
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e64m1(4);
+
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*4;
+                bi += off*1;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 4;
+                #else
+                    pass_K = off + 1;
+                #endif
+            #endif
+            double B0 = B[bi+0];
+            bi += 1;
+
+            vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+            ai += 4;
+
+            vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0 = B[bi+0];
+                bi += 1;
+
+                A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t c0 = __riscv_vfmul_vf_f64m1( result0, alpha, gvl );
+            __riscv_vse64_v_f64m1( &C[ci], c0, gvl);
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            double result0 = 0;
+            double result1 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*2;
+                bi += off*1;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 2;
+                #else
+                    pass_K = off + 1;
+                #endif
+            #endif
+
+            for(BLASLONG k=0; k<pass_K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                result1+=A[ai+1]*B[bi+0];
+                ai+=2;
+                bi+=1;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] = alpha * result0;
+            C[ci+0*ldc+1] = alpha * result1;
+            m_top+=2;
+        }
+
+
+        if( M & 1 ) {
+            double result0 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*1;
+                bi += off*1;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 1;
+                #else
+                    pass_K = off + 1;
+                #endif
+            #endif
+
+            for(BLASLONG k=0; k<pass_K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                ai+=1;
+                bi+=1;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] = alpha * result0;
+            m_top+=1;
+        }
+
+        n_top += 1;
+    }
+
+    return 0;
+}
+
diff --git a/kernel/riscv64/gemm_beta_rvv.c b/kernel/riscv64/gemm_beta_rvv.c
new file mode 100644
index 000000000..f3cf6491d
--- /dev/null
+++ b/kernel/riscv64/gemm_beta_rvv.c
@@ -0,0 +1,89 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m8(n)
+#define FLOAT_V_T               vfloat32m8_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m8
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m8
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f32m8
+#define VSEV_FLOAT              __riscv_vse32_v_f32m8
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m8(n)
+#define FLOAT_V_T               vfloat64m8_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m8
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m8
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f64m8
+#define VSEV_FLOAT              __riscv_vse64_v_f64m8
+#endif
+
+// Optimizes the implementation in ../generic/gemm_beta.c
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta,
+          IFLOAT *dummy2, BLASLONG dummy3, IFLOAT *dummy4, BLASLONG dummy5,
+          FLOAT *c, BLASLONG ldc)
+{
+    BLASLONG chunk;
+    FLOAT *c_offset;
+	size_t vl;
+    FLOAT_V_T vx;
+
+    if (beta == ZERO) {
+
+        vl = VSETVL(m);
+        vx = VFMVVF_FLOAT(0.0, vl);
+
+        for( ; n > 0; n--, c += ldc) {
+            c_offset = c;
+
+            for(chunk=m; chunk > 0; chunk -= vl, c_offset += vl) {
+                vl = VSETVL(chunk);
+
+                VSEV_FLOAT(c_offset, vx, vl);
+			}
+		}
+
+	} else {
+
+        for( ; n > 0; n--, c += ldc) {
+            c_offset = c;
+
+            for(chunk=m; chunk > 0; chunk -= vl, c_offset += vl) {
+                vl = VSETVL(chunk);
+
+                vx = VLEV_FLOAT(c_offset, vl);
+                vx = VFMULVF_FLOAT(vx, beta, vl);
+                VSEV_FLOAT(c_offset, vx, vl);
+			}
+		}
+
+	}
+
+    return 0;
+}
diff --git a/kernel/riscv64/gemm_ncopy_8_rvv.c b/kernel/riscv64/gemm_ncopy_8_rvv.c
new file mode 100644
index 000000000..c652ab0c0
--- /dev/null
+++ b/kernel/riscv64/gemm_ncopy_8_rvv.c
@@ -0,0 +1,197 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m1(n)
+#define FLOAT_V_T               vfloat32m1_t
+#define FLOAT_VX2_T             vfloat32m1x2_t
+#define FLOAT_VX4_T             vfloat32m1x4_t
+#define FLOAT_VX8_T             vfloat32m1x8_t
+#define VSET_VX2                __riscv_vset_v_f32m1_f32m1x2
+#define VSET_VX4                __riscv_vset_v_f32m1_f32m1x4
+#define VSET_VX8                __riscv_vset_v_f32m1_f32m1x8
+#define VLEV_FLOAT              __riscv_vle32_v_f32m1
+#define VSEV_FLOAT              __riscv_vse32_v_f32m1
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m1x2
+#define VSSEG4_FLOAT            __riscv_vsseg4e32_v_f32m1x4
+#define VSSEG8_FLOAT            __riscv_vsseg8e32_v_f32m1x8
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m1(n)
+#define FLOAT_V_T               vfloat64m1_t
+#define FLOAT_VX2_T             vfloat64m1x2_t
+#define FLOAT_VX4_T             vfloat64m1x4_t
+#define FLOAT_VX8_T             vfloat64m1x8_t
+#define VSET_VX2                __riscv_vset_v_f64m1_f64m1x2
+#define VSET_VX4                __riscv_vset_v_f64m1_f64m1x4
+#define VSET_VX8                __riscv_vset_v_f64m1_f64m1x8
+#define VLEV_FLOAT              __riscv_vle64_v_f64m1
+#define VSEV_FLOAT              __riscv_vse64_v_f64m1
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m1x2
+#define VSSEG4_FLOAT            __riscv_vsseg4e64_v_f64m1x4
+#define VSSEG8_FLOAT            __riscv_vsseg8e64_v_f64m1x8
+#endif
+
+// Optimizes the implementation in ../generic/gemm_ncopy_8.c
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b)
+{
+    BLASLONG i, j;
+
+    FLOAT *a_offset;
+    FLOAT *a_offset1, *a_offset2, *a_offset3, *a_offset4;
+    FLOAT *a_offset5, *a_offset6, *a_offset7, *a_offset8;
+    FLOAT *b_offset;
+
+    FLOAT_V_T v1, v2, v3, v4, v5, v6, v7, v8;
+    FLOAT_VX2_T vx2;
+    FLOAT_VX4_T vx4;
+    FLOAT_VX8_T vx8;
+
+    size_t vl;
+
+    //fprintf(stderr, "gemm_ncopy_8 m=%ld n=%ld lda=%ld\n", m, n, lda);
+
+    a_offset = a;
+    b_offset = b;
+
+    for(j = (n >> 3); j > 0; j--) {
+        a_offset1  = a_offset;
+        a_offset2  = a_offset1 + lda;
+        a_offset3  = a_offset2 + lda;
+        a_offset4  = a_offset3 + lda;
+        a_offset5  = a_offset4 + lda;
+        a_offset6  = a_offset5 + lda;
+        a_offset7  = a_offset6 + lda;
+        a_offset8  = a_offset7 + lda;
+        a_offset += 8 * lda;
+
+        for(i = m; i > 0; i -= vl) {
+            vl = VSETVL(i);
+
+            v1 = VLEV_FLOAT(a_offset1, vl);
+            v2 = VLEV_FLOAT(a_offset2, vl);
+            v3 = VLEV_FLOAT(a_offset3, vl);
+            v4 = VLEV_FLOAT(a_offset4, vl);
+            v5 = VLEV_FLOAT(a_offset5, vl);
+            v6 = VLEV_FLOAT(a_offset6, vl);
+            v7 = VLEV_FLOAT(a_offset7, vl);
+            v8 = VLEV_FLOAT(a_offset8, vl);
+
+            vx8 = VSET_VX8(vx8, 0, v1);
+            vx8 = VSET_VX8(vx8, 1, v2);
+            vx8 = VSET_VX8(vx8, 2, v3);
+            vx8 = VSET_VX8(vx8, 3, v4);
+            vx8 = VSET_VX8(vx8, 4, v5);
+            vx8 = VSET_VX8(vx8, 5, v6);
+            vx8 = VSET_VX8(vx8, 6, v7);
+            vx8 = VSET_VX8(vx8, 7, v8);
+
+            VSSEG8_FLOAT(b_offset, vx8, vl);
+
+            a_offset1 += vl;
+            a_offset2 += vl;
+            a_offset3 += vl;
+            a_offset4 += vl;
+            a_offset5 += vl;
+            a_offset6 += vl;
+            a_offset7 += vl;
+            a_offset8 += vl;
+            b_offset += vl*8;
+        }
+    }
+
+    if (n & 4) {
+        a_offset1  = a_offset;
+        a_offset2  = a_offset1 + lda;
+        a_offset3  = a_offset2 + lda;
+        a_offset4  = a_offset3 + lda;
+        a_offset += 4 * lda;
+
+        for(i = m; i > 0; i -= vl) {
+            vl = VSETVL(i);
+
+            v1 = VLEV_FLOAT(a_offset1, vl);
+            v2 = VLEV_FLOAT(a_offset2, vl);
+            v3 = VLEV_FLOAT(a_offset3, vl);
+            v4 = VLEV_FLOAT(a_offset4, vl);
+
+            vx4 = VSET_VX4(vx4, 0, v1);
+            vx4 = VSET_VX4(vx4, 1, v2);
+            vx4 = VSET_VX4(vx4, 2, v3);
+            vx4 = VSET_VX4(vx4, 3, v4);
+
+            VSSEG4_FLOAT(b_offset, vx4, vl);
+
+            a_offset1 += vl;
+            a_offset2 += vl;
+            a_offset3 += vl;
+            a_offset4 += vl;
+            b_offset += vl*4;
+        }
+    }
+
+    if (n & 2) {
+        a_offset1  = a_offset;
+        a_offset2  = a_offset1 + lda;
+        a_offset += 2 * lda;
+
+        for(i = m; i > 0; i -= vl) {
+            vl = VSETVL(i);
+
+            v1 = VLEV_FLOAT(a_offset1, vl);
+            v2 = VLEV_FLOAT(a_offset2, vl);
+
+            vx2 = VSET_VX2(vx2, 0, v1);
+            vx2 = VSET_VX2(vx2, 1, v2);
+
+            VSSEG2_FLOAT(b_offset, vx2, vl);
+
+            a_offset1 += vl;
+            a_offset2 += vl;
+            b_offset += vl*2;
+        }
+    }
+
+    if (n & 1) {
+        a_offset1  = a_offset;
+
+        for(i = m; i > 0; i -= vl) {
+            vl = VSETVL(i);
+
+            v1 = VLEV_FLOAT(a_offset1, vl);
+
+            VSEV_FLOAT(b_offset, v1, vl);
+
+            a_offset1 += vl;
+            b_offset += vl;
+        }
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/gemm_ncopy_rvv_v1.c b/kernel/riscv64/gemm_ncopy_rvv_v1.c
new file mode 100644
index 000000000..2d6db15e5
--- /dev/null
+++ b/kernel/riscv64/gemm_ncopy_rvv_v1.c
@@ -0,0 +1,76 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define FLOAT_V_T               vfloat32m2_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define FLOAT_V_T               vfloat64m2_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m2
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b)
+{
+    BLASLONG i, j;
+
+    FLOAT *a_offset;
+    FLOAT *a_offset1;
+    FLOAT *b_offset;
+
+    FLOAT_V_T v0;
+    size_t vl;
+
+    //fprintf(stderr, "%s, m=%ld n=%ld lda=%ld\n", __FUNCTION__, m, n, lda);
+
+    a_offset = a;
+    b_offset = b;
+
+    for(j = n; j > 0; j -= vl) {
+        vl = VSETVL(j);
+
+        a_offset1 = a_offset;
+        a_offset += vl * lda;
+
+        for(i = m; i > 0; i--) {
+            v0 = VLSEV_FLOAT(a_offset1, lda * sizeof(FLOAT), vl);
+            VSEV_FLOAT(b_offset, v0, vl);
+
+            a_offset1++;
+            b_offset += vl;
+        }
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/gemm_tcopy_8_rvv.c b/kernel/riscv64/gemm_tcopy_8_rvv.c
new file mode 100644
index 000000000..4742ae6a7
--- /dev/null
+++ b/kernel/riscv64/gemm_tcopy_8_rvv.c
@@ -0,0 +1,273 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m1(n)
+#define FLOAT_V_T               vfloat32m1_t
+#define FLOAT_VX2_T             vfloat32m1x2_t
+#define FLOAT_VX4_T             vfloat32m1x4_t
+#define FLOAT_VX8_T             vfloat32m1x8_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m1
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m1
+#define VSEV_FLOAT              __riscv_vse32_v_f32m1
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e32_v_f32m1x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m1x2
+#define VLSSEG4_FLOAT           __riscv_vlsseg4e32_v_f32m1x4
+#define VSSEG4_FLOAT            __riscv_vsseg4e32_v_f32m1x4
+#define VLSSEG8_FLOAT           __riscv_vlsseg8e32_v_f32m1x8
+#define VSSEG8_FLOAT            __riscv_vsseg8e32_v_f32m1x8
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m1(n)
+#define FLOAT_V_T               vfloat64m1_t
+#define FLOAT_VX2_T             vfloat64m1x2_t
+#define FLOAT_VX4_T             vfloat64m1x4_t
+#define FLOAT_VX8_T             vfloat64m1x8_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m1
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m1
+#define VSEV_FLOAT              __riscv_vse64_v_f64m1
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e64_v_f64m1x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m1x2
+#define VLSSEG4_FLOAT           __riscv_vlsseg4e64_v_f64m1x4
+#define VSSEG4_FLOAT            __riscv_vsseg4e64_v_f64m1x4
+#define VLSSEG8_FLOAT           __riscv_vlsseg8e64_v_f64m1x8
+#define VSSEG8_FLOAT            __riscv_vsseg8e64_v_f64m1x8
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
+{
+    BLASLONG i, j;
+
+    IFLOAT *aoffset;
+    IFLOAT *aoffset1;
+
+    IFLOAT *boffset, *boffset1, *boffset2, *boffset3, *boffset4;
+
+    FLOAT_V_T v0;
+    FLOAT_VX2_T vx2;
+    FLOAT_VX4_T vx4;
+    FLOAT_VX8_T vx8;
+
+    // fprintf(stderr, "gemm_tcopy_8 m=%ld n=%ld lda=%ld\n", m, n, lda);
+
+    aoffset   = a;
+    boffset   = b;
+    boffset2  = b + m  * (n & ~7);
+    boffset3  = b + m  * (n & ~3);
+    boffset4  = b + m  * (n & ~1);
+
+    for(j = (m >> 3); j > 0; j--) {
+
+        aoffset1  = aoffset;
+        aoffset += 8 * lda;
+
+        boffset1  = boffset;
+        boffset  += 64;
+
+        for(i = (n >> 3); i > 0; i--) {
+            size_t vl = 8;
+
+            vx8 = VLSSEG8_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
+            VSSEG8_FLOAT(boffset1, vx8, vl);
+
+            aoffset1 += 8;
+            boffset1 += m * 8;
+        }
+
+        if (n & 4) {
+            size_t vl = 8;
+
+            vx4 = VLSSEG4_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
+            VSSEG4_FLOAT(boffset2, vx4, vl);
+
+            aoffset1 += 4;
+            boffset2 += 32;
+        }
+
+        if (n & 2) {
+            size_t vl = 8;
+
+            vx2 = VLSSEG2_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
+            VSSEG2_FLOAT(boffset3, vx2, vl);
+
+            aoffset1 += 2;
+            boffset3 += 16;
+        }
+
+        if (n & 1) {
+            size_t vl = 8;
+
+            v0 = VLSEV_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
+            VSEV_FLOAT(boffset4, v0, vl);
+
+            aoffset1 += 1;
+            boffset4 += 8;
+        }
+
+    }
+
+    if (m & 4) {
+
+        aoffset1  = aoffset;
+        aoffset += 4 * lda;
+
+        boffset1  = boffset;
+        boffset  += 32;
+
+        for(i = (n >> 3); i > 0; i--) {
+            size_t vl = 4;
+
+            vx8 = VLSSEG8_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
+            VSSEG8_FLOAT(boffset1, vx8, vl);
+
+            aoffset1 += 8;
+            boffset1 += m * 8;
+        }
+
+        if (n & 4) {
+            size_t vl = 4;
+
+            vx4 = VLSSEG4_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
+            VSSEG4_FLOAT(boffset2, vx4, vl);
+
+            aoffset1 += 4;
+            boffset2 += 16;
+        }
+
+        if (n & 2) {
+            size_t vl = 4;
+
+            vx2 = VLSSEG2_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
+            VSSEG2_FLOAT(boffset3, vx2, vl);
+
+            aoffset1 += 2;
+            boffset3 += 8;
+	}
+
+        if (n & 1) {
+            size_t vl = 4;
+
+            v0 = VLSEV_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
+            VSEV_FLOAT(boffset4, v0, vl);
+
+            aoffset1 += 1;
+            boffset4 += 4;
+        }
+    }
+
+    if (m & 2) {
+        aoffset1  = aoffset;
+        aoffset += 2 * lda;
+
+        boffset1  = boffset;
+        boffset  += 16;
+
+        for(i = (n >> 3); i > 0; i--) {
+            size_t vl = 2;
+
+            vx8 = VLSSEG8_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
+            VSSEG8_FLOAT(boffset1, vx8, vl);
+
+            aoffset1 += 8;
+            boffset1 += m * 8;
+        }
+
+        if (n & 4) {
+            size_t vl = 2;
+
+            vx4 = VLSSEG4_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
+            VSSEG4_FLOAT(boffset2, vx4, vl);
+
+            aoffset1 += 4;
+            boffset2 += 8;
+        }
+
+        if (n & 2) {
+            size_t vl = 2;
+
+            vx2 = VLSSEG2_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
+            VSSEG2_FLOAT(boffset3, vx2, vl);
+
+            aoffset1 += 2;
+            boffset3 += 4;
+	}
+  
+        if (n & 1) {
+           size_t vl = 2;
+
+            v0 = VLSEV_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
+            VSEV_FLOAT(boffset4, v0, vl);
+
+            aoffset1 += 1;
+            boffset4 += 2;
+        }
+    }
+
+    if (m & 1) {
+        aoffset1  = aoffset;
+        boffset1  = boffset;
+
+        for(i = (n >> 3); i > 0; i--) {
+            size_t vl = 8;
+
+            v0 = VLEV_FLOAT(aoffset1, vl);
+            VSEV_FLOAT(boffset1, v0, vl);
+
+            aoffset1 += 8;
+            boffset1 += 8 * m;
+        }
+
+        if (n & 4) {
+            size_t vl = 4;
+
+            v0 = VLEV_FLOAT(aoffset1, vl);
+            VSEV_FLOAT(boffset2, v0, vl);
+
+            aoffset1 += 4;
+            //boffset2 += 4;
+        }
+
+        if (n & 2) {
+            size_t vl = 2;
+
+            v0 = VLEV_FLOAT(aoffset1, vl);
+            VSEV_FLOAT(boffset3, v0, vl);
+
+            aoffset1 += 2;
+           // boffset3 += 2;
+        }
+
+        if (n & 1) {
+           *(boffset4) = *(aoffset1);
+           // aoffset1 ++;
+           // boffset4 ++;
+        }
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/gemm_tcopy_rvv_v1.c b/kernel/riscv64/gemm_tcopy_rvv_v1.c
new file mode 100644
index 000000000..c5fb6479f
--- /dev/null
+++ b/kernel/riscv64/gemm_tcopy_rvv_v1.c
@@ -0,0 +1,74 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define FLOAT_V_T               vfloat32m2_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define FLOAT_V_T               vfloat64m2_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
+{
+    BLASLONG i, j;
+
+    IFLOAT *aoffset;
+    IFLOAT *aoffset1;
+    IFLOAT *boffset;
+
+    FLOAT_V_T v0;
+    size_t vl;
+
+    //fprintf(stderr, "%s, m=%ld n=%ld lda=%ld\n", __FUNCTION__, m, n, lda);
+
+    aoffset = a;
+    boffset = b;
+
+    for(j = n; j > 0; j -= vl) {
+        vl = VSETVL(j);
+
+        aoffset1 = aoffset;
+        aoffset += vl;
+
+        for(i = m; i > 0; i--) {
+            v0 = VLEV_FLOAT(aoffset1, vl);
+            VSEV_FLOAT(boffset, v0, vl);
+
+            aoffset1 += lda;
+            boffset += vl;
+        }  
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/gemmkernel_rvv_v1x8.c b/kernel/riscv64/gemmkernel_rvv_v1x8.c
new file mode 100644
index 000000000..471b3158f
--- /dev/null
+++ b/kernel/riscv64/gemmkernel_rvv_v1x8.c
@@ -0,0 +1,601 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define FLOAT_V_T               vfloat32m2_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m2
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f32m2
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define FLOAT_V_T               vfloat64m2_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m2
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f64m2
+#endif
+
+int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alpha, IFLOAT* ba, IFLOAT* bb, FLOAT* C, BLASLONG ldc
+#ifdef TRMMKERNEL
+		,BLASLONG offset
+#endif
+		)
+{
+    BLASLONG i,j,k;
+    FLOAT *C0,*C1,*C2,*C3,*C4,*C5,*C6,*C7;
+    IFLOAT *ptrba,*ptrbb;
+
+    //fprintf(stderr, "%s, bm=%ld bn=%ld bk=%ld alpha=%f ldc=%ld\n", __FUNCTION__, bm, bn, bk, alpha, ldc); // Debug
+
+    FLOAT_V_T va0, va1, va2, va3, va4, va5, va6, va7;
+    FLOAT_V_T vres0, vres1, vres2, vres3, vres4, vres5, vres6, vres7;
+    size_t vl;
+
+    // N:8
+    for (j = bn/8; j > 0; j--) {
+        C0 = C;
+        C1 = C0 + ldc;
+        C2 = C1 + ldc;
+        C3 = C2 + ldc;
+        C4 = C3 + ldc;
+        C5 = C4 + ldc;
+        C6 = C5 + ldc;
+        C7 = C6 + ldc;
+        ptrba = ba;
+
+        for (i = bm; i > 0; i -= vl) {
+            vl = VSETVL(i);
+
+            ptrbb = bb;
+
+            vres0 = VFMVVF_FLOAT(0.0, vl);
+            vres1 = VFMVVF_FLOAT(0.0, vl);
+            vres2 = VFMVVF_FLOAT(0.0, vl);
+            vres3 = VFMVVF_FLOAT(0.0, vl);
+            vres4 = VFMVVF_FLOAT(0.0, vl);
+            vres5 = VFMVVF_FLOAT(0.0, vl);
+            vres6 = VFMVVF_FLOAT(0.0, vl);
+            vres7 = VFMVVF_FLOAT(0.0, vl);
+#if 0
+            for (k = bk; k > 0; k--) {
+                va0 = VLEV_FLOAT(ptrba, vl); 
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl);
+                vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va0, vl);
+                vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va0, vl);
+                vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va0, vl);
+                vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va0, vl);
+
+                ptrba += vl;
+                ptrbb += 8;
+            }
+#else
+            // Unroll K
+            for (k = bk/8; k > 0; k--) {
+                va0 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+                va1 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+ 
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl);
+                vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va0, vl);
+                vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va0, vl);
+                vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va0, vl);
+                vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va0, vl);
+                ptrbb += 8;
+  
+                va2 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va1, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va1, vl);
+                vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va1, vl);
+                vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va1, vl);
+                vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va1, vl);
+                vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va1, vl);
+                ptrbb += 8;
+
+                va3 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va2, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va2, vl);
+                vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va2, vl);
+                vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va2, vl);
+                vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va2, vl);
+                vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va2, vl);
+                ptrbb += 8;
+
+                va4 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va3, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va3, vl);
+                vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va3, vl);
+                vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va3, vl);
+                vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va3, vl);
+                vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va3, vl);
+                ptrbb += 8;
+
+                va5 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va4, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va4, vl);
+                vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va4, vl);
+                vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va4, vl);
+                vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va4, vl);
+                vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va4, vl);
+                ptrbb += 8;
+
+                va6 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va5, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va5, vl);
+                vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va5, vl);
+                vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va5, vl);
+                vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va5, vl);
+                vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va5, vl);
+                ptrbb += 8;
+
+                va7 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va6, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va6, vl);
+                vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va6, vl);
+                vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va6, vl);
+                vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va6, vl);
+                vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va6, vl);
+                ptrbb += 8;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va7, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va7, vl);
+                vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va7, vl);
+                vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va7, vl);
+                vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va7, vl);
+                vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va7, vl);
+                ptrbb += 8;
+            }
+
+            // K remainder
+            for (k = bk&7; k > 0; k--) {
+                va0 = VLEV_FLOAT(ptrba, vl);
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl);
+                vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va0, vl);
+                vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va0, vl);
+                vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va0, vl);
+                vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va0, vl);
+
+                ptrbb += 8;
+                ptrba += vl;
+            }
+#endif
+            va0 = VLEV_FLOAT(C0, vl);
+            va0 = VFMACCVF_FLOAT(va0, alpha, vres0, vl);
+            VSEV_FLOAT(C0, va0, vl);
+
+            va1 = VLEV_FLOAT(C1, vl);
+            va1 = VFMACCVF_FLOAT(va1, alpha, vres1, vl);
+            VSEV_FLOAT(C1, va1, vl);
+
+            va2 = VLEV_FLOAT(C2, vl);
+            va2 = VFMACCVF_FLOAT(va2, alpha, vres2, vl);
+            VSEV_FLOAT(C2, va2, vl);
+
+            va3 = VLEV_FLOAT(C3, vl);
+            va3 = VFMACCVF_FLOAT(va3, alpha, vres3, vl);
+            VSEV_FLOAT(C3, va3, vl);
+
+            va4 = VLEV_FLOAT(C4, vl);
+            va4 = VFMACCVF_FLOAT(va4, alpha, vres4, vl);
+            VSEV_FLOAT(C4, va4, vl);
+
+            va5 = VLEV_FLOAT(C5, vl);
+            va5 = VFMACCVF_FLOAT(va5, alpha, vres5, vl);
+            VSEV_FLOAT(C5, va5, vl);
+
+            va6 = VLEV_FLOAT(C6, vl);
+            va6 = VFMACCVF_FLOAT(va6, alpha, vres6, vl);
+            VSEV_FLOAT(C6, va6, vl);
+
+            va7 = VLEV_FLOAT(C7, vl);
+            va7 = VFMACCVF_FLOAT(va7, alpha, vres7, vl);
+            VSEV_FLOAT(C7, va7, vl);
+
+            C0 += vl;
+            C1 += vl;
+            C2 += vl;
+            C3 += vl;
+            C4 += vl;
+            C5 += vl;
+            C6 += vl;
+            C7 += vl;
+        }
+
+        bb += (bk<<3);
+        C += (ldc<<3);
+    }
+
+    // N:4
+    if (bn & 4) {
+        C0 = C;
+        C1 = C0 + ldc;
+        C2 = C1 + ldc;
+        C3 = C2 + ldc;
+        ptrba = ba;
+
+        for (i = bm; i > 0; i -= vl) {
+            vl = VSETVL(i);
+
+            ptrbb = bb;
+
+            vres0 = VFMVVF_FLOAT(0.0, vl);
+            vres1 = VFMVVF_FLOAT(0.0, vl);
+            vres2 = VFMVVF_FLOAT(0.0, vl);
+            vres3 = VFMVVF_FLOAT(0.0, vl);
+
+#if 0
+            for (k = bk; k > 0; k--) {
+                va0 = VLEV_FLOAT(ptrba, vl); 
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl);
+
+                ptrba += vl;
+                ptrbb += 4;
+            }
+#else
+            // Unroll K
+            for (k = bk/8; k > 0; k--) {
+                va0 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+                va1 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl);
+                ptrbb += 4;
+                va2 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+  
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va1, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va1, vl);
+                ptrbb += 4;
+                va3 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va2, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va2, vl);
+                ptrbb += 4;
+                va4 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va3, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va3, vl);
+                ptrbb += 4;
+                va5 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va4, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va4, vl);
+                ptrbb += 4;
+                va6 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va5, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va5, vl);
+                ptrbb += 4;
+                va7 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va6, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va6, vl);
+                ptrbb += 4;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va7, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va7, vl);
+                ptrbb += 4;
+            }
+
+            // K remainder
+            for (k = bk&7; k > 0; k--) {
+                va0 = VLEV_FLOAT(ptrba, vl);
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl);
+
+                ptrbb += 4;
+                ptrba += vl;
+            }
+#endif
+            va0 = VLEV_FLOAT(C0, vl);
+            va0 = VFMACCVF_FLOAT(va0, alpha, vres0, vl);
+            VSEV_FLOAT(C0, va0, vl);
+
+            va1 = VLEV_FLOAT(C1, vl);
+            va1 = VFMACCVF_FLOAT(va1, alpha, vres1, vl);
+            VSEV_FLOAT(C1, va1, vl);
+
+            va2 = VLEV_FLOAT(C2, vl);
+            va2 = VFMACCVF_FLOAT(va2, alpha, vres2, vl);
+            VSEV_FLOAT(C2, va2, vl);
+
+            va3 = VLEV_FLOAT(C3, vl);
+            va3 = VFMACCVF_FLOAT(va3, alpha, vres3, vl);
+            VSEV_FLOAT(C3, va3, vl);
+
+            C0 += vl;
+            C1 += vl;
+            C2 += vl;
+            C3 += vl;
+        }
+
+        bb += (bk<<2);
+        C += (ldc<<2);
+    }
+
+    // N:2
+    if (bn & 2) {
+        C0 = C;
+        C1 = C0 + ldc;
+        ptrba = ba;
+
+        for (i = bm; i > 0; i -= vl) {
+            vl = VSETVL(i);
+
+            ptrbb = bb;
+
+            vres0 = VFMVVF_FLOAT(0.0, vl);
+            vres1 = VFMVVF_FLOAT(0.0, vl);
+#if 0
+            for (k = bk; k > 0; k--) {
+                va0 = VLEV_FLOAT(ptrba, vl); 
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
+
+                ptrba += vl;
+                ptrbb += 2;
+            }
+#else
+            // Unroll K
+            for (k = bk/8; k > 0; k--) {
+                va0 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+                va1 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
+                ptrbb += 2;
+                va2 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl);
+                ptrbb += 2;
+                va3 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl);
+                ptrbb += 2;
+                va4 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl);
+                ptrbb += 2;
+                va5 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl);
+                ptrbb += 2;
+                va6 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl);
+                ptrbb += 2;
+                va7 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl; 
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl);
+                ptrbb += 2;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl);
+                ptrbb += 2;
+            }
+
+            // K remainder
+            for (k = bk&7; k > 0; k--) {
+                va0 = VLEV_FLOAT(ptrba, vl);
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
+
+                ptrbb += 2;
+                ptrba += vl;
+            }
+#endif
+            va0 = VLEV_FLOAT(C0, vl);
+            va0 = VFMACCVF_FLOAT(va0, alpha, vres0, vl);
+            VSEV_FLOAT(C0, va0, vl);
+
+            va1 = VLEV_FLOAT(C1, vl);
+            va1 = VFMACCVF_FLOAT(va1, alpha, vres1, vl);
+            VSEV_FLOAT(C1, va1, vl);
+
+            C0 += vl;
+            C1 += vl;
+        }
+
+        bb += (bk<<1);
+        C += (ldc<<1);
+    }
+
+    // N:1
+    if (bn & 1) {
+        C0 = C;
+        ptrba = ba;
+
+        for (i = bm; i > 0; i -= vl) {
+            vl = VSETVL(i);
+
+            ptrbb = bb;
+
+            vres0 = VFMVVF_FLOAT(0.0, vl);
+#if 0
+            for (k = bk; k > 0; k--) {
+                va0 = VLEV_FLOAT(ptrba, vl); 
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
+ 
+                ptrba += vl;
+                ptrbb += 1;
+            }
+#else
+            // Unroll K
+            for (k = bk/8; k > 0; k--) {
+                va0 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+                va1 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
+                ptrbb += 1;
+                va2 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl);
+                ptrbb += 1;
+                va3 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl);
+                ptrbb += 1;
+                va4 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl);
+                ptrbb += 1;
+                va5 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl);
+                ptrbb += 1;
+                va6 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl);
+                ptrbb += 1;
+                va7 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl);
+                ptrbb += 1;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl);
+                ptrbb += 1;
+            }
+
+            // K remainder
+            for (k = bk&7; k > 0; k--) {
+                va0 = VLEV_FLOAT(ptrba, vl);
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
+
+                ptrbb += 1;
+                ptrba += vl;
+            }
+#endif
+            va0 = VLEV_FLOAT(C0, vl);
+            va0 = VFMACCVF_FLOAT(va0, alpha, vres0, vl);
+            VSEV_FLOAT(C0, va0, vl);
+  
+            C0 += vl;
+        }
+
+        bb += (bk);
+        C += (ldc);
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/gemv_n_rvv.c b/kernel/riscv64/gemv_n_rvv.c
new file mode 100644
index 000000000..1366eb5ad
--- /dev/null
+++ b/kernel/riscv64/gemv_n_rvv.c
@@ -0,0 +1,94 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m8(n)
+#define FLOAT_V_T               vfloat32m8_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m8
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m8
+#define VSEV_FLOAT              __riscv_vse32_v_f32m8
+#define VSSEV_FLOAT             __riscv_vsse32_v_f32m8
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f32m8
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m8(n)
+#define FLOAT_V_T               vfloat64m8_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m8
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m8
+#define VSEV_FLOAT              __riscv_vse64_v_f64m8
+#define VSSEV_FLOAT             __riscv_vsse64_v_f64m8
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f64m8
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+    if(n < 0)  return(0);
+
+    FLOAT *a_ptr, *x_ptr;
+    BLASLONG i;
+    FLOAT_V_T va, vy;
+
+    if(inc_y == 1) {
+
+        for (size_t vl; m > 0; m -= vl, y += vl, a += vl) {
+            vl = VSETVL(m);
+            a_ptr = a;
+            x_ptr = x;
+            vy = VLEV_FLOAT(y, vl);
+            for(i = 0; i < n; i++) {
+                va = VLEV_FLOAT(a_ptr, vl);
+                vy = VFMACCVF_FLOAT(vy, (alpha * (*x_ptr)), va, vl);
+
+                a_ptr += lda;
+                x_ptr += inc_x;
+            }
+            VSEV_FLOAT(y, vy, vl);
+        }
+ 
+    } else {
+
+        BLASLONG stride_y = inc_y * sizeof(FLOAT);
+
+        for (size_t vl; m > 0; m -= vl, y += vl*inc_y, a += vl) {
+            vl = VSETVL(m);
+            a_ptr = a;
+            x_ptr = x;
+            vy = VLSEV_FLOAT(y, stride_y, vl);
+            for(i = 0; i < n; i++) {
+                va = VLEV_FLOAT(a_ptr, vl);
+                vy = VFMACCVF_FLOAT(vy, (alpha * (*x_ptr)), va, vl);
+
+                a_ptr += lda;
+                x_ptr += inc_x;
+            }
+            VSSEV_FLOAT(y, stride_y, vy, vl);
+        }
+
+    }
+    return(0);
+}
diff --git a/kernel/riscv64/gemv_n_vector.c b/kernel/riscv64/gemv_n_vector.c
index bb9ab8e5a..aa13fc87d 100644
--- a/kernel/riscv64/gemv_n_vector.c
+++ b/kernel/riscv64/gemv_n_vector.c
@@ -27,21 +27,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m4(n)
+#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n)
 #define FLOAT_V_T vfloat32m4_t
-#define VLEV_FLOAT vle32_v_f32m4
-#define VLSEV_FLOAT vlse32_v_f32m4
-#define VSEV_FLOAT vse32_v_f32m4
-#define VSSEV_FLOAT vsse32_v_f32m4
-#define VFMACCVF_FLOAT vfmacc_vf_f32m4
+#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4)
+#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4)
+#define VSEV_FLOAT RISCV_RVV(vse32_v_f32m4)
+#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4)
+#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4)
 #else
-#define VSETVL(n) vsetvl_e64m4(n)
+#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n)
 #define FLOAT_V_T vfloat64m4_t
-#define VLEV_FLOAT vle64_v_f64m4
-#define VLSEV_FLOAT vlse64_v_f64m4
-#define VSEV_FLOAT vse64_v_f64m4
-#define VSSEV_FLOAT vsse64_v_f64m4
-#define VFMACCVF_FLOAT vfmacc_vf_f64m4
+#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4)
+#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4)
+#define VSEV_FLOAT RISCV_RVV(vse64_v_f64m4)
+#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4)
+#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4)
 #endif
 
 int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
diff --git a/kernel/riscv64/gemv_t_rvv.c b/kernel/riscv64/gemv_t_rvv.c
new file mode 100644
index 000000000..9c859aa50
--- /dev/null
+++ b/kernel/riscv64/gemv_t_rvv.c
@@ -0,0 +1,118 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m8()
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e32m1()
+#define FLOAT_V_T               vfloat32m8_t
+#define FLOAT_V_T_M1            vfloat32m1_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m8
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m8
+#define VFREDSUM_FLOAT          __riscv_vfredusum_vs_f32m8_f32m1
+#define VFMACCVV_FLOAT_TU       __riscv_vfmacc_vv_f32m8_tu
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m8
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m8()
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e64m1()
+#define FLOAT_V_T               vfloat64m8_t
+#define FLOAT_V_T_M1            vfloat64m1_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m8
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m8
+#define VFREDSUM_FLOAT          __riscv_vfredusum_vs_f64m8_f64m1
+#define VFMACCVV_FLOAT_TU       __riscv_vfmacc_vv_f64m8_tu
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m8
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+    BLASLONG i, j;
+    FLOAT *a_ptr, *x_ptr;
+
+    FLOAT_V_T va, vx, vr;
+    FLOAT_V_T_M1 v_res, v_z0;
+    size_t vlmax = VSETVL_MAX_M1;
+    v_z0 = VFMVVF_FLOAT_M1(0, vlmax);
+    vlmax = VSETVL_MAX;
+
+    if(inc_x == 1) {
+
+        for(i = 0; i < n; i++) {
+            j = m;
+            a_ptr = a;
+            x_ptr = x;
+            vr = VFMVVF_FLOAT(0, vlmax);
+
+            for (size_t vl; j > 0; j -= vl, a_ptr += vl, x_ptr += vl) {
+                vl = VSETVL(j);
+
+                va = VLEV_FLOAT(a_ptr, vl);
+                vx = VLEV_FLOAT(x_ptr, vl);
+                vr = VFMACCVV_FLOAT_TU(vr, va, vx, vl);
+            }
+
+            v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax);
+            *y += alpha * VFMVFS_FLOAT_M1(v_res);
+            y += inc_y;
+            a += lda;
+        }
+
+    } else {
+
+        BLASLONG stride_x = inc_x * sizeof(FLOAT);
+  
+        for(i = 0; i < n; i++) {
+            j = m;
+            a_ptr = a;
+            x_ptr = x;
+            vr = VFMVVF_FLOAT(0, vlmax);
+
+            for (size_t vl; j > 0; j -= vl, a_ptr += vl, x_ptr += vl*inc_x) {
+                vl = VSETVL(j);
+
+                va = VLEV_FLOAT(a_ptr, vl);
+                vx = VLSEV_FLOAT(x_ptr, stride_x, vl);
+                vr = VFMACCVV_FLOAT_TU(vr, va, vx, vl);
+            }
+
+            v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax);
+            *y += alpha * VFMVFS_FLOAT_M1(v_res);
+            y += inc_y;
+            a += lda;
+        }
+
+    }
+
+    return(0);
+}
diff --git a/kernel/riscv64/gemv_t_vector.c b/kernel/riscv64/gemv_t_vector.c
index 7d0b70cbb..62b85164c 100644
--- a/kernel/riscv64/gemv_t_vector.c
+++ b/kernel/riscv64/gemv_t_vector.c
@@ -27,107 +27,110 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m4(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
-#define FLOAT_V_T vfloat32m4_t
+#define VSETVL(n) RISCV_RVV(vsetvl_e32m2)(n)
+#define FLOAT_V_T vfloat32m2_t
 #define FLOAT_V_T_M1 vfloat32m1_t
-#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLEV_FLOAT vle32_v_f32m4
-#define VLSEV_FLOAT vlse32_v_f32m4
-#define VFREDSUM_FLOAT vfredosum_vs_f32m4_f32m1
-#define VFMACCVV_FLOAT vfmacc_vv_f32m4
-#define VFMVVF_FLOAT vfmv_v_f_f32m4
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFDOTVV_FLOAT vfdot_vv_f32m4
-#define VFMULVV_FLOAT vfmul_vv_f32m4
+#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m2)
+#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m2)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f32m2_f32m1(v_res, va, vb, gvl)
 #else
-#define VSETVL(n) vsetvl_e64m4(n)
-#define VSETVL_MAX vsetvlmax_e64m1()
-#define FLOAT_V_T vfloat64m4_t
+#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f32m2_f32m1)
+#endif
+#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f32m2)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m2)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1)
+#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f32m2)
+#define xint_t int
+#else
+#define VSETVL(n) RISCV_RVV(vsetvl_e64m2)(n)
+#define FLOAT_V_T vfloat64m2_t
 #define FLOAT_V_T_M1 vfloat64m1_t
-#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLEV_FLOAT vle64_v_f64m4
-#define VLSEV_FLOAT vlse64_v_f64m4
-#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
-#define VFMACCVV_FLOAT vfmacc_vv_f64m4
-#define VFMVVF_FLOAT vfmv_v_f_f64m4
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFDOTVV_FLOAT vfdot_vv_f64m4
-#define VFMULVV_FLOAT vfmul_vv_f64m4
+#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m2)
+#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m2)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f64m2_f64m1(v_res, va, vb, gvl)
+#else
+#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f64m2_f64m1)
+#endif
+#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f64m2)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m2)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1)
+#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f64m2)
+#define xint_t long long
 #endif
 
 int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
 {
-	    BLASLONG i = 0, j = 0, k = 0;
-	    BLASLONG ix = 0, iy = 0;
-	    FLOAT *a_ptr = a;
+	BLASLONG i = 0, j = 0, k = 0;
+	BLASLONG ix = 0, iy = 0;
+	FLOAT *a_ptr = a;
         FLOAT temp;
 
         FLOAT_V_T va, vr, vx;
         unsigned int gvl = 0;
-        FLOAT_V_T_M1 v_res, v_z0;
-        gvl = VSETVL_MAX;
-        v_res = VFMVVF_FLOAT_M1(0, gvl);
-        v_z0 = VFMVVF_FLOAT_M1(0, gvl);
+        FLOAT_V_T_M1 v_res;
+
 
         if(inc_x == 1){
                 for(i = 0; i < n; i++){
+                        v_res = VFMVVF_FLOAT_M1(0, 1);
                         gvl = VSETVL(m);
                         j = 0;
                         vr = VFMVVF_FLOAT(0, gvl);
                         for(k = 0; k < m/gvl; k++){
                                 va = VLEV_FLOAT(&a_ptr[j], gvl);
                                 vx = VLEV_FLOAT(&x[j], gvl);
-                                vr = VFMACCVV_FLOAT(vr, va, vx, gvl);
+                                vr = VFMULVV_FLOAT(va, vx, gvl);                // could vfmacc here and reduce outside loop
+                                v_res = VFREDSUM_FLOAT(vr, v_res, gvl);         // but that reordering diverges far enough from scalar path to make tests fail
                                 j += gvl;
                         }
-                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                        temp = (FLOAT)VFMVFS_FLOAT(v_res);
                         if(j < m){
                                 gvl = VSETVL(m-j);
                                 va = VLEV_FLOAT(&a_ptr[j], gvl);
                                 vx = VLEV_FLOAT(&x[j], gvl);
                                 vr = VFMULVV_FLOAT(va, vx, gvl);
-
-                                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                                temp += (FLOAT)VFMVFS_FLOAT(v_res);
+                                v_res = VFREDSUM_FLOAT(vr, v_res, gvl);
                         }
+                        temp = (FLOAT)EXTRACT_FLOAT(v_res);
                         y[iy] += alpha * temp;
+
+
                         iy += inc_y;
                         a_ptr += lda;
                 }
         }else{
                 BLASLONG stride_x = inc_x * sizeof(FLOAT);
-                
                 for(i = 0; i < n; i++){
+                        v_res = VFMVVF_FLOAT_M1(0, 1);
                         gvl = VSETVL(m);
-						BLASLONG inc_xv = inc_x * gvl;
                         j = 0;
                         ix = 0;
                         vr = VFMVVF_FLOAT(0, gvl);
                         for(k = 0; k < m/gvl; k++){
                                 va = VLEV_FLOAT(&a_ptr[j], gvl);
                                 vx = VLSEV_FLOAT(&x[ix], stride_x, gvl);
-                                vr = VFMACCVV_FLOAT(vr, va, vx, gvl);
+                                vr = VFMULVV_FLOAT(va, vx, gvl);
+                                v_res = VFREDSUM_FLOAT(vr, v_res, gvl);
                                 j += gvl;
-                                ix += inc_xv;
+                                ix += inc_x * gvl;
                         }
-                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                        temp = (FLOAT)VFMVFS_FLOAT(v_res);
                         if(j < m){
                                 gvl = VSETVL(m-j);
                                 va = VLEV_FLOAT(&a_ptr[j], gvl);
                                 vx = VLSEV_FLOAT(&x[ix], stride_x, gvl);
                                 vr = VFMULVV_FLOAT(va, vx, gvl);
-
-                                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                                temp += (FLOAT)VFMVFS_FLOAT(v_res);
+                                v_res = VFREDSUM_FLOAT(vr, v_res, gvl);
                         }
+                        temp = (FLOAT)EXTRACT_FLOAT(v_res);
                         y[iy] += alpha * temp;
+
+
                         iy += inc_y;
                         a_ptr += lda;
                 }
         }
+
+
 	return(0);
 }
-
diff --git a/kernel/riscv64/generate_kernel.py b/kernel/riscv64/generate_kernel.py
new file mode 100755
index 000000000..8be7c9f9c
--- /dev/null
+++ b/kernel/riscv64/generate_kernel.py
@@ -0,0 +1,673 @@
+#!/usr/bin/python3
+
+import sys, os
+import contextlib
+
+#-----------------------------------------------------------------------
+def ERROR(*args, **kwargs):
+    print(*args, file=sys.stderr, **kwargs)
+    sys.exit(-1)
+
+class Target(object):
+    def __init__( self, out, mappings, initial_level=0, tab_width=4 ):
+        self._level = initial_level
+        self._tab_width = tab_width
+        self._out = out
+        self._mappings = mappings
+
+    @contextlib.contextmanager
+    def map( self, **items ):
+        old_mappings = self._mappings
+        self._mappings = dict(old_mappings, **items)
+        yield self._mappings
+        self._mappings = old_mappings
+
+    @contextlib.contextmanager
+    def block( self, start=None, end=None, **args ):
+        with self.map(**args):
+            if start is not None:
+                self.write();
+                self.write(start)
+            self._level += 1
+            yield self._level
+            self._level -= 1
+            if end is not None:
+                self.write(end)
+                self.write()
+
+    def write( self, fmt=None, *args, **kwargs ):
+        if fmt is not None:
+            mappings = dict(self._mappings, **kwargs) if kwargs else self._mappings
+            self._out(self._indent_str() + fmt.format(*args, **mappings))
+        else:
+            self._out("")
+
+    def _indent_str( self ):
+        return ' ' * (self._level * self._tab_width)
+
+#-----------------------------------------------------------------------
+def generate_trmm_block( dest ):
+    dest.write("{index_type} pass_K = K;")
+    dest.write("#ifdef LEFT")
+    with dest.block():
+        dest.write("{index_type} off = offset + m_top;")
+    dest.write("#else")
+    with dest.block():
+        dest.write("{index_type} off = -offset + n_top;")
+    dest.write("#endif")
+
+    dest.write("#ifdef BACKWARDS")
+    with dest.block():
+        dest.write("ai += off*{M}{elt_size};")
+        dest.write("bi += off*{N}{elt_size};")
+        dest.write("pass_K -= off;")
+    dest.write("#else")
+    with dest.block():
+        dest.write("#ifdef LEFT")
+        with dest.block():
+            dest.write("pass_K = off + {M};")
+        dest.write("#else")
+        with dest.block():
+            dest.write("pass_K = off + {N};")
+        dest.write("#endif")
+    dest.write("#endif")
+
+#-----------------------------------------------------------------------
+def generate_gemm_kernel_inner_real( settings, dest, M, N, vlen, a_regs ):
+    TRMM           = (settings['op'].value == 'trmm')
+    narrow_result  = (settings['param_precision'].value != 'double') and settings['force_acc_double'].value
+
+    with dest.map( 
+        M=M, 
+        N=N, 
+    ):
+        dest.write("{index_type} ai=m_top*K{elt_size};")
+        dest.write("{index_type} bi=n_top*K{elt_size};")
+        if TRMM:
+            generate_trmm_block( dest )
+
+        for i in range(N):
+            dest.write("{param_scalar_t} B{i} = B[bi+{i}];", i=i)
+        dest.write("bi += {N};")
+        dest.write()
+
+        for i in range(a_regs):
+            dest.write("{param_vector_t} A{i} = {VLEV}( &A[ai+{i}*gvl], gvl );", i=i)
+        dest.write("ai += {M};")
+        dest.write()
+
+        for j in range(N):
+            for i in range(a_regs):
+                dest.write("{acc_vector_t} result{dest} = {VMUL_TO_ACC}( A{i}, B{j}, gvl);", dest=j*a_regs+i, i=i, j=j)
+
+        with dest.block("for({index_type} k=1; k<{Kend}; k++) {{", "}}", Kend=('pass_K' if TRMM else 'K')):
+            for i in range(N):
+                dest.write("B{i} = B[bi+{i}];", i=i )
+            dest.write("bi += {N};")
+            dest.write()
+
+            for i in range(a_regs):
+                dest.write("A{i} = {VLEV}( &A[ai+{i}*gvl], gvl );", i=i)
+
+            dest.write("ai += {M};")
+            dest.write()
+
+
+            for j in range(N):
+                for i in range(a_regs):
+                    dest.write("result{dest} = {VMACC_TO_ACC}( result{dest}, B{j}, A{i}, gvl);", dest= j*a_regs+i, j=j, i=i )
+
+        dest.write()
+        dest.write("{index_type} ci=n_top*ldc+m_top;")
+        dest.write()
+
+        if narrow_result:
+            for j in range(N):
+                for i in range(a_regs):
+                    dest.write("{param_vector_t} narrowed{idx} = {VFNCVT}( result{idx}, gvl );", idx=j*a_regs+i)
+
+        if not TRMM:
+            for j in range(N):
+                for i in range(a_regs):
+                    idx = j*a_regs+i
+                    increment = ' ci += ldc-gvl*{};'.format(a_regs-1) if (i == a_regs-1) else ' ci += gvl;'
+                    if idx == N*a_regs-1:
+                        increment = ''
+                    dest.write("{param_vector_t} c{idx} = {VLEV}( &C[ci], gvl);{increment}", idx=idx, increment=increment)
+
+        if narrow_result:
+            for j in range(N):
+                for i in range(a_regs):
+                    idx = j*a_regs+i
+                    if TRMM:
+                        dest.write("{param_vector_t} c{idx} = {VFMUL}( narrowed{idx}, alpha, gvl );", idx=idx)
+                    else:
+                        dest.write("c{idx} = {VFMACC}( c{idx}, alpha, narrowed{idx}, gvl );", idx=idx)
+        else:
+            for j in range(N):
+                for i in range(a_regs):
+                    idx = j*a_regs+i
+                    if TRMM:
+                        dest.write("{param_vector_t} c{idx} = {VFMUL}( result{idx}, alpha, gvl );", idx=idx)
+                    else:
+                        dest.write("c{idx} = {VFMACC}( c{idx}, alpha, result{idx}, gvl );", idx=idx)
+            
+
+        if not TRMM:
+            dest.write()
+            dest.write("ci=n_top*ldc+m_top;")
+            dest.write()
+
+        for j in range(N):
+            for i in range(a_regs):
+                idx = j*a_regs+i
+                increment = ' ci += ldc-gvl*{};'.format(a_regs-1) if (i == a_regs-1) else ' ci += gvl;'
+                if idx == N*a_regs-1:
+                    increment = ''
+                dest.write("{VSEV}( &C[ci], c{idx}, gvl);{increment}", idx=idx, increment=increment)
+
+
+#-----------------------------------------------------------------------
+def generate_gemm_kernel_inner_complex( settings, dest, M, N, vlen, a_regs ):
+    TRMM           = (settings['op'].value == 'trmm')
+    narrow_result  = (settings['param_precision'].value != 'double') and settings['force_acc_double'].value
+
+    if narrow_result:
+        raise RuntimeError("wide accumulator not supported for generated complex kernels")
+        # we could, but we run out of registers really really fast
+
+    with dest.map( 
+        M=M, 
+        N=N, 
+    ):
+        dest.write("{index_type} ai=m_top*K*2;")
+        dest.write("{index_type} bi=n_top*K*2;")
+        if TRMM:
+            generate_trmm_block( dest )
+
+        for i in range(N):
+            dest.write("{param_scalar_t} B{i}r = B[bi+{i}*2+0];", i=i)
+            dest.write("{param_scalar_t} B{i}i = B[bi+{i}*2+1];", i=i)
+        dest.write("bi += {N}*2;")
+        dest.write()
+
+        for i in range(a_regs):
+            dest.write("{param_vector_t} A{i}r = {VLSEV}( &A[ai+{i}*gvl*2], sizeof(FLOAT)*2, gvl );", i=i)
+            dest.write("{param_vector_t} A{i}i = {VLSEV}( &A[ai+{i}*gvl*2+1], sizeof(FLOAT)*2, gvl );", i=i)
+        dest.write("ai += {M}*2;")
+        dest.write()
+
+        # for each vector register loaded from matrix A, we require N registers to hold vector-scalar multiply-accumulate results
+        accumulation_regs = a_regs * N
+        dest.write("// {a_regs} vector regs to hold A array contents, {accumulation_regs} regs to hold values accumulated over k",
+                a_regs=a_regs*2, accumulation_regs=accumulation_regs*2
+            )
+        pass_regs = (accumulation_regs + a_regs)*2
+        tmp_regs = (32 // settings['LMUL_ACC'].value) - pass_regs
+        if tmp_regs < 2:
+            raise RuntimeError("Complex kernel would use too many registers!")
+
+        dest.write("// leaving {tmp_regs} vector registers for temporaries", tmp_regs=tmp_regs)
+
+        tmp_unroll_i = min(tmp_regs, a_regs)
+        tmp_unroll_j = N
+        while tmp_unroll_j > 1 and (tmp_regs/(tmp_unroll_i*2)) < tmp_unroll_j:
+            tmp_unroll_j = int(tmp_unroll_j / 2)
+
+        if tmp_unroll_i < a_regs or tmp_unroll_j < N:
+            dest.write("// performing {ops} operations between reuses of temporaries", ops=tmp_unroll_j*tmp_unroll_i)
+
+        for tj in range(0, N, tmp_unroll_j):
+            for ti in range(0, a_regs, tmp_unroll_i):
+                for j in range(tj, tj+tmp_unroll_j):
+                    for i in range(ti, ti+tmp_unroll_i):
+                        with dest.map(dest=j*a_regs+i, tmp=(i-ti)+tmp_unroll_i*(j-tj), i=i, j=j):
+                            if ti == 0 and tj==0:
+                                dest.write("{acc_vector_t} tmp{tmp}r = {VMUL_TO_ACC}( A{i}i, B{j}i, gvl);")
+                                dest.write("{acc_vector_t} tmp{tmp}i = {VMUL_TO_ACC}( A{i}r, B{j}i, gvl);")
+                            else:
+                                dest.write("tmp{tmp}r = {VMUL_TO_ACC}( A{i}i, B{j}i, gvl);")
+                                dest.write("tmp{tmp}i = {VMUL_TO_ACC}( A{i}r, B{j}i, gvl);")
+                for j in range(tj, tj+tmp_unroll_j):
+                    for i in range(ti, ti+tmp_unroll_i):
+                        with dest.map(dest=j*a_regs+i, tmp=(i-ti)+tmp_unroll_i*(j-tj), i=i, j=j):
+                            dest.write("tmp{tmp}r = VFMACC_RR( tmp{tmp}r, B{j}r, A{i}r, gvl);")
+                            dest.write("tmp{tmp}i = VFMACC_RI( tmp{tmp}i, B{j}r, A{i}i, gvl);")
+
+                for j in range(tj, tj+tmp_unroll_j):
+                    for i in range(ti, ti+tmp_unroll_i):
+                        with dest.map(dest=j*a_regs+i, tmp=(i-ti)+tmp_unroll_i*(j-tj), i=i, j=j):
+                            dest.write("{acc_vector_t} ACC{dest}r = tmp{tmp}r;")
+                            dest.write("{acc_vector_t} ACC{dest}i = tmp{tmp}i;")
+
+        with dest.block("for({index_type} k=1; k<{Kend}; k++) {{", "}}", Kend=('pass_K' if TRMM else 'K')):
+            for i in range(N):
+                dest.write("B{i}r = B[bi+{i}*2+0];", i=i)
+                dest.write("B{i}i = B[bi+{i}*2+1];", i=i)
+            dest.write("bi += {N}*2;")
+            dest.write()
+
+            for i in range(a_regs):
+                dest.write("A{i}r = {VLSEV}( &A[ai+{i}*gvl*2], sizeof(FLOAT)*2, gvl );", i=i)
+                dest.write("A{i}i = {VLSEV}( &A[ai+{i}*gvl*2+1], sizeof(FLOAT)*2, gvl );", i=i)
+
+            dest.write("ai += {M}*2;")
+            dest.write()
+
+
+            for tj in range(0, N, tmp_unroll_j):
+                for ti in range(0, a_regs, tmp_unroll_i):
+                    # note the values in tmp{tmp}* are frequently of similar magnitude and opposite sign
+                    # so accumulating them directly to ACC would lose precision when ACC is larger
+
+                    for j in range(tj, tj+tmp_unroll_j):
+                        for i in range(ti, ti+tmp_unroll_i):
+                            with dest.map(dest=j*a_regs+i, tmp=(i-ti)+tmp_unroll_i*(j-tj), i=i, j=j):
+                                dest.write("tmp{tmp}r = {VMUL_TO_ACC}( A{i}i, B{j}i, gvl);")
+                                dest.write("tmp{tmp}i = {VMUL_TO_ACC}( A{i}r, B{j}i, gvl);")
+                    for j in range(tj, tj+tmp_unroll_j):
+                        for i in range(ti, ti+tmp_unroll_i):
+                            with dest.map(dest=j*a_regs+i, tmp=(i-ti)+tmp_unroll_i*(j-tj), i=i, j=j):
+                                dest.write("tmp{tmp}r = VFMACC_RR( tmp{tmp}r, B{j}r, A{i}r, gvl);")
+                                dest.write("tmp{tmp}i = VFMACC_RI( tmp{tmp}i, B{j}r, A{i}i, gvl);")
+                    for j in range(tj, tj+tmp_unroll_j):
+                        for i in range(ti, ti+tmp_unroll_i):
+                            with dest.map(dest=j*a_regs+i, tmp=(i-ti)+tmp_unroll_i*(j-tj), i=i, j=j):
+                                dest.write("ACC{dest}r = {__riscv_}vfadd( ACC{dest}r, tmp{tmp}r, gvl);")
+                                dest.write("ACC{dest}i = {__riscv_}vfadd( ACC{dest}i, tmp{tmp}i, gvl);")
+
+        dest.write()
+        dest.write("{index_type} ci=n_top*ldc+m_top;")
+        dest.write()
+
+        for j in range(N):
+            if TRMM:
+                for i in range(a_regs):
+                    with dest.map(idx=j*a_regs+i):
+                        dest.write("{param_vector_t} C{idx}r = {__riscv_}vfmul( ACC{idx}r, alphar, gvl );")
+                        dest.write("{param_vector_t} C{idx}i = {__riscv_}vfmul( ACC{idx}i, alphar, gvl );")
+            else:
+                for i in range(a_regs):
+                    idx = j*a_regs+i
+                    increment = 'ci += ldc-gvl*{};'.format(a_regs-1) if (i == a_regs-1) else ' ci += gvl;'
+                    if idx == N*a_regs-1:
+                        increment = ''                    
+                    with dest.map(idx=j*a_regs+i, increment=increment):
+                        dest.write("{param_vector_t} C{idx}r = {VLSEV}( &C[ci*2+0], sizeof(FLOAT)*2, gvl );")
+                        dest.write("{param_vector_t} C{idx}i = {VLSEV}( &C[ci*2+1], sizeof(FLOAT)*2, gvl );")
+                        dest.write("{increment}")
+
+        if not TRMM:
+            for j in range(N):
+                for i in range(a_regs):
+                    with dest.map(idx=j*a_regs+i):
+                        dest.write("C{idx}r = {__riscv_}vfmacc( C{idx}r, alphar, ACC{idx}r, gvl );")
+                        dest.write("C{idx}i = {__riscv_}vfmacc( C{idx}i, alphar, ACC{idx}i, gvl );")
+
+        for j in range(N):
+            for i in range(a_regs):
+                with dest.map(idx=j*a_regs+i):
+                    dest.write("C{idx}r = {__riscv_}vfnmsac( C{idx}r, alphai, ACC{idx}i, gvl );")
+                    dest.write("C{idx}i = {__riscv_}vfmacc ( C{idx}i, alphai, ACC{idx}r, gvl );")
+
+        if not TRMM:
+            dest.write()
+            dest.write("ci=n_top*ldc+m_top;")
+            dest.write()
+
+        for j in range(N):
+            for i in range(a_regs):
+                idx = j*a_regs+i
+                increment = 'ci += ldc-gvl*{};'.format(a_regs-1) if (i == a_regs-1) else ' ci += gvl;'
+                if idx == N*a_regs-1:
+                    increment = ''                    
+                with dest.map(idx=j*a_regs+i, increment=increment):
+                    dest.write("{VSSEV}( &C[ci*2+0], sizeof(FLOAT)*2, C{idx}r, gvl);")
+                    dest.write("{VSSEV}( &C[ci*2+1], sizeof(FLOAT)*2, C{idx}i, gvl);")
+                    dest.write("{increment}")
+
+#-----------------------------------------------------------------------
+def generate_gemm_kernel( settings, OUTPUT ):
+    if settings['conjugate'].value:
+        ERROR('conjugate gemm not yet supported')
+
+    is_complex = settings['complex'].value
+    generate_gemm_kernel_inner = generate_gemm_kernel_inner_complex if is_complex else generate_gemm_kernel_inner_real
+    dest = Target(OUTPUT, { k:str(settings[k].value) for k in settings })
+
+    M = settings['M'].value
+    N = settings['N'].value
+    vlenmax = int(settings['reg_width_bits'].value * settings['LMUL_ACC'].value /
+                  settings['ELEN_PARAM'].value)
+    a_regs = max(int(M/vlenmax), 1)
+
+    # for each vector register loaded from matrix A, we require N registers to hold vector-scalar multiply-accumulate results
+    accumulation_regs = a_regs * N
+    required_regs = accumulation_regs + a_regs
+    if is_complex:
+        required_regs = required_regs * 2 + 2
+        dest.write('''
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+    #define S0  1
+    #define S1 -1
+    #define S2  1
+    #define S3  1
+    #define VFMACC_RR __riscv_vfmsac{tail_policy}
+    #define VFMACC_RI __riscv_vfmacc{tail_policy}
+#endif
+#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
+    #define S0  1
+    #define S1  1
+    #define S2  1
+    #define S3 -1
+    #define VFMACC_RR __riscv_vfmacc{tail_policy}
+    #define VFMACC_RI __riscv_vfmsac{tail_policy}
+#endif
+#if   defined(RN) || defined(RT) || defined(CN) || defined(CT)
+    #define S0  1
+    #define S1  1
+    #define S2 -1
+    #define S3  1
+    #define VFMACC_RR __riscv_vfmacc{tail_policy}
+    #define VFMACC_RI __riscv_vfnmsac{tail_policy}
+#endif
+#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
+    #define S0  1
+    #define S1 -1
+    #define S2 -1
+    #define S3 -1
+    #define VFMACC_RR __riscv_vfmsac{tail_policy}
+    #define VFMACC_RI __riscv_vfnmacc{tail_policy}
+#endif
+'''.format(tail_policy=settings['tail_policy'].value))
+
+
+    if required_regs > (32 // settings['LMUL_ACC'].value):
+        raise Exception("{} vector registers needed during accumulation for unrolling {} x {}{} but only {} are available".format(
+            required_regs, N, M, (" with wide accumulator" if settings['LMUL_ACC'].value > 1 else ''), 32 // settings['LMUL_ACC'].value
+            ))
+
+    TRMM = (settings['op'].value == 'trmm')
+    if TRMM:
+        with dest.block("#if defined(LEFT) != defined(TRANSA)", "#endif"):
+            dest.write("#define BACKWARDS")
+
+    dest.write("int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, {alpha}, FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc{trmm})",
+            alpha = ('FLOAT alphar, FLOAT alphai' if is_complex else 'FLOAT alpha'),
+            trmm = (', BLASLONG offset' if TRMM else '')
+        )
+
+    with dest.block("{{", "}}", elt_size='*2' if is_complex else ''):
+        if settings['trace'].value:
+            dest.write("printf(\"\\n\\nENTRY: %s(%d) M %d N %d K %d ldc %d\\n\", __FILE__, __LINE__, M, N, K, ldc);")
+        dest.write("{index_type} gvl = 0;")
+        dest.write("{index_type} m_top = 0;")
+        dest.write("{index_type} n_top = 0;")
+
+        dest.write()
+        dest.write()
+        dest.write("// -- MAIN PASS")
+
+        with dest.block("for ({index_type} j=0; j<N/{N}; j+=1) {{", "}}"):
+            dest.write("m_top = 0;")
+            dest.write("{index_type} gvl = {VSETVL}({vlenmax});", vlenmax=min(vlenmax,max(int(M/a_regs),1)))
+            dest.write()
+            with dest.block("for ({index_type} i=0; i<M/{M}; i+=1) {{", "}}"):
+                generate_gemm_kernel_inner( settings, dest, M, N, vlenmax, a_regs )
+                dest.write( "m_top += {M};" )
+
+            dest.write()
+            dest.write()
+            dest.write("// -- tails for main pass")
+            generate_M_tails( dest, settings, M, N )
+
+            dest.write( "n_top += {N};" )
+
+
+        N_tail = int(N/2)
+        while( N_tail > 0 ):
+            with dest.map(N=N_tail):
+                dest.write()
+                dest.write()
+                dest.write("// -- tails for N={N}")
+                with dest.block("if( N & {N} ) {{", "}}" ):
+                    if settings['trace'].value:
+                        dest.write("printf(\"N tail entry: %s(%d) M %d N %d K %d m_top %d n_top %d\\n\", __FILE__, __LINE__, M, N, K, m_top, n_top);")
+                    dest.write("gvl = {VSETVL}({vlenmax});", vlenmax=min(vlenmax,max(int(M/a_regs),1)))
+                    dest.write("m_top = 0;")
+                    with dest.block("for ({index_type} i=0; i<M/{M}; i+=1) {{", "}}"):
+                        generate_gemm_kernel_inner( settings, dest, M, N_tail, vlenmax, a_regs )
+                        dest.write("m_top += {M};")
+
+                    generate_M_tails( dest, settings, M, N_tail )
+                    dest.write("n_top += {N};")
+            N_tail = int(N_tail/2)
+
+        dest.write("return 0;");
+
+
+#-----------------------------------------------------------------------
+def generate_M_tails( dest, settings, M, N ):
+    M_tail = int(M/2)
+    M_tail_min = settings['M_tail_scalar_from'].value
+    vlenmax = int(settings['reg_width_bits'].value * settings['LMUL_ACC'].value
+                  / settings['ELEN_PARAM'].value )
+    TRMM           = (settings['op'].value == 'trmm')
+    is_complex = settings['complex'].value
+    generate_gemm_kernel_inner = generate_gemm_kernel_inner_complex if is_complex else generate_gemm_kernel_inner_real
+
+    while( M_tail > M_tail_min ):
+        with dest.block("if( M & {M_tail} ) {{", "}}", M_tail=M_tail ):
+            if settings['trace'].value:
+                dest.write("printf(\"tail: %s(%d) M %d N %d K %d m_top %d n_top %d\\n\", __FILE__, __LINE__, M, N, K, m_top, n_top);")
+            a_regs = max( 1, int(M_tail/vlenmax) )
+            vlen = int(M_tail/a_regs)
+            dest.write("gvl = {VSETVL}({vlen});\n", vlen=vlen)
+
+            generate_gemm_kernel_inner( settings, dest, M_tail, N, vlen, a_regs )
+            dest.write( "m_top += {M_tail};" )
+
+        M_tail = int( M_tail / 2 )
+
+    while( M_tail > 0 ):
+        with dest.block("if( M & {M_tail} ) {{", "}}", 
+                M_tail=M_tail, 
+                N=N, 
+                result_t = ('double' if settings['force_acc_double'].value else settings['param_scalar_t'].value) 
+        ):
+            if settings['trace'].value:
+                dest.write("printf(\"tail: %s(%d) M %d N %d K %d m_top %d n_top %d\\n\", __FILE__, __LINE__, M, N, K, m_top, n_top);")
+            for r in range(M_tail * N * (2 if is_complex else 1)):
+                dest.write("{result_t} result{r} = 0;",
+                    r=r
+                )
+
+            dest.write("{index_type} ai=m_top*K{elt_size};")
+            dest.write("{index_type} bi=n_top*K{elt_size};")
+
+            if TRMM:
+                with dest.map(M=M_tail, N=N):
+                    generate_trmm_block( dest )
+
+            with dest.block("for({index_type} k=0; k<{Kend}; k++) {{", "}}", Kend = ('pass_K' if TRMM else 'K') ):
+                for ki in range( N ):
+                    for kj in range( M_tail ):
+                        if is_complex:
+                            dest.write("result{dest}+=S0*A[ai+{kj}+0]*B[bi+{ki}+0] + S1*A[ai+{kj}+1]*B[bi+{ki}+1];".format(
+                                        dest=(ki*M_tail+kj)*2, kj=kj*2, ki=ki*2
+                                    ))                            
+                            dest.write("result{dest}+=S2*A[ai+{kj}+1]*B[bi+{ki}+0] + S3*A[ai+{kj}+0]*B[bi+{ki}+1];".format(
+                                        dest=(ki*M_tail+kj)*2+1, kj=kj*2, ki=ki*2
+                                    ))                            
+                        else:
+                            dest.write("result{dest}+=A[ai+{kj}]*B[bi+{ki}];".format(
+                                    dest=ki*M_tail+kj, kj=kj, ki=ki
+                                ))
+                dest.write("ai+={M_tail}{elt_size};")
+                dest.write("bi+={N}{elt_size};")
+
+            dest.write("{index_type} ci=n_top*ldc+m_top;")
+            if is_complex:
+                dest.write("{result_t} Cr, Ci;")
+            for ki in range( N ):
+                for kj in range( M_tail ):
+                    if is_complex:
+                        if TRMM:
+                            dest.write('Cr = result{dest}*alphar;', dest=(ki*M_tail+kj)*2+0)
+                            dest.write('Ci = result{dest}*alphar;', dest=(ki*M_tail+kj)*2+1)
+                        else:
+                            dest.write('Cr = C[(ci+{ki}*ldc+{kj})*2+0];', ki=ki, kj=kj)
+                            dest.write('Ci = C[(ci+{ki}*ldc+{kj})*2+1];', ki=ki, kj=kj)
+                            dest.write('Cr += result{dest}*alphar;', dest=(ki*M_tail+kj)*2+0)
+                            dest.write('Ci += result{dest}*alphar;', dest=(ki*M_tail+kj)*2+1)
+                        dest.write('Cr -= result{dest}*alphai;', dest=(ki*M_tail+kj)*2+1)
+                        dest.write('Ci += result{dest}*alphai;', dest=(ki*M_tail+kj)*2+0)
+                        dest.write("C[(ci+{ki}*ldc+{kj})*2+0] = Cr;", ki=ki, kj=kj )
+                        dest.write("C[(ci+{ki}*ldc+{kj})*2+1] = Ci;", ki=ki, kj=kj )
+                    else:
+                        op = '' if TRMM else '+'
+                        dest.write("C[ci+{ki}*ldc+{kj}] {op}= alpha * result{dest};",
+                                ki=ki, kj=kj, op=op, dest=ki*M_tail+kj
+                            )
+            dest.write("m_top+={M_tail};")
+
+        M_tail = int(M_tail/2)
+
+
+#-----------------------------------------------------------------------
+class Setting(object):
+    def __init__( self, value, convert = None ):
+        self._value = value
+        self._convert = convert
+
+    @classmethod
+    def ENUM( cls, *values ):
+        def closure( values ):
+            return lambda value: values[value.lower()]
+        return closure( { v.lower():v for v in values } )
+
+    @classmethod
+    def BOOL( cls, value ):
+        return value.lower().startswith('t') or value == '1'
+
+    @property
+    def value( self ):
+        return self._value
+
+    @property
+    def configurable( self ):
+        return self._convert is not None
+
+    @value.setter
+    def value( self, value ):
+        self._value = self._convert( value )
+
+    def __str__( self ):
+        return str(self._value)
+
+#-----------------------------------------------------------------------
+def main():
+    settings = {
+        'op':               Setting( 'gemm', Setting.ENUM( 'gemm', 'trmm' ) ),
+        'M':                Setting( 16, int ),
+        'N':                Setting( 4, int ),
+        'reg_width_bits':   Setting( 256, int ),
+        'LMUL':             Setting( 1, int ),
+        'M_tail_scalar_from':Setting( 2, int ),
+        'cpu':              Setting( 'zvl256b', str ),
+        'param_precision':  Setting( 'float', Setting.ENUM( 'float', 'double' ) ),
+        'force_acc_double': Setting( False, Setting.BOOL ),
+        'complex':          Setting( False, Setting.BOOL ),
+        'conjugate':        Setting( False, Setting.BOOL ),
+        'index_type':       Setting( 'BLASLONG', str ),
+        'trace':            Setting( False, Setting.BOOL ),
+        'output':           Setting( None, str ),
+        'tail_policy':      Setting( '', str ), # _ta, if toolchain supports it
+        '__riscv_':         Setting( '__riscv_', str),
+    }
+
+    for item in sys.argv[1:]:
+        try:
+            name, value = tuple(item.split( '=', 1 ))
+        except:
+            ERROR("couldn't parse {}, expected arguments of the form name=value".format(item))
+
+        if name not in settings:
+            ERROR("couldn't parse {}, {} it is not a known option\n".format( item, name )
+                  +"options (and current defaults) are\n{}".format(
+                   " ".join([ '{}={}'.format(k, settings[k].value) for k in settings.keys()]))
+                )
+
+        try:
+            settings[name].value = value
+        except:
+            import traceback
+            traceback.print_exc()
+            ERROR("couldn't parse {}".format(item))
+
+    if settings['output'].value is None:
+        if settings['complex'].value:
+            prefix = 'z' if settings['param_precision'].value == 'double' else 'c'
+        else:
+            prefix = 'd' if settings['param_precision'].value == 'double' else 's'
+        settings['output'] = Setting('{}{}_kernel_{}x{}_{}.c'.format(
+                prefix,
+                settings['op'],
+                settings['M'],
+                settings['N'],
+                settings['cpu']
+            ))
+
+    if settings['param_precision'].value == 'double':
+        settings['param_scalar_t'] = Setting( 'double' )
+        settings['ELEN_PARAM'] = Setting(64)
+    else:
+        settings['param_scalar_t'] = Setting( 'float' )
+        settings['ELEN_PARAM'] = Setting(32)
+
+    settings['VFMUL'] = Setting( '{}vfmul_vf_f{}m{}{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['LMUL'], settings['tail_policy']) )
+    settings['VFMACC'] = Setting( '{}vfmacc_vf_f{}m{}{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['LMUL'], settings['tail_policy']) )
+
+    settings['ELEN_ACC'] = settings['ELEN_PARAM']
+    settings['LMUL_ACC'] = Setting(settings['LMUL'].value)
+    widen = ''
+
+    if settings['force_acc_double'].value and (settings['param_precision'].value == 'float'):
+        settings['ELEN_ACC'] = Setting(64)
+        settings['LMUL_ACC'] = Setting(settings['LMUL'].value*2)
+        settings['VFNCVT']   = Setting('{}vfncvt_f_f_w_f{}m{}{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['LMUL'], settings['tail_policy']))
+        widen = 'w'
+
+    settings['VMUL_TO_ACC'] = Setting( '{}vf{}mul_vf_f{}m{}{}'.format(settings['__riscv_'], widen, settings['ELEN_ACC'], settings['LMUL_ACC'], settings['tail_policy']) )
+    settings['VMACC_TO_ACC'] = Setting( '{}vf{}macc_vf_f{}m{}{}'.format(settings['__riscv_'], widen, settings['ELEN_ACC'], settings['LMUL_ACC'], settings['tail_policy']) )
+
+    settings['param_vector_t']=Setting('vfloat{}m{}_t'.format(settings['ELEN_PARAM'], settings['LMUL']))
+    settings['acc_vector_t']  =Setting('vfloat{}m{}_t'.format(settings['ELEN_ACC'], settings['LMUL_ACC']))
+    settings['VLEV']          =Setting('{}vle{}_v_f{}m{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['ELEN_PARAM'], settings['LMUL']))
+    settings['VSEV']          =Setting('{}vse{}_v_f{}m{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['ELEN_PARAM'], settings['LMUL']))
+    settings['VLSEV']         =Setting('{}vlse{}_v_f{}m{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['ELEN_PARAM'], settings['LMUL']))
+    settings['VSSEV']         =Setting('{}vsse{}_v_f{}m{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['ELEN_PARAM'], settings['LMUL']))
+    settings['VSETVL']        =Setting('{}vsetvl_e{}m{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['LMUL']))
+
+
+    to_stdout = (settings['output'].value == '-')
+    if not to_stdout:
+        print("Writing {}".format(settings['output'].value), file=sys.stderr)
+
+    with open(sys.stdout.fileno() if to_stdout else settings['output'].value, 'w') as destination_file:
+        def OUTPUT(*args, **kwargs):
+            print(*args, file=destination_file, **kwargs)
+
+        OUTPUT("/*\n\nAUTOGENERATED KERNEL\nSettings:\n {}".format(" ".join([ "{}={}\n".format(k, repr(settings[k].value)) for k in sorted(settings.keys()) if settings[k].configurable])))
+        OUTPUT("Derived:\n {}\n*/\n".format(" ".join([ "{}={}\n".format(k, repr(settings[k].value)) for k in sorted(settings.keys()) if not settings[k].configurable])))
+
+        OUTPUT('#include "common.h"')
+        OUTPUT("\n")
+
+        if settings['op'].value in ('gemm', 'trmm'):
+            generate_gemm_kernel(settings, OUTPUT)
+        else:
+            ERROR("unsupported kernel type {}".format(settings['op']))
+
+if __name__ == "__main__":
+    main()
diff --git a/kernel/riscv64/iamax_rvv.c b/kernel/riscv64/iamax_rvv.c
new file mode 100644
index 000000000..8362d7cef
--- /dev/null
+++ b/kernel/riscv64/iamax_rvv.c
@@ -0,0 +1,149 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e64m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m8()
+#define FLOAT_V_T               vfloat64m8_t
+#define FLOAT_V_T_M1            vfloat64m1_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m8
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m8
+#define VFREDMAXVS_FLOAT        __riscv_vfredmax_vs_f64m8_f64m1
+#define MASK_T                  vbool8_t
+#define VMFLTVF_FLOAT           __riscv_vmflt_vf_f64m8_b8
+#define VMFLTVV_FLOAT           __riscv_vmflt_vv_f64m8_b8
+#define VMFGEVF_FLOAT           __riscv_vmfge_vf_f64m8_b8
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m8
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
+#define VFABSV_FLOAT            __riscv_vfabs_v_f64m8
+#define VFMAXVV_FLOAT_TU        __riscv_vfmax_vv_f64m8_tu
+#define VFIRSTM                 __riscv_vfirst_m_b8
+#define UINT_V_T                vuint64m8_t
+#define VIDV_MASK_UINT_TU       __riscv_vid_v_u64m8_tumu
+#define VIDV_UINT               __riscv_vid_v_u64m8
+#define VADDVX_MASK_UINT_TU     __riscv_vadd_vx_u64m8_tumu
+#define VADDVX_UINT             __riscv_vadd_vx_u64m8
+#define VMVVX_UINT              __riscv_vmv_v_x_u64m8
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
+#define VSLIDEDOWN_UINT         __riscv_vslidedown_vx_u64m8
+#define VMVVXS_UINT             __riscv_vmv_x_s_u64m8_u64
+#else
+#define VSETVL(n)               __riscv_vsetvl_e32m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m8()
+#define FLOAT_V_T               vfloat32m8_t
+#define FLOAT_V_T_M1            vfloat32m1_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m8
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m8
+#define VFREDMAXVS_FLOAT        __riscv_vfredmax_vs_f32m8_f32m1
+#define MASK_T                  vbool4_t
+#define VMFLTVF_FLOAT           __riscv_vmflt_vf_f32m8_b4
+#define VMFLTVV_FLOAT           __riscv_vmflt_vv_f32m8_b4
+#define VMFGEVF_FLOAT           __riscv_vmfge_vf_f32m8_b4
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m8
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
+#define VFABSV_FLOAT            __riscv_vfabs_v_f32m8
+#define VFMAXVV_FLOAT_TU        __riscv_vfmax_vv_f32m8_tu
+#define VFIRSTM                 __riscv_vfirst_m_b4
+#define UINT_V_T                vuint32m8_t
+#define VIDV_MASK_UINT_TU       __riscv_vid_v_u32m8_tumu
+#define VIDV_UINT               __riscv_vid_v_u32m8
+#define VADDVX_MASK_UINT_TU     __riscv_vadd_vx_u32m8_tumu
+#define VADDVX_UINT             __riscv_vadd_vx_u32m8
+#define VMVVX_UINT              __riscv_vmv_v_x_u32m8
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
+#define VSLIDEDOWN_UINT         __riscv_vslidedown_vx_u32m8
+#define VMVVXS_UINT             __riscv_vmv_x_s_u32m8_u32
+#endif
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+    unsigned int max_index = 0;
+    if (n <= 0 || inc_x <= 0) return(max_index);
+
+    FLOAT_V_T vx, v_max;
+    UINT_V_T v_max_index;
+    MASK_T mask;
+  
+    size_t vlmax = VSETVL_MAX;
+    v_max_index = VMVVX_UINT(0, vlmax);
+    v_max = VFMVVF_FLOAT(-1, vlmax);
+    BLASLONG j=0;
+    FLOAT maxf=0.0;
+    
+    if(inc_x == 1) {
+
+        for (size_t vl; n > 0; n -= vl, x += vl, j += vl) {
+            vl = VSETVL(n);
+
+            vx = VLEV_FLOAT(x, vl);
+            vx = VFABSV_FLOAT(vx, vl);
+
+            //index where element greater than v_max
+            mask = VMFLTVV_FLOAT(v_max, vx, vl);
+            v_max_index = VIDV_MASK_UINT_TU(mask, v_max_index, vl);
+            v_max_index = VADDVX_MASK_UINT_TU(mask, v_max_index, v_max_index, j, vl);
+
+            //update v_max
+            v_max = VFMAXVV_FLOAT_TU(v_max, v_max, vx, vl);
+        }
+
+    } else {
+  
+        BLASLONG stride_x = inc_x * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x, j += vl) {
+            vl = VSETVL(n);
+
+            vx = VLSEV_FLOAT(x, stride_x, vl);
+            vx = VFABSV_FLOAT(vx, vl);
+
+            //index where element greater than v_max
+            mask = VMFLTVV_FLOAT(v_max, vx, vl);
+            v_max_index = VIDV_MASK_UINT_TU(mask, v_max_index, vl);
+            v_max_index = VADDVX_MASK_UINT_TU(mask, v_max_index, v_max_index, j, vl);
+
+            //update v_max
+            v_max = VFMAXVV_FLOAT_TU(v_max, v_max, vx, vl);
+        }
+  
+    }
+
+    FLOAT_V_T_M1 v_res;
+    v_res = VFMVVF_FLOAT_M1(0, vlmax);
+
+    v_res = VFREDMAXVS_FLOAT(v_max, v_res, vlmax);
+    maxf = VFMVFS_FLOAT_M1(v_res);
+    mask = VMFGEVF_FLOAT(v_max, maxf, vlmax);
+    max_index = VFIRSTM(mask, vlmax);
+    
+    v_max_index = VSLIDEDOWN_UINT(v_max_index, max_index, vlmax);
+    max_index = VMVVXS_UINT(v_max_index);
+
+    return(max_index+1);
+}
diff --git a/kernel/riscv64/iamax_vector.c b/kernel/riscv64/iamax_vector.c
index 4242af6ea..800312400 100644
--- a/kernel/riscv64/iamax_vector.c
+++ b/kernel/riscv64/iamax_vector.c
@@ -27,127 +27,123 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 #include <math.h>
+#include <float.h>
 
 #if defined(DOUBLE)
-#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VSETVL(n) vsetvl_e64m8(n)
-#define VSETVL_MAX vsetvlmax_e64m1()
-#define FLOAT_V_T vfloat64m8_t
+
+#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n)
+#define FLOAT_V_T vfloat64m4_t
 #define FLOAT_V_T_M1 vfloat64m1_t
-#define VLEV_FLOAT vle64_v_f64m8
-#define VLSEV_FLOAT vlse64_v_f64m8
-#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1
-#define MASK_T vbool8_t
-#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
-#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8
-#define VFMVVF_FLOAT vfmv_v_f_f64m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m
-#define VFMAXVV_FLOAT vfmax_vv_f64m8
-#define VMFGEVF_FLOAT vmfge_vf_f64m8_b8
-#define VMFIRSTM vmfirst_m_b8
-#define UINT_V_T vuint64m8_t
-#define VIDV_MASK_UINT vid_v_u64m8_m
-#define VIDV_UINT vid_v_u64m8
-#define VADDVX_MASK_UINT vadd_vx_u64m8_m
-#define VADDVX_UINT vadd_vx_u64m8
-#define VMVVX_UINT vmv_v_x_u64m8
+#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4)
+#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDMAXVS_FLOAT(va, vb, gvl) vfredmax_vs_f64m4_f64m1(v_res, va, vb, gvl)
+#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u64m4_m)
+#define VCOMPRESS(va, vm, gvl) RISCV_RVV(vcompress_vm_u64m4)(vm, compressed, va, gvl)
+#else
+#define VFREDMAXVS_FLOAT RISCV_RVV(vfredmax_vs_f64m4_f64m1)
+#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u64m4_mu)
+#define VCOMPRESS RISCV_RVV(vcompress_vm_u64m4)
+#endif
+#define MASK_T vbool16_t
+#define VMFLTVV_FLOAT RISCV_RVV(vmflt_vv_f64m4_b16)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m4)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1)
+#define VFMAXVV_FLOAT RISCV_RVV(vfmax_vv_f64m4)
+#define VMFGEVF_FLOAT RISCV_RVV(vmfge_vf_f64m4_b16)
+#define VMFIRSTM RISCV_RVV(vfirst_m_b16)
+#define UINT_V_T vuint64m4_t
+#define VIDV_UINT RISCV_RVV(vid_v_u64m4)
+#define VADDVX_UINT RISCV_RVV(vadd_vx_u64m4)
+#define VMVVX_UINT RISCV_RVV(vmv_v_x_u64m4)
+#define VFABS_FLOAT RISCV_RVV(vfabs_v_f64m4)
+#define VMV_X RISCV_RVV(vmv_x_s_u64m4_u64)
 #else
-#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VSETVL(n) vsetvl_e32m8(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
-#define FLOAT_V_T vfloat32m8_t
+
+#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n)
+#define FLOAT_V_T vfloat32m4_t
 #define FLOAT_V_T_M1 vfloat32m1_t
-#define VLEV_FLOAT vle32_v_f32m8
-#define VLSEV_FLOAT vlse32_v_f32m8
-#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1
-#define MASK_T vbool4_t
-#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
-#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4
-#define VFMVVF_FLOAT vfmv_v_f_f32m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m
-#define VFMAXVV_FLOAT vfmax_vv_f32m8
-#define VMFGEVF_FLOAT vmfge_vf_f32m8_b4
-#define VMFIRSTM vmfirst_m_b4
-#define UINT_V_T vuint32m8_t
-#define VIDV_MASK_UINT vid_v_u32m8_m
-#define VIDV_UINT vid_v_u32m8
-#define VADDVX_MASK_UINT vadd_vx_u32m8_m
-#define VADDVX_UINT vadd_vx_u32m8
-#define VMVVX_UINT vmv_v_x_u32m8
+#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4)
+#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDMAXVS_FLOAT(va, vb, gvl) vfredmax_vs_f32m4_f32m1(v_res, va, vb, gvl)
+#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u32m4_m)
+#define VCOMPRESS(va, vm, gvl) RISCV_RVV(vcompress_vm_u32m4)(vm, compressed, va, gvl)
+#else
+#define VFREDMAXVS_FLOAT RISCV_RVV(vfredmax_vs_f32m4_f32m1)
+#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u32m4_mu)
+#define VCOMPRESS RISCV_RVV(vcompress_vm_u32m4)
+#endif
+#define MASK_T vbool8_t
+#define VMFLTVV_FLOAT RISCV_RVV(vmflt_vv_f32m4_b8)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m4)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1)
+#define VFMAXVV_FLOAT RISCV_RVV(vfmax_vv_f32m4)
+#define VMFGEVF_FLOAT RISCV_RVV(vmfge_vf_f32m4_b8)
+#define VMFIRSTM RISCV_RVV(vfirst_m_b8)
+#define UINT_V_T vuint32m4_t
+#define VIDV_UINT RISCV_RVV(vid_v_u32m4)
+#define VADDVX_UINT RISCV_RVV(vadd_vx_u32m4)
+#define VMVVX_UINT RISCV_RVV(vmv_v_x_u32m4)
+#define VFABS_FLOAT RISCV_RVV(vfabs_v_f32m4)
+#define VMV_X RISCV_RVV(vmv_x_s_u32m4_u32)
 #endif
 
 
 BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
-	BLASLONG i=0, j=0;
-	FLOAT maxf=0.0;
-#ifdef DOUBLE
-        BLASLONG max_index = 0;
-#else
+        BLASLONG i=0, j=0;
         unsigned int max_index = 0;
-#endif
-	if (n <= 0 || inc_x <= 0) return(max_index);
+        if (n <= 0 || inc_x <= 0) return(max_index);
+        FLOAT maxf=-FLT_MAX;
 
         FLOAT_V_T vx, v_max;
         UINT_V_T v_max_index;
         MASK_T mask;
         unsigned int gvl = 0;
-        FLOAT_V_T_M1 v_res, v_z0;
-        gvl = VSETVL_MAX;
-        v_res = VFMVVF_FLOAT_M1(0, gvl);
-        v_z0 = VFMVVF_FLOAT_M1(0, gvl);
+        FLOAT_V_T_M1 v_res;
+        v_res = VFMVVF_FLOAT_M1(-FLT_MAX, 1);
+
+        gvl = VSETVL(n);
+        UINT_V_T vid = VIDV_UINT(gvl);
 
         if(inc_x == 1){
-                gvl = VSETVL(n);
                 v_max_index = VMVVX_UINT(0, gvl);
-                v_max = VFMVVF_FLOAT(-1, gvl);
+                v_max = VFMVVF_FLOAT(-FLT_MAX, gvl);
                 for(i=0,j=0; i < n/gvl; i++){
                         vx = VLEV_FLOAT(&x[j], gvl);
-                        //fabs(vector)
-                        mask = VMFLTVF_FLOAT(vx, 0, gvl);
-                        vx = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl);
+                        vx = VFABS_FLOAT(vx, gvl);
 
                         //index where element greater than v_max
                         mask = VMFLTVV_FLOAT(v_max, vx, gvl);
-                        v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl);
-                        v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j,gvl);
+                        v_max_index = VADDVX_MASK_UINT(mask, v_max_index, vid, j, gvl);
 
                         //update v_max and start_index j
                         v_max = VFMAXVV_FLOAT(v_max, vx, gvl);
                         j += gvl;
                 }
-                v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl);
-                maxf = VFMVFS_FLOAT(v_res);
+                v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl);
+                maxf = EXTRACT_FLOAT(v_res);
                 mask = VMFGEVF_FLOAT(v_max, maxf, gvl);
-                max_index = VMFIRSTM(mask,gvl);
-#ifdef DOUBLE
-		max_index = *((BLASLONG *)&v_max_index+max_index);
-#else
-		max_index = *((unsigned int *)&v_max_index+max_index);
-#endif
+                UINT_V_T compressed;
+                compressed = VCOMPRESS(v_max_index, mask, gvl);
+                max_index = VMV_X(compressed);
+
                 if(j < n){
                         gvl = VSETVL(n-j);
-                        vx = VLEV_FLOAT(&x[j], gvl);
-                        //fabs(vector)
-                        mask = VMFLTVF_FLOAT(vx, 0, gvl);
-                        v_max = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl);
+                        v_max = VLEV_FLOAT(&x[j], gvl);
+                        v_max = VFABS_FLOAT(v_max, gvl);
 
-                        v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl);
-                        FLOAT cur_maxf = VFMVFS_FLOAT(v_res);
+                        v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl);
+                        FLOAT cur_maxf = EXTRACT_FLOAT(v_res);
                         if(cur_maxf > maxf){
                                 //tail index
-                                v_max_index = VIDV_UINT(gvl);
-                                v_max_index = VADDVX_UINT(v_max_index, j, gvl);
+                                v_max_index = VADDVX_UINT(vid, j, gvl);
 
                                 mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl);
-                                max_index = VMFIRSTM(mask,gvl);
-#ifdef DOUBLE
-                                max_index = *((BLASLONG*)&v_max_index+max_index);
-#else
-                                max_index = *((unsigned int*)&v_max_index+max_index);
-#endif
+                                UINT_V_T compressed;
+                                compressed = VCOMPRESS(v_max_index, mask, gvl);
+                                max_index = VMV_X(compressed);
                         }
                 }
         }else{
@@ -155,56 +151,48 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                 unsigned int stride_x = inc_x * sizeof(FLOAT);
                 unsigned int idx = 0, inc_v = gvl * inc_x;
 
+                v_max = VFMVVF_FLOAT(-FLT_MAX, gvl);
                 v_max_index = VMVVX_UINT(0, gvl);
-                v_max = VFMVVF_FLOAT(-1, gvl);
                 for(i=0,j=0; i < n/gvl; i++){
                         vx = VLSEV_FLOAT(&x[idx], stride_x, gvl);
-                        //fabs(vector)
-                        mask = VMFLTVF_FLOAT(vx, 0, gvl);
-                        vx = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl);
+                        vx = VFABS_FLOAT(vx, gvl);
 
                         //index where element greater than v_max
                         mask = VMFLTVV_FLOAT(v_max, vx, gvl);
-                        v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl);
-                        v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, gvl);
+                        v_max_index = VADDVX_MASK_UINT(mask, v_max_index, vid, j, gvl);
 
                         //update v_max and start_index j
                         v_max = VFMAXVV_FLOAT(v_max, vx, gvl);
                         j += gvl;
                         idx += inc_v;
                 }
-                v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl);
-                maxf = VFMVFS_FLOAT(v_res);
+
+                v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl);
+                maxf = EXTRACT_FLOAT(v_res);
                 mask = VMFGEVF_FLOAT(v_max, maxf, gvl);
-                max_index = VMFIRSTM(mask,gvl);
-#ifdef DOUBLE
-                max_index = *((BLASLONG*)&v_max_index+max_index);
-#else
-                max_index = *((unsigned int*)&v_max_index+max_index);
-#endif
+                UINT_V_T compressed;
+                compressed = VCOMPRESS(v_max_index, mask, gvl);
+                max_index = VMV_X(compressed);
+
                 if(j < n){
                         gvl = VSETVL(n-j);
-                        vx = VLSEV_FLOAT(&x[idx], stride_x, gvl);
-                        //fabs(vector)
-                        mask = VMFLTVF_FLOAT(vx, 0, gvl);
-                        v_max = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl);
+                        v_max = VLSEV_FLOAT(&x[idx], stride_x, gvl);
+                        v_max = VFABS_FLOAT(v_max, gvl);
+
+                        v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl);
+                        FLOAT cur_maxf = EXTRACT_FLOAT(v_res);
 
-                        v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl);
-                        FLOAT cur_maxf = VFMVFS_FLOAT(v_res);
                         if(cur_maxf > maxf){
                                 //tail index
-                                v_max_index = VIDV_UINT(gvl);
-                                v_max_index = VADDVX_UINT(v_max_index, j, gvl);
+                                v_max_index = VADDVX_UINT(vid, j, gvl);
 
                                 mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl);
-                                max_index = VMFIRSTM(mask,gvl);
-#ifdef DOUBLE
-                                max_index = *((BLASLONG*)&v_max_index+max_index);
-#else
-                                max_index = *((unsigned int*)&v_max_index+max_index);
-#endif
+
+                                UINT_V_T compressed;
+                                compressed = VCOMPRESS(v_max_index, mask, gvl);
+                                max_index = VMV_X(compressed);
                         }
                 }
         }
-	return(max_index+1);
+        return(max_index+1);
 }
diff --git a/kernel/riscv64/iamin_rvv.c b/kernel/riscv64/iamin_rvv.c
new file mode 100644
index 000000000..f90dbb545
--- /dev/null
+++ b/kernel/riscv64/iamin_rvv.c
@@ -0,0 +1,150 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <float.h>
+
+#if defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e64m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m8()
+#define FLOAT_V_T               vfloat64m8_t
+#define FLOAT_V_T_M1            vfloat64m1_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m8
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m8
+#define VFREDMINVS_FLOAT        __riscv_vfredmin_vs_f64m8_f64m1
+#define MASK_T                  vbool8_t
+#define VMFLTVF_FLOAT           __riscv_vmflt_vf_f64m8_b8
+#define VMFLTVV_FLOAT           __riscv_vmflt_vv_f64m8_b8
+#define VMFLEVF_FLOAT           __riscv_vmfle_vf_f64m8_b8
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m8
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
+#define VFABSV_FLOAT            __riscv_vfabs_v_f64m8
+#define VFMINVV_FLOAT_TU        __riscv_vfmin_vv_f64m8_tu
+#define VFIRSTM                 __riscv_vfirst_m_b8
+#define UINT_V_T                vuint64m8_t
+#define VIDV_MASK_UINT_TU       __riscv_vid_v_u64m8_tumu
+#define VIDV_UINT               __riscv_vid_v_u64m8
+#define VADDVX_MASK_UINT_TU     __riscv_vadd_vx_u64m8_tumu
+#define VADDVX_UINT             __riscv_vadd_vx_u64m8
+#define VMVVX_UINT              __riscv_vmv_v_x_u64m8
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
+#define VSLIDEDOWN_UINT         __riscv_vslidedown_vx_u64m8
+#define VMVVXS_UINT             __riscv_vmv_x_s_u64m8_u64
+#else
+#define VSETVL(n)               __riscv_vsetvl_e32m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m8()
+#define FLOAT_V_T               vfloat32m8_t
+#define FLOAT_V_T_M1            vfloat32m1_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m8
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m8
+#define VFREDMINVS_FLOAT        __riscv_vfredmin_vs_f32m8_f32m1
+#define MASK_T                  vbool4_t
+#define VMFLTVF_FLOAT           __riscv_vmflt_vf_f32m8_b4
+#define VMFLTVV_FLOAT           __riscv_vmflt_vv_f32m8_b4
+#define VMFLEVF_FLOAT           __riscv_vmfle_vf_f32m8_b4
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m8
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
+#define VFABSV_FLOAT            __riscv_vfabs_v_f32m8
+#define VFMINVV_FLOAT_TU        __riscv_vfmin_vv_f32m8_tu
+#define VFIRSTM                 __riscv_vfirst_m_b4
+#define UINT_V_T                vuint32m8_t
+#define VIDV_MASK_UINT_TU       __riscv_vid_v_u32m8_tumu
+#define VIDV_UINT               __riscv_vid_v_u32m8
+#define VADDVX_MASK_UINT_TU     __riscv_vadd_vx_u32m8_tumu
+#define VADDVX_UINT             __riscv_vadd_vx_u32m8
+#define VMVVX_UINT              __riscv_vmv_v_x_u32m8
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
+#define VSLIDEDOWN_UINT         __riscv_vslidedown_vx_u32m8
+#define VMVVXS_UINT             __riscv_vmv_x_s_u32m8_u32
+#endif
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+    unsigned int min_index = 0;
+    if (n <= 0 || inc_x <= 0) return(min_index);
+
+    FLOAT_V_T vx, v_min;
+    UINT_V_T v_min_index;
+    MASK_T mask;
+  
+    size_t vlmax = VSETVL_MAX;
+    v_min_index = VMVVX_UINT(0, vlmax);
+    v_min = VFMVVF_FLOAT(FLT_MAX, vlmax);
+    BLASLONG j=0;
+    FLOAT minf=0.0;
+    
+    if(inc_x == 1) {
+
+        for (size_t vl; n > 0; n -= vl, x += vl, j += vl) {
+            vl = VSETVL(n);
+
+            vx = VLEV_FLOAT(x, vl);
+            vx = VFABSV_FLOAT(vx, vl);
+
+            // index where element less than v_min
+            mask = VMFLTVV_FLOAT(vx, v_min, vl);
+            v_min_index = VIDV_MASK_UINT_TU(mask, v_min_index, vl);
+            v_min_index = VADDVX_MASK_UINT_TU(mask, v_min_index, v_min_index, j, vl);
+
+            //update v_min and start_index j
+            v_min = VFMINVV_FLOAT_TU(v_min, v_min, vx, vl);
+        }
+
+    } else {
+  
+        BLASLONG stride_x = inc_x * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x, j += vl) {
+            vl = VSETVL(n);
+
+            vx = VLSEV_FLOAT(x, stride_x, vl);
+            vx = VFABSV_FLOAT(vx, vl);
+
+            // index where element less than v_min
+            mask = VMFLTVV_FLOAT(vx, v_min, vl);
+            v_min_index = VIDV_MASK_UINT_TU(mask, v_min_index, vl);
+            v_min_index = VADDVX_MASK_UINT_TU(mask, v_min_index, v_min_index, j, vl);
+
+            //update v_min and start_index j
+            v_min = VFMINVV_FLOAT_TU(v_min, v_min, vx, vl);
+        }
+  
+    }
+
+    FLOAT_V_T_M1 v_res;
+    v_res = VFMVVF_FLOAT_M1(FLT_MAX, vlmax);
+
+    v_res = VFREDMINVS_FLOAT(v_min, v_res, vlmax);
+    minf = VFMVFS_FLOAT_M1(v_res);
+    mask = VMFLEVF_FLOAT(v_min, minf, vlmax);
+    min_index = VFIRSTM(mask, vlmax);
+
+    v_min_index = VSLIDEDOWN_UINT(v_min_index, min_index, vlmax);
+    min_index = VMVVXS_UINT(v_min_index);
+
+    return(min_index+1);
+}
diff --git a/kernel/riscv64/iamin_vector.c b/kernel/riscv64/iamin_vector.c
index 4e81e7848..0e591e697 100644
--- a/kernel/riscv64/iamin_vector.c
+++ b/kernel/riscv64/iamin_vector.c
@@ -31,85 +31,93 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #if defined(DOUBLE)
 
-#define ABS fabs
-#define VSETVL(n) vsetvl_e64m8(n)
-#define VSETVL_MAX vsetvlmax_e64m1()
+#define VSETVL(n) RISCV_RVV(vsetvl_e64m8)(n)
 #define FLOAT_V_T vfloat64m8_t
 #define FLOAT_V_T_M1 vfloat64m1_t
-#define VLEV_FLOAT vle64_v_f64m8
-#define VLSEV_FLOAT vlse64_v_f64m8
-#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1
-#define MASK_T vbool8_t
-#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
-#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8
-#define VFMVVF_FLOAT vfmv_v_f_f64m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m
-#define VFMINVV_FLOAT vfmin_vv_f64m8
-#define VMFLEVF_FLOAT vmfle_vf_f64m8_b8
-#define VMFIRSTM vmfirst_m_b8
-#define UINT_V_T vuint64m8_t
+#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m8)
+#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m8)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDMINVS_FLOAT(va, vb, gvl) vfredmin_vs_f64m8_f64m1(v_res, va, vb, gvl)
 #define VIDV_MASK_UINT vid_v_u64m8_m
-#define VIDV_UINT vid_v_u64m8
 #define VADDVX_MASK_UINT vadd_vx_u64m8_m
-#define VADDVX_UINT vadd_vx_u64m8
-#define VMVVX_UINT vmv_v_x_u64m8
+#define VCOMPRESS(va, vm, gvl) RISCV_RVV(vcompress_vm_u64m8)(vm, compressed, va, gvl)
+#else
+#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m8_f64m1
+#define VIDV_MASK_UINT __riscv_vid_v_u64m8_mu
+#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_mu
+#define VCOMPRESS RISCV_RVV(vcompress_vm_u64m8)
+#endif
+#define MASK_T vbool8_t
+#define VMFGTVV_FLOAT RISCV_RVV(vmfgt_vv_f64m8_b8)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m8)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1)
+#define VFMINVV_FLOAT RISCV_RVV(vfmin_vv_f64m8)
+#define VMFLEVF_FLOAT RISCV_RVV(vmfle_vf_f64m8_b8)
+#define VMFIRSTM RISCV_RVV(vfirst_m_b8)
+#define UINT_V_T vuint64m8_t
+#define VIDV_UINT RISCV_RVV(vid_v_u64m8)
+#define VADDVX_UINT RISCV_RVV(vadd_vx_u64m8)
+#define VMVVX_UINT RISCV_RVV(vmv_v_x_u64m8)
+#define VFABS_FLOAT RISCV_RVV(vfabs_v_f64m8)
+#define VMV_X RISCV_RVV(vmv_x_s_u64m8_u64)
 #else
 
-#define ABS fabsf
-#define VSETVL(n) vsetvl_e32m8(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
+#define VSETVL(n) RISCV_RVV(vsetvl_e32m8)(n)
 #define FLOAT_V_T vfloat32m8_t
 #define FLOAT_V_T_M1 vfloat32m1_t
-#define VLEV_FLOAT vle32_v_f32m8
-#define VLSEV_FLOAT vlse32_v_f32m8
-#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1
-#define MASK_T vbool4_t
-#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
-#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4
-#define VFMVVF_FLOAT vfmv_v_f_f32m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m
-#define VFMINVV_FLOAT vfmin_vv_f32m8
-#define VMFLEVF_FLOAT vmfle_vf_f32m8_b4
-#define VMFIRSTM vmfirst_m_b4
-#define UINT_V_T vuint32m8_t
+#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m8)
+#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m8)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDMINVS_FLOAT(va, vb, gvl) vfredmin_vs_f32m8_f32m1(v_res, va, vb, gvl)
 #define VIDV_MASK_UINT vid_v_u32m8_m
-#define VIDV_UINT vid_v_u32m8
 #define VADDVX_MASK_UINT vadd_vx_u32m8_m
-#define VADDVX_UINT vadd_vx_u32m8
-#define VMVVX_UINT vmv_v_x_u32m8
+#define VCOMPRESS(va, vm, gvl) RISCV_RVV(vcompress_vm_u32m8)(vm, compressed, va, gvl)
+#else
+#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m8_f32m1
+#define VIDV_MASK_UINT __riscv_vid_v_u32m8_mu
+#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_mu
+#define VCOMPRESS RISCV_RVV(vcompress_vm_u32m8)
+#endif
+#define MASK_T vbool4_t
+#define VMFGTVV_FLOAT RISCV_RVV(vmfgt_vv_f32m8_b4)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m8)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1)
+#define VFMINVV_FLOAT RISCV_RVV(vfmin_vv_f32m8)
+#define VMFLEVF_FLOAT RISCV_RVV(vmfle_vf_f32m8_b4)
+#define VMFIRSTM RISCV_RVV(vfirst_m_b4)
+#define UINT_V_T vuint32m8_t
+#define VIDV_UINT RISCV_RVV(vid_v_u32m8)
+#define VADDVX_UINT RISCV_RVV(vadd_vx_u32m8)
+#define VMVVX_UINT RISCV_RVV(vmv_v_x_u32m8)
+#define VFABS_FLOAT RISCV_RVV(vfabs_v_f32m8)
+#define VMV_X RISCV_RVV(vmv_x_s_u32m8_u32)
 #endif
 
 
 BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
-	BLASLONG i=0, j=0;
-	FLOAT minf=FLT_MAX;
+        BLASLONG i=0, j=0;
         unsigned int min_index = 0;
-	if (n <= 0 || inc_x <= 0) return(min_index);
+        if (n <= 0 || inc_x <= 0) return(min_index);
+        FLOAT minf=FLT_MAX;
 
         FLOAT_V_T vx, v_min;
         UINT_V_T v_min_index;
         MASK_T mask;
         unsigned int gvl = 0;
-        FLOAT_V_T_M1 v_res, v_max;
-        gvl = VSETVL_MAX;
-        v_res = VFMVVF_FLOAT_M1(0, gvl);
-        v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl);
+        FLOAT_V_T_M1 v_res;
+        v_res = VFMVVF_FLOAT_M1(FLT_MAX, 1);
 
         if(inc_x == 1){
                 gvl = VSETVL(n);
-                v_min = VFMVVF_FLOAT(FLT_MAX, gvl);
                 v_min_index = VMVVX_UINT(0, gvl);
+                v_min = VFMVVF_FLOAT(FLT_MAX, gvl);
                 for(i=0,j=0; i < n/gvl; i++){
                         vx = VLEV_FLOAT(&x[j], gvl);
-                        //fabs(vector)
-                        mask = VMFLTVF_FLOAT(vx, 0, gvl);
-                        vx = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl);
+                        vx = VFABS_FLOAT(vx, gvl);
 
-                        //index where element less than v_min
-                        mask = VMFLTVV_FLOAT(vx, v_min, gvl);
+                        //index where element greater than v_min
+                        mask = VMFGTVV_FLOAT(v_min, vx, gvl);
                         v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl);
                         v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, gvl);
 
@@ -117,29 +125,29 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                         v_min = VFMINVV_FLOAT(v_min, vx, gvl);
                         j += gvl;
                 }
-                v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
-                minf = *((FLOAT*)&v_res);
+                v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl);
+                minf = EXTRACT_FLOAT(v_res);
                 mask = VMFLEVF_FLOAT(v_min, minf, gvl);
-                min_index = VMFIRSTM(mask,gvl);
-                min_index = *((unsigned int*)&v_min_index+min_index);
+                UINT_V_T compressed;
+                compressed = VCOMPRESS(v_min_index, mask, gvl);
+                min_index = VMV_X(compressed);
 
                 if(j < n){
                         gvl = VSETVL(n-j);
-                        vx = VLEV_FLOAT(&x[j], gvl);
-                        //fabs(vector)
-                        mask = VMFLTVF_FLOAT(vx, 0, gvl);
-                        v_min = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl);
+                        v_min = VLEV_FLOAT(&x[j], gvl);
+                        v_min = VFABS_FLOAT(v_min, gvl);
 
-                        v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
-                        FLOAT cur_minf = *((FLOAT*)&v_res);
+                        v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl);
+                        FLOAT cur_minf = EXTRACT_FLOAT(v_res);
                         if(cur_minf < minf){
                                 //tail index
                                 v_min_index = VIDV_UINT(gvl);
                                 v_min_index = VADDVX_UINT(v_min_index, j, gvl);
 
                                 mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl);
-                                min_index = VMFIRSTM(mask,gvl);
-                                min_index = *((unsigned int*)&v_min_index+min_index);
+                                UINT_V_T compressed;
+                                compressed = VCOMPRESS(v_min_index, mask, gvl);
+                                min_index = VMV_X(compressed);
                         }
                 }
         }else{
@@ -151,12 +159,10 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                 v_min_index = VMVVX_UINT(0, gvl);
                 for(i=0,j=0; i < n/gvl; i++){
                         vx = VLSEV_FLOAT(&x[idx], stride_x, gvl);
-                        //fabs(vector)
-                        mask = VMFLTVF_FLOAT(vx, 0, gvl);
-                        vx = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl);
+                        vx = VFABS_FLOAT(vx, gvl);
 
-                        //index where element less than v_min
-                        mask = VMFLTVV_FLOAT(vx, v_min, gvl);
+                        //index where element greater than v_min
+                        mask = VMFGTVV_FLOAT(v_min, vx, gvl);
                         v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl);
                         v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, gvl);
 
@@ -165,33 +171,31 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                         j += gvl;
                         idx += inc_v;
                 }
-                v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
-                minf = *((FLOAT*)&v_res);
+                v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl);
+                minf = EXTRACT_FLOAT(v_res);
                 mask = VMFLEVF_FLOAT(v_min, minf, gvl);
-                min_index = VMFIRSTM(mask,gvl);
-                min_index = *((unsigned int*)&v_min_index+min_index);
+                UINT_V_T compressed;
+                compressed = VCOMPRESS(v_min_index, mask, gvl);
+                min_index = VMV_X(compressed);
 
                 if(j < n){
                         gvl = VSETVL(n-j);
-                        vx = VLSEV_FLOAT(&x[idx], stride_x, gvl);
-                        //fabs(vector)
-                        mask = VMFLTVF_FLOAT(vx, 0, gvl);
-                        v_min = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl);
+                        v_min = VLSEV_FLOAT(&x[idx], stride_x, gvl);
+                        v_min = VFABS_FLOAT(v_min, gvl);
 
-                        v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
-                        FLOAT cur_minf = *((FLOAT*)&v_res);
+                        v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl);
+                        FLOAT cur_minf = EXTRACT_FLOAT(v_res);
                         if(cur_minf < minf){
                                 //tail index
                                 v_min_index = VIDV_UINT(gvl);
                                 v_min_index = VADDVX_UINT(v_min_index, j, gvl);
 
                                 mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl);
-                                min_index = VMFIRSTM(mask,gvl);
-                                min_index = *((unsigned int*)&v_min_index+min_index);
+                                UINT_V_T compressed;
+                                compressed = VCOMPRESS(v_min_index, mask, gvl);
+                                min_index = VMV_X(compressed);
                         }
                 }
         }
-	return(min_index+1);
+        return(min_index+1);
 }
-
-
diff --git a/kernel/riscv64/imax_rvv.c b/kernel/riscv64/imax_rvv.c
new file mode 100644
index 000000000..b1a77b178
--- /dev/null
+++ b/kernel/riscv64/imax_rvv.c
@@ -0,0 +1,146 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <float.h>
+
+#if defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e64m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m8()
+#define FLOAT_V_T               vfloat64m8_t
+#define FLOAT_V_T_M1            vfloat64m1_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m8
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m8
+#define VFREDMAXVS_FLOAT        __riscv_vfredmax_vs_f64m8_f64m1
+#define MASK_T                  vbool8_t
+#define VMFLTVF_FLOAT           __riscv_vmflt_vf_f64m8_b8
+#define VMFLTVV_FLOAT           __riscv_vmflt_vv_f64m8_b8
+#define VMFGEVF_FLOAT           __riscv_vmfge_vf_f64m8_b8
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m8
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
+#define VFMAXVV_FLOAT_TU        __riscv_vfmax_vv_f64m8_tu
+#define VFIRSTM                 __riscv_vfirst_m_b8
+#define UINT_V_T                vuint64m8_t
+#define VIDV_MASK_UINT_TU       __riscv_vid_v_u64m8_tumu
+#define VIDV_UINT               __riscv_vid_v_u64m8
+#define VADDVX_MASK_UINT_TU     __riscv_vadd_vx_u64m8_tumu
+#define VADDVX_UINT             __riscv_vadd_vx_u64m8
+#define VMVVX_UINT              __riscv_vmv_v_x_u64m8
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
+#define VSLIDEDOWN_UINT         __riscv_vslidedown_vx_u64m8
+#define VMVVXS_UINT             __riscv_vmv_x_s_u64m8_u64
+#else
+#define VSETVL(n)               __riscv_vsetvl_e32m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m8()
+#define FLOAT_V_T               vfloat32m8_t
+#define FLOAT_V_T_M1            vfloat32m1_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m8
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m8
+#define VFREDMAXVS_FLOAT        __riscv_vfredmax_vs_f32m8_f32m1
+#define MASK_T                  vbool4_t
+#define VMFLTVF_FLOAT           __riscv_vmflt_vf_f32m8_b4
+#define VMFLTVV_FLOAT           __riscv_vmflt_vv_f32m8_b4
+#define VMFGEVF_FLOAT           __riscv_vmfge_vf_f32m8_b4
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m8
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
+#define VFMAXVV_FLOAT_TU        __riscv_vfmax_vv_f32m8_tu
+#define VFIRSTM                 __riscv_vfirst_m_b4
+#define UINT_V_T                vuint32m8_t
+#define VIDV_MASK_UINT_TU       __riscv_vid_v_u32m8_tumu
+#define VIDV_UINT               __riscv_vid_v_u32m8
+#define VADDVX_MASK_UINT_TU     __riscv_vadd_vx_u32m8_tumu
+#define VADDVX_UINT             __riscv_vadd_vx_u32m8
+#define VMVVX_UINT              __riscv_vmv_v_x_u32m8
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
+#define VSLIDEDOWN_UINT         __riscv_vslidedown_vx_u32m8
+#define VMVVXS_UINT             __riscv_vmv_x_s_u32m8_u32
+#endif
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+    unsigned int max_index = 0;
+    if (n <= 0 || inc_x <= 0) return(max_index);
+
+    FLOAT_V_T vx, v_max;
+    UINT_V_T v_max_index;
+    MASK_T mask;
+  
+    size_t vlmax = VSETVL_MAX;
+    v_max_index = VMVVX_UINT(0, vlmax);
+    v_max = VFMVVF_FLOAT(-FLT_MAX, vlmax);
+    BLASLONG j=0;
+    FLOAT maxf=0.0;
+    
+    if(inc_x == 1) {
+
+        for (size_t vl; n > 0; n -= vl, x += vl, j += vl) {
+            vl = VSETVL(n);
+
+            vx = VLEV_FLOAT(x, vl);
+
+            //index where element greater than v_max
+            mask = VMFLTVV_FLOAT(v_max, vx, vl);
+            v_max_index = VIDV_MASK_UINT_TU(mask, v_max_index, vl);
+            v_max_index = VADDVX_MASK_UINT_TU(mask, v_max_index, v_max_index, j, vl);
+
+            //update v_max and start_index j
+            v_max = VFMAXVV_FLOAT_TU(v_max, v_max, vx, vl);
+        }
+
+    } else {
+  
+        BLASLONG stride_x = inc_x * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x, j += vl) {
+            vl = VSETVL(n);
+
+            vx = VLSEV_FLOAT(x, stride_x, vl);
+
+            //index where element greater than v_max
+            mask = VMFLTVV_FLOAT(v_max, vx, vl);
+            v_max_index = VIDV_MASK_UINT_TU(mask, v_max_index, vl);
+            v_max_index = VADDVX_MASK_UINT_TU(mask, v_max_index, v_max_index, j, vl);
+
+            //update v_max and start_index j
+            v_max = VFMAXVV_FLOAT_TU(v_max, v_max, vx, vl);
+        }
+  
+    }
+
+    FLOAT_V_T_M1 v_res;
+    v_res = VFMVVF_FLOAT_M1(-FLT_MAX, vlmax);
+
+    v_res = VFREDMAXVS_FLOAT(v_max, v_res, vlmax);
+    maxf = VFMVFS_FLOAT_M1(v_res);
+    mask = VMFGEVF_FLOAT(v_max, maxf, vlmax);
+    max_index = VFIRSTM(mask, vlmax);
+    
+    v_max_index = VSLIDEDOWN_UINT(v_max_index, max_index, vlmax);
+    max_index = VMVVXS_UINT(v_max_index);
+
+    return(max_index+1);
+}
diff --git a/kernel/riscv64/imax_vector.c b/kernel/riscv64/imax_vector.c
index ca48a3c48..308fa15a4 100644
--- a/kernel/riscv64/imax_vector.c
+++ b/kernel/riscv64/imax_vector.c
@@ -31,68 +31,80 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #if defined(DOUBLE)
 
-#define ABS fabs
-#define VSETVL(n) vsetvl_e64m8(n)
-#define VSETVL_MAX vsetvlmax_e64m1()
+#define VSETVL(n) RISCV_RVV(vsetvl_e64m8)(n)
 #define FLOAT_V_T vfloat64m8_t
 #define FLOAT_V_T_M1 vfloat64m1_t
-#define VLEV_FLOAT vle64_v_f64m8
-#define VLSEV_FLOAT vlse64_v_f64m8
-#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1
+#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m8)
+#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m8)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDMAXVS_FLOAT(va, vb, gvl) vfredmax_vs_f64m8_f64m1(v_res, va, vb, gvl)
+#define VIDV_MASK_UINT RISCV_RVV(vid_v_u64m8_m)
+#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u64m8_m)
+#define VCOMPRESS(va, vm, gvl) RISCV_RVV(vcompress_vm_u64m8)(vm, compressed, va, gvl)
+#else
+#define VFREDMAXVS_FLOAT RISCV_RVV(vfredmax_vs_f64m8_f64m1)
+#define VIDV_MASK_UINT RISCV_RVV(vid_v_u64m8_mu)
+#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u64m8_mu)
+#define VCOMPRESS RISCV_RVV(vcompress_vm_u64m8)
+#endif
 #define MASK_T vbool8_t
-#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8
-#define VFMVVF_FLOAT vfmv_v_f_f64m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFMAXVV_FLOAT vfmax_vv_f64m8
-#define VMFGEVF_FLOAT vmfge_vf_f64m8_b8
-#define VMFIRSTM vmfirst_m_b8
+#define VMFLTVV_FLOAT RISCV_RVV(vmflt_vv_f64m8_b8)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m8)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1)
+#define VFMAXVV_FLOAT RISCV_RVV(vfmax_vv_f64m8)
+#define VMFGEVF_FLOAT RISCV_RVV(vmfge_vf_f64m8_b8)
+#define VMFIRSTM RISCV_RVV(vfirst_m_b8)
 #define UINT_V_T vuint64m8_t
-#define VIDV_MASK_UINT vid_v_u64m8_m
-#define VIDV_UINT vid_v_u64m8
-#define VADDVX_MASK_UINT vadd_vx_u64m8_m
-#define VADDVX_UINT vadd_vx_u64m8
-#define VMVVX_UINT vmv_v_x_u64m8
+#define VIDV_UINT RISCV_RVV(vid_v_u64m8)
+#define VADDVX_UINT RISCV_RVV(vadd_vx_u64m8)
+#define VMVVX_UINT RISCV_RVV(vmv_v_x_u64m8)
+#define VMV_X RISCV_RVV(vmv_x_s_u64m8_u64)
 #else
 
-#define ABS fabsf
-#define VSETVL(n) vsetvl_e32m8(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
+#define VSETVL(n) RISCV_RVV(vsetvl_e32m8)(n)
 #define FLOAT_V_T vfloat32m8_t
 #define FLOAT_V_T_M1 vfloat32m1_t
-#define VLEV_FLOAT vle32_v_f32m8
-#define VLSEV_FLOAT vlse32_v_f32m8
-#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1
+#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m8)
+#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m8)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDMAXVS_FLOAT(va, vb, gvl) vfredmax_vs_f32m8_f32m1(v_res, va, vb, gvl)
+#define VIDV_MASK_UINT RISCV_RVV(vid_v_u32m8_m)
+#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u32m8_m)
+#define VCOMPRESS(va, vm, gvl) RISCV_RVV(vcompress_vm_u32m8)(vm, compressed, va, gvl)
+#else
+#define VFREDMAXVS_FLOAT RISCV_RVV(vfredmax_vs_f32m8_f32m1)
+#define VIDV_MASK_UINT RISCV_RVV(vid_v_u32m8_mu)
+#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u32m8_mu)
+#define VCOMPRESS RISCV_RVV(vcompress_vm_u32m8)
+#endif
 #define MASK_T vbool4_t
-#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4
-#define VFMVVF_FLOAT vfmv_v_f_f32m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFMAXVV_FLOAT vfmax_vv_f32m8
-#define VMFGEVF_FLOAT vmfge_vf_f32m8_b4
-#define VMFIRSTM vmfirst_m_b4
+#define VMFLTVV_FLOAT RISCV_RVV(vmflt_vv_f32m8_b4)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m8)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1)
+#define VFMAXVV_FLOAT RISCV_RVV(vfmax_vv_f32m8)
+#define VMFGEVF_FLOAT RISCV_RVV(vmfge_vf_f32m8_b4)
+#define VMFIRSTM RISCV_RVV(vfirst_m_b4)
 #define UINT_V_T vuint32m8_t
-#define VIDV_MASK_UINT vid_v_u32m8_m
-#define VIDV_UINT vid_v_u32m8
-#define VADDVX_MASK_UINT vadd_vx_u32m8_m
-#define VADDVX_UINT vadd_vx_u32m8
-#define VMVVX_UINT vmv_v_x_u32m8
+#define VIDV_UINT RISCV_RVV(vid_v_u32m8)
+#define VADDVX_UINT RISCV_RVV(vadd_vx_u32m8)
+#define VMVVX_UINT RISCV_RVV(vmv_v_x_u32m8)
+#define VMV_X RISCV_RVV(vmv_x_s_u32m8_u32)
 #endif
 
 
 BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
-	BLASLONG i=0, j=0;
+        BLASLONG i=0, j=0;
         unsigned int max_index = 0;
-	if (n <= 0 || inc_x <= 0) return(max_index);
-	FLOAT maxf=-FLT_MAX;
+        if (n <= 0 || inc_x <= 0) return(max_index);
+        FLOAT maxf=-FLT_MAX;
 
         FLOAT_V_T vx, v_max;
         UINT_V_T v_max_index;
         MASK_T mask;
         unsigned int gvl = 0;
-        FLOAT_V_T_M1 v_res, v_min;
-        gvl = VSETVL_MAX;
-        v_res = VFMVVF_FLOAT_M1(0, gvl);
-        v_min = VFMVVF_FLOAT_M1(-FLT_MAX, gvl);
+        FLOAT_V_T_M1 v_res;
+        v_res = VFMVVF_FLOAT_M1(-FLT_MAX, 1);
 
         if(inc_x == 1){
                 gvl = VSETVL(n);
@@ -104,32 +116,34 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                         //index where element greater than v_max
                         mask = VMFLTVV_FLOAT(v_max, vx, gvl);
                         v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl);
-                        v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j,gvl);
+                        v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, gvl);
 
                         //update v_max and start_index j
                         v_max = VFMAXVV_FLOAT(v_max, vx, gvl);
                         j += gvl;
                 }
-                v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl);
-                maxf = *((FLOAT*)&v_res);
+                v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl);
+                maxf = EXTRACT_FLOAT(v_res);
                 mask = VMFGEVF_FLOAT(v_max, maxf, gvl);
-                max_index = VMFIRSTM(mask,gvl);
-                max_index = *((unsigned int*)&v_max_index+max_index);
+                UINT_V_T compressed;
+                compressed = VCOMPRESS(v_max_index, mask, gvl);
+                max_index = VMV_X(compressed);
 
                 if(j < n){
                         gvl = VSETVL(n-j);
                         v_max = VLEV_FLOAT(&x[j], gvl);
 
-                        v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl);
-                        FLOAT cur_maxf = *((FLOAT*)&v_res);
+                        v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl);
+                        FLOAT cur_maxf = EXTRACT_FLOAT(v_res);
                         if(cur_maxf > maxf){
                                 //tail index
                                 v_max_index = VIDV_UINT(gvl);
                                 v_max_index = VADDVX_UINT(v_max_index, j, gvl);
 
                                 mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl);
-                                max_index = VMFIRSTM(mask,gvl);
-                                max_index = *((unsigned int*)&v_max_index+max_index);
+                                UINT_V_T compressed;
+                                compressed = VCOMPRESS(v_max_index, mask, gvl);
+                                max_index = VMV_X(compressed);
                         }
                 }
         }else{
@@ -145,37 +159,37 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                         //index where element greater than v_max
                         mask = VMFLTVV_FLOAT(v_max, vx, gvl);
                         v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl);
-                        v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j,gvl);
+                        v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, gvl);
 
                         //update v_max and start_index j
                         v_max = VFMAXVV_FLOAT(v_max, vx, gvl);
                         j += gvl;
                         idx += inc_v;
                 }
-                v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl);
-                maxf = *((FLOAT*)&v_res);
+                v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl);
+                maxf = EXTRACT_FLOAT(v_res);
                 mask = VMFGEVF_FLOAT(v_max, maxf, gvl);
-                max_index = VMFIRSTM(mask,gvl);
-                max_index = *((unsigned int*)&v_max_index+max_index);
+                UINT_V_T compressed;
+                compressed = VCOMPRESS(v_max_index, mask, gvl);
+                max_index = VMV_X(compressed);
 
                 if(j < n){
                         gvl = VSETVL(n-j);
                         v_max = VLSEV_FLOAT(&x[idx], stride_x, gvl);
 
-                        v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl);
-                        FLOAT cur_maxf = *((FLOAT*)&v_res);
+                        v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl);
+                        FLOAT cur_maxf = EXTRACT_FLOAT(v_res);
                         if(cur_maxf > maxf){
                                 //tail index
                                 v_max_index = VIDV_UINT(gvl);
                                 v_max_index = VADDVX_UINT(v_max_index, j, gvl);
 
                                 mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl);
-                                max_index = VMFIRSTM(mask,gvl);
-                                max_index = *((unsigned int*)&v_max_index+max_index);
+                                UINT_V_T compressed;
+                                compressed = VCOMPRESS(v_max_index, mask, gvl);
+                                max_index = VMV_X(compressed);
                         }
                 }
         }
-	return(max_index+1);
+        return(max_index+1);
 }
-
-
diff --git a/kernel/riscv64/imin_rvv.c b/kernel/riscv64/imin_rvv.c
new file mode 100644
index 000000000..1de7f3233
--- /dev/null
+++ b/kernel/riscv64/imin_rvv.c
@@ -0,0 +1,146 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <float.h>
+
+#if defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e64m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m8()
+#define FLOAT_V_T               vfloat64m8_t
+#define FLOAT_V_T_M1            vfloat64m1_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m8
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m8
+#define VFREDMINVS_FLOAT        __riscv_vfredmin_vs_f64m8_f64m1
+#define MASK_T                  vbool8_t
+#define VMFLTVF_FLOAT           __riscv_vmflt_vf_f64m8_b8
+#define VMFLTVV_FLOAT           __riscv_vmflt_vv_f64m8_b8
+#define VMFLEVF_FLOAT           __riscv_vmfle_vf_f64m8_b8
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m8
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
+#define VFMINVV_FLOAT_TU        __riscv_vfmin_vv_f64m8_tu
+#define VFIRSTM                 __riscv_vfirst_m_b8
+#define UINT_V_T                vuint64m8_t
+#define VIDV_MASK_UINT_TU       __riscv_vid_v_u64m8_tumu
+#define VIDV_UINT               __riscv_vid_v_u64m8
+#define VADDVX_MASK_UINT_TU     __riscv_vadd_vx_u64m8_tumu
+#define VADDVX_UINT             __riscv_vadd_vx_u64m8
+#define VMVVX_UINT              __riscv_vmv_v_x_u64m8
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
+#define VSLIDEDOWN_UINT         __riscv_vslidedown_vx_u64m8
+#define VMVVXS_UINT             __riscv_vmv_x_s_u64m8_u64
+#else
+#define VSETVL(n)               __riscv_vsetvl_e32m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m8()
+#define FLOAT_V_T               vfloat32m8_t
+#define FLOAT_V_T_M1            vfloat32m1_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m8
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m8
+#define VFREDMINVS_FLOAT        __riscv_vfredmin_vs_f32m8_f32m1
+#define MASK_T                  vbool4_t
+#define VMFLTVF_FLOAT           __riscv_vmflt_vf_f32m8_b4
+#define VMFLTVV_FLOAT           __riscv_vmflt_vv_f32m8_b4
+#define VMFLEVF_FLOAT           __riscv_vmfle_vf_f32m8_b4
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m8
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
+#define VFMINVV_FLOAT_TU        __riscv_vfmin_vv_f32m8_tu
+#define VFIRSTM                 __riscv_vfirst_m_b4
+#define UINT_V_T                vuint32m8_t
+#define VIDV_MASK_UINT_TU       __riscv_vid_v_u32m8_tumu
+#define VIDV_UINT               __riscv_vid_v_u32m8
+#define VADDVX_MASK_UINT_TU     __riscv_vadd_vx_u32m8_tumu
+#define VADDVX_UINT             __riscv_vadd_vx_u32m8
+#define VMVVX_UINT              __riscv_vmv_v_x_u32m8
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
+#define VSLIDEDOWN_UINT         __riscv_vslidedown_vx_u32m8
+#define VMVVXS_UINT             __riscv_vmv_x_s_u32m8_u32
+#endif
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+    unsigned int min_index = 0;
+    if (n <= 0 || inc_x <= 0) return(min_index);
+
+    FLOAT_V_T vx, v_min;
+    UINT_V_T v_min_index;
+    MASK_T mask;
+  
+    size_t vlmax = VSETVL_MAX;
+    v_min_index = VMVVX_UINT(0, vlmax);
+    v_min = VFMVVF_FLOAT(FLT_MAX, vlmax);
+    BLASLONG j=0;
+    FLOAT minf=0.0;
+    
+    if(inc_x == 1) {
+
+        for (size_t vl; n > 0; n -= vl, x += vl, j += vl) {
+            vl = VSETVL(n);
+
+            vx = VLEV_FLOAT(x, vl);
+
+            // index where element less than v_min
+            mask = VMFLTVV_FLOAT(vx, v_min, vl);
+            v_min_index = VIDV_MASK_UINT_TU(mask, v_min_index, vl);
+            v_min_index = VADDVX_MASK_UINT_TU(mask, v_min_index, v_min_index, j, vl);
+
+            //update v_min and start_index j
+            v_min = VFMINVV_FLOAT_TU(v_min, v_min, vx, vl);
+        }
+
+    } else {
+  
+        BLASLONG stride_x = inc_x * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x, j += vl) {
+            vl = VSETVL(n);
+
+            vx = VLSEV_FLOAT(x, stride_x, vl);
+
+            // index where element less than v_min
+            mask = VMFLTVV_FLOAT(vx, v_min, vl);
+            v_min_index = VIDV_MASK_UINT_TU(mask, v_min_index, vl);
+            v_min_index = VADDVX_MASK_UINT_TU(mask, v_min_index, v_min_index, j, vl);
+
+            //update v_min and start_index j
+            v_min = VFMINVV_FLOAT_TU(v_min, v_min, vx, vl);
+        }
+  
+    }
+
+    FLOAT_V_T_M1 v_res;
+    v_res = VFMVVF_FLOAT_M1(FLT_MAX, vlmax);
+
+    v_res = VFREDMINVS_FLOAT(v_min, v_res, vlmax);
+    minf = VFMVFS_FLOAT_M1(v_res);
+    mask = VMFLEVF_FLOAT(v_min, minf, vlmax);
+    min_index = VFIRSTM(mask, vlmax);
+
+    v_min_index = VSLIDEDOWN_UINT(v_min_index, min_index, vlmax);
+    min_index = VMVVXS_UINT(v_min_index);
+
+    return(min_index+1);
+}
diff --git a/kernel/riscv64/imin_vector.c b/kernel/riscv64/imin_vector.c
index 2a677098d..ec36b8eb9 100644
--- a/kernel/riscv64/imin_vector.c
+++ b/kernel/riscv64/imin_vector.c
@@ -31,122 +31,119 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #if defined(DOUBLE)
 
-#define ABS fabs
-#define VSETVL(n) vsetvl_e64m8(n)
-#define VSETVL_MAX vsetvlmax_e64m1()
+#define VSETVL(n) RISCV_RVV(vsetvl_e64m8)(n)
 #define FLOAT_V_T vfloat64m8_t
 #define FLOAT_V_T_M1 vfloat64m1_t
-#define VLEV_FLOAT vle64_v_f64m8
-#define VLSEV_FLOAT vlse64_v_f64m8
-#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1
+#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m8)
+#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m8)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDMINVS_FLOAT(va, vb, gvl) vfredmin_vs_f64m8_f64m1(v_res, va, vb, gvl)
+#define VIDV_MASK_UINT(mask, gvl) RISCV_RVV(vid_v_u64m8_m)(mask, v_min_index, gvl)
+#define VADDVX_MASK_UINT(mask, a, b, gvl) RISCV_RVV(vadd_vx_u64m8_m)(mask, a, a, b, gvl)
+#define VCOMPRESS(va, vm, gvl) RISCV_RVV(vcompress_vm_u64m8)(vm, compressed, va, gvl)
+#else
+#define VFREDMINVS_FLOAT RISCV_RVV(vfredmin_vs_f64m8_f64m1)
+#define VIDV_MASK_UINT RISCV_RVV(vid_v_u64m8_m)
+#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u64m8_m)
+#define VCOMPRESS RISCV_RVV(vcompress_vm_u64m8)
+#endif
 #define MASK_T vbool8_t
-#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8
-#define VFMVVF_FLOAT vfmv_v_f_f64m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFMINVV_FLOAT vfmin_vv_f64m8
-#define VMFLEVF_FLOAT vmfle_vf_f64m8_b8
-#define VMFIRSTM vmfirst_m_b8
+#define VMFGTVV_FLOAT RISCV_RVV(vmfgt_vv_f64m8_b8)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m8)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1)
+#define VFMINVV_FLOAT RISCV_RVV(vfmin_vv_f64m8)
+#define VMFLEVF_FLOAT RISCV_RVV(vmfle_vf_f64m8_b8)
+#define VMFIRSTM RISCV_RVV(vfirst_m_b8)
 #define UINT_V_T vuint64m8_t
-#define VIDV_MASK_UINT vid_v_u64m8_m
-#define VIDV_UINT vid_v_u64m8
-#define VADDVX_MASK_UINT vadd_vx_u64m8_m
-#define VADDVX_UINT vadd_vx_u64m8
-#define VMVVX_UINT vmv_v_x_u64m8
+#define VIDV_UINT RISCV_RVV(vid_v_u64m8)
+#define VADDVX_UINT RISCV_RVV(vadd_vx_u64m8)
+#define VMVVX_UINT RISCV_RVV(vmv_v_x_u64m8)
+#define VMV_X RISCV_RVV(vmv_x_s_u64m8_u64)
 #else
 
-#define ABS fabsf
-#define VSETVL(n) vsetvl_e32m8(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
+#define VSETVL(n) RISCV_RVV(vsetvl_e32m8)(n)
 #define FLOAT_V_T vfloat32m8_t
 #define FLOAT_V_T_M1 vfloat32m1_t
-#define VLEV_FLOAT vle32_v_f32m8
-#define VLSEV_FLOAT vlse32_v_f32m8
-#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1
+#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m8)
+#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m8)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDMINVS_FLOAT(va, vb, gvl) vfredmin_vs_f32m8_f32m1(v_res, va, vb, gvl)
+#define VIDV_MASK_UINT(mask, gvl) RISCV_RVV(vid_v_u32m8_m)(mask, v_min_index, gvl)
+#define VADDVX_MASK_UINT(mask, a, b, gvl) RISCV_RVV(vadd_vx_u32m8_m)(mask, a, a, b, gvl)
+#define VCOMPRESS(va, vm, gvl) RISCV_RVV(vcompress_vm_u32m8)(vm, compressed, va, gvl)
+#else
+#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m8_f32m1
+#define VIDV_MASK_UINT __riscv_vid_v_u32m8_m
+#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_m
+#define VCOMPRESS RISCV_RVV(vcompress_vm_u32m8)
+#endif
 #define MASK_T vbool4_t
-#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4
-#define VFMVVF_FLOAT vfmv_v_f_f32m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFMINVV_FLOAT vfmin_vv_f32m8
-#define VMFLEVF_FLOAT vmfle_vf_f32m8_b4
-#define VMFIRSTM vmfirst_m_b4
+#define VMFGTVV_FLOAT RISCV_RVV(vmfgt_vv_f32m8_b4)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m8)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1)
+#define VFMINVV_FLOAT RISCV_RVV(vfmin_vv_f32m8)
+#define VMFLEVF_FLOAT RISCV_RVV(vmfle_vf_f32m8_b4)
+#define VMFIRSTM RISCV_RVV(vfirst_m_b4)
 #define UINT_V_T vuint32m8_t
-#define VIDV_MASK_UINT vid_v_u32m8_m
-#define VIDV_UINT vid_v_u32m8
-#define VADDVX_MASK_UINT vadd_vx_u32m8_m
-#define VADDVX_UINT vadd_vx_u32m8
-#define VMVVX_UINT vmv_v_x_u32m8
+#define VIDV_UINT RISCV_RVV(vid_v_u32m8)
+#define VADDVX_UINT RISCV_RVV(vadd_vx_u32m8)
+#define VMVVX_UINT RISCV_RVV(vmv_v_x_u32m8)
+#define VMV_X RISCV_RVV(vmv_x_s_u32m8_u32)
 #endif
 
 
 BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
-	BLASLONG i=0, j=0;
-	FLOAT minf=FLT_MAX;
+        BLASLONG i=0, j=0;
         unsigned int min_index = 0;
-	if (n <= 0 || inc_x <= 0) return(min_index);
+        if (n <= 0 || inc_x <= 0) return(min_index);
+        FLOAT minf=FLT_MAX;
 
         FLOAT_V_T vx, v_min;
         UINT_V_T v_min_index;
         MASK_T mask;
         unsigned int gvl = 0;
-        FLOAT_V_T_M1 v_res, v_max;
-        gvl = VSETVL_MAX;
-        v_res = VFMVVF_FLOAT_M1(0, gvl);
-        v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl);
+        FLOAT_V_T_M1 v_res;
+        v_res = VFMVVF_FLOAT_M1(FLT_MAX, 1);
 
         if(inc_x == 1){
                 gvl = VSETVL(n);
-                v_min = VFMVVF_FLOAT(FLT_MAX, gvl);
                 v_min_index = VMVVX_UINT(0, gvl);
+                v_min = VFMVVF_FLOAT(FLT_MAX, gvl);
                 for(i=0,j=0; i < n/gvl; i++){
                         vx = VLEV_FLOAT(&x[j], gvl);
-                        //index where element less than v_min
-                        mask = VMFLTVV_FLOAT(vx, v_min, gvl);
-                        v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl);
-/*
-#if defined(DOUBLE)
-asm volatile(
-        "vor.vv v0, %1, %1 \n\t"
-        "vsetvli x0, %2, e64,m8 \n\t"
-        "vid.v %0, v0.t \n\t"
-        :"+v"(v_min_index)
-        :"v"(mask), "r"(gvl)
-        :"v0");
-#else
-asm volatile(
-        "vor.vv v0, %1, %1 \n\t"
-        "vsetvli x0, %2, e32,m8 \n\t"
-        "vid.v %0, v0.t \n\t"
-        :"+v"(v_min_index)
-        :"v"(mask), "r"(gvl)
-        :"v0");
-#endif
-*/
-                        v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j,gvl);
+
+                        //index where element greater than v_min
+                        mask = VMFGTVV_FLOAT(v_min, vx, gvl);
+                        v_min_index = VIDV_MASK_UINT(mask, gvl);
+                        v_min_index = VADDVX_MASK_UINT(mask, v_min_index, j, gvl);
 
                         //update v_min and start_index j
                         v_min = VFMINVV_FLOAT(v_min, vx, gvl);
                         j += gvl;
                 }
-                v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
-                minf = *((FLOAT*)&v_res);
+                v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl);
+                minf = EXTRACT_FLOAT(v_res);
                 mask = VMFLEVF_FLOAT(v_min, minf, gvl);
-                min_index = VMFIRSTM(mask,gvl);
-                min_index = *((unsigned int*)&v_min_index+min_index);
+                UINT_V_T compressed;
+                compressed = VCOMPRESS(v_min_index, mask, gvl);
+                min_index = VMV_X(compressed);
 
                 if(j < n){
                         gvl = VSETVL(n-j);
                         v_min = VLEV_FLOAT(&x[j], gvl);
 
-                        v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
-                        FLOAT cur_minf = *((FLOAT*)&v_res);
-                        if(cur_minf < minf){
+                        v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl);
+                        FLOAT cur_minf = EXTRACT_FLOAT(v_res);
+                        if(cur_minf > minf){
                                 //tail index
                                 v_min_index = VIDV_UINT(gvl);
                                 v_min_index = VADDVX_UINT(v_min_index, j, gvl);
+
                                 mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl);
-                                min_index = VMFIRSTM(mask,gvl);
-                                min_index = *((unsigned int*)&v_min_index+min_index);
+                                UINT_V_T compressed;
+                                compressed = VCOMPRESS(v_min_index, mask, gvl);
+                                min_index = VMV_X(compressed);
                         }
                 }
         }else{
@@ -159,59 +156,39 @@ asm volatile(
                 for(i=0,j=0; i < n/gvl; i++){
                         vx = VLSEV_FLOAT(&x[idx], stride_x, gvl);
 
-                        //index where element less than v_min
-                        mask = VMFLTVV_FLOAT(vx, v_min, gvl);
-                        v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl);
-/*
-#if defined(DOUBLE)
-asm volatile(
-        "vor.vv v0, %1, %1 \n\t"
-        "vsetvli x0, %2, e64,m8 \n\t"
-        "vid.v %0, v0.t \n\t"
-        :"+v"(v_min_index)
-        :"v"(mask), "r"(gvl)
-        :"v0");
-#else
-asm volatile(
-        "vor.vv v0, %1, %1 \n\t"
-        "vsetvli x0, %2, e32,m8 \n\t"
-        "vid.v %0, v0.t \n\t"
-        :"+v"(v_min_index)
-        :"v"(mask), "r"(gvl)
-        :"v0");
-#endif
-*/
-
-                        v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j,gvl);
+                        //index where element greater than v_min
+                        mask = VMFGTVV_FLOAT(v_min, vx, gvl);
+                        v_min_index = VIDV_MASK_UINT(mask, gvl);
+                        v_min_index = VADDVX_MASK_UINT(mask, v_min_index, j, gvl);
 
                         //update v_min and start_index j
                         v_min = VFMINVV_FLOAT(v_min, vx, gvl);
                         j += gvl;
                         idx += inc_v;
                 }
-                v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
-                minf = *((FLOAT*)&v_res);
+                v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl);
+                minf = EXTRACT_FLOAT(v_res);
                 mask = VMFLEVF_FLOAT(v_min, minf, gvl);
-                min_index = VMFIRSTM(mask,gvl);
-                min_index = *((unsigned int*)&v_min_index+min_index);
+                UINT_V_T compressed;
+                compressed = VCOMPRESS(v_min_index, mask, gvl);
+                min_index = VMV_X(compressed);
 
                 if(j < n){
                         gvl = VSETVL(n-j);
                         v_min = VLSEV_FLOAT(&x[idx], stride_x, gvl);
-
-                        v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
-                        FLOAT cur_minf = *((FLOAT*)&v_res);
-                        if(cur_minf < minf){
+                        v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl);
+                        FLOAT cur_minf = EXTRACT_FLOAT(v_res);
+                        if(cur_minf > minf){
                                 //tail index
                                 v_min_index = VIDV_UINT(gvl);
                                 v_min_index = VADDVX_UINT(v_min_index, j, gvl);
+
                                 mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl);
-                                min_index = VMFIRSTM(mask,gvl);
-                                min_index = *((unsigned int*)&v_min_index+min_index);
+                                UINT_V_T compressed;
+                                compressed = VCOMPRESS(v_min_index, mask, gvl);
+                                min_index = VMV_X(compressed);
                         }
                 }
         }
-	return(min_index+1);
+        return(min_index+1);
 }
-
-
diff --git a/kernel/riscv64/izamax_rvv.c b/kernel/riscv64/izamax_rvv.c
new file mode 100644
index 000000000..32f66a7a7
--- /dev/null
+++ b/kernel/riscv64/izamax_rvv.c
@@ -0,0 +1,172 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e64m4(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m4()
+#define FLOAT_V_T               vfloat64m4_t
+#define FLOAT_V_T_M1            vfloat64m1_t
+#define FLOAT_VX2_T             vfloat64m4x2_t
+#define VGET_VX2                __riscv_vget_v_f64m4x2_f64m4
+#define VLEV_FLOAT              __riscv_vle64_v_f64m4
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m4
+#define VLSEG_FLOAT             __riscv_vlseg2e64_v_f64m4x2
+#define VLSSEG_FLOAT            __riscv_vlsseg2e64_v_f64m4x2
+#define VFREDMAXVS_FLOAT        __riscv_vfredmax_vs_f64m4_f64m1
+#define MASK_T                  vbool16_t
+#define VMFLTVF_FLOAT           __riscv_vmflt_vf_f64m4_b16
+#define VMFLTVV_FLOAT           __riscv_vmflt_vv_f64m4_b16
+#define VMFGEVF_FLOAT           __riscv_vmfge_vf_f64m4_b16
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m4
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
+#define VFABSV_FLOAT            __riscv_vfabs_v_f64m4
+#define VFMAXVV_FLOAT_TU        __riscv_vfmax_vv_f64m4_tu
+#define VFADDVV_FLOAT           __riscv_vfadd_vv_f64m4
+#define VFIRSTM                 __riscv_vfirst_m_b16
+#define UINT_V_T                vuint64m4_t
+#define VIDV_MASK_UINT_TU       __riscv_vid_v_u64m4_tumu
+#define VIDV_UINT               __riscv_vid_v_u64m4
+#define VADDVX_MASK_UINT_TU     __riscv_vadd_vx_u64m4_tumu
+#define VADDVX_UINT             __riscv_vadd_vx_u64m4
+#define VMVVX_UINT              __riscv_vmv_v_x_u64m4
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
+#define VSLIDEDOWN_UINT         __riscv_vslidedown_vx_u64m4
+#define VMVVXS_UINT             __riscv_vmv_x_s_u64m4_u64
+#else
+#define VSETVL(n)               __riscv_vsetvl_e32m4(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m4()
+#define FLOAT_V_T               vfloat32m4_t
+#define FLOAT_V_T_M1            vfloat32m1_t
+#define FLOAT_VX2_T             vfloat32m4x2_t
+#define VGET_VX2                __riscv_vget_v_f32m4x2_f32m4
+#define VLEV_FLOAT              __riscv_vle32_v_f32m4
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m4
+#define VLSEG_FLOAT             __riscv_vlseg2e32_v_f32m4x2
+#define VLSSEG_FLOAT            __riscv_vlsseg2e32_v_f32m4x2
+#define VFREDMAXVS_FLOAT        __riscv_vfredmax_vs_f32m4_f32m1
+#define MASK_T                  vbool8_t
+#define VMFLTVF_FLOAT           __riscv_vmflt_vf_f32m4_b8
+#define VMFLTVV_FLOAT           __riscv_vmflt_vv_f32m4_b8
+#define VMFGEVF_FLOAT           __riscv_vmfge_vf_f32m4_b8
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m4
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
+#define VFABSV_FLOAT            __riscv_vfabs_v_f32m4
+#define VFMAXVV_FLOAT_TU        __riscv_vfmax_vv_f32m4_tu
+#define VFADDVV_FLOAT           __riscv_vfadd_vv_f32m4
+#define VFIRSTM                 __riscv_vfirst_m_b8
+#define UINT_V_T                vuint32m4_t
+#define VIDV_MASK_UINT_TU       __riscv_vid_v_u32m4_tumu
+#define VIDV_UINT               __riscv_vid_v_u32m4
+#define VADDVX_MASK_UINT_TU     __riscv_vadd_vx_u32m4_tumu
+#define VADDVX_UINT             __riscv_vadd_vx_u32m4
+#define VMVVX_UINT              __riscv_vmv_v_x_u32m4
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
+#define VSLIDEDOWN_UINT         __riscv_vslidedown_vx_u32m4
+#define VMVVXS_UINT             __riscv_vmv_x_s_u32m4_u32
+#endif
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+    unsigned int max_index = 0;
+    if (n <= 0 || inc_x <= 0) return(max_index);
+
+    FLOAT_V_T vx0, vx1, v_max;
+    FLOAT_VX2_T vxx2;
+    UINT_V_T v_max_index;
+    MASK_T mask;
+  
+    size_t vlmax = VSETVL_MAX;
+    v_max_index = VMVVX_UINT(0, vlmax);
+    v_max = VFMVVF_FLOAT(-1, vlmax);
+    BLASLONG j=0;
+    FLOAT maxf=0.0;
+
+    if(inc_x == 1) {
+
+        for (size_t vl; n > 0; n -= vl, x += vl*2, j += vl) {
+            vl = VSETVL(n);
+
+            vxx2 = VLSEG_FLOAT(x, vl);
+
+            vx0 = VGET_VX2(vxx2, 0);
+            vx1 = VGET_VX2(vxx2, 1);
+
+            vx0 = VFABSV_FLOAT(vx0, vl);
+            vx1 = VFABSV_FLOAT(vx1, vl);
+
+            vx0 = VFADDVV_FLOAT(vx0, vx1, vl);
+
+            //index where element greater than v_max
+            mask = VMFLTVV_FLOAT(v_max, vx0, vl);
+            v_max_index = VIDV_MASK_UINT_TU(mask, v_max_index, vl);
+            v_max_index = VADDVX_MASK_UINT_TU(mask, v_max_index, v_max_index, j, vl);
+
+            //update v_max and start_index j
+            v_max = VFMAXVV_FLOAT_TU(v_max, v_max, vx0, vl);
+        }
+    }
+    else {
+        
+        BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2;
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, j += vl) {
+            vl = VSETVL(n);
+        
+            vxx2 = VLSSEG_FLOAT(x, stride_x, vl);
+
+            vx0 = VGET_VX2(vxx2, 0);
+            vx1 = VGET_VX2(vxx2, 1);
+        
+            vx0 = VFABSV_FLOAT(vx0, vl);
+            vx1 = VFABSV_FLOAT(vx1, vl);
+        
+            vx0 = VFADDVV_FLOAT(vx0, vx1, vl);
+        
+            //index where element greater than v_max
+            mask = VMFLTVV_FLOAT(v_max, vx0, vl);
+            v_max_index = VIDV_MASK_UINT_TU(mask, v_max_index, vl);
+            v_max_index = VADDVX_MASK_UINT_TU(mask, v_max_index, v_max_index, j, vl);
+
+            //update v_max and start_index j
+            v_max = VFMAXVV_FLOAT_TU(v_max, v_max, vx0, vl);
+        }
+
+    }
+    FLOAT_V_T_M1 v_res;
+    v_res = VFMVVF_FLOAT_M1(0, vlmax);
+
+    v_res = VFREDMAXVS_FLOAT(v_max, v_res, vlmax);
+    maxf = VFMVFS_FLOAT_M1(v_res);
+    mask = VMFGEVF_FLOAT(v_max, maxf, vlmax);
+    max_index = VFIRSTM(mask, vlmax);
+    
+    v_max_index = VSLIDEDOWN_UINT(v_max_index, max_index, vlmax);
+    max_index = VMVVXS_UINT(v_max_index);
+
+    return(max_index+1);
+}
diff --git a/kernel/riscv64/izamax_vector.c b/kernel/riscv64/izamax_vector.c
index 66a101566..d33e89c00 100644
--- a/kernel/riscv64/izamax_vector.c
+++ b/kernel/riscv64/izamax_vector.c
@@ -27,241 +27,146 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 #include <math.h>
+#include <float.h>
 
 #if defined(DOUBLE)
 
-#define VSETVL(n) vsetvl_e64m8(n)
-#define VSETVL_MAX vsetvlmax_e64m1()
+#define VSETVL(n) RISCV_RVV(vsetvl_e64m8)(n)
 #define FLOAT_V_T vfloat64m8_t
 #define FLOAT_V_T_M1 vfloat64m1_t
-#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLSEV_FLOAT vlse64_v_f64m8
-#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1
+#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m8)
+#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m8)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDMAXVS_FLOAT(va, vb, gvl) RISCV_RVV(vfredmax_vs_f64m8_f64m1)(v_res, va, vb, gvl)
+#define VIDV_MASK_UINT RISCV_RVV(vid_v_u64m8_m)
+#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u64m8_m)
+#define VCOMPRESS(va, vm, gvl) RISCV_RVV(vcompress_vm_u64m8)(vm, compressed, va, gvl)
+#else
+#define VFREDMAXVS_FLOAT RISCV_RVV(vfredmax_vs_f64m8_f64m1)
+#define VIDV_MASK_UINT RISCV_RVV(vid_v_u64m8_mu)
+#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u64m8_mu)
+#define VCOMPRESS RISCV_RVV(vcompress_vm_u64m8)
+#endif
 #define MASK_T vbool8_t
-#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
-#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8
-#define VFMVVF_FLOAT vfmv_v_f_f64m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m
-#define VFMAXVV_FLOAT vfmax_vv_f64m8
-#define VMFGEVF_FLOAT vmfge_vf_f64m8_b8
-#define VMFIRSTM vmfirst_m_b8
+#define VMFLTVV_FLOAT RISCV_RVV(vmflt_vv_f64m8_b8)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m8)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1)
+#define VFMAXVV_FLOAT RISCV_RVV(vfmax_vv_f64m8)
+#define VMFGEVF_FLOAT RISCV_RVV(vmfge_vf_f64m8_b8)
+#define VMFIRSTM RISCV_RVV(vfirst_m_b8)
 #define UINT_V_T vuint64m8_t
-#define VSEVU_UINT vse64_v_u64m8
+#define VSEVU_UINT RISCV_RVV(vse64_v_u64m8)
 #define UINT_T long unsigned int
-#define VIDV_MASK_UINT vid_v_u64m8_m
-#define VIDV_UINT vid_v_u64m8
-#define VADDVX_MASK_UINT vadd_vx_u64m8_m
-#define VADDVX_UINT vadd_vx_u64m8
-#define VFADDVV_FLOAT vfadd_vv_f64m8
-#define VMVVX_UINT vmv_v_x_u64m8
+#define VIDV_UINT RISCV_RVV(vid_v_u64m8)
+#define VADDVX_UINT RISCV_RVV(vadd_vx_u64m8)
+#define VMVVX_UINT RISCV_RVV(vmv_v_x_u64m8)
+#define VFABS_FLOAT RISCV_RVV(vfabs_v_f64m8)
+#define VFADDVV_FLOAT RISCV_RVV(vfadd_vv_f64m8)
+#define VMV_X RISCV_RVV(vmv_x_s_u64m8_u64)
 #else
 
-#define ABS fabsf
-#define VSETVL(n) vsetvl_e32m8(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
+#define VSETVL(n) RISCV_RVV(vsetvl_e32m8)(n)
 #define FLOAT_V_T vfloat32m8_t
 #define FLOAT_V_T_M1 vfloat32m1_t
-#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLSEV_FLOAT vlse32_v_f32m8
-#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1
+#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m8)
+#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m8)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDMAXVS_FLOAT(va, vb, gvl) RISCV_RVV(vfredmax_vs_f32m8_f32m1)(v_res, va, vb, gvl)
+#define VIDV_MASK_UINT RISCV_RVV(vid_v_u32m8_m)
+#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u32m8_m)
+#define VCOMPRESS(va, vm, gvl) RISCV_RVV(vcompress_vm_u32m8)(vm, compressed, va, gvl)
+#else
+#define VFREDMAXVS_FLOAT RISCV_RVV(vfredmax_vs_f32m8_f32m1)
+#define VIDV_MASK_UINT RISCV_RVV(vid_v_u32m8_mu)
+#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u32m8_mu)
+#define VCOMPRESS RISCV_RVV(vcompress_vm_u32m8)
+#endif
 #define MASK_T vbool4_t
-#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
-#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4
-#define VFMVVF_FLOAT vfmv_v_f_f32m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m
-#define VFMAXVV_FLOAT vfmax_vv_f32m8
-#define VMFGEVF_FLOAT vmfge_vf_f32m8_b4
-#define VMFIRSTM vmfirst_m_b4
+#define VMFLTVV_FLOAT RISCV_RVV(vmflt_vv_f32m8_b4)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m8)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1)
+#define VFMAXVV_FLOAT RISCV_RVV(vfmax_vv_f32m8)
+#define VMFGEVF_FLOAT RISCV_RVV(vmfge_vf_f32m8_b4)
+#define VMFIRSTM RISCV_RVV(vfirst_m_b4)
 #define UINT_V_T vuint32m8_t
 #define UINT_T unsigned int
-#define VSEVU_UINT vse32_v_u32m8
-#define VIDV_MASK_UINT vid_v_u32m8_m
-#define VIDV_UINT vid_v_u32m8
-#define VADDVX_MASK_UINT vadd_vx_u32m8_m
-#define VADDVX_UINT vadd_vx_u32m8
-#define VFADDVV_FLOAT vfadd_vv_f32m8
-#define VMVVX_UINT vmv_v_x_u32m8
+#define VSEVU_UINT RISCV_RVV(vse32_v_u32m8)
+#define VIDV_UINT RISCV_RVV(vid_v_u32m8)
+#define VADDVX_UINT RISCV_RVV(vadd_vx_u32m8)
+#define VMVVX_UINT RISCV_RVV(vmv_v_x_u32m8)
+#define VFABS_FLOAT RISCV_RVV(vfabs_v_f32m8)
+#define VFADDVV_FLOAT RISCV_RVV(vfadd_vv_f32m8)
+#define VMV_X RISCV_RVV(vmv_x_s_u32m8_u32)
 #endif
 
-#define RVV_M RVV_M8
 
 BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
-	BLASLONG i=0, j=0;
-	FLOAT maxf=0.0;
+        BLASLONG i=0, j=0;
         unsigned int max_index = 0;
-	if (n <= 0 || inc_x <= 0) return(max_index);
+        if (n <= 0 || inc_x <= 0) return(max_index);
+        FLOAT maxf=-FLT_MAX;
 
-        FLOAT_V_T vx0, vx1, v_max;
+        FLOAT_V_T vx, vx2, v_max;
         UINT_V_T v_max_index;
-        MASK_T mask0, mask1;
+        MASK_T mask;
         unsigned int gvl = 0;
-        FLOAT_V_T_M1 v_res, v_z0;
-        gvl = VSETVL_MAX;
-        v_res = VFMVVF_FLOAT_M1(0, gvl);
-        v_z0 = VFMVVF_FLOAT_M1(0, gvl);
+        FLOAT_V_T_M1 v_res;
+        v_res = VFMVVF_FLOAT_M1(-FLT_MAX, 1);
 
         gvl = VSETVL(n);
-                UINT_T temp_uint[gvl];
+        unsigned int stride_x = inc_x * 2 * sizeof(FLOAT);
+        unsigned int idx = 0, inc_v = gvl * inc_x * 2;
+
+        v_max = VFMVVF_FLOAT(-FLT_MAX, gvl);
         v_max_index = VMVVX_UINT(0, gvl);
-        v_max = VFMVVF_FLOAT(-1, gvl);
-        BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
-        BLASLONG inc_xv = gvl * inc_x * 2;
-        BLASLONG ix = 0;
         for(i=0,j=0; i < n/gvl; i++){
-                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
-                //fabs(vector)
-                mask0 = VMFLTVF_FLOAT(vx0, 0, gvl);
-                vx0 = VFRSUBVF_MASK_FLOAT(mask0, vx0, vx0, 0, gvl);
-/*
-#if defined(DOUBLE)
-asm volatile(
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e64,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+v"(vx0)
-        :"v"(mask0), "f"(zero), "r"(gvl)
-        :"v0");
-#else
-asm volatile(
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e32,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+v"(vx0)
-        :"v"(mask0), "f"(zero), "r"(gvl)
-        :"v0");
-#endif
-*/
-                vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
-                //fabs(vector)
-                mask1 = VMFLTVF_FLOAT(vx1, 0, gvl);
-                vx1 = VFRSUBVF_MASK_FLOAT(mask1, vx1, vx1, 0, gvl);
-/*
-#if defined(DOUBLE)
-asm volatile(
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e64,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+v"(vx1)
-        :"v"(mask1), "f"(zero), "r"(gvl)
-        :"v0");
-#else
-asm volatile(
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e32,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+v"(vx1)
-        :"v"(mask1), "f"(zero), "r"(gvl)
-        :"v0");
-#endif
-*/
-                vx0 = VFADDVV_FLOAT(vx0, vx1, gvl);
+                vx = VLSEV_FLOAT(&x[idx], stride_x, gvl);
+                vx2 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl);
+                vx = VFABS_FLOAT(vx, gvl);
+                vx2 = VFABS_FLOAT(vx2, gvl);
+                vx = VFADDVV_FLOAT(vx, vx2, gvl);
+
 
                 //index where element greater than v_max
-                mask0 = VMFLTVV_FLOAT(v_max, vx0, gvl);
-                v_max_index = VIDV_MASK_UINT(mask0, v_max_index, gvl);
-/*
-#if defined(DOUBLE)
-asm volatile(
-        "vor.vv v0, %1, %1 \n\t"
-        "vsetvli x0, %2, e64,m8 \n\t"
-        "vid.v %0, v0.t \n\t"
-        :"+v"(v_max_index)
-        :"v"(mask0), "r"(gvl)
-        :"v0");
-#else
-asm volatile(
-        "vor.vv v0, %1, %1 \n\t"
-        "vsetvli x0, %2, e32,m8 \n\t"
-        "vid.v %0, v0.t \n\t"
-        :"+v"(v_max_index)
-        :"v"(mask0), "r"(gvl)
-        :"v0");
-#endif
-*/
-                v_max_index = VADDVX_MASK_UINT(mask0, v_max_index, v_max_index, j, gvl);
+                mask = VMFLTVV_FLOAT(v_max, vx, gvl);
+                v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl);
+                v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, gvl);
 
                 //update v_max and start_index j
-                v_max = VFMAXVV_FLOAT(v_max, vx0, gvl);
+                v_max = VFMAXVV_FLOAT(v_max, vx, gvl);
                 j += gvl;
-                ix += inc_xv;
+                idx += inc_v;
         }
-        vx0 = VFMVVF_FLOAT(0, gvl);
-        v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl);
-        maxf = VFMVFS_FLOAT(v_res);
-        mask0 = VMFGEVF_FLOAT(v_max, maxf, gvl);
-        max_index = VMFIRSTM(mask0,gvl);
-        VSEVU_UINT(temp_uint,v_max_index,gvl);
-        max_index = temp_uint[max_index];
-
+        v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl);
+        maxf = EXTRACT_FLOAT(v_res);
+        mask = VMFGEVF_FLOAT(v_max, maxf, gvl);
+        UINT_V_T compressed;
+        compressed = VCOMPRESS(v_max_index, mask, gvl);
+        max_index = VMV_X(compressed);
 
         if(j < n){
                 gvl = VSETVL(n-j);
-                v_max_index = VMVVX_UINT(0, gvl);
-                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
-                //fabs(vector)
-                mask0 = VMFLTVF_FLOAT(vx0, 0, gvl);
-                vx0 = VFRSUBVF_MASK_FLOAT(mask0, vx0, vx0, 0, gvl);
-/*
-#if defined(DOUBLE)
-asm volatile(
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e64,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+v"(vx0)
-        :"v"(mask0), "f"(zero), "r"(gvl)
-        :"v0");
-#else
-asm volatile(
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e32,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+v"(vx0)
-        :"v"(mask0), "f"(zero), "r"(gvl)
-        :"v0");
-#endif
-*/
-                vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
-                //fabs(vector)
-                mask1 = VMFLTVF_FLOAT(vx1, 0, gvl);
-                vx1 = VFRSUBVF_MASK_FLOAT(mask1, vx1, vx1, 0, gvl);
-/*
-#if defined(DOUBLE)
-asm volatile(
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e64,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+v"(vx1)
-        :"v"(mask1), "f"(zero), "r"(gvl)
-        :"v0");
-#else
-asm volatile(
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e32,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+v"(vx1)
-        :"v"(mask1), "f"(zero), "r"(gvl)
-        :"v0");
-#endif
-*/
-                v_max = VFADDVV_FLOAT(vx0, vx1, gvl);
-                v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl);
-                FLOAT cur_maxf = VFMVFS_FLOAT(v_res);
+                v_max = VLSEV_FLOAT(&x[idx], stride_x, gvl);
+                vx2 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl);
+                v_max = VFABS_FLOAT(v_max, gvl);
+                vx2 = VFABS_FLOAT(vx2, gvl);
+                v_max = VFADDVV_FLOAT(v_max, vx2, gvl);
+
+                v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl);
+                FLOAT cur_maxf = EXTRACT_FLOAT(v_res);
+
                 if(cur_maxf > maxf){
                         //tail index
                         v_max_index = VIDV_UINT(gvl);
                         v_max_index = VADDVX_UINT(v_max_index, j, gvl);
 
-                        mask0 = VMFGEVF_FLOAT(v_max, cur_maxf, gvl);
-                        max_index = VMFIRSTM(mask0,gvl);
-                        VSEVU_UINT(temp_uint,v_max_index,gvl);
-                                         max_index = temp_uint[max_index];
-
+                        mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl);
+                        UINT_V_T compressed;
+                        compressed = VCOMPRESS(v_max_index, mask, gvl);
+                        max_index = VMV_X(compressed);
                 }
         }
-	return(max_index+1);
-}
-
 
+        return(max_index+1);
+}
diff --git a/kernel/riscv64/izamin_rvv.c b/kernel/riscv64/izamin_rvv.c
new file mode 100644
index 000000000..d34b220fa
--- /dev/null
+++ b/kernel/riscv64/izamin_rvv.c
@@ -0,0 +1,171 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <float.h>
+
+#if defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e64m4(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m4()
+#define FLOAT_V_T               vfloat64m4_t
+#define FLOAT_V_T_M1            vfloat64m1_t
+#define FLOAT_VX2_T             vfloat64m4x2_t
+#define VGET_VX2                __riscv_vget_v_f64m4x2_f64m4
+#define VLSEG_FLOAT             __riscv_vlseg2e64_v_f64m4x2
+#define VLSSEG_FLOAT            __riscv_vlsseg2e64_v_f64m4x2
+#define VFREDMINVS_FLOAT        __riscv_vfredmin_vs_f64m4_f64m1
+#define MASK_T                  vbool16_t
+#define VMFLTVF_FLOAT           __riscv_vmflt_vf_f64m4_b16
+#define VMFLTVV_FLOAT           __riscv_vmflt_vv_f64m4_b16
+#define VMFLEVF_FLOAT           __riscv_vmfle_vf_f64m4_b16
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m4
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
+#define VFABSV_FLOAT            __riscv_vfabs_v_f64m4
+#define VFMINVV_FLOAT_TU        __riscv_vfmin_vv_f64m4_tu
+#define VFADDVV_FLOAT           __riscv_vfadd_vv_f64m4
+#define VFIRSTM                 __riscv_vfirst_m_b16
+#define UINT_V_T                vuint64m4_t
+#define VIDV_MASK_UINT_TU       __riscv_vid_v_u64m4_tumu
+#define VIDV_UINT               __riscv_vid_v_u64m4
+#define VADDVX_MASK_UINT_TU     __riscv_vadd_vx_u64m4_tumu
+#define VADDVX_UINT             __riscv_vadd_vx_u64m4
+#define VMVVX_UINT              __riscv_vmv_v_x_u64m4
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
+#define VSLIDEDOWN_UINT         __riscv_vslidedown_vx_u64m4
+#define VMVVXS_UINT             __riscv_vmv_x_s_u64m4_u64
+#else
+#define VSETVL(n)               __riscv_vsetvl_e32m4(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m4()
+#define FLOAT_V_T               vfloat32m4_t
+#define FLOAT_V_T_M1            vfloat32m1_t
+#define FLOAT_VX2_T             vfloat32m4x2_t
+#define VGET_VX2                __riscv_vget_v_f32m4x2_f32m4
+#define VLSEG_FLOAT             __riscv_vlseg2e32_v_f32m4x2
+#define VLSSEG_FLOAT            __riscv_vlsseg2e32_v_f32m4x2
+#define VFREDMINVS_FLOAT        __riscv_vfredmin_vs_f32m4_f32m1
+#define MASK_T                  vbool8_t
+#define VMFLTVF_FLOAT           __riscv_vmflt_vf_f32m4_b8
+#define VMFLTVV_FLOAT           __riscv_vmflt_vv_f32m4_b8
+#define VMFLEVF_FLOAT           __riscv_vmfle_vf_f32m4_b8
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m4
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
+#define VFABSV_FLOAT            __riscv_vfabs_v_f32m4
+#define VFMINVV_FLOAT_TU        __riscv_vfmin_vv_f32m4_tu
+#define VFADDVV_FLOAT           __riscv_vfadd_vv_f32m4
+#define VFIRSTM                 __riscv_vfirst_m_b8
+#define UINT_V_T                vuint32m4_t
+#define VIDV_MASK_UINT_TU       __riscv_vid_v_u32m4_tumu
+#define VIDV_UINT               __riscv_vid_v_u32m4
+#define VADDVX_MASK_UINT_TU     __riscv_vadd_vx_u32m4_tumu
+#define VADDVX_UINT             __riscv_vadd_vx_u32m4
+#define VMVVX_UINT              __riscv_vmv_v_x_u32m4
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
+#define VSLIDEDOWN_UINT         __riscv_vslidedown_vx_u32m4
+#define VMVVXS_UINT             __riscv_vmv_x_s_u32m4_u32
+#endif
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+    unsigned int min_index = 0;
+    if (n <= 0 || inc_x <= 0) return(min_index);
+
+    FLOAT_V_T vx0, vx1, v_min;
+    FLOAT_VX2_T vxx2;
+    UINT_V_T v_min_index;
+    MASK_T mask;
+  
+    size_t vlmax = VSETVL_MAX;
+    v_min_index = VMVVX_UINT(0, vlmax);
+    v_min = VFMVVF_FLOAT(FLT_MAX, vlmax);
+    BLASLONG j=0;
+    FLOAT minf=0.0;
+
+    if(inc_x == 1) {
+
+        for (size_t vl; n > 0; n -= vl, x += vl*2, j += vl) {
+            vl = VSETVL(n);
+
+            vxx2 = VLSEG_FLOAT(x, vl);
+
+            vx0 = VGET_VX2(vxx2, 0);
+            vx1 = VGET_VX2(vxx2, 1);
+
+            vx0 = VFABSV_FLOAT(vx0, vl);
+            vx1 = VFABSV_FLOAT(vx1, vl);
+
+            vx0 = VFADDVV_FLOAT(vx0, vx1, vl);
+
+            // index where element less than v_min
+            mask = VMFLTVV_FLOAT(vx0, v_min, vl);
+            v_min_index = VIDV_MASK_UINT_TU(mask, v_min_index, vl);
+            v_min_index = VADDVX_MASK_UINT_TU(mask, v_min_index, v_min_index, j, vl);
+
+            //update v_min and start_index j
+            v_min = VFMINVV_FLOAT_TU(v_min, v_min, vx0, vl);
+        }
+
+    } else {
+
+        BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2;
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, j += vl) {
+            vl = VSETVL(n);
+
+            vxx2 = VLSSEG_FLOAT(x, stride_x, vl);
+
+            vx0 = VGET_VX2(vxx2, 0);
+            vx1 = VGET_VX2(vxx2, 1);
+
+            vx0 = VFABSV_FLOAT(vx0, vl);
+            vx1 = VFABSV_FLOAT(vx1, vl);
+
+            vx0 = VFADDVV_FLOAT(vx0, vx1, vl);
+
+            // index where element less than v_min
+            mask = VMFLTVV_FLOAT(vx0, v_min, vl);
+            v_min_index = VIDV_MASK_UINT_TU(mask, v_min_index, vl);
+            v_min_index = VADDVX_MASK_UINT_TU(mask, v_min_index, v_min_index, j, vl);
+
+            //update v_min and start_index j
+            v_min = VFMINVV_FLOAT_TU(v_min, v_min, vx0, vl);
+        }
+
+    }
+
+    FLOAT_V_T_M1 v_res;
+    v_res = VFMVVF_FLOAT_M1(FLT_MAX, vlmax);
+
+    v_res = VFREDMINVS_FLOAT(v_min, v_res, vlmax);
+    minf = VFMVFS_FLOAT_M1(v_res);
+    mask = VMFLEVF_FLOAT(v_min, minf, vlmax);
+    min_index = VFIRSTM(mask, vlmax);
+
+    v_min_index = VSLIDEDOWN_UINT(v_min_index, min_index, vlmax);
+    min_index = VMVVXS_UINT(v_min_index);
+
+    return(min_index+1);
+}
diff --git a/kernel/riscv64/izamin_vector.c b/kernel/riscv64/izamin_vector.c
index 818193a9e..c76a38099 100644
--- a/kernel/riscv64/izamin_vector.c
+++ b/kernel/riscv64/izamin_vector.c
@@ -31,235 +31,142 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #if defined(DOUBLE)
 
-#define VSETVL(n) vsetvl_e64m8(n)
-#define VSETVL_MAX vsetvlmax_e64m1()
+#define VSETVL(n) RISCV_RVV(vsetvl_e64m8)(n)
 #define FLOAT_V_T vfloat64m8_t
 #define FLOAT_V_T_M1 vfloat64m1_t
-#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLSEV_FLOAT vlse64_v_f64m8
-#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1
+#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m8)
+#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m8)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDMINVS_FLOAT(va, vb, gvl) RISCV_RVV(vfredmin_vs_f64m8_f64m1)(v_res, va, vb, gvl)
+#define VIDV_MASK_UINT RISCV_RVV(vid_v_u64m8_m)
+#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u64m8_m)
+#define VCOMPRESS(va, vm, gvl) RISCV_RVV(vcompress_vm_u64m8)(vm, compressed, va, gvl)
+#else
+#define VFREDMINVS_FLOAT RISCV_RVV(vfredmin_vs_f64m8_f64m1)
+#define VIDV_MASK_UINT RISCV_RVV(vid_v_u64m8_mu)
+#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u64m8_mu)
+#define VCOMPRESS RISCV_RVV(vcompress_vm_u64m8)
+#endif
 #define MASK_T vbool8_t
-#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
-#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8
-#define VFMVVF_FLOAT vfmv_v_f_f64m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m
-#define VFMINVV_FLOAT vfmin_vv_f64m8
-#define VMFLEVF_FLOAT vmfle_vf_f64m8_b8
-#define VMFIRSTM vmfirst_m_b8
+#define VMFGTVV_FLOAT RISCV_RVV(vmfgt_vv_f64m8_b8)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m8)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1)
+#define VFMINVV_FLOAT RISCV_RVV(vfmin_vv_f64m8)
+#define VMFLEVF_FLOAT RISCV_RVV(vmfle_vf_f64m8_b8)
+#define VMFIRSTM RISCV_RVV(vfirst_m_b8)
 #define UINT_V_T vuint64m8_t
 #define VSEVU_UINT vse64_v_u64m8
 #define UINT_T long unsigned int
-#define VIDV_MASK_UINT vid_v_u64m8_m
-#define VIDV_UINT vid_v_u64m8
-#define VADDVX_MASK_UINT vadd_vx_u64m8_m
-#define VADDVX_UINT vadd_vx_u64m8
-#define VFADDVV_FLOAT vfadd_vv_f64m8
-#define VMVVX_UINT vmv_v_x_u64m8
+#define VIDV_UINT RISCV_RVV(vid_v_u64m8)
+#define VADDVX_UINT RISCV_RVV(vadd_vx_u64m8)
+#define VMVVX_UINT RISCV_RVV(vmv_v_x_u64m8)
+#define VFABS_FLOAT RISCV_RVV(vfabs_v_f64m8)
+#define VFADDVV_FLOAT RISCV_RVV(vfadd_vv_f64m8)
+#define VMV_X RISCV_RVV(vmv_x_s_u64m8_u64)
 #else
 
-#define ABS fabsf
-#define VSETVL(n) vsetvl_e32m8(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
+#define VSETVL(n) RISCV_RVV(vsetvl_e32m8)(n)
 #define FLOAT_V_T vfloat32m8_t
 #define FLOAT_V_T_M1 vfloat32m1_t
-#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLSEV_FLOAT vlse32_v_f32m8
-#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1
+#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m8)
+#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m8)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDMINVS_FLOAT(va, vb, gvl) RISCV_RVV(vfredmin_vs_f32m8_f32m1)(v_res, va, vb, gvl)
+#define VIDV_MASK_UINT RISCV_RVV(vid_v_u32m8_m)
+#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u32m8_m)
+#define VCOMPRESS(va, vm, gvl) RISCV_RVV(vcompress_vm_u32m8)(vm, compressed, va, gvl)
+#else
+#define VFREDMINVS_FLOAT RISCV_RVV(vfredmin_vs_f32m8_f32m1)
+#define VIDV_MASK_UINT RISCV_RVV(vid_v_u32m8_mu)
+#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u32m8_mu)
+#define VCOMPRESS RISCV_RVV(vcompress_vm_u32m8)
+#endif
 #define MASK_T vbool4_t
-#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
-#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4
-#define VFMVVF_FLOAT vfmv_v_f_f32m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m
-#define VFMINVV_FLOAT vfmin_vv_f32m8
-#define VMFLEVF_FLOAT vmfle_vf_f32m8_b4
-#define VMFIRSTM vmfirst_m_b4
+#define VMFGTVV_FLOAT RISCV_RVV(vmfgt_vv_f32m8_b4)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m8)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1)
+#define VFMINVV_FLOAT RISCV_RVV(vfmin_vv_f32m8)
+#define VMFLEVF_FLOAT RISCV_RVV(vmfle_vf_f32m8_b4)
+#define VMFIRSTM RISCV_RVV(vfirst_m_b4)
 #define UINT_V_T vuint32m8_t
 #define UINT_T unsigned int
-#define VSEVU_UINT vse32_v_u32m8
-#define VIDV_MASK_UINT vid_v_u32m8_m
-#define VIDV_UINT vid_v_u32m8
-#define VADDVX_MASK_UINT vadd_vx_u32m8_m
-#define VADDVX_UINT vadd_vx_u32m8
-#define VFADDVV_FLOAT vfadd_vv_f32m8
-#define VMVVX_UINT vmv_v_x_u32m8
+#define VSEVU_UINT RISCV_RVV(vse32_v_u32m8)
+#define VIDV_UINT RISCV_RVV(vid_v_u32m8)
+#define VADDVX_UINT RISCV_RVV(vadd_vx_u32m8)
+#define VMVVX_UINT RISCV_RVV(vmv_v_x_u32m8)
+#define VFABS_FLOAT RISCV_RVV(vfabs_v_f32m8)
+#define VFADDVV_FLOAT RISCV_RVV(vfadd_vv_f32m8)
+#define VMV_X RISCV_RVV(vmv_x_s_u32m8_u32)
 #endif
 
 
 BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
-	BLASLONG i=0, j=0;
-	FLOAT minf=FLT_MAX;
+        BLASLONG i=0, j=0;
         unsigned int min_index = 0;
-	if (n <= 0 || inc_x <= 0) return(min_index);
+        if (n <= 0 || inc_x <= 0) return(min_index);
+        FLOAT minf=FLT_MAX;
 
-        FLOAT_V_T vx0, vx1, v_min;
+        FLOAT_V_T vx, vx2, v_min;
         UINT_V_T v_min_index;
-        MASK_T mask0, mask1;
+        MASK_T mask;
         unsigned int gvl = 0;
-        FLOAT_V_T_M1 v_res, v_max;
-        gvl = VSETVL_MAX;
-        v_res = VFMVVF_FLOAT_M1(0, gvl);
-        v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl);
+        FLOAT_V_T_M1 v_res;
+        v_res = VFMVVF_FLOAT_M1(FLT_MAX, 1);
 
         gvl = VSETVL(n);
-		UINT_T temp_uint[gvl];
-        v_min_index = VMVVX_UINT(0, gvl);
+        unsigned int stride_x = inc_x * 2 * sizeof(FLOAT);
+        unsigned int idx = 0, inc_v = gvl * inc_x * 2;
+
         v_min = VFMVVF_FLOAT(FLT_MAX, gvl);
-        BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
-        BLASLONG inc_xv = gvl * inc_x * 2;
-        BLASLONG ix = 0;
+        v_min_index = VMVVX_UINT(0, gvl);
         for(i=0,j=0; i < n/gvl; i++){
-                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
-                //fabs(vector)
-                mask0 = VMFLTVF_FLOAT(vx0, 0, gvl);
-                vx0 = VFRSUBVF_MASK_FLOAT(mask0, vx0, vx0, 0, gvl);
-/*
-#if defined(DOUBLE)
-asm volatile(
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e64,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+v"(vx0)
-        :"v"(mask0), "f"(zero), "r"(gvl)
-        :"v0");
-#else
-asm volatile(
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e32,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+v"(vx0)
-        :"v"(mask0), "f"(zero), "r"(gvl)
-        :"v0");
-#endif
-*/
-                vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
-                //fabs(vector)
-                mask1 = VMFLTVF_FLOAT(vx1, 0, gvl);
-                vx1 = VFRSUBVF_MASK_FLOAT(mask1, vx1, vx1, 0, gvl);
-/*
-#if defined(DOUBLE)
-asm volatile(
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e64,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+v"(vx1)
-        :"v"(mask1), "f"(zero), "r"(gvl)
-        :"v0");
-#else
-asm volatile(
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e32,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+v"(vx1)
-        :"v"(mask1), "f"(zero), "r"(gvl)
-        :"v0");
-#endif
-*/
-                vx0 = VFADDVV_FLOAT(vx0, vx1, gvl);
+                vx = VLSEV_FLOAT(&x[idx], stride_x, gvl);
+                vx2 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl);
+                vx = VFABS_FLOAT(vx, gvl);
+                vx2 = VFABS_FLOAT(vx2, gvl);
+                vx = VFADDVV_FLOAT(vx, vx2, gvl);
 
-                //index where element less than v_min
-                mask0 = VMFLTVV_FLOAT(vx0, v_min, gvl);
-                v_min_index = VIDV_MASK_UINT(mask0, v_min_index, gvl);
-/*
-#if defined(DOUBLE)
-asm volatile(
-        "vor.vv v0, %1, %1 \n\t"
-        "vsetvli x0, %2, e64,m8 \n\t"
-        "vid.v %0, v0.t \n\t"
-        :"+v"(v_min_index)
-        :"v"(mask0), "r"(gvl)
-        :"v0");
-#else
-asm volatile(
-        "vor.vv v0, %1, %1 \n\t"
-        "vsetvli x0, %2, e32,m8 \n\t"
-        "vid.v %0, v0.t \n\t"
-        :"+v"(v_min_index)
-        :"v"(mask0), "r"(gvl)
-        :"v0");
-#endif
-*/
-                v_min_index = VADDVX_MASK_UINT(mask0, v_min_index, v_min_index, j, gvl);
+
+                //index where element greater than v_min
+                mask = VMFGTVV_FLOAT(v_min, vx, gvl);
+                v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl);
+                v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, gvl);
 
                 //update v_min and start_index j
-                v_min = VFMINVV_FLOAT(v_min, vx0, gvl);
+                v_min = VFMINVV_FLOAT(v_min, vx, gvl);
                 j += gvl;
-                ix += inc_xv;
+                idx += inc_v;
         }
-        v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
-        minf = VFMVFS_FLOAT(v_res);
-        mask0 = VMFLEVF_FLOAT(v_min, minf, gvl);
-        min_index = VMFIRSTM(mask0,gvl);
-                 VSEVU_UINT(temp_uint,v_min_index,gvl);
-        min_index = temp_uint[min_index];
+
+        v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl);
+        minf = EXTRACT_FLOAT(v_res);
+        mask = VMFLEVF_FLOAT(v_min, minf, gvl);
+        UINT_V_T compressed;
+        compressed = VCOMPRESS(v_min_index, mask, gvl);
+        min_index = VMV_X(compressed);
 
         if(j < n){
                 gvl = VSETVL(n-j);
-                v_min_index = VMVVX_UINT(0, gvl);
-                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
-                //fabs(vector)
-                mask0 = VMFLTVF_FLOAT(vx0, 0, gvl);
-                vx0 = VFRSUBVF_MASK_FLOAT(mask0, vx0, vx0, 0, gvl);
-/*
-#if defined(DOUBLE)
-asm volatile(
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e64,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+v"(vx0)
-        :"v"(mask0), "f"(zero), "r"(gvl)
-        :"v0");
-#else
-asm volatile(
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e32,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+v"(vx0)
-        :"v"(mask0), "f"(zero), "r"(gvl)
-        :"v0");
-#endif
-*/
-                vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
-                //fabs(vector)
-                mask1 = VMFLTVF_FLOAT(vx1, 0, gvl);
-                vx1 = VFRSUBVF_MASK_FLOAT(mask1, vx1, vx1, 0, gvl);
-/*
-#if defined(DOUBLE)
-asm volatile(
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e64,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+v"(vx1)
-        :"v"(mask1), "f"(zero), "r"(gvl)
-        :"v0");
-#else
-asm volatile(
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e32,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+v"(vx1)
-        :"v"(mask1), "f"(zero), "r"(gvl)
-        :"v0");
-#endif
-*/
-                v_min = VFADDVV_FLOAT(vx0, vx1, gvl);
-                v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
-                FLOAT cur_minf = VFMVFS_FLOAT(v_res);
+                v_min = VLSEV_FLOAT(&x[idx], stride_x, gvl);
+                vx2 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl);
+                v_min = VFABS_FLOAT(v_min, gvl);
+                vx2 = VFABS_FLOAT(vx2, gvl);
+                v_min = VFADDVV_FLOAT(v_min, vx2, gvl);
+
+                v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl);
+                FLOAT cur_minf = EXTRACT_FLOAT(v_res);
                 if(cur_minf < minf){
                         //tail index
                         v_min_index = VIDV_UINT(gvl);
                         v_min_index = VADDVX_UINT(v_min_index, j, gvl);
 
-                        mask0 = VMFLEVF_FLOAT(v_min, cur_minf, gvl);
-                        min_index = VMFIRSTM(mask0,gvl);
-                              VSEVU_UINT(temp_uint,v_min_index,gvl);
-                                       min_index = temp_uint[min_index];
-
+                        mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl);
+                        UINT_V_T compressed;
+                        compressed = VCOMPRESS(v_min_index, mask, gvl);
+                        min_index = VMV_X(compressed);
                 }
         }
-	return(min_index+1);
-}
-
 
+        return(min_index+1);
+}
diff --git a/kernel/riscv64/max_rvv.c b/kernel/riscv64/max_rvv.c
new file mode 100644
index 000000000..745c27bf4
--- /dev/null
+++ b/kernel/riscv64/max_rvv.c
@@ -0,0 +1,98 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <float.h>
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m8()
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e32m1()
+#define FLOAT_V_T               vfloat32m8_t
+#define FLOAT_V_T_M1            vfloat32m1_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m8
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m8
+#define VFREDMAXVS_FLOAT        __riscv_vfredmax_vs_f32m8_f32m1
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m8
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
+#define VFMAXVV_FLOAT_TU        __riscv_vfmax_vv_f32m8_tu
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m8()
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e64m1()
+#define FLOAT_V_T               vfloat64m8_t
+#define FLOAT_V_T_M1            vfloat64m1_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m8
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m8
+#define VFREDMAXVS_FLOAT        __riscv_vfredmax_vs_f64m8_f64m1
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m8
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
+#define VFMAXVV_FLOAT_TU        __riscv_vfmax_vv_f64m8_tu
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+    FLOAT maxf = 0.0;
+
+    if (n <= 0 || inc_x <= 0) return(maxf);
+
+    FLOAT_V_T vx, vmax;
+    FLOAT_V_T_M1 v_res;
+
+    v_res = VFMVVF_FLOAT_M1(-FLT_MAX, VSETVL_MAX_M1);
+    size_t vlmax = VSETVL_MAX;
+    vmax = VFMVVF_FLOAT(-FLT_MAX, vlmax);
+
+    if(inc_x == 1) {
+
+        for (size_t vl; n > 0; n -= vl, x += vl) {
+            vl = VSETVL(n);
+
+            vx = VLEV_FLOAT(x, vl);
+            vmax = VFMAXVV_FLOAT_TU(vmax, vmax, vx, vl);
+       }
+
+    } else {
+
+        BLASLONG stride_x = inc_x * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x) {
+            vl = VSETVL(n);
+
+            vx = VLSEV_FLOAT(x, stride_x, vl);
+            vmax = VFMAXVV_FLOAT_TU(vmax, vmax, vx, vl);
+        }
+
+    }
+
+    v_res = VFREDMAXVS_FLOAT(vmax, v_res, vlmax);
+    maxf = VFMVFS_FLOAT_M1(v_res);
+
+    return(maxf);
+}
diff --git a/kernel/riscv64/max_vector.c b/kernel/riscv64/max_vector.c
index 7f31e9a53..ee9920cd2 100644
--- a/kernel/riscv64/max_vector.c
+++ b/kernel/riscv64/max_vector.c
@@ -28,29 +28,47 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 #include <math.h>
 #include <float.h>
-#if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m8(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
-#define FLOAT_V_T vfloat32m8_t
-#define FLOAT_V_T_M1 vfloat32m1_t
-#define VLEV_FLOAT vle32_v_f32m8
-#define VLSEV_FLOAT vlse32_v_f32m8
-#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1
-#define VFMVVF_FLOAT vfmv_v_f_f32m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFMAXVV_FLOAT vfmax_vv_f32m8
+
+#ifdef RISCV64_ZVL256B
+#       define LMUL m2
+#       if defined(DOUBLE)
+#               define ELEN 64
+#               define MLEN 32
+#       else
+#               define ELEN 32
+#               define MLEN 16
+#       endif
+#else
+#       define LMUL m8
+#       if defined(DOUBLE)
+#               define ELEN 64
+#               define MLEN 8
+#       else
+#               define ELEN 32
+#               define MLEN 4
+#       endif
+#endif
+
+#define _
+#define JOIN2_X(x, y) x ## y
+#define JOIN2(x, y) JOIN2_X(x, y)
+#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
+
+#define VSETVL          JOIN(RISCV_RVV(vsetvl),    _e,     ELEN,   LMUL,   _)
+#define FLOAT_V_T       JOIN(vfloat,            ELEN,   LMUL,   _t,     _)
+#define FLOAT_V_T_M1    JOIN(vfloat,            ELEN,   m1,     _t,     _)
+#define VLEV_FLOAT      JOIN(RISCV_RVV(vle),       ELEN,   _v_f,   ELEN,   LMUL)
+#define VLSEV_FLOAT     JOIN(RISCV_RVV(vlse),      ELEN,   _v_f,   ELEN,   LMUL)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDMAXVS_FLOAT(va, vb, gvl) JOIN(RISCV_RVV(vfredmax_vs_f),  ELEN,   LMUL,   _f, JOIN2( ELEN,   m1))(v_res, va, vb, gvl)
 #else
-#define VSETVL(n) vsetvl_e64m8(n)
-#define VSETVL_MAX vsetvlmax_e64m1()
-#define FLOAT_V_T vfloat64m8_t
-#define FLOAT_V_T_M1 vfloat64m1_t
-#define VLEV_FLOAT vle64_v_f64m8
-#define VLSEV_FLOAT vlse64_v_f64m8
-#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1
-#define VFMVVF_FLOAT vfmv_v_f_f64m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFMAXVV_FLOAT vfmax_vv_f64m8
+#define VFREDMAXVS_FLOAT JOIN(RISCV_RVV(vfredmax_vs_f),  ELEN,   LMUL,   _f, JOIN2( ELEN,   m1))
 #endif
+#define MASK_T          JOIN(vbool,     MLEN,   _t,     _,      _)
+#define VMFLTVF_FLOAT   JOIN(RISCV_RVV(vmflt_vf_f), ELEN,  LMUL,   _b,     MLEN)
+#define VFMVVF_FLOAT    JOIN(RISCV_RVV(vfmv),      _v_f_f, ELEN,   LMUL,   _)
+#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv),      _v_f_f, ELEN,   m1,     _)
+#define VFMAXVV_FLOAT   JOIN(RISCV_RVV(vfmax),     _vv_f,  ELEN,   LMUL,   _)
 
 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
@@ -59,10 +77,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	FLOAT maxf=-FLT_MAX;
         unsigned int gvl = 0;
         FLOAT_V_T v0, v1, v_max;
-        FLOAT_V_T_M1 v_res, v_min;
-        gvl = VSETVL_MAX;
-        v_res = VFMVVF_FLOAT_M1(0, gvl);
-        v_min = VFMVVF_FLOAT_M1(-FLT_MAX, gvl);
+        FLOAT_V_T_M1 v_res;
+        v_res = VFMVVF_FLOAT_M1(-FLT_MAX, 1);
 
         if(inc_x == 1){
                 gvl = VSETVL(n);
@@ -76,15 +92,12 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                                 v_max = VFMAXVV_FLOAT(v_max, v1, gvl);
                                 j += gvl * 2;
                         }
-                        v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl);
-                        maxf = *((FLOAT*)&v_res);
+                        v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl);
                 }
                 for(;j<n;){
                         gvl = VSETVL(n-j);
                         v0 = VLEV_FLOAT(&x[j], gvl);
-                        v_res = VFREDMAXVS_FLOAT(v_res, v0, v_min, gvl);
-                        if(*((FLOAT*)&v_res) > maxf)
-                                maxf = *((FLOAT*)&v_res);
+                        v_res = VFREDMAXVS_FLOAT(v0, v_res, gvl);
                         j += gvl;
                 }
         }else{
@@ -102,18 +115,16 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                                 j += gvl * 2;
                                 idx += inc_xv * 2;
                         }
-                        v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl);
-                        maxf = *((FLOAT*)&v_res);
+                        v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl);
                 }
                 for(;j<n;){
                         gvl = VSETVL(n-j);
                         v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
-                        v_res = VFREDMAXVS_FLOAT(v_res, v0, v_min, gvl);
-                        if(*((FLOAT*)&v_res) > maxf)
-                                maxf = *((FLOAT*)&v_res);
+                        v_res = VFREDMAXVS_FLOAT(v0, v_res, gvl);
                         j += gvl;
                 }
         }
+        maxf = EXTRACT_FLOAT(v_res);
 	return(maxf);
 }
 
diff --git a/kernel/riscv64/min_rvv.c b/kernel/riscv64/min_rvv.c
new file mode 100644
index 000000000..78528fef9
--- /dev/null
+++ b/kernel/riscv64/min_rvv.c
@@ -0,0 +1,98 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <float.h>
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m8()
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e32m1()
+#define FLOAT_V_T               vfloat32m8_t
+#define FLOAT_V_T_M1            vfloat32m1_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m8
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m8
+#define VFREDMINVS_FLOAT        __riscv_vfredmin_vs_f32m8_f32m1
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m8
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
+#define VFMINVV_FLOAT_TU        __riscv_vfmin_vv_f32m8_tu
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m8()
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e64m1()
+#define FLOAT_V_T               vfloat64m8_t
+#define FLOAT_V_T_M1            vfloat64m1_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m8
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m8
+#define VFREDMINVS_FLOAT        __riscv_vfredmin_vs_f64m8_f64m1
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m8
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
+#define VFMINVV_FLOAT_TU        __riscv_vfmin_vv_f64m8_tu
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+    FLOAT minf = 0.0;
+
+    if (n <= 0 || inc_x <= 0) return(minf);
+
+    FLOAT_V_T vx, vmin;
+    FLOAT_V_T_M1 v_res;
+    
+    v_res = VFMVVF_FLOAT_M1(FLT_MAX, VSETVL_MAX_M1);
+    size_t vlmax = VSETVL_MAX;
+    vmin = VFMVVF_FLOAT(FLT_MAX, vlmax);
+
+    if(inc_x == 1) {
+
+        for (size_t vl; n > 0; n -= vl, x += vl) {
+            vl = VSETVL(n);
+
+            vx = VLEV_FLOAT(x, vl);
+            vmin = VFMINVV_FLOAT_TU(vmin, vmin, vx, vl);
+        }
+
+    } else {
+
+        BLASLONG stride_x = inc_x * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x) {
+            vl = VSETVL(n);
+
+            vx = VLSEV_FLOAT(x, stride_x, vl);
+            vmin = VFMINVV_FLOAT_TU(vmin, vmin, vx, vl);
+        }
+
+    }
+
+    v_res = VFREDMINVS_FLOAT(vmin, v_res, vlmax);
+    minf = VFMVFS_FLOAT_M1(v_res);
+
+    return(minf);
+}
diff --git a/kernel/riscv64/min_vector.c b/kernel/riscv64/min_vector.c
index 14b7e01ed..2001840bb 100644
--- a/kernel/riscv64/min_vector.c
+++ b/kernel/riscv64/min_vector.c
@@ -28,29 +28,47 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 #include <math.h>
 #include <float.h>
-#if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m8(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
-#define FLOAT_V_T vfloat32m8_t
-#define FLOAT_V_T_M1 vfloat32m1_t
-#define VLEV_FLOAT vle32_v_f32m8
-#define VLSEV_FLOAT vlse32_v_f32m8
-#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1
-#define VFMVVF_FLOAT vfmv_v_f_f32m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFMINVV_FLOAT vfmin_vv_f32m8
+
+#ifdef RISCV64_ZVL256B
+#       define LMUL m2
+#       if defined(DOUBLE)
+#               define ELEN 64
+#               define MLEN 32
+#       else
+#               define ELEN 32
+#               define MLEN 16
+#       endif
+#else
+#       define LMUL m8
+#       if defined(DOUBLE)
+#               define ELEN 64
+#               define MLEN 8
+#       else
+#               define ELEN 32
+#               define MLEN 4
+#       endif
+#endif
+
+#define _
+#define JOIN2_X(x, y) x ## y
+#define JOIN2(x, y) JOIN2_X(x, y)
+#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
+
+#define VSETVL          JOIN(RISCV_RVV(vsetvl),    _e,     ELEN,   LMUL,   _)
+#define FLOAT_V_T       JOIN(vfloat,    ELEN,   LMUL,   _t,     _)
+#define FLOAT_V_T_M1    JOIN(vfloat,    ELEN,   m1,     _t,     _)
+#define VLEV_FLOAT      JOIN(RISCV_RVV(vle),       ELEN,   _v_f,   ELEN,   LMUL)
+#define VLSEV_FLOAT     JOIN(RISCV_RVV(vlse),      ELEN,   _v_f,   ELEN,   LMUL)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDMINVS_FLOAT(va, vb, gvl) JOIN(RISCV_RVV(vfredmin_vs_f),  ELEN,   LMUL,   _f, JOIN2( ELEN,   m1))(v_res, va, vb, gvl)
 #else
-#define VSETVL(n) vsetvl_e64m8(n)
-#define VSETVL_MAX vsetvlmax_e64m1()
-#define FLOAT_V_T vfloat64m8_t
-#define FLOAT_V_T_M1 vfloat64m1_t
-#define VLEV_FLOAT vle64_v_f64m8
-#define VLSEV_FLOAT vlse64_v_f64m8
-#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1
-#define VFMVVF_FLOAT vfmv_v_f_f64m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFMINVV_FLOAT vfmin_vv_f64m8
+#define VFREDMINVS_FLOAT JOIN(RISCV_RVV(vfredmin_vs_f),  ELEN,   LMUL,   _f, JOIN2( ELEN,   m1))
 #endif
+#define MASK_T          JOIN(vbool,     MLEN,   _t,     _,      _)
+#define VMFLTVF_FLOAT   JOIN(RISCV_RVV(vmflt_vf_f), ELEN,  LMUL,   _b,     MLEN)
+#define VFMVVF_FLOAT    JOIN(RISCV_RVV(vfmv),      _v_f_f, ELEN,   LMUL,   _)
+#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv),      _v_f_f, ELEN,   m1,     _)
+#define VFMINVV_FLOAT   JOIN(RISCV_RVV(vfmin),     _vv_f,  ELEN,   LMUL,   _)
 
 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
@@ -59,10 +77,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	FLOAT minf=FLT_MAX;
         unsigned int gvl = 0;
         FLOAT_V_T v0, v1, v_min;
-        FLOAT_V_T_M1 v_res, v_max;
-        gvl = VSETVL_MAX;
-        v_res = VFMVVF_FLOAT_M1(0, gvl);
-        v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl);
+        FLOAT_V_T_M1 v_res;
+        v_res = VFMVVF_FLOAT_M1(FLT_MAX, 1);
 
         if(inc_x == 1){
                 gvl = VSETVL(n);
@@ -76,15 +92,12 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                                 v_min = VFMINVV_FLOAT(v_min, v1, gvl);
                                 j += gvl * 2;
                         }
-                        v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
-                        minf = *((FLOAT*)&v_res);
+                        v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl);
                 }
                 for(;j<n;){
                         gvl = VSETVL(n-j);
                         v0 = VLEV_FLOAT(&x[j], gvl);
-                        v_res = VFREDMINVS_FLOAT(v_res, v0, v_max, gvl);
-                        if(*((FLOAT*)&v_res) < minf)
-                                minf = *((FLOAT*)&v_res);
+                        v_res = VFREDMINVS_FLOAT(v0, v_res, gvl);
                         j += gvl;
                 }
         }else{
@@ -102,18 +115,16 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                                 j += gvl * 2;
                                 idx += inc_xv * 2;
                         }
-                        v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
-                        minf = *((FLOAT*)&v_res);
+                        v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl);
                 }
                 for(;j<n;){
                         gvl = VSETVL(n-j);
                         v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
-                        v_res = VFREDMINVS_FLOAT(v_res, v0, v_max, gvl);
-                        if(*((FLOAT*)&v_res) < minf)
-                                minf = *((FLOAT*)&v_res);
+                        v_res = VFREDMINVS_FLOAT(v0, v_res, gvl);
                         j += gvl;
                 }
         }
+        minf = EXTRACT_FLOAT(v_res);
 	return(minf);
 }
 
diff --git a/kernel/riscv64/nrm2_rvv.c b/kernel/riscv64/nrm2_rvv.c
new file mode 100644
index 000000000..3eb423849
--- /dev/null
+++ b/kernel/riscv64/nrm2_rvv.c
@@ -0,0 +1,212 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if defined(DOUBLE)
+#define VSETVL             __riscv_vsetvl_e64m4
+#define FLOAT_V_T           vfloat64m4_t
+#define FLOAT_V_T_M1        vfloat64m1_t
+#define VLEV_FLOAT          __riscv_vle64_v_f64m4
+#define VLSEV_FLOAT         __riscv_vlse64_v_f64m4
+#define VFMVVF_FLOAT        __riscv_vfmv_v_f_f64m4
+#define VFMVSF_FLOAT        __riscv_vfmv_s_f_f64m4
+#define VFMVVF_FLOAT_M1     __riscv_vfmv_v_f_f64m1
+#define MASK_T              vbool16_t
+#define VFABS               __riscv_vfabs_v_f64m4
+#define VMFNE               __riscv_vmfne_vf_f64m4_b16
+#define VMFGT               __riscv_vmfgt_vv_f64m4_b16
+#define VMFEQ               __riscv_vmfeq_vf_f64m4_b16
+#define VCPOP               __riscv_vcpop_m_b16
+#define VFREDMAX            __riscv_vfredmax_vs_f64m4_f64m1
+#define VFREDMIN            __riscv_vfredmin_vs_f64m4_f64m1
+#define VFIRST              __riscv_vfirst_m_b16
+#define VRGATHER            __riscv_vrgather_vx_f64m4
+#define VFDIV               __riscv_vfdiv_vv_f64m4
+#define VFDIV_M             __riscv_vfdiv_vv_f64m4_mu
+#define VFMUL               __riscv_vfmul_vv_f64m4
+#define VFMUL_M             __riscv_vfmul_vv_f64m4_mu
+#define VFMACC              __riscv_vfmacc_vv_f64m4
+#define VFMACC_M            __riscv_vfmacc_vv_f64m4_mu
+#define VMSBF               __riscv_vmsbf_m_b16
+#define VMSOF               __riscv_vmsof_m_b16
+#define VMAND               __riscv_vmand_mm_b16
+#define VMANDN              __riscv_vmand_mm_b16
+#define VFREDSUM            __riscv_vfredusum_vs_f64m4_f64m1
+#define VMERGE              __riscv_vmerge_vvm_f64m4
+#define VSEV_FLOAT          __riscv_vse64_v_f64m4
+#define EXTRACT_FLOAT0_V(v) __riscv_vfmv_f_s_f64m4_f64(v)
+#define ABS fabs
+#else
+#define VSETVL              __riscv_vsetvl_e32m4
+#define FLOAT_V_T           vfloat32m4_t
+#define FLOAT_V_T_M1        vfloat32m1_t
+#define VLEV_FLOAT          __riscv_vle32_v_f32m4
+#define VLSEV_FLOAT         __riscv_vlse32_v_f32m4
+#define VFMVVF_FLOAT        __riscv_vfmv_v_f_f32m4
+#define VFMVSF_FLOAT        __riscv_vfmv_s_f_f32m4
+#define VFMVVF_FLOAT_M1     __riscv_vfmv_v_f_f32m1
+#define MASK_T              vbool8_t
+#define VFABS               __riscv_vfabs_v_f32m4
+#define VMFNE               __riscv_vmfne_vf_f32m4_b8
+#define VMFGT               __riscv_vmfgt_vv_f32m4_b8
+#define VMFEQ               __riscv_vmfeq_vf_f32m4_b8
+#define VCPOP               __riscv_vcpop_m_b8
+#define VFREDMAX            __riscv_vfredmax_vs_f32m4_f32m1
+#define VFREDMIN            __riscv_vfredmin_vs_f32m4_f32m1
+#define VFIRST              __riscv_vfirst_m_b8
+#define VRGATHER            __riscv_vrgather_vx_f32m4
+#define VFDIV               __riscv_vfdiv_vv_f32m4
+#define VFDIV_M             __riscv_vfdiv_vv_f32m4_mu
+#define VFMUL               __riscv_vfmul_vv_f32m4
+#define VFMUL_M             __riscv_vfmul_vv_f32m4_mu
+#define VFMACC              __riscv_vfmacc_vv_f32m4
+#define VFMACC_M            __riscv_vfmacc_vv_f32m4_mu
+#define VMSBF               __riscv_vmsbf_m_b8
+#define VMSOF               __riscv_vmsof_m_b8
+#define VMAND               __riscv_vmand_mm_b8
+#define VMANDN              __riscv_vmand_mm_b8
+#define VFREDSUM            __riscv_vfredusum_vs_f32m4_f32m1
+#define VMERGE              __riscv_vmerge_vvm_f32m4
+#define VSEV_FLOAT          __riscv_vse32_v_f32m4
+#define EXTRACT_FLOAT0_V(v) __riscv_vfmv_f_s_f32m4_f32(v)
+#define ABS fabsf
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+
+	if (n <= 0 || inc_x <= 0) return(0.0);
+        if(n == 1) return (ABS(x[0]));
+
+        unsigned int gvl = 0;
+
+        MASK_T nonzero_mask;
+        MASK_T scale_mask;
+
+        gvl = VSETVL(n);
+        FLOAT_V_T v0;
+        FLOAT_V_T v_ssq = VFMVVF_FLOAT(0, gvl);
+        FLOAT_V_T v_scale = VFMVVF_FLOAT(0, gvl);
+
+        FLOAT scale = 0;
+        FLOAT ssq = 0;
+        unsigned int stride_x = inc_x * sizeof(FLOAT);
+        int idx = 0;
+
+        if( n >= gvl ) // don't pay overheads if we're not doing useful work
+        {
+                for(i=0; i<n/gvl; i++){
+                        v0 = VLSEV_FLOAT( &x[idx], stride_x, gvl );
+                        nonzero_mask = VMFNE( v0, 0, gvl );
+                        v0 = VFABS( v0, gvl );
+                        scale_mask = VMFGT( v0, v_scale, gvl );
+
+                        // assume scale changes are relatively infrequent
+
+                        // unclear if the vcpop+branch is actually a win
+                        // since the operations being skipped are predicated anyway
+                        // need profiling to confirm
+                        if( VCPOP(scale_mask, gvl) ) 
+                        {
+                                v_scale = VFDIV_M( scale_mask, v_scale, v_scale, v0, gvl );
+                                v_scale = VFMUL_M( scale_mask, v_scale, v_scale, v_scale, gvl );
+                                v_ssq = VFMUL_M( scale_mask, v_ssq, v_ssq, v_scale, gvl );
+                                v_scale = VMERGE( v_scale, v0, scale_mask, gvl );
+                        }
+                        v0 = VFDIV_M( nonzero_mask, v0, v0, v_scale, gvl );
+                        v_ssq = VFMACC_M( nonzero_mask, v_ssq, v0, v0, gvl );
+                        idx += inc_x * gvl;
+                }
+
+                // we have gvl elements which we accumulated independently, with independent scales
+                // we need to combine these
+                // naive sort so we process small values first to avoid losing information
+                // could use vector sort extensions where available, but we're dealing with gvl elts at most
+
+                FLOAT * out_ssq = alloca(gvl*sizeof(FLOAT));
+                FLOAT * out_scale = alloca(gvl*sizeof(FLOAT));
+                VSEV_FLOAT( out_ssq, v_ssq, gvl );
+                VSEV_FLOAT( out_scale, v_scale, gvl );
+                for( int a = 0; a < (gvl-1); ++a )
+                {
+                        int smallest = a;
+                        for( size_t b = a+1; b < gvl; ++b )
+                                if( out_scale[b] < out_scale[smallest] )
+                                        smallest = b;
+                        if( smallest != a )
+                        {
+                                FLOAT tmp1 = out_ssq[a];
+                                FLOAT tmp2 = out_scale[a];
+                                out_ssq[a] = out_ssq[smallest];
+                                out_scale[a] = out_scale[smallest];
+                                out_ssq[smallest] = tmp1;
+                                out_scale[smallest] = tmp2;
+                        }
+                }
+
+                int a = 0;
+                while( a<gvl && out_scale[a] == 0 )
+                        ++a;
+
+                if( a < gvl ) 
+                {
+                        ssq = out_ssq[a];
+                        scale = out_scale[a];
+                        ++a;
+                        for( ; a < gvl; ++a ) 
+                        {
+                                ssq = ssq * ( scale / out_scale[a] ) * ( scale / out_scale[a] ) + out_ssq[a];
+                                scale = out_scale[a];
+                        }
+                }
+        }
+
+        //finish any tail using scalar ops
+        i*=gvl*inc_x;
+        n*=inc_x;
+        while(i < n){
+                if ( x[i] != 0.0 ){
+                        FLOAT absxi = ABS( x[i] );
+                        if ( scale < absxi ){
+                                ssq = 1 + ssq * ( scale / absxi ) * ( scale / absxi );
+                                scale = absxi ;
+                        }
+                        else{
+                                ssq += ( absxi/scale ) * ( absxi/scale );
+                        }
+
+                }
+
+                i += inc_x;
+        }
+
+	return(scale * sqrt(ssq));
+}
+
+
diff --git a/kernel/riscv64/nrm2_vector.c b/kernel/riscv64/nrm2_vector.c
index cf6fdb741..5c03fbec7 100644
--- a/kernel/riscv64/nrm2_vector.c
+++ b/kernel/riscv64/nrm2_vector.c
@@ -26,207 +26,189 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 #include "common.h"
-#if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m4(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
-#define FLOAT_V_T vfloat32m4_t
-#define VFMVFS_FLOATM4 vfmv_f_s_f32m4_f32
-#define FLOAT_V_T_M1 vfloat32m1_t
-#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLEV_FLOAT vle32_v_f32m4
-#define VLSEV_FLOAT vlse32_v_f32m4
-#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
-#define VFMACCVV_FLOAT vfmacc_vv_f32m4
-#define VFMVVF_FLOAT vfmv_v_f_f32m4
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFDOTVV_FLOAT vfdot_vv_f32m4
-#define ABS fabsf
-#define MASK_T vbool8_t
-#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m4_m
-#define VMFGTVF_FLOAT vmfgt_vf_f32m4_b8
-#define VMFIRSTM vmfirst_m_b8
-#define VFDIVVF_FLOAT vfdiv_vf_f32m4
-#define VMFLTVF_FLOAT vmflt_vf_f32m4_b8
-#define VFREDMAXVS_FLOAT vfredmax_vs_f32m4_f32m1
+
+#ifdef RISCV64_ZVL256B
+#       define LMUL m1
+#       if defined(DOUBLE)
+#               define ELEN 64
+#               define MLEN 64
+#       else
+#               define ELEN 32
+#               define MLEN 32
+#       endif
+#else
+#       define LMUL m4
+#       if defined(DOUBLE)
+#               define ELEN 64
+#               define MLEN 16
+#       else
+#               define ELEN 32
+#               define MLEN 8
+#       endif
+#endif
+
+#define _
+#define JOIN2_X(x, y) x ## y
+#define JOIN2(x, y) JOIN2_X(x, y)
+#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
+
+#define VSETVL          JOIN(RISCV_RVV(vsetvl),    _e,     ELEN,   LMUL,   _)
+#define FLOAT_V_T       JOIN(vfloat,            ELEN,   LMUL,   _t,     _)
+#define FLOAT_V_T_M1    JOIN(vfloat,            ELEN,   m1,     _t,     _)
+#define VLEV_FLOAT      JOIN(RISCV_RVV(vle),       ELEN,   _v_f,   ELEN,   LMUL)
+#define VLSEV_FLOAT     JOIN(RISCV_RVV(vlse),      ELEN,   _v_f,   ELEN,   LMUL)
+#define VFMVVF_FLOAT    JOIN(RISCV_RVV(vfmv),      _v_f_f, ELEN,   LMUL,   _)
+#define VFMVSF_FLOAT    JOIN(RISCV_RVV(vfmv),      _s_f_f, ELEN,   LMUL,   _)
+#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv),      _v_f_f, ELEN,   m1,     _)
+#define MASK_T          JOIN(vbool,             MLEN,   _t,     _,      _)
+#define VFABS           JOIN(RISCV_RVV(vfabs),     _v_f,   ELEN,   LMUL,   _)
+#define VMFNE           JOIN(RISCV_RVV(vmfne_vf_f),ELEN,   LMUL,   _b,     MLEN)
+#define VMFGT           JOIN(RISCV_RVV(vmfgt_vv_f),ELEN,   LMUL,   _b,     MLEN)
+#define VMFEQ           JOIN(RISCV_RVV(vmfeq_vf_f),ELEN,   LMUL,   _b,     MLEN)
+#define VCPOP           JOIN(RISCV_RVV(vcpop),     _m_b,   MLEN,   _,      _)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFDIV_M         JOIN(RISCV_RVV(vfdiv),     _vv_f,  ELEN,   LMUL,   _m)
+#define VFMUL_M         JOIN(RISCV_RVV(vfmul),     _vv_f,  ELEN,   LMUL,   _m)
+#define VFMACC_M        JOIN(RISCV_RVV(vfmacc),    _vv_f,  ELEN,   LMUL,   _m)
+#define VMERGE(a, b, mask, gvl)       JOIN(RISCV_RVV(vmerge),    _vvm_f, ELEN,   LMUL,   _)(mask, a, b, gvl)
 #else
-#define VSETVL(n) vsetvl_e64m4(n)
-#define VSETVL_MAX vsetvlmax_e64m1()
-#define FLOAT_V_T vfloat64m4_t
-#define VFMVFS_FLOATM4 vfmv_f_s_f64m4_f64
-#define FLOAT_V_T_M1 vfloat64m1_t
-#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLEV_FLOAT vle64_v_f64m4
-#define VLSEV_FLOAT vlse64_v_f64m4
-#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
-#define VFMACCVV_FLOAT vfmacc_vv_f64m4
-#define VFMVVF_FLOAT vfmv_v_f_f64m4
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFDOTVV_FLOAT vfdot_vv_f64m4
+#define VFDIV_M         JOIN(RISCV_RVV(vfdiv),     _vv_f,  ELEN,   LMUL,   _mu)
+#define VFMUL_M         JOIN(RISCV_RVV(vfmul),     _vv_f,  ELEN,   LMUL,   _mu)
+#define VFMACC_M        JOIN(RISCV_RVV(vfmacc),    _vv_f,  ELEN,   LMUL,   _mu)
+#define VMERGE          JOIN(RISCV_RVV(vmerge),    _vvm_f, ELEN,   LMUL,   _)
+#endif
+#define VFIRST          JOIN(RISCV_RVV(vfirst),    _m_b,   MLEN,   _,      _)
+#define VRGATHER        JOIN(RISCV_RVV(vrgather),  _vx_f,  ELEN,   LMUL,   _)
+#define VFDIV           JOIN(RISCV_RVV(vfdiv),     _vv_f,  ELEN,   LMUL,   _)
+#define VFMUL           JOIN(RISCV_RVV(vfmul),     _vv_f,  ELEN,   LMUL,   _)
+#define VFMACC          JOIN(RISCV_RVV(vfmacc),    _vv_f,  ELEN,   LMUL,   _)
+#define VMSBF           JOIN(RISCV_RVV(vmsbf),     _m_b,   MLEN,   _,      _)
+#define VMSOF           JOIN(RISCV_RVV(vmsof),     _m_b,   MLEN,   _,      _)
+#define VMAND           JOIN(RISCV_RVV(vmand),     _mm_b,  MLEN,   _,      _)
+#define VMANDN          JOIN(RISCV_RVV(vmandn),    _mm_b,  MLEN,   _,      _)
+
+#define VSEV_FLOAT      JOIN(RISCV_RVV(vse),       ELEN,   _v_f,   ELEN,   LMUL)
+
+#if defined(DOUBLE)
 #define ABS fabs
-#define MASK_T vbool16_t
-#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m4_m
-#define VMFGTVF_FLOAT vmfgt_vf_f64m4_b16
-#define VMFIRSTM vmfirst_m_b16
-#define VFDIVVF_FLOAT vfdiv_vf_f64m4
-#define VMFLTVF_FLOAT vmflt_vf_f64m4_b16
-#define VFREDMAXVS_FLOAT vfredmax_vs_f64m4_f64m1
+#else
+#define ABS fabsf
 #endif
 
+#define EXTRACT_FLOAT0_V(v) JOIN(RISCV_RVV(vfmv_f_s_f), ELEN, LMUL, _f, ELEN)(v)
+
+//#define DUMP( label, v0, gvl )
+#define DUMP( label, v0, gvl ) do{ FLOAT x[16]; VSEV_FLOAT( x, v0, gvl ); printf ("%s(%d): %s [ ", __FILE__, __LINE__, label); for( int xxx = 0; xxx < gvl; ++xxx ) { printf("%f, ", x[xxx]); } printf(" ]\n"); } while(0)
+
 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
-	BLASLONG i=0, j=0;
+	BLASLONG i=0;
 
-	if ( n < 0 )  return(0.0);
+	if (n <= 0 || inc_x <= 0) return(0.0);
         if(n == 1) return (ABS(x[0]));
 
-        FLOAT_V_T vr, v0, v_zero;
         unsigned int gvl = 0;
-        FLOAT_V_T_M1 v_res, v_z0;
-        gvl = VSETVL_MAX;
-        v_res = VFMVVF_FLOAT_M1(0, gvl);
-        v_z0 = VFMVVF_FLOAT_M1(0, gvl);
-
-        FLOAT scale = 0.0, ssq = 0.0;
-        MASK_T mask;
-        BLASLONG index = 0;
-        if(inc_x == 1){
-                gvl = VSETVL(n);
-                vr = VFMVVF_FLOAT(0, gvl);
-                v_zero = VFMVVF_FLOAT(0, gvl);
-                for(i=0,j=0; i<n/gvl; i++){
-                        v0 = VLEV_FLOAT(&x[j], gvl);
-                        //fabs(vector)
-                        mask = VMFLTVF_FLOAT(v0, 0, gvl);
-                        v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl);
-                        //if scale change
-                        mask = VMFGTVF_FLOAT(v0, scale, gvl);
-                        index = VMFIRSTM(mask, gvl);
-                        if(index == -1){//no elements greater than scale
-                                if(scale != 0.0){
-                                        v0 = VFDIVVF_FLOAT(v0, scale, gvl);
-                                        vr = VFMACCVV_FLOAT(vr, v0, v0, gvl);
-                                }
-                        }else{//found greater element
-                                //ssq in vector vr: vr[0]
-                                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                                //total ssq before current vector
-                                ssq += VFMVFS_FLOAT(v_res);
-                                //find max
-                                v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
-                                //update ssq before max_index
-                                ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res));
-                                //update scale
-                                scale = VFMVFS_FLOAT(v_res);
-                                //ssq in vector vr
-                                v0 = VFDIVVF_FLOAT(v0, scale, gvl);
-                                vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
+
+        MASK_T nonzero_mask;
+        MASK_T scale_mask;
+
+        gvl = VSETVL(n);
+        FLOAT_V_T v0;
+        FLOAT_V_T v_ssq = VFMVVF_FLOAT(0, gvl);
+        FLOAT_V_T v_scale = VFMVVF_FLOAT(0, gvl);
+
+        FLOAT scale = 0;
+        FLOAT ssq = 0;
+        unsigned int stride_x = inc_x * sizeof(FLOAT);
+        int idx = 0;
+
+        if( n >= gvl ) // don't pay overheads if we're not doing useful work
+        {
+                for(i=0; i<n/gvl; i++){
+                        v0 = VLSEV_FLOAT( &x[idx], stride_x, gvl );
+                        nonzero_mask = VMFNE( v0, 0, gvl );
+                        v0 = VFABS( v0, gvl );
+                        scale_mask = VMFGT( v0, v_scale, gvl );
+
+                        // assume scale changes are relatively infrequent
+
+                        // unclear if the vcpop+branch is actually a win
+                        // since the operations being skipped are predicated anyway
+                        // need profiling to confirm
+                        if( VCPOP(scale_mask, gvl) ) 
+                        {
+                                v_scale = VFDIV_M( scale_mask, v_scale, v_scale, v0, gvl );
+                                v_scale = VFMUL_M( scale_mask, v_scale, v_scale, v_scale, gvl );
+                                v_ssq = VFMUL_M( scale_mask, v_ssq, v_ssq, v_scale, gvl );
+                                v_scale = VMERGE( v_scale, v0, scale_mask, gvl );
                         }
-                        j += gvl;
+                        v0 = VFDIV_M( nonzero_mask, v0, v0, v_scale, gvl );
+                        v_ssq = VFMACC_M( nonzero_mask, v_ssq, v0, v0, gvl );
+                        idx += inc_x * gvl;
                 }
-                //ssq in vector vr: vr[0]
-                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                //total ssq now
-                ssq += VFMVFS_FLOAT(v_res);
-
-                //tail
-                if(j < n){
-                        gvl = VSETVL(n-j);
-                        v0 = VLEV_FLOAT(&x[j], gvl);
-                        //fabs(vector)
-                        mask = VMFLTVF_FLOAT(v0, 0, gvl);
-                        v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl);
-                        //if scale change
-                        mask = VMFGTVF_FLOAT(v0, scale, gvl);
-                        index = VMFIRSTM(mask, gvl);
-                        if(index == -1){//no elements greater than scale
-                                if(scale != 0.0)
-                                        v0 = VFDIVVF_FLOAT(v0, scale, gvl);
-                        }else{//found greater element
-                                //find max
-                                v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
-                                //update ssq before max_index
-                                ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res));
-                                //update scale
-                                scale = VFMVFS_FLOAT(v_res);
-                                v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+
+                // we have gvl elements which we accumulated independently, with independent scales
+                // we need to combine these
+                // naive sort so we process small values first to avoid losing information
+                // could use vector sort extensions where available, but we're dealing with gvl elts at most
+
+                FLOAT * out_ssq = alloca(gvl*sizeof(FLOAT));
+                FLOAT * out_scale = alloca(gvl*sizeof(FLOAT));
+                VSEV_FLOAT( out_ssq, v_ssq, gvl );
+                VSEV_FLOAT( out_scale, v_scale, gvl );
+                for( int a = 0; a < (gvl-1); ++a )
+                {
+                        int smallest = a;
+                        for( size_t b = a+1; b < gvl; ++b )
+                                if( out_scale[b] < out_scale[smallest] )
+                                        smallest = b;
+                        if( smallest != a )
+                        {
+                                FLOAT tmp1 = out_ssq[a];
+                                FLOAT tmp2 = out_scale[a];
+                                out_ssq[a] = out_ssq[smallest];
+                                out_scale[a] = out_scale[smallest];
+                                out_ssq[smallest] = tmp1;
+                                out_scale[smallest] = tmp2;
                         }
-                        vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
-                        //ssq in vector vr: vr[0]
-                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                        //total ssq now
-                        ssq += VFMVFS_FLOAT(v_res);
                 }
-        }else{
-                gvl = VSETVL(n);
-                vr = VFMVVF_FLOAT(0, gvl);
-                v_zero = VFMVVF_FLOAT(0, gvl);
-                unsigned int stride_x = inc_x * sizeof(FLOAT);
-                int idx = 0, inc_v = inc_x * gvl;
-                for(i=0,j=0; i<n/gvl; i++){
-                        v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl);
-                        //fabs(vector)
-                        mask = VMFLTVF_FLOAT(v0, 0, gvl);
-                        v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl);
-                        //if scale change
-                        mask = VMFGTVF_FLOAT(v0, scale, gvl);
-                        index = VMFIRSTM(mask, gvl);
-                        if(index == -1){//no elements greater than scale
-                                if(scale != 0.0){
-                                        v0 = VFDIVVF_FLOAT(v0, scale, gvl);
-                                        vr = VFMACCVV_FLOAT(vr, v0, v0, gvl);
-                                }
-                        }else{//found greater element
-                                //ssq in vector vr: vr[0]
-                                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                                //total ssq before current vector
-                                ssq += VFMVFS_FLOAT(v_res);
-                                //find max
-                                v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
-                                //update ssq before max_index
-                                ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res));
-                                //update scale
-                                scale = VFMVFS_FLOAT(v_res);
-                                //ssq in vector vr
-                                v0 = VFDIVVF_FLOAT(v0, scale, gvl);
-                                vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
+
+                int a = 0;
+                while( a<gvl && out_scale[a] == 0 )
+                        ++a;
+
+                if( a < gvl ) 
+                {
+                        ssq = out_ssq[a];
+                        scale = out_scale[a];
+                        ++a;
+                        for( ; a < gvl; ++a ) 
+                        {
+                                ssq = ssq * ( scale / out_scale[a] ) * ( scale / out_scale[a] ) + out_ssq[a];
+                                scale = out_scale[a];
                         }
-                        j += gvl;
-                        idx += inc_v;
                 }
-                //ssq in vector vr: vr[0]
-                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                //total ssq now
-                ssq += VFMVFS_FLOAT(v_res);
-
-                //tail
-                if(j < n){
-                        gvl = VSETVL(n-j);
-                        v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl);
-                        //fabs(vector)
-                        mask = VMFLTVF_FLOAT(v0, 0, gvl);
-                        v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl);
-                        //if scale change
-                        mask = VMFGTVF_FLOAT(v0, scale, gvl);
-                        index = VMFIRSTM(mask, gvl);
-                        if(index == -1){//no elements greater than scale
-                                if(scale != 0.0)
-                                        v0 = VFDIVVF_FLOAT(v0, scale, gvl);
-                        }else{//found greater element
-                                //find max
-                                v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
-                                //update ssq before max_index
-                                ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res));
-                                //update scale
-                                scale = VFMVFS_FLOATM4(vr);
-                                v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+        }
+
+        //finish any tail using scalar ops
+        i*=gvl*inc_x;
+        n*=inc_x;
+        while(i < n){
+                if ( x[i] != 0.0 ){
+                        FLOAT absxi = ABS( x[i] );
+                        if ( scale < absxi ){
+                                ssq = 1 + ssq * ( scale / absxi ) * ( scale / absxi );
+                                scale = absxi ;
+                        }
+                        else{
+                                ssq += ( absxi/scale ) * ( absxi/scale );
                         }
-                        vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
-                        //ssq in vector vr: vr[0]
-                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                        //total ssq now
-                        ssq += VFMVFS_FLOAT(v_res);
+
                 }
+
+                i += inc_x;
         }
+
 	return(scale * sqrt(ssq));
 }
 
diff --git a/kernel/riscv64/nrm2_vector_dot.c b/kernel/riscv64/nrm2_vector_dot.c
index a8b2b7c66..dfa13a6f5 100644
--- a/kernel/riscv64/nrm2_vector_dot.c
+++ b/kernel/riscv64/nrm2_vector_dot.c
@@ -31,9 +31,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL_MAX vsetvlmax_e32m1()
 #define FLOAT_V_T vfloat32m8_t
 #define FLOAT_V_T_M1 vfloat32m1_t
+#define VLEV_FLOAT vle32_v_f32m8
+#define VLSEV_FLOAT vlse32_v_f32m8
 #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLEV_FLOAT vle_v_f32m8
-#define VLSEV_FLOAT vlse_v_f32m8
 #define VFREDSUM_FLOAT vfredusum_vs_f32m8_f32m1
 #define VFMACCVV_FLOAT vfmacc_vv_f32m8
 #define VFMVVF_FLOAT vfmv_v_f_f32m8
@@ -45,9 +45,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL_MAX vsetvlmax_e64m1()
 #define FLOAT_V_T vfloat64m8_t
 #define FLOAT_V_T_M1 vfloat64m1_t
+#define VLEV_FLOAT vle64_v_f64m8
+#define VLSEV_FLOAT vlse64_v_f64m8
 #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLEV_FLOAT vle_v_f64m8
-#define VLSEV_FLOAT vlse_v_f64m8
 #define VFREDSUM_FLOAT vfredusum_vs_f64m8_f64m1
 #define VFMACCVV_FLOAT vfmacc_vv_f64m8
 #define VFMVVF_FLOAT vfmv_v_f_f64m8
@@ -61,7 +61,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	BLASLONG i=0, j=0;
 	double len = 0.0 ;
 
-	if ( n < 0 )  return(0.0);
+	if ( n <= 0 )  return(0.0);
         if(n == 1) return (ABS(x[0]));
 
         FLOAT_V_T vr, v0, v1;
diff --git a/kernel/riscv64/rot_rvv.c b/kernel/riscv64/rot_rvv.c
new file mode 100644
index 000000000..90f81d5e2
--- /dev/null
+++ b/kernel/riscv64/rot_rvv.c
@@ -0,0 +1,149 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m8(n)
+#define FLOAT_V_T               vfloat32m8_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m8
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m8
+#define VSEV_FLOAT              __riscv_vse32_v_f32m8
+#define VSSEV_FLOAT             __riscv_vsse32_v_f32m8
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f32m8
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f32m8
+#define VFMSACVF_FLOAT          __riscv_vfmsac_vf_f32m8
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m8(n)
+#define FLOAT_V_T               vfloat64m8_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m8
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m8
+#define VSEV_FLOAT              __riscv_vse64_v_f64m8
+#define VSSEV_FLOAT             __riscv_vsse64_v_f64m8
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f64m8
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f64m8
+#define VFMSACVF_FLOAT          __riscv_vfmsac_vf_f64m8
+#endif
+
+int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
+{
+
+    if(n <= 0) return(0);
+
+    FLOAT_V_T v0, v1, vx, vy;
+
+    if (inc_x == 0 || inc_y == 0) {
+        BLASLONG i=0;
+        BLASLONG ix=0,iy=0;
+        FLOAT temp;
+        while(i < n)
+        {
+            temp   = c*x[ix] + s*y[iy] ;
+            y[iy]  = c*y[iy] - s*x[ix] ;
+            x[ix]  = temp ;
+
+            ix += inc_x ;
+            iy += inc_y ;
+            i++ ;
+        }
+    }
+    else if(inc_x == 1 && inc_y == 1) {
+
+        for (size_t vl; n > 0; n -= vl, x += vl, y += vl) {
+            vl = VSETVL(n);
+
+            vx = VLEV_FLOAT(x, vl);
+            vy = VLEV_FLOAT(y, vl);
+
+            v0 = VFMULVF_FLOAT(vx, c, vl);
+            v0 = VFMACCVF_FLOAT(v0, s, vy, vl);
+            VSEV_FLOAT(x, v0, vl);
+
+            v1 = VFMULVF_FLOAT(vx, s, vl);
+            v1 = VFMSACVF_FLOAT(v1, c, vy, vl);
+            VSEV_FLOAT(y, v1, vl);
+        }
+
+    } else if(inc_y == 1) {
+        BLASLONG stride_x = inc_x * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) {
+            vl = VSETVL(n);
+
+            vx = VLSEV_FLOAT(x, stride_x, vl);
+            vy = VLEV_FLOAT(y, vl);
+
+            v0 = VFMULVF_FLOAT(vx, c, vl);
+            v0 = VFMACCVF_FLOAT(v0, s, vy, vl);
+            VSSEV_FLOAT(x, stride_x, v0, vl);
+
+            v1 = VFMULVF_FLOAT(vx, s, vl);
+            v1 = VFMSACVF_FLOAT(v1, c, vy, vl);
+            VSEV_FLOAT(y, v1, vl);
+        }
+ 
+    } else if(inc_x == 1) {
+        BLASLONG stride_y = inc_y * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) {
+            vl = VSETVL(n);
+
+            vx = VLEV_FLOAT(x, vl);
+            vy = VLSEV_FLOAT(y, stride_y, vl);
+
+            v0 = VFMULVF_FLOAT(vx, c, vl);
+            v0 = VFMACCVF_FLOAT(v0, s, vy, vl);
+            VSEV_FLOAT(x, v0, vl);
+
+            v1 = VFMULVF_FLOAT(vx, s, vl);
+            v1 = VFMSACVF_FLOAT(v1, c, vy, vl);
+            VSSEV_FLOAT(y, stride_y, v1, vl);
+        }
+
+    } else {
+        BLASLONG stride_x = inc_x * sizeof(FLOAT);
+        BLASLONG stride_y = inc_y * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) {
+            vl = VSETVL(n);
+ 
+            vx = VLSEV_FLOAT(x, stride_x, vl);
+            vy = VLSEV_FLOAT(y, stride_y, vl);
+
+            v0 = VFMULVF_FLOAT(vx, c, vl);
+            v0 = VFMACCVF_FLOAT(v0, s, vy, vl);
+            VSSEV_FLOAT(x, stride_x, v0, vl);
+
+            v1 = VFMULVF_FLOAT(vx, s, vl);
+            v1 = VFMSACVF_FLOAT(v1, c, vy, vl);
+            VSSEV_FLOAT(y, stride_y, v1, vl);
+        }
+
+    }
+    
+    return(0);
+}
diff --git a/kernel/riscv64/rot_vector.c b/kernel/riscv64/rot_vector.c
index f3786e1d0..649d9bb94 100644
--- a/kernel/riscv64/rot_vector.c
+++ b/kernel/riscv64/rot_vector.c
@@ -28,27 +28,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m4(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
+#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n)
+#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)()
 #define FLOAT_V_T vfloat32m4_t
-#define VLEV_FLOAT vle32_v_f32m4
-#define VLSEV_FLOAT vlse32_v_f32m4
-#define VSEV_FLOAT vse32_v_f32m4
-#define VSSEV_FLOAT vsse32_v_f32m4
-#define VFMACCVF_FLOAT vfmacc_vf_f32m4
-#define VFMULVF_FLOAT vfmul_vf_f32m4
-#define VFMSACVF_FLOAT vfmsac_vf_f32m4
+#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4)
+#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4)
+#define VSEV_FLOAT RISCV_RVV(vse32_v_f32m4)
+#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4)
+#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4)
+#define VFMULVF_FLOAT RISCV_RVV(vfmul_vf_f32m4)
+#define VFMSACVF_FLOAT RISCV_RVV(vfmsac_vf_f32m4)
 #else
-#define VSETVL(n) vsetvl_e64m4(n)
-#define VSETVL_MAX vsetvlmax_e64m1()
+#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n)
+#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)()
 #define FLOAT_V_T vfloat64m4_t
-#define VLEV_FLOAT vle64_v_f64m4
-#define VLSEV_FLOAT vlse64_v_f64m4
-#define VSEV_FLOAT vse64_v_f64m4
-#define VSSEV_FLOAT vsse64_v_f64m4
-#define VFMACCVF_FLOAT vfmacc_vf_f64m4
-#define VFMULVF_FLOAT vfmul_vf_f64m4
-#define VFMSACVF_FLOAT vfmsac_vf_f64m4
+#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4)
+#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4)
+#define VSEV_FLOAT RISCV_RVV(vse64_v_f64m4)
+#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4)
+#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4)
+#define VFMULVF_FLOAT RISCV_RVV(vfmul_vf_f64m4)
+#define VFMSACVF_FLOAT RISCV_RVV(vfmsac_vf_f64m4)
 #endif
 
 int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
@@ -57,11 +57,10 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
 	BLASLONG ix=0,iy=0;
 
 	if(n <= 0)  return(0);
-        unsigned int gvl = 0;
+        unsigned int gvl = VSETVL((inc_x != 0 && inc_y != 0) ? n : 1);
         FLOAT_V_T v0, v1, vx, vy;
 
         if(inc_x == 1 && inc_y == 1){
-                gvl = VSETVL(n);
                 for(i=0,j=0; i<n/gvl; i++){
                         vx = VLEV_FLOAT(&x[j], gvl);
                         vy = VLEV_FLOAT(&y[j], gvl);
@@ -90,7 +89,6 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
                         VSEV_FLOAT(&y[j], v1, gvl);
                 }
         }else if(inc_y == 1){
-                gvl = VSETVL(n);
                 BLASLONG stride_x = inc_x * sizeof(FLOAT);
                 BLASLONG inc_xv = inc_x * gvl;
                 for(i=0,j=0; i<n/gvl; i++){
@@ -122,7 +120,6 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
                         VSEV_FLOAT(&y[j], v1, gvl);
                 }
         }else if(inc_x == 1){
-                gvl = VSETVL(n);
                 BLASLONG stride_y = inc_y * sizeof(FLOAT);
                 BLASLONG inc_yv = inc_y * gvl;
                 for(i=0,j=0; i<n/gvl; i++){
@@ -154,8 +151,6 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
                         VSSEV_FLOAT(&y[j*inc_y], stride_y, v1, gvl);
                 }
         }else{
-                gvl = VSETVL(n);
-		if (inc_x == 0 && inc_y == 0) gvl = VSETVL(1);
                 BLASLONG stride_x = inc_x * sizeof(FLOAT);
                 BLASLONG stride_y = inc_y * sizeof(FLOAT);
                 BLASLONG inc_xv = inc_x * gvl;
diff --git a/kernel/riscv64/scal_rvv.c b/kernel/riscv64/scal_rvv.c
new file mode 100644
index 000000000..2c273fb63
--- /dev/null
+++ b/kernel/riscv64/scal_rvv.c
@@ -0,0 +1,97 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m8()
+#define FLOAT_V_T               vfloat32m8_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m8
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m8
+#define VSEV_FLOAT              __riscv_vse32_v_f32m8
+#define VSSEV_FLOAT             __riscv_vsse32_v_f32m8
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f32m8
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m8
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m8()
+#define FLOAT_V_T               vfloat64m8_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m8
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m8
+#define VSEV_FLOAT              __riscv_vse64_v_f64m8
+#define VSSEV_FLOAT             __riscv_vsse64_v_f64m8
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f64m8
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m8
+#endif
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+    if ( (n <= 0) || (inc_x <= 0)) return(0);
+
+    FLOAT_V_T v0;
+ 
+    if(inc_x == 1) {
+        if(da == 0.0) {
+            int gvl = VSETVL_MAX;
+            v0 = VFMVVF_FLOAT(0.0, gvl);
+            for (size_t vl; n > 0; n -= vl, x += vl) {
+                vl = VSETVL(n);
+                VSEV_FLOAT(x, v0, vl);
+            }
+        }
+        else {
+            for (size_t vl; n > 0; n -= vl, x += vl) {
+                vl = VSETVL(n);
+                v0 = VLEV_FLOAT(x, vl);
+                v0 = VFMULVF_FLOAT(v0, da, vl);
+                VSEV_FLOAT(x, v0, vl);
+            }
+        }
+    } else {
+        BLASLONG stride_x = inc_x * sizeof(FLOAT);
+
+        if(da == 0.0) {
+            int gvl = VSETVL_MAX;
+            v0 = VFMVVF_FLOAT(0.0, gvl);
+            for (size_t vl; n > 0; n -= vl, x += vl*inc_x) {
+                vl = VSETVL(n);
+                VSSEV_FLOAT(x, stride_x, v0, vl);
+            }
+        }
+        else {
+            for (size_t vl; n > 0; n -= vl, x += vl*inc_x) {
+                vl = VSETVL(n);
+                v0 = VLSEV_FLOAT(x, stride_x, vl);
+                v0 = VFMULVF_FLOAT(v0, da, vl);
+                VSSEV_FLOAT(x, stride_x, v0, vl);
+            }
+        }
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/scal_vector.c b/kernel/riscv64/scal_vector.c
index 8b9ef5a3e..8fa9315f6 100644
--- a/kernel/riscv64/scal_vector.c
+++ b/kernel/riscv64/scal_vector.c
@@ -26,28 +26,41 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 #include "common.h"
-#if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m8(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
-#define FLOAT_V_T vfloat32m8_t
-#define VLEV_FLOAT vle32_v_f32m8
-#define VLSEV_FLOAT vlse32_v_f32m8
-#define VSEV_FLOAT vse32_v_f32m8
-#define VSSEV_FLOAT vsse32_v_f32m8
-#define VFMULVF_FLOAT vfmul_vf_f32m8
-#define VFMVVF_FLOAT vfmv_v_f_f32m8
+
+#ifdef RISCV64_ZVL256B
+#       define LMUL m2
+#       if defined(DOUBLE)
+#               define ELEN 64
+#               define MLEN 32
+#       else
+#               define ELEN 32
+#               define MLEN 16
+#       endif
 #else
-#define VSETVL(n) vsetvl_e64m8(n)
-#define VSETVL_MAX vsetvlmax_e64m1()
-#define FLOAT_V_T vfloat64m8_t
-#define VLEV_FLOAT vle64_v_f64m8
-#define VLSEV_FLOAT vlse64_v_f64m8
-#define VSEV_FLOAT vse64_v_f64m8
-#define VSSEV_FLOAT vsse64_v_f64m8
-#define VFMULVF_FLOAT vfmul_vf_f64m8
-#define VFMVVF_FLOAT vfmv_v_f_f64m8
+#       define LMUL m8
+#       if defined(DOUBLE)
+#               define ELEN 64
+#               define MLEN 8
+#       else
+#               define ELEN 32
+#               define MLEN 4
+#       endif
 #endif
 
+#define _
+#define JOIN2_X(x, y) x ## y
+#define JOIN2(x, y) JOIN2_X(x, y)
+#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
+
+#define VSETVL          JOIN(RISCV_RVV(vsetvl),    _e,     ELEN,   LMUL,   _)
+#define FLOAT_V_T       JOIN(vfloat,            ELEN,   LMUL,   _t,     _)
+#define VLEV_FLOAT      JOIN(RISCV_RVV(vle),       ELEN,   _v_f,   ELEN,   LMUL)
+#define VLSEV_FLOAT     JOIN(RISCV_RVV(vlse),      ELEN,   _v_f,   ELEN,   LMUL)
+#define VSEV_FLOAT      JOIN(RISCV_RVV(vse),       ELEN,   _v_f,   ELEN,   LMUL)
+#define VSSEV_FLOAT     JOIN(RISCV_RVV(vsse),      ELEN,   _v_f,   ELEN,   LMUL)
+#define VFMVVF_FLOAT    JOIN(RISCV_RVV(vfmv),      _v_f_f, ELEN,   LMUL,   _)
+#define VFMULVF_FLOAT   JOIN(RISCV_RVV(vfmul),     _vf_f,  ELEN,   LMUL,   _)
+
 int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
 {
 	BLASLONG i=0,j=0;
@@ -84,25 +97,25 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
                 }
         }else{
                 if(da == 0.0){
+                        BLASLONG stride_x = inc_x * sizeof(FLOAT);
+                        BLASLONG ix = 0;
                         gvl = VSETVL(n);
-						BLASLONG stride_x = inc_x * sizeof(FLOAT);
-						BLASLONG ix = 0;
-                        if(gvl <= n / 2){
-							    long int inc_xv = gvl * inc_x;
-                                v0 = VFMVVF_FLOAT(0, gvl);
-                                for(i = 0, j = 0; i < n/(2*gvl); i++, j+=2*gvl){
-									VSSEV_FLOAT(&x[ix], stride_x, v0, gvl);
-									VSSEV_FLOAT(&x[ix + inc_xv], stride_x, v0, gvl);
-									ix += inc_xv * 2;
-                                }
+                        v0 = VFMVVF_FLOAT(0, gvl);
+
+                        for(i = 0; i < n/(gvl*2); ++i ){
+                                VSSEV_FLOAT(&x[ix], stride_x, v0, gvl);
+                                ix += inc_x * gvl;
+                                VSSEV_FLOAT(&x[ix], stride_x, v0, gvl);
+                                ix += inc_x * gvl;
                         }
-                        //tail
-                        for(; j <n; ){
-                                gvl = VSETVL(n-j);
+
+                        i *= gvl*2;
+                        while( i < n ){
+                                gvl = VSETVL(n-i);
                                 v0 = VFMVVF_FLOAT(0, gvl);
-								VSSEV_FLOAT(&x[ix], stride_x, v0, gvl);
-                                j += gvl;
-								ix += inc_x * gvl;
+                                VSSEV_FLOAT(&x[ix], stride_x, v0, gvl);
+                                i += gvl;
+                                ix += inc_x * gvl;
                         }
                 }else{
                         gvl = VSETVL(n);
diff --git a/kernel/riscv64/sgemm_kernel_16x8_zvl256b.c b/kernel/riscv64/sgemm_kernel_16x8_zvl256b.c
new file mode 100644
index 000000000..e22df34f9
--- /dev/null
+++ b/kernel/riscv64/sgemm_kernel_16x8_zvl256b.c
@@ -0,0 +1,1081 @@
+/*
+
+AUTOGENERATED KERNEL
+Settings:
+ LMUL=1
+ M=16
+ M_tail_scalar_from=2
+ N=8
+ __riscv_='__riscv_'
+ complex=False
+ conjugate=False
+ cpu='zvl256b'
+ force_acc_double=False
+ index_type='BLASLONG'
+ op='gemm'
+ param_precision='float'
+ reg_width_bits=256
+ tail_policy=''
+ trace=False
+
+Derived:
+ ELEN_ACC=32
+ ELEN_PARAM=32
+ LMUL_ACC=1
+ VFMACC='__riscv_vfmacc_vf_f32m1'
+ VFMUL='__riscv_vfmul_vf_f32m1'
+ VLEV='__riscv_vle32_v_f32m1'
+ VLSEV='__riscv_vlse32_v_f32m1'
+ VMACC_TO_ACC='__riscv_vfmacc_vf_f32m1'
+ VMUL_TO_ACC='__riscv_vfmul_vf_f32m1'
+ VSETVL='__riscv_vsetvl_e32m1'
+ VSEV='__riscv_vse32_v_f32m1'
+ VSSEV='__riscv_vsse32_v_f32m1'
+ acc_vector_t='vfloat32m1_t'
+ output='sgemm_kernel_16x8_zvl256b.c'
+ param_scalar_t='float'
+ param_vector_t='vfloat32m1_t'
+
+*/
+
+#include "common.h"
+
+
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc)
+
+{
+    BLASLONG gvl = 0;
+    BLASLONG m_top = 0;
+    BLASLONG n_top = 0;
+
+
+    // -- MAIN PASS
+
+    for (BLASLONG j=0; j<N/8; j+=1) {
+        m_top = 0;
+        BLASLONG gvl = __riscv_vsetvl_e32m1(8);
+
+
+        for (BLASLONG i=0; i<M/16; i+=1) {
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            float B0 = B[bi+0];
+            float B1 = B[bi+1];
+            float B2 = B[bi+2];
+            float B3 = B[bi+3];
+            float B4 = B[bi+4];
+            float B5 = B[bi+5];
+            float B6 = B[bi+6];
+            float B7 = B[bi+7];
+            bi += 8;
+
+            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+            vfloat32m1_t A1 = __riscv_vle32_v_f32m1( &A[ai+1*gvl], gvl );
+            ai += 16;
+
+            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
+            vfloat32m1_t result1 = __riscv_vfmul_vf_f32m1( A1, B0, gvl);
+            vfloat32m1_t result2 = __riscv_vfmul_vf_f32m1( A0, B1, gvl);
+            vfloat32m1_t result3 = __riscv_vfmul_vf_f32m1( A1, B1, gvl);
+            vfloat32m1_t result4 = __riscv_vfmul_vf_f32m1( A0, B2, gvl);
+            vfloat32m1_t result5 = __riscv_vfmul_vf_f32m1( A1, B2, gvl);
+            vfloat32m1_t result6 = __riscv_vfmul_vf_f32m1( A0, B3, gvl);
+            vfloat32m1_t result7 = __riscv_vfmul_vf_f32m1( A1, B3, gvl);
+            vfloat32m1_t result8 = __riscv_vfmul_vf_f32m1( A0, B4, gvl);
+            vfloat32m1_t result9 = __riscv_vfmul_vf_f32m1( A1, B4, gvl);
+            vfloat32m1_t result10 = __riscv_vfmul_vf_f32m1( A0, B5, gvl);
+            vfloat32m1_t result11 = __riscv_vfmul_vf_f32m1( A1, B5, gvl);
+            vfloat32m1_t result12 = __riscv_vfmul_vf_f32m1( A0, B6, gvl);
+            vfloat32m1_t result13 = __riscv_vfmul_vf_f32m1( A1, B6, gvl);
+            vfloat32m1_t result14 = __riscv_vfmul_vf_f32m1( A0, B7, gvl);
+            vfloat32m1_t result15 = __riscv_vfmul_vf_f32m1( A1, B7, gvl);
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                B2 = B[bi+2];
+                B3 = B[bi+3];
+                B4 = B[bi+4];
+                B5 = B[bi+5];
+                B6 = B[bi+6];
+                B7 = B[bi+7];
+                bi += 8;
+
+                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+                A1 = __riscv_vle32_v_f32m1( &A[ai+1*gvl], gvl );
+                ai += 16;
+
+                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m1( result1, B0, A1, gvl);
+                result2 = __riscv_vfmacc_vf_f32m1( result2, B1, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f32m1( result3, B1, A1, gvl);
+                result4 = __riscv_vfmacc_vf_f32m1( result4, B2, A0, gvl);
+                result5 = __riscv_vfmacc_vf_f32m1( result5, B2, A1, gvl);
+                result6 = __riscv_vfmacc_vf_f32m1( result6, B3, A0, gvl);
+                result7 = __riscv_vfmacc_vf_f32m1( result7, B3, A1, gvl);
+                result8 = __riscv_vfmacc_vf_f32m1( result8, B4, A0, gvl);
+                result9 = __riscv_vfmacc_vf_f32m1( result9, B4, A1, gvl);
+                result10 = __riscv_vfmacc_vf_f32m1( result10, B5, A0, gvl);
+                result11 = __riscv_vfmacc_vf_f32m1( result11, B5, A1, gvl);
+                result12 = __riscv_vfmacc_vf_f32m1( result12, B6, A0, gvl);
+                result13 = __riscv_vfmacc_vf_f32m1( result13, B6, A1, gvl);
+                result14 = __riscv_vfmacc_vf_f32m1( result14, B7, A0, gvl);
+                result15 = __riscv_vfmacc_vf_f32m1( result15, B7, A1, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t c0 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += gvl;
+            vfloat32m1_t c1 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*1;
+            vfloat32m1_t c2 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += gvl;
+            vfloat32m1_t c3 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*1;
+            vfloat32m1_t c4 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += gvl;
+            vfloat32m1_t c5 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*1;
+            vfloat32m1_t c6 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += gvl;
+            vfloat32m1_t c7 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*1;
+            vfloat32m1_t c8 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += gvl;
+            vfloat32m1_t c9 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*1;
+            vfloat32m1_t c10 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += gvl;
+            vfloat32m1_t c11 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*1;
+            vfloat32m1_t c12 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += gvl;
+            vfloat32m1_t c13 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*1;
+            vfloat32m1_t c14 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += gvl;
+            vfloat32m1_t c15 = __riscv_vle32_v_f32m1( &C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f32m1( c0, alpha, result0, gvl );
+            c1 = __riscv_vfmacc_vf_f32m1( c1, alpha, result1, gvl );
+            c2 = __riscv_vfmacc_vf_f32m1( c2, alpha, result2, gvl );
+            c3 = __riscv_vfmacc_vf_f32m1( c3, alpha, result3, gvl );
+            c4 = __riscv_vfmacc_vf_f32m1( c4, alpha, result4, gvl );
+            c5 = __riscv_vfmacc_vf_f32m1( c5, alpha, result5, gvl );
+            c6 = __riscv_vfmacc_vf_f32m1( c6, alpha, result6, gvl );
+            c7 = __riscv_vfmacc_vf_f32m1( c7, alpha, result7, gvl );
+            c8 = __riscv_vfmacc_vf_f32m1( c8, alpha, result8, gvl );
+            c9 = __riscv_vfmacc_vf_f32m1( c9, alpha, result9, gvl );
+            c10 = __riscv_vfmacc_vf_f32m1( c10, alpha, result10, gvl );
+            c11 = __riscv_vfmacc_vf_f32m1( c11, alpha, result11, gvl );
+            c12 = __riscv_vfmacc_vf_f32m1( c12, alpha, result12, gvl );
+            c13 = __riscv_vfmacc_vf_f32m1( c13, alpha, result13, gvl );
+            c14 = __riscv_vfmacc_vf_f32m1( c14, alpha, result14, gvl );
+            c15 = __riscv_vfmacc_vf_f32m1( c15, alpha, result15, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vse32_v_f32m1( &C[ci], c0, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c1, gvl); ci += ldc-gvl*1;
+            __riscv_vse32_v_f32m1( &C[ci], c2, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c3, gvl); ci += ldc-gvl*1;
+            __riscv_vse32_v_f32m1( &C[ci], c4, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c5, gvl); ci += ldc-gvl*1;
+            __riscv_vse32_v_f32m1( &C[ci], c6, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c7, gvl); ci += ldc-gvl*1;
+            __riscv_vse32_v_f32m1( &C[ci], c8, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c9, gvl); ci += ldc-gvl*1;
+            __riscv_vse32_v_f32m1( &C[ci], c10, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c11, gvl); ci += ldc-gvl*1;
+            __riscv_vse32_v_f32m1( &C[ci], c12, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c13, gvl); ci += ldc-gvl*1;
+            __riscv_vse32_v_f32m1( &C[ci], c14, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c15, gvl);
+            m_top += 16;
+        }
+
+
+
+        // -- tails for main pass
+
+        if( M & 8 ) {
+            gvl = __riscv_vsetvl_e32m1(8);
+
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            float B0 = B[bi+0];
+            float B1 = B[bi+1];
+            float B2 = B[bi+2];
+            float B3 = B[bi+3];
+            float B4 = B[bi+4];
+            float B5 = B[bi+5];
+            float B6 = B[bi+6];
+            float B7 = B[bi+7];
+            bi += 8;
+
+            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+            ai += 8;
+
+            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
+            vfloat32m1_t result1 = __riscv_vfmul_vf_f32m1( A0, B1, gvl);
+            vfloat32m1_t result2 = __riscv_vfmul_vf_f32m1( A0, B2, gvl);
+            vfloat32m1_t result3 = __riscv_vfmul_vf_f32m1( A0, B3, gvl);
+            vfloat32m1_t result4 = __riscv_vfmul_vf_f32m1( A0, B4, gvl);
+            vfloat32m1_t result5 = __riscv_vfmul_vf_f32m1( A0, B5, gvl);
+            vfloat32m1_t result6 = __riscv_vfmul_vf_f32m1( A0, B6, gvl);
+            vfloat32m1_t result7 = __riscv_vfmul_vf_f32m1( A0, B7, gvl);
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                B2 = B[bi+2];
+                B3 = B[bi+3];
+                B4 = B[bi+4];
+                B5 = B[bi+5];
+                B6 = B[bi+6];
+                B7 = B[bi+7];
+                bi += 8;
+
+                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m1( result1, B1, A0, gvl);
+                result2 = __riscv_vfmacc_vf_f32m1( result2, B2, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f32m1( result3, B3, A0, gvl);
+                result4 = __riscv_vfmacc_vf_f32m1( result4, B4, A0, gvl);
+                result5 = __riscv_vfmacc_vf_f32m1( result5, B5, A0, gvl);
+                result6 = __riscv_vfmacc_vf_f32m1( result6, B6, A0, gvl);
+                result7 = __riscv_vfmacc_vf_f32m1( result7, B7, A0, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t c0 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat32m1_t c1 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat32m1_t c2 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat32m1_t c3 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat32m1_t c4 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat32m1_t c5 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat32m1_t c6 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat32m1_t c7 = __riscv_vle32_v_f32m1( &C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f32m1( c0, alpha, result0, gvl );
+            c1 = __riscv_vfmacc_vf_f32m1( c1, alpha, result1, gvl );
+            c2 = __riscv_vfmacc_vf_f32m1( c2, alpha, result2, gvl );
+            c3 = __riscv_vfmacc_vf_f32m1( c3, alpha, result3, gvl );
+            c4 = __riscv_vfmacc_vf_f32m1( c4, alpha, result4, gvl );
+            c5 = __riscv_vfmacc_vf_f32m1( c5, alpha, result5, gvl );
+            c6 = __riscv_vfmacc_vf_f32m1( c6, alpha, result6, gvl );
+            c7 = __riscv_vfmacc_vf_f32m1( c7, alpha, result7, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vse32_v_f32m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c1, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c2, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c3, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c4, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c5, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c6, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c7, gvl);
+            m_top += 8;
+        }
+
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e32m1(4);
+
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            float B0 = B[bi+0];
+            float B1 = B[bi+1];
+            float B2 = B[bi+2];
+            float B3 = B[bi+3];
+            float B4 = B[bi+4];
+            float B5 = B[bi+5];
+            float B6 = B[bi+6];
+            float B7 = B[bi+7];
+            bi += 8;
+
+            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+            ai += 4;
+
+            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
+            vfloat32m1_t result1 = __riscv_vfmul_vf_f32m1( A0, B1, gvl);
+            vfloat32m1_t result2 = __riscv_vfmul_vf_f32m1( A0, B2, gvl);
+            vfloat32m1_t result3 = __riscv_vfmul_vf_f32m1( A0, B3, gvl);
+            vfloat32m1_t result4 = __riscv_vfmul_vf_f32m1( A0, B4, gvl);
+            vfloat32m1_t result5 = __riscv_vfmul_vf_f32m1( A0, B5, gvl);
+            vfloat32m1_t result6 = __riscv_vfmul_vf_f32m1( A0, B6, gvl);
+            vfloat32m1_t result7 = __riscv_vfmul_vf_f32m1( A0, B7, gvl);
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                B2 = B[bi+2];
+                B3 = B[bi+3];
+                B4 = B[bi+4];
+                B5 = B[bi+5];
+                B6 = B[bi+6];
+                B7 = B[bi+7];
+                bi += 8;
+
+                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m1( result1, B1, A0, gvl);
+                result2 = __riscv_vfmacc_vf_f32m1( result2, B2, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f32m1( result3, B3, A0, gvl);
+                result4 = __riscv_vfmacc_vf_f32m1( result4, B4, A0, gvl);
+                result5 = __riscv_vfmacc_vf_f32m1( result5, B5, A0, gvl);
+                result6 = __riscv_vfmacc_vf_f32m1( result6, B6, A0, gvl);
+                result7 = __riscv_vfmacc_vf_f32m1( result7, B7, A0, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t c0 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat32m1_t c1 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat32m1_t c2 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat32m1_t c3 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat32m1_t c4 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat32m1_t c5 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat32m1_t c6 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat32m1_t c7 = __riscv_vle32_v_f32m1( &C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f32m1( c0, alpha, result0, gvl );
+            c1 = __riscv_vfmacc_vf_f32m1( c1, alpha, result1, gvl );
+            c2 = __riscv_vfmacc_vf_f32m1( c2, alpha, result2, gvl );
+            c3 = __riscv_vfmacc_vf_f32m1( c3, alpha, result3, gvl );
+            c4 = __riscv_vfmacc_vf_f32m1( c4, alpha, result4, gvl );
+            c5 = __riscv_vfmacc_vf_f32m1( c5, alpha, result5, gvl );
+            c6 = __riscv_vfmacc_vf_f32m1( c6, alpha, result6, gvl );
+            c7 = __riscv_vfmacc_vf_f32m1( c7, alpha, result7, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vse32_v_f32m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c1, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c2, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c3, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c4, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c5, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c6, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c7, gvl);
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            float result4 = 0;
+            float result5 = 0;
+            float result6 = 0;
+            float result7 = 0;
+            float result8 = 0;
+            float result9 = 0;
+            float result10 = 0;
+            float result11 = 0;
+            float result12 = 0;
+            float result13 = 0;
+            float result14 = 0;
+            float result15 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+
+            for(BLASLONG k=0; k<K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                result1+=A[ai+1]*B[bi+0];
+                result2+=A[ai+0]*B[bi+1];
+                result3+=A[ai+1]*B[bi+1];
+                result4+=A[ai+0]*B[bi+2];
+                result5+=A[ai+1]*B[bi+2];
+                result6+=A[ai+0]*B[bi+3];
+                result7+=A[ai+1]*B[bi+3];
+                result8+=A[ai+0]*B[bi+4];
+                result9+=A[ai+1]*B[bi+4];
+                result10+=A[ai+0]*B[bi+5];
+                result11+=A[ai+1]*B[bi+5];
+                result12+=A[ai+0]*B[bi+6];
+                result13+=A[ai+1]*B[bi+6];
+                result14+=A[ai+0]*B[bi+7];
+                result15+=A[ai+1]*B[bi+7];
+                ai+=2;
+                bi+=8;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] += alpha * result0;
+            C[ci+0*ldc+1] += alpha * result1;
+            C[ci+1*ldc+0] += alpha * result2;
+            C[ci+1*ldc+1] += alpha * result3;
+            C[ci+2*ldc+0] += alpha * result4;
+            C[ci+2*ldc+1] += alpha * result5;
+            C[ci+3*ldc+0] += alpha * result6;
+            C[ci+3*ldc+1] += alpha * result7;
+            C[ci+4*ldc+0] += alpha * result8;
+            C[ci+4*ldc+1] += alpha * result9;
+            C[ci+5*ldc+0] += alpha * result10;
+            C[ci+5*ldc+1] += alpha * result11;
+            C[ci+6*ldc+0] += alpha * result12;
+            C[ci+6*ldc+1] += alpha * result13;
+            C[ci+7*ldc+0] += alpha * result14;
+            C[ci+7*ldc+1] += alpha * result15;
+            m_top+=2;
+        }
+
+
+        if( M & 1 ) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            float result4 = 0;
+            float result5 = 0;
+            float result6 = 0;
+            float result7 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+
+            for(BLASLONG k=0; k<K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                result1+=A[ai+0]*B[bi+1];
+                result2+=A[ai+0]*B[bi+2];
+                result3+=A[ai+0]*B[bi+3];
+                result4+=A[ai+0]*B[bi+4];
+                result5+=A[ai+0]*B[bi+5];
+                result6+=A[ai+0]*B[bi+6];
+                result7+=A[ai+0]*B[bi+7];
+                ai+=1;
+                bi+=8;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] += alpha * result0;
+            C[ci+1*ldc+0] += alpha * result1;
+            C[ci+2*ldc+0] += alpha * result2;
+            C[ci+3*ldc+0] += alpha * result3;
+            C[ci+4*ldc+0] += alpha * result4;
+            C[ci+5*ldc+0] += alpha * result5;
+            C[ci+6*ldc+0] += alpha * result6;
+            C[ci+7*ldc+0] += alpha * result7;
+            m_top+=1;
+        }
+
+        n_top += 8;
+    }
+
+
+
+    // -- tails for N=4
+
+    if( N & 4 ) {
+        gvl = __riscv_vsetvl_e32m1(8);
+        m_top = 0;
+
+        for (BLASLONG i=0; i<M/16; i+=1) {
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            float B0 = B[bi+0];
+            float B1 = B[bi+1];
+            float B2 = B[bi+2];
+            float B3 = B[bi+3];
+            bi += 4;
+
+            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+            vfloat32m1_t A1 = __riscv_vle32_v_f32m1( &A[ai+1*gvl], gvl );
+            ai += 16;
+
+            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
+            vfloat32m1_t result1 = __riscv_vfmul_vf_f32m1( A1, B0, gvl);
+            vfloat32m1_t result2 = __riscv_vfmul_vf_f32m1( A0, B1, gvl);
+            vfloat32m1_t result3 = __riscv_vfmul_vf_f32m1( A1, B1, gvl);
+            vfloat32m1_t result4 = __riscv_vfmul_vf_f32m1( A0, B2, gvl);
+            vfloat32m1_t result5 = __riscv_vfmul_vf_f32m1( A1, B2, gvl);
+            vfloat32m1_t result6 = __riscv_vfmul_vf_f32m1( A0, B3, gvl);
+            vfloat32m1_t result7 = __riscv_vfmul_vf_f32m1( A1, B3, gvl);
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                B2 = B[bi+2];
+                B3 = B[bi+3];
+                bi += 4;
+
+                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+                A1 = __riscv_vle32_v_f32m1( &A[ai+1*gvl], gvl );
+                ai += 16;
+
+                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m1( result1, B0, A1, gvl);
+                result2 = __riscv_vfmacc_vf_f32m1( result2, B1, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f32m1( result3, B1, A1, gvl);
+                result4 = __riscv_vfmacc_vf_f32m1( result4, B2, A0, gvl);
+                result5 = __riscv_vfmacc_vf_f32m1( result5, B2, A1, gvl);
+                result6 = __riscv_vfmacc_vf_f32m1( result6, B3, A0, gvl);
+                result7 = __riscv_vfmacc_vf_f32m1( result7, B3, A1, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t c0 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += gvl;
+            vfloat32m1_t c1 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*1;
+            vfloat32m1_t c2 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += gvl;
+            vfloat32m1_t c3 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*1;
+            vfloat32m1_t c4 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += gvl;
+            vfloat32m1_t c5 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*1;
+            vfloat32m1_t c6 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += gvl;
+            vfloat32m1_t c7 = __riscv_vle32_v_f32m1( &C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f32m1( c0, alpha, result0, gvl );
+            c1 = __riscv_vfmacc_vf_f32m1( c1, alpha, result1, gvl );
+            c2 = __riscv_vfmacc_vf_f32m1( c2, alpha, result2, gvl );
+            c3 = __riscv_vfmacc_vf_f32m1( c3, alpha, result3, gvl );
+            c4 = __riscv_vfmacc_vf_f32m1( c4, alpha, result4, gvl );
+            c5 = __riscv_vfmacc_vf_f32m1( c5, alpha, result5, gvl );
+            c6 = __riscv_vfmacc_vf_f32m1( c6, alpha, result6, gvl );
+            c7 = __riscv_vfmacc_vf_f32m1( c7, alpha, result7, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vse32_v_f32m1( &C[ci], c0, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c1, gvl); ci += ldc-gvl*1;
+            __riscv_vse32_v_f32m1( &C[ci], c2, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c3, gvl); ci += ldc-gvl*1;
+            __riscv_vse32_v_f32m1( &C[ci], c4, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c5, gvl); ci += ldc-gvl*1;
+            __riscv_vse32_v_f32m1( &C[ci], c6, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c7, gvl);
+            m_top += 16;
+        }
+
+
+        if( M & 8 ) {
+            gvl = __riscv_vsetvl_e32m1(8);
+
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            float B0 = B[bi+0];
+            float B1 = B[bi+1];
+            float B2 = B[bi+2];
+            float B3 = B[bi+3];
+            bi += 4;
+
+            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+            ai += 8;
+
+            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
+            vfloat32m1_t result1 = __riscv_vfmul_vf_f32m1( A0, B1, gvl);
+            vfloat32m1_t result2 = __riscv_vfmul_vf_f32m1( A0, B2, gvl);
+            vfloat32m1_t result3 = __riscv_vfmul_vf_f32m1( A0, B3, gvl);
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                B2 = B[bi+2];
+                B3 = B[bi+3];
+                bi += 4;
+
+                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m1( result1, B1, A0, gvl);
+                result2 = __riscv_vfmacc_vf_f32m1( result2, B2, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f32m1( result3, B3, A0, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t c0 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat32m1_t c1 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat32m1_t c2 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat32m1_t c3 = __riscv_vle32_v_f32m1( &C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f32m1( c0, alpha, result0, gvl );
+            c1 = __riscv_vfmacc_vf_f32m1( c1, alpha, result1, gvl );
+            c2 = __riscv_vfmacc_vf_f32m1( c2, alpha, result2, gvl );
+            c3 = __riscv_vfmacc_vf_f32m1( c3, alpha, result3, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vse32_v_f32m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c1, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c2, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c3, gvl);
+            m_top += 8;
+        }
+
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e32m1(4);
+
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            float B0 = B[bi+0];
+            float B1 = B[bi+1];
+            float B2 = B[bi+2];
+            float B3 = B[bi+3];
+            bi += 4;
+
+            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+            ai += 4;
+
+            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
+            vfloat32m1_t result1 = __riscv_vfmul_vf_f32m1( A0, B1, gvl);
+            vfloat32m1_t result2 = __riscv_vfmul_vf_f32m1( A0, B2, gvl);
+            vfloat32m1_t result3 = __riscv_vfmul_vf_f32m1( A0, B3, gvl);
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                B2 = B[bi+2];
+                B3 = B[bi+3];
+                bi += 4;
+
+                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m1( result1, B1, A0, gvl);
+                result2 = __riscv_vfmacc_vf_f32m1( result2, B2, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f32m1( result3, B3, A0, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t c0 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat32m1_t c1 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat32m1_t c2 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat32m1_t c3 = __riscv_vle32_v_f32m1( &C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f32m1( c0, alpha, result0, gvl );
+            c1 = __riscv_vfmacc_vf_f32m1( c1, alpha, result1, gvl );
+            c2 = __riscv_vfmacc_vf_f32m1( c2, alpha, result2, gvl );
+            c3 = __riscv_vfmacc_vf_f32m1( c3, alpha, result3, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vse32_v_f32m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c1, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c2, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c3, gvl);
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            float result4 = 0;
+            float result5 = 0;
+            float result6 = 0;
+            float result7 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+
+            for(BLASLONG k=0; k<K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                result1+=A[ai+1]*B[bi+0];
+                result2+=A[ai+0]*B[bi+1];
+                result3+=A[ai+1]*B[bi+1];
+                result4+=A[ai+0]*B[bi+2];
+                result5+=A[ai+1]*B[bi+2];
+                result6+=A[ai+0]*B[bi+3];
+                result7+=A[ai+1]*B[bi+3];
+                ai+=2;
+                bi+=4;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] += alpha * result0;
+            C[ci+0*ldc+1] += alpha * result1;
+            C[ci+1*ldc+0] += alpha * result2;
+            C[ci+1*ldc+1] += alpha * result3;
+            C[ci+2*ldc+0] += alpha * result4;
+            C[ci+2*ldc+1] += alpha * result5;
+            C[ci+3*ldc+0] += alpha * result6;
+            C[ci+3*ldc+1] += alpha * result7;
+            m_top+=2;
+        }
+
+
+        if( M & 1 ) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+
+            for(BLASLONG k=0; k<K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                result1+=A[ai+0]*B[bi+1];
+                result2+=A[ai+0]*B[bi+2];
+                result3+=A[ai+0]*B[bi+3];
+                ai+=1;
+                bi+=4;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] += alpha * result0;
+            C[ci+1*ldc+0] += alpha * result1;
+            C[ci+2*ldc+0] += alpha * result2;
+            C[ci+3*ldc+0] += alpha * result3;
+            m_top+=1;
+        }
+
+        n_top += 4;
+    }
+
+
+
+    // -- tails for N=2
+
+    if( N & 2 ) {
+        gvl = __riscv_vsetvl_e32m1(8);
+        m_top = 0;
+
+        for (BLASLONG i=0; i<M/16; i+=1) {
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            float B0 = B[bi+0];
+            float B1 = B[bi+1];
+            bi += 2;
+
+            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+            vfloat32m1_t A1 = __riscv_vle32_v_f32m1( &A[ai+1*gvl], gvl );
+            ai += 16;
+
+            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
+            vfloat32m1_t result1 = __riscv_vfmul_vf_f32m1( A1, B0, gvl);
+            vfloat32m1_t result2 = __riscv_vfmul_vf_f32m1( A0, B1, gvl);
+            vfloat32m1_t result3 = __riscv_vfmul_vf_f32m1( A1, B1, gvl);
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                bi += 2;
+
+                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+                A1 = __riscv_vle32_v_f32m1( &A[ai+1*gvl], gvl );
+                ai += 16;
+
+                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m1( result1, B0, A1, gvl);
+                result2 = __riscv_vfmacc_vf_f32m1( result2, B1, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f32m1( result3, B1, A1, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t c0 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += gvl;
+            vfloat32m1_t c1 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*1;
+            vfloat32m1_t c2 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += gvl;
+            vfloat32m1_t c3 = __riscv_vle32_v_f32m1( &C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f32m1( c0, alpha, result0, gvl );
+            c1 = __riscv_vfmacc_vf_f32m1( c1, alpha, result1, gvl );
+            c2 = __riscv_vfmacc_vf_f32m1( c2, alpha, result2, gvl );
+            c3 = __riscv_vfmacc_vf_f32m1( c3, alpha, result3, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vse32_v_f32m1( &C[ci], c0, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c1, gvl); ci += ldc-gvl*1;
+            __riscv_vse32_v_f32m1( &C[ci], c2, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c3, gvl);
+            m_top += 16;
+        }
+
+
+        if( M & 8 ) {
+            gvl = __riscv_vsetvl_e32m1(8);
+
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            float B0 = B[bi+0];
+            float B1 = B[bi+1];
+            bi += 2;
+
+            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+            ai += 8;
+
+            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
+            vfloat32m1_t result1 = __riscv_vfmul_vf_f32m1( A0, B1, gvl);
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                bi += 2;
+
+                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m1( result1, B1, A0, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t c0 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat32m1_t c1 = __riscv_vle32_v_f32m1( &C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f32m1( c0, alpha, result0, gvl );
+            c1 = __riscv_vfmacc_vf_f32m1( c1, alpha, result1, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vse32_v_f32m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c1, gvl);
+            m_top += 8;
+        }
+
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e32m1(4);
+
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            float B0 = B[bi+0];
+            float B1 = B[bi+1];
+            bi += 2;
+
+            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+            ai += 4;
+
+            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
+            vfloat32m1_t result1 = __riscv_vfmul_vf_f32m1( A0, B1, gvl);
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                bi += 2;
+
+                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m1( result1, B1, A0, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t c0 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat32m1_t c1 = __riscv_vle32_v_f32m1( &C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f32m1( c0, alpha, result0, gvl );
+            c1 = __riscv_vfmacc_vf_f32m1( c1, alpha, result1, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vse32_v_f32m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c1, gvl);
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+
+            for(BLASLONG k=0; k<K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                result1+=A[ai+1]*B[bi+0];
+                result2+=A[ai+0]*B[bi+1];
+                result3+=A[ai+1]*B[bi+1];
+                ai+=2;
+                bi+=2;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] += alpha * result0;
+            C[ci+0*ldc+1] += alpha * result1;
+            C[ci+1*ldc+0] += alpha * result2;
+            C[ci+1*ldc+1] += alpha * result3;
+            m_top+=2;
+        }
+
+
+        if( M & 1 ) {
+            float result0 = 0;
+            float result1 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+
+            for(BLASLONG k=0; k<K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                result1+=A[ai+0]*B[bi+1];
+                ai+=1;
+                bi+=2;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] += alpha * result0;
+            C[ci+1*ldc+0] += alpha * result1;
+            m_top+=1;
+        }
+
+        n_top += 2;
+    }
+
+
+
+    // -- tails for N=1
+
+    if( N & 1 ) {
+        gvl = __riscv_vsetvl_e32m1(8);
+        m_top = 0;
+
+        for (BLASLONG i=0; i<M/16; i+=1) {
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            float B0 = B[bi+0];
+            bi += 1;
+
+            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+            vfloat32m1_t A1 = __riscv_vle32_v_f32m1( &A[ai+1*gvl], gvl );
+            ai += 16;
+
+            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
+            vfloat32m1_t result1 = __riscv_vfmul_vf_f32m1( A1, B0, gvl);
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0 = B[bi+0];
+                bi += 1;
+
+                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+                A1 = __riscv_vle32_v_f32m1( &A[ai+1*gvl], gvl );
+                ai += 16;
+
+                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m1( result1, B0, A1, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t c0 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += gvl;
+            vfloat32m1_t c1 = __riscv_vle32_v_f32m1( &C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f32m1( c0, alpha, result0, gvl );
+            c1 = __riscv_vfmacc_vf_f32m1( c1, alpha, result1, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vse32_v_f32m1( &C[ci], c0, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c1, gvl);
+            m_top += 16;
+        }
+
+
+        if( M & 8 ) {
+            gvl = __riscv_vsetvl_e32m1(8);
+
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            float B0 = B[bi+0];
+            bi += 1;
+
+            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+            ai += 8;
+
+            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0 = B[bi+0];
+                bi += 1;
+
+                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t c0 = __riscv_vle32_v_f32m1( &C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f32m1( c0, alpha, result0, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vse32_v_f32m1( &C[ci], c0, gvl);
+            m_top += 8;
+        }
+
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e32m1(4);
+
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            float B0 = B[bi+0];
+            bi += 1;
+
+            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+            ai += 4;
+
+            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0 = B[bi+0];
+                bi += 1;
+
+                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t c0 = __riscv_vle32_v_f32m1( &C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f32m1( c0, alpha, result0, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vse32_v_f32m1( &C[ci], c0, gvl);
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            float result0 = 0;
+            float result1 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+
+            for(BLASLONG k=0; k<K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                result1+=A[ai+1]*B[bi+0];
+                ai+=2;
+                bi+=1;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] += alpha * result0;
+            C[ci+0*ldc+1] += alpha * result1;
+            m_top+=2;
+        }
+
+
+        if( M & 1 ) {
+            float result0 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+
+            for(BLASLONG k=0; k<K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                ai+=1;
+                bi+=1;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] += alpha * result0;
+            m_top+=1;
+        }
+
+        n_top += 1;
+    }
+
+    return 0;
+}
+
diff --git a/kernel/riscv64/sgemm_kernel_8x8_zvl128b.c b/kernel/riscv64/sgemm_kernel_8x8_zvl128b.c
new file mode 100644
index 000000000..ad720e694
--- /dev/null
+++ b/kernel/riscv64/sgemm_kernel_8x8_zvl128b.c
@@ -0,0 +1,791 @@
+/*
+
+AUTOGENERATED KERNEL
+Script: ./kernel/riscv64/generate_kernel.py
+Settings:
+ LMUL=2
+ M=8
+ M_tail_scalar_from=2
+ N=8
+ __riscv_='__riscv_'
+ complex=False
+ conjugate=False
+ cpu='zvl128b'
+ force_acc_double=False
+ index_type='BLASLONG'
+ op='gemm'
+ param_precision='float'
+ reg_width_bits=128
+ tail_policy=''
+ trace=False
+
+Derived:
+ ELEN_ACC=32
+ ELEN_PARAM=32
+ LMUL_ACC=2
+ VFMACC='__riscv_vfmacc_vf_f32m2'
+ VFMUL='__riscv_vfmul_vf_f32m2'
+ VLEV='__riscv_vle32_v_f32m2'
+ VLSEV='__riscv_vlse32_v_f32m2'
+ VMACC_TO_ACC='__riscv_vfmacc_vf_f32m2'
+ VMUL_TO_ACC='__riscv_vfmul_vf_f32m2'
+ VSETVL='__riscv_vsetvl_e32m2'
+ VSEV='__riscv_vse32_v_f32m2'
+ VSSEV='__riscv_vsse32_v_f32m2'
+ acc_vector_t='vfloat32m2_t'
+ output='sgemm_kernel_8x8_zvl128b.c'
+ param_scalar_t='float'
+ param_vector_t='vfloat32m2_t'
+
+*/
+
+#include "common.h"
+
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc)
+
+{
+    BLASLONG gvl = 0;
+    BLASLONG m_top = 0;
+    BLASLONG n_top = 0;
+
+    // -- MAIN PASS
+
+    for (BLASLONG j = 0; j < N / 8; j += 1) {
+        m_top = 0;
+        BLASLONG gvl = __riscv_vsetvl_e32m2(8);
+
+        for (BLASLONG i = 0; i < M / 8; i += 1) {
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            float B0 = B[bi + 0];
+            float B1 = B[bi + 1];
+            float B2 = B[bi + 2];
+            float B3 = B[bi + 3];
+            float B4 = B[bi + 4];
+            float B5 = B[bi + 5];
+            float B6 = B[bi + 6];
+            float B7 = B[bi + 7];
+            bi += 8;
+
+            vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+            ai += 8;
+
+            vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl);
+            vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl);
+            vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl);
+            vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl);
+            vfloat32m2_t result4 = __riscv_vfmul_vf_f32m2(A0, B4, gvl);
+            vfloat32m2_t result5 = __riscv_vfmul_vf_f32m2(A0, B5, gvl);
+            vfloat32m2_t result6 = __riscv_vfmul_vf_f32m2(A0, B6, gvl);
+            vfloat32m2_t result7 = __riscv_vfmul_vf_f32m2(A0, B7, gvl);
+
+            for (BLASLONG k = 1; k < K; k++) {
+                B0 = B[bi + 0];
+                B1 = B[bi + 1];
+                B2 = B[bi + 2];
+                B3 = B[bi + 3];
+                B4 = B[bi + 4];
+                B5 = B[bi + 5];
+                B6 = B[bi + 6];
+                B7 = B[bi + 7];
+                bi += 8;
+
+                A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl);
+                result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl);
+                result4 = __riscv_vfmacc_vf_f32m2(result4, B4, A0, gvl);
+                result5 = __riscv_vfmacc_vf_f32m2(result5, B5, A0, gvl);
+                result6 = __riscv_vfmacc_vf_f32m2(result6, B6, A0, gvl);
+                result7 = __riscv_vfmacc_vf_f32m2(result7, B7, A0, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t c1 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t c2 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t c3 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t c4 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t c5 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t c6 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t c7 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl);
+            c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl);
+            c2 = __riscv_vfmacc_vf_f32m2(c2, alpha, result2, gvl);
+            c3 = __riscv_vfmacc_vf_f32m2(c3, alpha, result3, gvl);
+            c4 = __riscv_vfmacc_vf_f32m2(c4, alpha, result4, gvl);
+            c5 = __riscv_vfmacc_vf_f32m2(c5, alpha, result5, gvl);
+            c6 = __riscv_vfmacc_vf_f32m2(c6, alpha, result6, gvl);
+            c7 = __riscv_vfmacc_vf_f32m2(c7, alpha, result7, gvl);
+
+            ci = n_top * ldc + m_top;
+
+            __riscv_vse32_v_f32m2(&C[ci], c0, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c1, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c2, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c3, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c4, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c5, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c6, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c7, gvl);
+            m_top += 8;
+        }
+
+        // -- tails for main pass
+
+        if (M & 4) {
+            gvl = __riscv_vsetvl_e32m2(4);
+
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            float B0 = B[bi + 0];
+            float B1 = B[bi + 1];
+            float B2 = B[bi + 2];
+            float B3 = B[bi + 3];
+            float B4 = B[bi + 4];
+            float B5 = B[bi + 5];
+            float B6 = B[bi + 6];
+            float B7 = B[bi + 7];
+            bi += 8;
+
+            vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+            ai += 4;
+
+            vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl);
+            vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl);
+            vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl);
+            vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl);
+            vfloat32m2_t result4 = __riscv_vfmul_vf_f32m2(A0, B4, gvl);
+            vfloat32m2_t result5 = __riscv_vfmul_vf_f32m2(A0, B5, gvl);
+            vfloat32m2_t result6 = __riscv_vfmul_vf_f32m2(A0, B6, gvl);
+            vfloat32m2_t result7 = __riscv_vfmul_vf_f32m2(A0, B7, gvl);
+
+            for (BLASLONG k = 1; k < K; k++) {
+                B0 = B[bi + 0];
+                B1 = B[bi + 1];
+                B2 = B[bi + 2];
+                B3 = B[bi + 3];
+                B4 = B[bi + 4];
+                B5 = B[bi + 5];
+                B6 = B[bi + 6];
+                B7 = B[bi + 7];
+                bi += 8;
+
+                A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl);
+                result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl);
+                result4 = __riscv_vfmacc_vf_f32m2(result4, B4, A0, gvl);
+                result5 = __riscv_vfmacc_vf_f32m2(result5, B5, A0, gvl);
+                result6 = __riscv_vfmacc_vf_f32m2(result6, B6, A0, gvl);
+                result7 = __riscv_vfmacc_vf_f32m2(result7, B7, A0, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t c1 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t c2 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t c3 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t c4 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t c5 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t c6 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t c7 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl);
+            c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl);
+            c2 = __riscv_vfmacc_vf_f32m2(c2, alpha, result2, gvl);
+            c3 = __riscv_vfmacc_vf_f32m2(c3, alpha, result3, gvl);
+            c4 = __riscv_vfmacc_vf_f32m2(c4, alpha, result4, gvl);
+            c5 = __riscv_vfmacc_vf_f32m2(c5, alpha, result5, gvl);
+            c6 = __riscv_vfmacc_vf_f32m2(c6, alpha, result6, gvl);
+            c7 = __riscv_vfmacc_vf_f32m2(c7, alpha, result7, gvl);
+
+            ci = n_top * ldc + m_top;
+
+            __riscv_vse32_v_f32m2(&C[ci], c0, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c1, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c2, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c3, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c4, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c5, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c6, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c7, gvl);
+            m_top += 4;
+        }
+
+        if (M & 2) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            float result4 = 0;
+            float result5 = 0;
+            float result6 = 0;
+            float result7 = 0;
+            float result8 = 0;
+            float result9 = 0;
+            float result10 = 0;
+            float result11 = 0;
+            float result12 = 0;
+            float result13 = 0;
+            float result14 = 0;
+            float result15 = 0;
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+
+            for (BLASLONG k = 0; k < K; k++) {
+                result0 += A[ai + 0] * B[bi + 0];
+                result1 += A[ai + 1] * B[bi + 0];
+                result2 += A[ai + 0] * B[bi + 1];
+                result3 += A[ai + 1] * B[bi + 1];
+                result4 += A[ai + 0] * B[bi + 2];
+                result5 += A[ai + 1] * B[bi + 2];
+                result6 += A[ai + 0] * B[bi + 3];
+                result7 += A[ai + 1] * B[bi + 3];
+                result8 += A[ai + 0] * B[bi + 4];
+                result9 += A[ai + 1] * B[bi + 4];
+                result10 += A[ai + 0] * B[bi + 5];
+                result11 += A[ai + 1] * B[bi + 5];
+                result12 += A[ai + 0] * B[bi + 6];
+                result13 += A[ai + 1] * B[bi + 6];
+                result14 += A[ai + 0] * B[bi + 7];
+                result15 += A[ai + 1] * B[bi + 7];
+                ai += 2;
+                bi += 8;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            C[ci + 0 * ldc + 0] += alpha * result0;
+            C[ci + 0 * ldc + 1] += alpha * result1;
+            C[ci + 1 * ldc + 0] += alpha * result2;
+            C[ci + 1 * ldc + 1] += alpha * result3;
+            C[ci + 2 * ldc + 0] += alpha * result4;
+            C[ci + 2 * ldc + 1] += alpha * result5;
+            C[ci + 3 * ldc + 0] += alpha * result6;
+            C[ci + 3 * ldc + 1] += alpha * result7;
+            C[ci + 4 * ldc + 0] += alpha * result8;
+            C[ci + 4 * ldc + 1] += alpha * result9;
+            C[ci + 5 * ldc + 0] += alpha * result10;
+            C[ci + 5 * ldc + 1] += alpha * result11;
+            C[ci + 6 * ldc + 0] += alpha * result12;
+            C[ci + 6 * ldc + 1] += alpha * result13;
+            C[ci + 7 * ldc + 0] += alpha * result14;
+            C[ci + 7 * ldc + 1] += alpha * result15;
+            m_top += 2;
+        }
+
+        if (M & 1) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            float result4 = 0;
+            float result5 = 0;
+            float result6 = 0;
+            float result7 = 0;
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+
+            for (BLASLONG k = 0; k < K; k++) {
+                result0 += A[ai + 0] * B[bi + 0];
+                result1 += A[ai + 0] * B[bi + 1];
+                result2 += A[ai + 0] * B[bi + 2];
+                result3 += A[ai + 0] * B[bi + 3];
+                result4 += A[ai + 0] * B[bi + 4];
+                result5 += A[ai + 0] * B[bi + 5];
+                result6 += A[ai + 0] * B[bi + 6];
+                result7 += A[ai + 0] * B[bi + 7];
+                ai += 1;
+                bi += 8;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            C[ci + 0 * ldc + 0] += alpha * result0;
+            C[ci + 1 * ldc + 0] += alpha * result1;
+            C[ci + 2 * ldc + 0] += alpha * result2;
+            C[ci + 3 * ldc + 0] += alpha * result3;
+            C[ci + 4 * ldc + 0] += alpha * result4;
+            C[ci + 5 * ldc + 0] += alpha * result5;
+            C[ci + 6 * ldc + 0] += alpha * result6;
+            C[ci + 7 * ldc + 0] += alpha * result7;
+            m_top += 1;
+        }
+
+        n_top += 8;
+    }
+
+    // -- tails for N=4
+
+    if (N & 4) {
+        gvl = __riscv_vsetvl_e32m2(8);
+        m_top = 0;
+
+        for (BLASLONG i = 0; i < M / 8; i += 1) {
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            float B0 = B[bi + 0];
+            float B1 = B[bi + 1];
+            float B2 = B[bi + 2];
+            float B3 = B[bi + 3];
+            bi += 4;
+
+            vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+            ai += 8;
+
+            vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl);
+            vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl);
+            vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl);
+            vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl);
+
+            for (BLASLONG k = 1; k < K; k++) {
+                B0 = B[bi + 0];
+                B1 = B[bi + 1];
+                B2 = B[bi + 2];
+                B3 = B[bi + 3];
+                bi += 4;
+
+                A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl);
+                result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t c1 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t c2 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t c3 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl);
+            c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl);
+            c2 = __riscv_vfmacc_vf_f32m2(c2, alpha, result2, gvl);
+            c3 = __riscv_vfmacc_vf_f32m2(c3, alpha, result3, gvl);
+
+            ci = n_top * ldc + m_top;
+
+            __riscv_vse32_v_f32m2(&C[ci], c0, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c1, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c2, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c3, gvl);
+            m_top += 8;
+        }
+
+        if (M & 4) {
+            gvl = __riscv_vsetvl_e32m2(4);
+
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            float B0 = B[bi + 0];
+            float B1 = B[bi + 1];
+            float B2 = B[bi + 2];
+            float B3 = B[bi + 3];
+            bi += 4;
+
+            vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+            ai += 4;
+
+            vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl);
+            vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl);
+            vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl);
+            vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl);
+
+            for (BLASLONG k = 1; k < K; k++) {
+                B0 = B[bi + 0];
+                B1 = B[bi + 1];
+                B2 = B[bi + 2];
+                B3 = B[bi + 3];
+                bi += 4;
+
+                A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl);
+                result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t c1 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t c2 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t c3 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl);
+            c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl);
+            c2 = __riscv_vfmacc_vf_f32m2(c2, alpha, result2, gvl);
+            c3 = __riscv_vfmacc_vf_f32m2(c3, alpha, result3, gvl);
+
+            ci = n_top * ldc + m_top;
+
+            __riscv_vse32_v_f32m2(&C[ci], c0, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c1, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c2, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c3, gvl);
+            m_top += 4;
+        }
+
+        if (M & 2) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            float result4 = 0;
+            float result5 = 0;
+            float result6 = 0;
+            float result7 = 0;
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+
+            for (BLASLONG k = 0; k < K; k++) {
+                result0 += A[ai + 0] * B[bi + 0];
+                result1 += A[ai + 1] * B[bi + 0];
+                result2 += A[ai + 0] * B[bi + 1];
+                result3 += A[ai + 1] * B[bi + 1];
+                result4 += A[ai + 0] * B[bi + 2];
+                result5 += A[ai + 1] * B[bi + 2];
+                result6 += A[ai + 0] * B[bi + 3];
+                result7 += A[ai + 1] * B[bi + 3];
+                ai += 2;
+                bi += 4;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            C[ci + 0 * ldc + 0] += alpha * result0;
+            C[ci + 0 * ldc + 1] += alpha * result1;
+            C[ci + 1 * ldc + 0] += alpha * result2;
+            C[ci + 1 * ldc + 1] += alpha * result3;
+            C[ci + 2 * ldc + 0] += alpha * result4;
+            C[ci + 2 * ldc + 1] += alpha * result5;
+            C[ci + 3 * ldc + 0] += alpha * result6;
+            C[ci + 3 * ldc + 1] += alpha * result7;
+            m_top += 2;
+        }
+
+        if (M & 1) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+
+            for (BLASLONG k = 0; k < K; k++) {
+                result0 += A[ai + 0] * B[bi + 0];
+                result1 += A[ai + 0] * B[bi + 1];
+                result2 += A[ai + 0] * B[bi + 2];
+                result3 += A[ai + 0] * B[bi + 3];
+                ai += 1;
+                bi += 4;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            C[ci + 0 * ldc + 0] += alpha * result0;
+            C[ci + 1 * ldc + 0] += alpha * result1;
+            C[ci + 2 * ldc + 0] += alpha * result2;
+            C[ci + 3 * ldc + 0] += alpha * result3;
+            m_top += 1;
+        }
+
+        n_top += 4;
+    }
+
+    // -- tails for N=2
+
+    if (N & 2) {
+        gvl = __riscv_vsetvl_e32m2(8);
+        m_top = 0;
+
+        for (BLASLONG i = 0; i < M / 8; i += 1) {
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            float B0 = B[bi + 0];
+            float B1 = B[bi + 1];
+            bi += 2;
+
+            vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+            ai += 8;
+
+            vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl);
+            vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl);
+
+            for (BLASLONG k = 1; k < K; k++) {
+                B0 = B[bi + 0];
+                B1 = B[bi + 1];
+                bi += 2;
+
+                A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t c1 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl);
+            c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl);
+
+            ci = n_top * ldc + m_top;
+
+            __riscv_vse32_v_f32m2(&C[ci], c0, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c1, gvl);
+            m_top += 8;
+        }
+
+        if (M & 4) {
+            gvl = __riscv_vsetvl_e32m2(4);
+
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            float B0 = B[bi + 0];
+            float B1 = B[bi + 1];
+            bi += 2;
+
+            vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+            ai += 4;
+
+            vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl);
+            vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl);
+
+            for (BLASLONG k = 1; k < K; k++) {
+                B0 = B[bi + 0];
+                B1 = B[bi + 1];
+                bi += 2;
+
+                A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t c1 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl);
+            c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl);
+
+            ci = n_top * ldc + m_top;
+
+            __riscv_vse32_v_f32m2(&C[ci], c0, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c1, gvl);
+            m_top += 4;
+        }
+
+        if (M & 2) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+
+            for (BLASLONG k = 0; k < K; k++) {
+                result0 += A[ai + 0] * B[bi + 0];
+                result1 += A[ai + 1] * B[bi + 0];
+                result2 += A[ai + 0] * B[bi + 1];
+                result3 += A[ai + 1] * B[bi + 1];
+                ai += 2;
+                bi += 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            C[ci + 0 * ldc + 0] += alpha * result0;
+            C[ci + 0 * ldc + 1] += alpha * result1;
+            C[ci + 1 * ldc + 0] += alpha * result2;
+            C[ci + 1 * ldc + 1] += alpha * result3;
+            m_top += 2;
+        }
+
+        if (M & 1) {
+            float result0 = 0;
+            float result1 = 0;
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+
+            for (BLASLONG k = 0; k < K; k++) {
+                result0 += A[ai + 0] * B[bi + 0];
+                result1 += A[ai + 0] * B[bi + 1];
+                ai += 1;
+                bi += 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            C[ci + 0 * ldc + 0] += alpha * result0;
+            C[ci + 1 * ldc + 0] += alpha * result1;
+            m_top += 1;
+        }
+
+        n_top += 2;
+    }
+
+    // -- tails for N=1
+
+    if (N & 1) {
+        gvl = __riscv_vsetvl_e32m2(8);
+        m_top = 0;
+
+        for (BLASLONG i = 0; i < M / 8; i += 1) {
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            float B0 = B[bi + 0];
+            bi += 1;
+
+            vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+            ai += 8;
+
+            vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl);
+
+            for (BLASLONG k = 1; k < K; k++) {
+                B0 = B[bi + 0];
+                bi += 1;
+
+                A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl);
+
+            ci = n_top * ldc + m_top;
+
+            __riscv_vse32_v_f32m2(&C[ci], c0, gvl);
+            m_top += 8;
+        }
+
+        if (M & 4) {
+            gvl = __riscv_vsetvl_e32m2(4);
+
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            float B0 = B[bi + 0];
+            bi += 1;
+
+            vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+            ai += 4;
+
+            vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl);
+
+            for (BLASLONG k = 1; k < K; k++) {
+                B0 = B[bi + 0];
+                bi += 1;
+
+                A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl);
+
+            ci = n_top * ldc + m_top;
+
+            __riscv_vse32_v_f32m2(&C[ci], c0, gvl);
+            m_top += 4;
+        }
+
+        if (M & 2) {
+            float result0 = 0;
+            float result1 = 0;
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+
+            for (BLASLONG k = 0; k < K; k++) {
+                result0 += A[ai + 0] * B[bi + 0];
+                result1 += A[ai + 1] * B[bi + 0];
+                ai += 2;
+                bi += 1;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            C[ci + 0 * ldc + 0] += alpha * result0;
+            C[ci + 0 * ldc + 1] += alpha * result1;
+            m_top += 2;
+        }
+
+        if (M & 1) {
+            float result0 = 0;
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+
+            for (BLASLONG k = 0; k < K; k++) {
+                result0 += A[ai + 0] * B[bi + 0];
+                ai += 1;
+                bi += 1;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            C[ci + 0 * ldc + 0] += alpha * result0;
+            m_top += 1;
+        }
+
+        n_top += 1;
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/strmm_kernel_16x8_zvl256b.c b/kernel/riscv64/strmm_kernel_16x8_zvl256b.c
new file mode 100644
index 000000000..f70157d61
--- /dev/null
+++ b/kernel/riscv64/strmm_kernel_16x8_zvl256b.c
@@ -0,0 +1,1330 @@
+/*
+
+AUTOGENERATED KERNEL
+Settings:
+ LMUL=1
+ M=16
+ M_tail_scalar_from=2
+ N=8
+ __riscv_='__riscv_'
+ complex=False
+ conjugate=False
+ cpu='zvl256b'
+ force_acc_double=False
+ index_type='BLASLONG'
+ op='trmm'
+ param_precision='float'
+ reg_width_bits=256
+ tail_policy=''
+ trace=False
+
+Derived:
+ ELEN_ACC=32
+ ELEN_PARAM=32
+ LMUL_ACC=1
+ VFMACC='__riscv_vfmacc_vf_f32m1'
+ VFMUL='__riscv_vfmul_vf_f32m1'
+ VLEV='__riscv_vle32_v_f32m1'
+ VLSEV='__riscv_vlse32_v_f32m1'
+ VMACC_TO_ACC='__riscv_vfmacc_vf_f32m1'
+ VMUL_TO_ACC='__riscv_vfmul_vf_f32m1'
+ VSETVL='__riscv_vsetvl_e32m1'
+ VSEV='__riscv_vse32_v_f32m1'
+ VSSEV='__riscv_vsse32_v_f32m1'
+ acc_vector_t='vfloat32m1_t'
+ output='strmm_kernel_16x8_zvl256b.c'
+ param_scalar_t='float'
+ param_vector_t='vfloat32m1_t'
+
+*/
+
+#include "common.h"
+
+
+
+#if defined(LEFT) != defined(TRANSA)
+    #define BACKWARDS
+#endif
+
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc, BLASLONG offset)
+
+{
+    BLASLONG gvl = 0;
+    BLASLONG m_top = 0;
+    BLASLONG n_top = 0;
+
+
+    // -- MAIN PASS
+
+    for (BLASLONG j=0; j<N/8; j+=1) {
+        m_top = 0;
+        BLASLONG gvl = __riscv_vsetvl_e32m1(8);
+
+
+        for (BLASLONG i=0; i<M/16; i+=1) {
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*16;
+                bi += off*8;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 16;
+                #else
+                    pass_K = off + 8;
+                #endif
+            #endif
+            float B0 = B[bi+0];
+            float B1 = B[bi+1];
+            float B2 = B[bi+2];
+            float B3 = B[bi+3];
+            float B4 = B[bi+4];
+            float B5 = B[bi+5];
+            float B6 = B[bi+6];
+            float B7 = B[bi+7];
+            bi += 8;
+
+            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+            vfloat32m1_t A1 = __riscv_vle32_v_f32m1( &A[ai+1*gvl], gvl );
+            ai += 16;
+
+            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
+            vfloat32m1_t result1 = __riscv_vfmul_vf_f32m1( A1, B0, gvl);
+            vfloat32m1_t result2 = __riscv_vfmul_vf_f32m1( A0, B1, gvl);
+            vfloat32m1_t result3 = __riscv_vfmul_vf_f32m1( A1, B1, gvl);
+            vfloat32m1_t result4 = __riscv_vfmul_vf_f32m1( A0, B2, gvl);
+            vfloat32m1_t result5 = __riscv_vfmul_vf_f32m1( A1, B2, gvl);
+            vfloat32m1_t result6 = __riscv_vfmul_vf_f32m1( A0, B3, gvl);
+            vfloat32m1_t result7 = __riscv_vfmul_vf_f32m1( A1, B3, gvl);
+            vfloat32m1_t result8 = __riscv_vfmul_vf_f32m1( A0, B4, gvl);
+            vfloat32m1_t result9 = __riscv_vfmul_vf_f32m1( A1, B4, gvl);
+            vfloat32m1_t result10 = __riscv_vfmul_vf_f32m1( A0, B5, gvl);
+            vfloat32m1_t result11 = __riscv_vfmul_vf_f32m1( A1, B5, gvl);
+            vfloat32m1_t result12 = __riscv_vfmul_vf_f32m1( A0, B6, gvl);
+            vfloat32m1_t result13 = __riscv_vfmul_vf_f32m1( A1, B6, gvl);
+            vfloat32m1_t result14 = __riscv_vfmul_vf_f32m1( A0, B7, gvl);
+            vfloat32m1_t result15 = __riscv_vfmul_vf_f32m1( A1, B7, gvl);
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                B2 = B[bi+2];
+                B3 = B[bi+3];
+                B4 = B[bi+4];
+                B5 = B[bi+5];
+                B6 = B[bi+6];
+                B7 = B[bi+7];
+                bi += 8;
+
+                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+                A1 = __riscv_vle32_v_f32m1( &A[ai+1*gvl], gvl );
+                ai += 16;
+
+                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m1( result1, B0, A1, gvl);
+                result2 = __riscv_vfmacc_vf_f32m1( result2, B1, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f32m1( result3, B1, A1, gvl);
+                result4 = __riscv_vfmacc_vf_f32m1( result4, B2, A0, gvl);
+                result5 = __riscv_vfmacc_vf_f32m1( result5, B2, A1, gvl);
+                result6 = __riscv_vfmacc_vf_f32m1( result6, B3, A0, gvl);
+                result7 = __riscv_vfmacc_vf_f32m1( result7, B3, A1, gvl);
+                result8 = __riscv_vfmacc_vf_f32m1( result8, B4, A0, gvl);
+                result9 = __riscv_vfmacc_vf_f32m1( result9, B4, A1, gvl);
+                result10 = __riscv_vfmacc_vf_f32m1( result10, B5, A0, gvl);
+                result11 = __riscv_vfmacc_vf_f32m1( result11, B5, A1, gvl);
+                result12 = __riscv_vfmacc_vf_f32m1( result12, B6, A0, gvl);
+                result13 = __riscv_vfmacc_vf_f32m1( result13, B6, A1, gvl);
+                result14 = __riscv_vfmacc_vf_f32m1( result14, B7, A0, gvl);
+                result15 = __riscv_vfmacc_vf_f32m1( result15, B7, A1, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t c0 = __riscv_vfmul_vf_f32m1( result0, alpha, gvl );
+            vfloat32m1_t c1 = __riscv_vfmul_vf_f32m1( result1, alpha, gvl );
+            vfloat32m1_t c2 = __riscv_vfmul_vf_f32m1( result2, alpha, gvl );
+            vfloat32m1_t c3 = __riscv_vfmul_vf_f32m1( result3, alpha, gvl );
+            vfloat32m1_t c4 = __riscv_vfmul_vf_f32m1( result4, alpha, gvl );
+            vfloat32m1_t c5 = __riscv_vfmul_vf_f32m1( result5, alpha, gvl );
+            vfloat32m1_t c6 = __riscv_vfmul_vf_f32m1( result6, alpha, gvl );
+            vfloat32m1_t c7 = __riscv_vfmul_vf_f32m1( result7, alpha, gvl );
+            vfloat32m1_t c8 = __riscv_vfmul_vf_f32m1( result8, alpha, gvl );
+            vfloat32m1_t c9 = __riscv_vfmul_vf_f32m1( result9, alpha, gvl );
+            vfloat32m1_t c10 = __riscv_vfmul_vf_f32m1( result10, alpha, gvl );
+            vfloat32m1_t c11 = __riscv_vfmul_vf_f32m1( result11, alpha, gvl );
+            vfloat32m1_t c12 = __riscv_vfmul_vf_f32m1( result12, alpha, gvl );
+            vfloat32m1_t c13 = __riscv_vfmul_vf_f32m1( result13, alpha, gvl );
+            vfloat32m1_t c14 = __riscv_vfmul_vf_f32m1( result14, alpha, gvl );
+            vfloat32m1_t c15 = __riscv_vfmul_vf_f32m1( result15, alpha, gvl );
+            __riscv_vse32_v_f32m1( &C[ci], c0, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c1, gvl); ci += ldc-gvl*1;
+            __riscv_vse32_v_f32m1( &C[ci], c2, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c3, gvl); ci += ldc-gvl*1;
+            __riscv_vse32_v_f32m1( &C[ci], c4, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c5, gvl); ci += ldc-gvl*1;
+            __riscv_vse32_v_f32m1( &C[ci], c6, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c7, gvl); ci += ldc-gvl*1;
+            __riscv_vse32_v_f32m1( &C[ci], c8, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c9, gvl); ci += ldc-gvl*1;
+            __riscv_vse32_v_f32m1( &C[ci], c10, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c11, gvl); ci += ldc-gvl*1;
+            __riscv_vse32_v_f32m1( &C[ci], c12, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c13, gvl); ci += ldc-gvl*1;
+            __riscv_vse32_v_f32m1( &C[ci], c14, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c15, gvl);
+            m_top += 16;
+        }
+
+
+
+        // -- tails for main pass
+
+        if( M & 8 ) {
+            gvl = __riscv_vsetvl_e32m1(8);
+
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*8;
+                bi += off*8;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 8;
+                #else
+                    pass_K = off + 8;
+                #endif
+            #endif
+            float B0 = B[bi+0];
+            float B1 = B[bi+1];
+            float B2 = B[bi+2];
+            float B3 = B[bi+3];
+            float B4 = B[bi+4];
+            float B5 = B[bi+5];
+            float B6 = B[bi+6];
+            float B7 = B[bi+7];
+            bi += 8;
+
+            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+            ai += 8;
+
+            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
+            vfloat32m1_t result1 = __riscv_vfmul_vf_f32m1( A0, B1, gvl);
+            vfloat32m1_t result2 = __riscv_vfmul_vf_f32m1( A0, B2, gvl);
+            vfloat32m1_t result3 = __riscv_vfmul_vf_f32m1( A0, B3, gvl);
+            vfloat32m1_t result4 = __riscv_vfmul_vf_f32m1( A0, B4, gvl);
+            vfloat32m1_t result5 = __riscv_vfmul_vf_f32m1( A0, B5, gvl);
+            vfloat32m1_t result6 = __riscv_vfmul_vf_f32m1( A0, B6, gvl);
+            vfloat32m1_t result7 = __riscv_vfmul_vf_f32m1( A0, B7, gvl);
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                B2 = B[bi+2];
+                B3 = B[bi+3];
+                B4 = B[bi+4];
+                B5 = B[bi+5];
+                B6 = B[bi+6];
+                B7 = B[bi+7];
+                bi += 8;
+
+                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m1( result1, B1, A0, gvl);
+                result2 = __riscv_vfmacc_vf_f32m1( result2, B2, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f32m1( result3, B3, A0, gvl);
+                result4 = __riscv_vfmacc_vf_f32m1( result4, B4, A0, gvl);
+                result5 = __riscv_vfmacc_vf_f32m1( result5, B5, A0, gvl);
+                result6 = __riscv_vfmacc_vf_f32m1( result6, B6, A0, gvl);
+                result7 = __riscv_vfmacc_vf_f32m1( result7, B7, A0, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t c0 = __riscv_vfmul_vf_f32m1( result0, alpha, gvl );
+            vfloat32m1_t c1 = __riscv_vfmul_vf_f32m1( result1, alpha, gvl );
+            vfloat32m1_t c2 = __riscv_vfmul_vf_f32m1( result2, alpha, gvl );
+            vfloat32m1_t c3 = __riscv_vfmul_vf_f32m1( result3, alpha, gvl );
+            vfloat32m1_t c4 = __riscv_vfmul_vf_f32m1( result4, alpha, gvl );
+            vfloat32m1_t c5 = __riscv_vfmul_vf_f32m1( result5, alpha, gvl );
+            vfloat32m1_t c6 = __riscv_vfmul_vf_f32m1( result6, alpha, gvl );
+            vfloat32m1_t c7 = __riscv_vfmul_vf_f32m1( result7, alpha, gvl );
+            __riscv_vse32_v_f32m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c1, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c2, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c3, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c4, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c5, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c6, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c7, gvl);
+            m_top += 8;
+        }
+
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e32m1(4);
+
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*4;
+                bi += off*8;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 4;
+                #else
+                    pass_K = off + 8;
+                #endif
+            #endif
+            float B0 = B[bi+0];
+            float B1 = B[bi+1];
+            float B2 = B[bi+2];
+            float B3 = B[bi+3];
+            float B4 = B[bi+4];
+            float B5 = B[bi+5];
+            float B6 = B[bi+6];
+            float B7 = B[bi+7];
+            bi += 8;
+
+            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+            ai += 4;
+
+            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
+            vfloat32m1_t result1 = __riscv_vfmul_vf_f32m1( A0, B1, gvl);
+            vfloat32m1_t result2 = __riscv_vfmul_vf_f32m1( A0, B2, gvl);
+            vfloat32m1_t result3 = __riscv_vfmul_vf_f32m1( A0, B3, gvl);
+            vfloat32m1_t result4 = __riscv_vfmul_vf_f32m1( A0, B4, gvl);
+            vfloat32m1_t result5 = __riscv_vfmul_vf_f32m1( A0, B5, gvl);
+            vfloat32m1_t result6 = __riscv_vfmul_vf_f32m1( A0, B6, gvl);
+            vfloat32m1_t result7 = __riscv_vfmul_vf_f32m1( A0, B7, gvl);
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                B2 = B[bi+2];
+                B3 = B[bi+3];
+                B4 = B[bi+4];
+                B5 = B[bi+5];
+                B6 = B[bi+6];
+                B7 = B[bi+7];
+                bi += 8;
+
+                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m1( result1, B1, A0, gvl);
+                result2 = __riscv_vfmacc_vf_f32m1( result2, B2, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f32m1( result3, B3, A0, gvl);
+                result4 = __riscv_vfmacc_vf_f32m1( result4, B4, A0, gvl);
+                result5 = __riscv_vfmacc_vf_f32m1( result5, B5, A0, gvl);
+                result6 = __riscv_vfmacc_vf_f32m1( result6, B6, A0, gvl);
+                result7 = __riscv_vfmacc_vf_f32m1( result7, B7, A0, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t c0 = __riscv_vfmul_vf_f32m1( result0, alpha, gvl );
+            vfloat32m1_t c1 = __riscv_vfmul_vf_f32m1( result1, alpha, gvl );
+            vfloat32m1_t c2 = __riscv_vfmul_vf_f32m1( result2, alpha, gvl );
+            vfloat32m1_t c3 = __riscv_vfmul_vf_f32m1( result3, alpha, gvl );
+            vfloat32m1_t c4 = __riscv_vfmul_vf_f32m1( result4, alpha, gvl );
+            vfloat32m1_t c5 = __riscv_vfmul_vf_f32m1( result5, alpha, gvl );
+            vfloat32m1_t c6 = __riscv_vfmul_vf_f32m1( result6, alpha, gvl );
+            vfloat32m1_t c7 = __riscv_vfmul_vf_f32m1( result7, alpha, gvl );
+            __riscv_vse32_v_f32m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c1, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c2, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c3, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c4, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c5, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c6, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c7, gvl);
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            float result4 = 0;
+            float result5 = 0;
+            float result6 = 0;
+            float result7 = 0;
+            float result8 = 0;
+            float result9 = 0;
+            float result10 = 0;
+            float result11 = 0;
+            float result12 = 0;
+            float result13 = 0;
+            float result14 = 0;
+            float result15 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*2;
+                bi += off*8;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 2;
+                #else
+                    pass_K = off + 8;
+                #endif
+            #endif
+
+            for(BLASLONG k=0; k<pass_K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                result1+=A[ai+1]*B[bi+0];
+                result2+=A[ai+0]*B[bi+1];
+                result3+=A[ai+1]*B[bi+1];
+                result4+=A[ai+0]*B[bi+2];
+                result5+=A[ai+1]*B[bi+2];
+                result6+=A[ai+0]*B[bi+3];
+                result7+=A[ai+1]*B[bi+3];
+                result8+=A[ai+0]*B[bi+4];
+                result9+=A[ai+1]*B[bi+4];
+                result10+=A[ai+0]*B[bi+5];
+                result11+=A[ai+1]*B[bi+5];
+                result12+=A[ai+0]*B[bi+6];
+                result13+=A[ai+1]*B[bi+6];
+                result14+=A[ai+0]*B[bi+7];
+                result15+=A[ai+1]*B[bi+7];
+                ai+=2;
+                bi+=8;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] = alpha * result0;
+            C[ci+0*ldc+1] = alpha * result1;
+            C[ci+1*ldc+0] = alpha * result2;
+            C[ci+1*ldc+1] = alpha * result3;
+            C[ci+2*ldc+0] = alpha * result4;
+            C[ci+2*ldc+1] = alpha * result5;
+            C[ci+3*ldc+0] = alpha * result6;
+            C[ci+3*ldc+1] = alpha * result7;
+            C[ci+4*ldc+0] = alpha * result8;
+            C[ci+4*ldc+1] = alpha * result9;
+            C[ci+5*ldc+0] = alpha * result10;
+            C[ci+5*ldc+1] = alpha * result11;
+            C[ci+6*ldc+0] = alpha * result12;
+            C[ci+6*ldc+1] = alpha * result13;
+            C[ci+7*ldc+0] = alpha * result14;
+            C[ci+7*ldc+1] = alpha * result15;
+            m_top+=2;
+        }
+
+
+        if( M & 1 ) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            float result4 = 0;
+            float result5 = 0;
+            float result6 = 0;
+            float result7 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*1;
+                bi += off*8;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 1;
+                #else
+                    pass_K = off + 8;
+                #endif
+            #endif
+
+            for(BLASLONG k=0; k<pass_K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                result1+=A[ai+0]*B[bi+1];
+                result2+=A[ai+0]*B[bi+2];
+                result3+=A[ai+0]*B[bi+3];
+                result4+=A[ai+0]*B[bi+4];
+                result5+=A[ai+0]*B[bi+5];
+                result6+=A[ai+0]*B[bi+6];
+                result7+=A[ai+0]*B[bi+7];
+                ai+=1;
+                bi+=8;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] = alpha * result0;
+            C[ci+1*ldc+0] = alpha * result1;
+            C[ci+2*ldc+0] = alpha * result2;
+            C[ci+3*ldc+0] = alpha * result3;
+            C[ci+4*ldc+0] = alpha * result4;
+            C[ci+5*ldc+0] = alpha * result5;
+            C[ci+6*ldc+0] = alpha * result6;
+            C[ci+7*ldc+0] = alpha * result7;
+            m_top+=1;
+        }
+
+        n_top += 8;
+    }
+
+
+
+    // -- tails for N=4
+
+    if( N & 4 ) {
+        gvl = __riscv_vsetvl_e32m1(8);
+        m_top = 0;
+
+        for (BLASLONG i=0; i<M/16; i+=1) {
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*16;
+                bi += off*4;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 16;
+                #else
+                    pass_K = off + 4;
+                #endif
+            #endif
+            float B0 = B[bi+0];
+            float B1 = B[bi+1];
+            float B2 = B[bi+2];
+            float B3 = B[bi+3];
+            bi += 4;
+
+            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+            vfloat32m1_t A1 = __riscv_vle32_v_f32m1( &A[ai+1*gvl], gvl );
+            ai += 16;
+
+            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
+            vfloat32m1_t result1 = __riscv_vfmul_vf_f32m1( A1, B0, gvl);
+            vfloat32m1_t result2 = __riscv_vfmul_vf_f32m1( A0, B1, gvl);
+            vfloat32m1_t result3 = __riscv_vfmul_vf_f32m1( A1, B1, gvl);
+            vfloat32m1_t result4 = __riscv_vfmul_vf_f32m1( A0, B2, gvl);
+            vfloat32m1_t result5 = __riscv_vfmul_vf_f32m1( A1, B2, gvl);
+            vfloat32m1_t result6 = __riscv_vfmul_vf_f32m1( A0, B3, gvl);
+            vfloat32m1_t result7 = __riscv_vfmul_vf_f32m1( A1, B3, gvl);
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                B2 = B[bi+2];
+                B3 = B[bi+3];
+                bi += 4;
+
+                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+                A1 = __riscv_vle32_v_f32m1( &A[ai+1*gvl], gvl );
+                ai += 16;
+
+                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m1( result1, B0, A1, gvl);
+                result2 = __riscv_vfmacc_vf_f32m1( result2, B1, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f32m1( result3, B1, A1, gvl);
+                result4 = __riscv_vfmacc_vf_f32m1( result4, B2, A0, gvl);
+                result5 = __riscv_vfmacc_vf_f32m1( result5, B2, A1, gvl);
+                result6 = __riscv_vfmacc_vf_f32m1( result6, B3, A0, gvl);
+                result7 = __riscv_vfmacc_vf_f32m1( result7, B3, A1, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t c0 = __riscv_vfmul_vf_f32m1( result0, alpha, gvl );
+            vfloat32m1_t c1 = __riscv_vfmul_vf_f32m1( result1, alpha, gvl );
+            vfloat32m1_t c2 = __riscv_vfmul_vf_f32m1( result2, alpha, gvl );
+            vfloat32m1_t c3 = __riscv_vfmul_vf_f32m1( result3, alpha, gvl );
+            vfloat32m1_t c4 = __riscv_vfmul_vf_f32m1( result4, alpha, gvl );
+            vfloat32m1_t c5 = __riscv_vfmul_vf_f32m1( result5, alpha, gvl );
+            vfloat32m1_t c6 = __riscv_vfmul_vf_f32m1( result6, alpha, gvl );
+            vfloat32m1_t c7 = __riscv_vfmul_vf_f32m1( result7, alpha, gvl );
+            __riscv_vse32_v_f32m1( &C[ci], c0, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c1, gvl); ci += ldc-gvl*1;
+            __riscv_vse32_v_f32m1( &C[ci], c2, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c3, gvl); ci += ldc-gvl*1;
+            __riscv_vse32_v_f32m1( &C[ci], c4, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c5, gvl); ci += ldc-gvl*1;
+            __riscv_vse32_v_f32m1( &C[ci], c6, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c7, gvl);
+            m_top += 16;
+        }
+
+
+        if( M & 8 ) {
+            gvl = __riscv_vsetvl_e32m1(8);
+
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*8;
+                bi += off*4;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 8;
+                #else
+                    pass_K = off + 4;
+                #endif
+            #endif
+            float B0 = B[bi+0];
+            float B1 = B[bi+1];
+            float B2 = B[bi+2];
+            float B3 = B[bi+3];
+            bi += 4;
+
+            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+            ai += 8;
+
+            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
+            vfloat32m1_t result1 = __riscv_vfmul_vf_f32m1( A0, B1, gvl);
+            vfloat32m1_t result2 = __riscv_vfmul_vf_f32m1( A0, B2, gvl);
+            vfloat32m1_t result3 = __riscv_vfmul_vf_f32m1( A0, B3, gvl);
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                B2 = B[bi+2];
+                B3 = B[bi+3];
+                bi += 4;
+
+                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m1( result1, B1, A0, gvl);
+                result2 = __riscv_vfmacc_vf_f32m1( result2, B2, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f32m1( result3, B3, A0, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t c0 = __riscv_vfmul_vf_f32m1( result0, alpha, gvl );
+            vfloat32m1_t c1 = __riscv_vfmul_vf_f32m1( result1, alpha, gvl );
+            vfloat32m1_t c2 = __riscv_vfmul_vf_f32m1( result2, alpha, gvl );
+            vfloat32m1_t c3 = __riscv_vfmul_vf_f32m1( result3, alpha, gvl );
+            __riscv_vse32_v_f32m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c1, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c2, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c3, gvl);
+            m_top += 8;
+        }
+
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e32m1(4);
+
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*4;
+                bi += off*4;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 4;
+                #else
+                    pass_K = off + 4;
+                #endif
+            #endif
+            float B0 = B[bi+0];
+            float B1 = B[bi+1];
+            float B2 = B[bi+2];
+            float B3 = B[bi+3];
+            bi += 4;
+
+            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+            ai += 4;
+
+            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
+            vfloat32m1_t result1 = __riscv_vfmul_vf_f32m1( A0, B1, gvl);
+            vfloat32m1_t result2 = __riscv_vfmul_vf_f32m1( A0, B2, gvl);
+            vfloat32m1_t result3 = __riscv_vfmul_vf_f32m1( A0, B3, gvl);
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                B2 = B[bi+2];
+                B3 = B[bi+3];
+                bi += 4;
+
+                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m1( result1, B1, A0, gvl);
+                result2 = __riscv_vfmacc_vf_f32m1( result2, B2, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f32m1( result3, B3, A0, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t c0 = __riscv_vfmul_vf_f32m1( result0, alpha, gvl );
+            vfloat32m1_t c1 = __riscv_vfmul_vf_f32m1( result1, alpha, gvl );
+            vfloat32m1_t c2 = __riscv_vfmul_vf_f32m1( result2, alpha, gvl );
+            vfloat32m1_t c3 = __riscv_vfmul_vf_f32m1( result3, alpha, gvl );
+            __riscv_vse32_v_f32m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c1, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c2, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c3, gvl);
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            float result4 = 0;
+            float result5 = 0;
+            float result6 = 0;
+            float result7 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*2;
+                bi += off*4;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 2;
+                #else
+                    pass_K = off + 4;
+                #endif
+            #endif
+
+            for(BLASLONG k=0; k<pass_K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                result1+=A[ai+1]*B[bi+0];
+                result2+=A[ai+0]*B[bi+1];
+                result3+=A[ai+1]*B[bi+1];
+                result4+=A[ai+0]*B[bi+2];
+                result5+=A[ai+1]*B[bi+2];
+                result6+=A[ai+0]*B[bi+3];
+                result7+=A[ai+1]*B[bi+3];
+                ai+=2;
+                bi+=4;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] = alpha * result0;
+            C[ci+0*ldc+1] = alpha * result1;
+            C[ci+1*ldc+0] = alpha * result2;
+            C[ci+1*ldc+1] = alpha * result3;
+            C[ci+2*ldc+0] = alpha * result4;
+            C[ci+2*ldc+1] = alpha * result5;
+            C[ci+3*ldc+0] = alpha * result6;
+            C[ci+3*ldc+1] = alpha * result7;
+            m_top+=2;
+        }
+
+
+        if( M & 1 ) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*1;
+                bi += off*4;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 1;
+                #else
+                    pass_K = off + 4;
+                #endif
+            #endif
+
+            for(BLASLONG k=0; k<pass_K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                result1+=A[ai+0]*B[bi+1];
+                result2+=A[ai+0]*B[bi+2];
+                result3+=A[ai+0]*B[bi+3];
+                ai+=1;
+                bi+=4;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] = alpha * result0;
+            C[ci+1*ldc+0] = alpha * result1;
+            C[ci+2*ldc+0] = alpha * result2;
+            C[ci+3*ldc+0] = alpha * result3;
+            m_top+=1;
+        }
+
+        n_top += 4;
+    }
+
+
+
+    // -- tails for N=2
+
+    if( N & 2 ) {
+        gvl = __riscv_vsetvl_e32m1(8);
+        m_top = 0;
+
+        for (BLASLONG i=0; i<M/16; i+=1) {
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*16;
+                bi += off*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 16;
+                #else
+                    pass_K = off + 2;
+                #endif
+            #endif
+            float B0 = B[bi+0];
+            float B1 = B[bi+1];
+            bi += 2;
+
+            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+            vfloat32m1_t A1 = __riscv_vle32_v_f32m1( &A[ai+1*gvl], gvl );
+            ai += 16;
+
+            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
+            vfloat32m1_t result1 = __riscv_vfmul_vf_f32m1( A1, B0, gvl);
+            vfloat32m1_t result2 = __riscv_vfmul_vf_f32m1( A0, B1, gvl);
+            vfloat32m1_t result3 = __riscv_vfmul_vf_f32m1( A1, B1, gvl);
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                bi += 2;
+
+                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+                A1 = __riscv_vle32_v_f32m1( &A[ai+1*gvl], gvl );
+                ai += 16;
+
+                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m1( result1, B0, A1, gvl);
+                result2 = __riscv_vfmacc_vf_f32m1( result2, B1, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f32m1( result3, B1, A1, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t c0 = __riscv_vfmul_vf_f32m1( result0, alpha, gvl );
+            vfloat32m1_t c1 = __riscv_vfmul_vf_f32m1( result1, alpha, gvl );
+            vfloat32m1_t c2 = __riscv_vfmul_vf_f32m1( result2, alpha, gvl );
+            vfloat32m1_t c3 = __riscv_vfmul_vf_f32m1( result3, alpha, gvl );
+            __riscv_vse32_v_f32m1( &C[ci], c0, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c1, gvl); ci += ldc-gvl*1;
+            __riscv_vse32_v_f32m1( &C[ci], c2, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c3, gvl);
+            m_top += 16;
+        }
+
+
+        if( M & 8 ) {
+            gvl = __riscv_vsetvl_e32m1(8);
+
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*8;
+                bi += off*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 8;
+                #else
+                    pass_K = off + 2;
+                #endif
+            #endif
+            float B0 = B[bi+0];
+            float B1 = B[bi+1];
+            bi += 2;
+
+            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+            ai += 8;
+
+            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
+            vfloat32m1_t result1 = __riscv_vfmul_vf_f32m1( A0, B1, gvl);
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                bi += 2;
+
+                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m1( result1, B1, A0, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t c0 = __riscv_vfmul_vf_f32m1( result0, alpha, gvl );
+            vfloat32m1_t c1 = __riscv_vfmul_vf_f32m1( result1, alpha, gvl );
+            __riscv_vse32_v_f32m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c1, gvl);
+            m_top += 8;
+        }
+
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e32m1(4);
+
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*4;
+                bi += off*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 4;
+                #else
+                    pass_K = off + 2;
+                #endif
+            #endif
+            float B0 = B[bi+0];
+            float B1 = B[bi+1];
+            bi += 2;
+
+            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+            ai += 4;
+
+            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
+            vfloat32m1_t result1 = __riscv_vfmul_vf_f32m1( A0, B1, gvl);
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                bi += 2;
+
+                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m1( result1, B1, A0, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t c0 = __riscv_vfmul_vf_f32m1( result0, alpha, gvl );
+            vfloat32m1_t c1 = __riscv_vfmul_vf_f32m1( result1, alpha, gvl );
+            __riscv_vse32_v_f32m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c1, gvl);
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*2;
+                bi += off*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 2;
+                #else
+                    pass_K = off + 2;
+                #endif
+            #endif
+
+            for(BLASLONG k=0; k<pass_K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                result1+=A[ai+1]*B[bi+0];
+                result2+=A[ai+0]*B[bi+1];
+                result3+=A[ai+1]*B[bi+1];
+                ai+=2;
+                bi+=2;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] = alpha * result0;
+            C[ci+0*ldc+1] = alpha * result1;
+            C[ci+1*ldc+0] = alpha * result2;
+            C[ci+1*ldc+1] = alpha * result3;
+            m_top+=2;
+        }
+
+
+        if( M & 1 ) {
+            float result0 = 0;
+            float result1 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*1;
+                bi += off*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 1;
+                #else
+                    pass_K = off + 2;
+                #endif
+            #endif
+
+            for(BLASLONG k=0; k<pass_K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                result1+=A[ai+0]*B[bi+1];
+                ai+=1;
+                bi+=2;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] = alpha * result0;
+            C[ci+1*ldc+0] = alpha * result1;
+            m_top+=1;
+        }
+
+        n_top += 2;
+    }
+
+
+
+    // -- tails for N=1
+
+    if( N & 1 ) {
+        gvl = __riscv_vsetvl_e32m1(8);
+        m_top = 0;
+
+        for (BLASLONG i=0; i<M/16; i+=1) {
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*16;
+                bi += off*1;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 16;
+                #else
+                    pass_K = off + 1;
+                #endif
+            #endif
+            float B0 = B[bi+0];
+            bi += 1;
+
+            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+            vfloat32m1_t A1 = __riscv_vle32_v_f32m1( &A[ai+1*gvl], gvl );
+            ai += 16;
+
+            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
+            vfloat32m1_t result1 = __riscv_vfmul_vf_f32m1( A1, B0, gvl);
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0 = B[bi+0];
+                bi += 1;
+
+                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+                A1 = __riscv_vle32_v_f32m1( &A[ai+1*gvl], gvl );
+                ai += 16;
+
+                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m1( result1, B0, A1, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t c0 = __riscv_vfmul_vf_f32m1( result0, alpha, gvl );
+            vfloat32m1_t c1 = __riscv_vfmul_vf_f32m1( result1, alpha, gvl );
+            __riscv_vse32_v_f32m1( &C[ci], c0, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c1, gvl);
+            m_top += 16;
+        }
+
+
+        if( M & 8 ) {
+            gvl = __riscv_vsetvl_e32m1(8);
+
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*8;
+                bi += off*1;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 8;
+                #else
+                    pass_K = off + 1;
+                #endif
+            #endif
+            float B0 = B[bi+0];
+            bi += 1;
+
+            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+            ai += 8;
+
+            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0 = B[bi+0];
+                bi += 1;
+
+                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t c0 = __riscv_vfmul_vf_f32m1( result0, alpha, gvl );
+            __riscv_vse32_v_f32m1( &C[ci], c0, gvl);
+            m_top += 8;
+        }
+
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e32m1(4);
+
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*4;
+                bi += off*1;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 4;
+                #else
+                    pass_K = off + 1;
+                #endif
+            #endif
+            float B0 = B[bi+0];
+            bi += 1;
+
+            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+            ai += 4;
+
+            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0 = B[bi+0];
+                bi += 1;
+
+                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t c0 = __riscv_vfmul_vf_f32m1( result0, alpha, gvl );
+            __riscv_vse32_v_f32m1( &C[ci], c0, gvl);
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            float result0 = 0;
+            float result1 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*2;
+                bi += off*1;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 2;
+                #else
+                    pass_K = off + 1;
+                #endif
+            #endif
+
+            for(BLASLONG k=0; k<pass_K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                result1+=A[ai+1]*B[bi+0];
+                ai+=2;
+                bi+=1;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] = alpha * result0;
+            C[ci+0*ldc+1] = alpha * result1;
+            m_top+=2;
+        }
+
+
+        if( M & 1 ) {
+            float result0 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*1;
+                bi += off*1;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 1;
+                #else
+                    pass_K = off + 1;
+                #endif
+            #endif
+
+            for(BLASLONG k=0; k<pass_K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                ai+=1;
+                bi+=1;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] = alpha * result0;
+            m_top+=1;
+        }
+
+        n_top += 1;
+    }
+
+    return 0;
+}
+
diff --git a/kernel/riscv64/strmm_kernel_8x8_zvl128b.c b/kernel/riscv64/strmm_kernel_8x8_zvl128b.c
new file mode 100644
index 000000000..ef18f036c
--- /dev/null
+++ b/kernel/riscv64/strmm_kernel_8x8_zvl128b.c
@@ -0,0 +1,991 @@
+/*
+
+AUTOGENERATED KERNEL
+Script: ./kernel/riscv64/generate_kernel.py
+Settings:
+ LMUL=2
+ M=8
+ M_tail_scalar_from=2
+ N=8
+ __riscv_='__riscv_'
+ complex=False
+ conjugate=False
+ cpu='zvl128b'
+ force_acc_double=False
+ index_type='BLASLONG'
+ op='trmm'
+ param_precision='float'
+ reg_width_bits=128
+ tail_policy=''
+ trace=False
+
+Derived:
+ ELEN_ACC=32
+ ELEN_PARAM=32
+ LMUL_ACC=2
+ VFMACC='__riscv_vfmacc_vf_f32m2'
+ VFMUL='__riscv_vfmul_vf_f32m2'
+ VLEV='__riscv_vle32_v_f32m2'
+ VLSEV='__riscv_vlse32_v_f32m2'
+ VMACC_TO_ACC='__riscv_vfmacc_vf_f32m2'
+ VMUL_TO_ACC='__riscv_vfmul_vf_f32m2'
+ VSETVL='__riscv_vsetvl_e32m2'
+ VSEV='__riscv_vse32_v_f32m2'
+ VSSEV='__riscv_vsse32_v_f32m2'
+ acc_vector_t='vfloat32m2_t'
+ output='strmm_kernel_8x8_zvl128b.c'
+ param_scalar_t='float'
+ param_vector_t='vfloat32m2_t'
+
+*/
+
+#include "common.h"
+
+#if defined(LEFT) != defined(TRANSA)
+#define BACKWARDS
+#endif
+
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc, BLASLONG offset)
+
+{
+    BLASLONG gvl = 0;
+    BLASLONG m_top = 0;
+    BLASLONG n_top = 0;
+
+    // -- MAIN PASS
+
+    for (BLASLONG j = 0; j < N / 8; j += 1) {
+        m_top = 0;
+        BLASLONG gvl = __riscv_vsetvl_e32m2(8);
+
+        for (BLASLONG i = 0; i < M / 8; i += 1) {
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 8;
+            bi += off * 8;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 8;
+#else
+            pass_K = off + 8;
+#endif
+#endif
+            float B0 = B[bi + 0];
+            float B1 = B[bi + 1];
+            float B2 = B[bi + 2];
+            float B3 = B[bi + 3];
+            float B4 = B[bi + 4];
+            float B5 = B[bi + 5];
+            float B6 = B[bi + 6];
+            float B7 = B[bi + 7];
+            bi += 8;
+
+            vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+            ai += 8;
+
+            vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl);
+            vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl);
+            vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl);
+            vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl);
+            vfloat32m2_t result4 = __riscv_vfmul_vf_f32m2(A0, B4, gvl);
+            vfloat32m2_t result5 = __riscv_vfmul_vf_f32m2(A0, B5, gvl);
+            vfloat32m2_t result6 = __riscv_vfmul_vf_f32m2(A0, B6, gvl);
+            vfloat32m2_t result7 = __riscv_vfmul_vf_f32m2(A0, B7, gvl);
+
+            for (BLASLONG k = 1; k < pass_K; k++) {
+                B0 = B[bi + 0];
+                B1 = B[bi + 1];
+                B2 = B[bi + 2];
+                B3 = B[bi + 3];
+                B4 = B[bi + 4];
+                B5 = B[bi + 5];
+                B6 = B[bi + 6];
+                B7 = B[bi + 7];
+                bi += 8;
+
+                A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl);
+                result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl);
+                result4 = __riscv_vfmacc_vf_f32m2(result4, B4, A0, gvl);
+                result5 = __riscv_vfmacc_vf_f32m2(result5, B5, A0, gvl);
+                result6 = __riscv_vfmacc_vf_f32m2(result6, B6, A0, gvl);
+                result7 = __riscv_vfmacc_vf_f32m2(result7, B7, A0, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat32m2_t c0 = __riscv_vfmul_vf_f32m2(result0, alpha, gvl);
+            vfloat32m2_t c1 = __riscv_vfmul_vf_f32m2(result1, alpha, gvl);
+            vfloat32m2_t c2 = __riscv_vfmul_vf_f32m2(result2, alpha, gvl);
+            vfloat32m2_t c3 = __riscv_vfmul_vf_f32m2(result3, alpha, gvl);
+            vfloat32m2_t c4 = __riscv_vfmul_vf_f32m2(result4, alpha, gvl);
+            vfloat32m2_t c5 = __riscv_vfmul_vf_f32m2(result5, alpha, gvl);
+            vfloat32m2_t c6 = __riscv_vfmul_vf_f32m2(result6, alpha, gvl);
+            vfloat32m2_t c7 = __riscv_vfmul_vf_f32m2(result7, alpha, gvl);
+            __riscv_vse32_v_f32m2(&C[ci], c0, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c1, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c2, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c3, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c4, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c5, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c6, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c7, gvl);
+            m_top += 8;
+        }
+
+        // -- tails for main pass
+
+        if (M & 4) {
+            gvl = __riscv_vsetvl_e32m2(4);
+
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 4;
+            bi += off * 8;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 4;
+#else
+            pass_K = off + 8;
+#endif
+#endif
+            float B0 = B[bi + 0];
+            float B1 = B[bi + 1];
+            float B2 = B[bi + 2];
+            float B3 = B[bi + 3];
+            float B4 = B[bi + 4];
+            float B5 = B[bi + 5];
+            float B6 = B[bi + 6];
+            float B7 = B[bi + 7];
+            bi += 8;
+
+            vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+            ai += 4;
+
+            vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl);
+            vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl);
+            vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl);
+            vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl);
+            vfloat32m2_t result4 = __riscv_vfmul_vf_f32m2(A0, B4, gvl);
+            vfloat32m2_t result5 = __riscv_vfmul_vf_f32m2(A0, B5, gvl);
+            vfloat32m2_t result6 = __riscv_vfmul_vf_f32m2(A0, B6, gvl);
+            vfloat32m2_t result7 = __riscv_vfmul_vf_f32m2(A0, B7, gvl);
+
+            for (BLASLONG k = 1; k < pass_K; k++) {
+                B0 = B[bi + 0];
+                B1 = B[bi + 1];
+                B2 = B[bi + 2];
+                B3 = B[bi + 3];
+                B4 = B[bi + 4];
+                B5 = B[bi + 5];
+                B6 = B[bi + 6];
+                B7 = B[bi + 7];
+                bi += 8;
+
+                A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl);
+                result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl);
+                result4 = __riscv_vfmacc_vf_f32m2(result4, B4, A0, gvl);
+                result5 = __riscv_vfmacc_vf_f32m2(result5, B5, A0, gvl);
+                result6 = __riscv_vfmacc_vf_f32m2(result6, B6, A0, gvl);
+                result7 = __riscv_vfmacc_vf_f32m2(result7, B7, A0, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat32m2_t c0 = __riscv_vfmul_vf_f32m2(result0, alpha, gvl);
+            vfloat32m2_t c1 = __riscv_vfmul_vf_f32m2(result1, alpha, gvl);
+            vfloat32m2_t c2 = __riscv_vfmul_vf_f32m2(result2, alpha, gvl);
+            vfloat32m2_t c3 = __riscv_vfmul_vf_f32m2(result3, alpha, gvl);
+            vfloat32m2_t c4 = __riscv_vfmul_vf_f32m2(result4, alpha, gvl);
+            vfloat32m2_t c5 = __riscv_vfmul_vf_f32m2(result5, alpha, gvl);
+            vfloat32m2_t c6 = __riscv_vfmul_vf_f32m2(result6, alpha, gvl);
+            vfloat32m2_t c7 = __riscv_vfmul_vf_f32m2(result7, alpha, gvl);
+            __riscv_vse32_v_f32m2(&C[ci], c0, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c1, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c2, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c3, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c4, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c5, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c6, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c7, gvl);
+            m_top += 4;
+        }
+
+        if (M & 2) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            float result4 = 0;
+            float result5 = 0;
+            float result6 = 0;
+            float result7 = 0;
+            float result8 = 0;
+            float result9 = 0;
+            float result10 = 0;
+            float result11 = 0;
+            float result12 = 0;
+            float result13 = 0;
+            float result14 = 0;
+            float result15 = 0;
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 2;
+            bi += off * 8;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 2;
+#else
+            pass_K = off + 8;
+#endif
+#endif
+
+            for (BLASLONG k = 0; k < pass_K; k++) {
+                result0 += A[ai + 0] * B[bi + 0];
+                result1 += A[ai + 1] * B[bi + 0];
+                result2 += A[ai + 0] * B[bi + 1];
+                result3 += A[ai + 1] * B[bi + 1];
+                result4 += A[ai + 0] * B[bi + 2];
+                result5 += A[ai + 1] * B[bi + 2];
+                result6 += A[ai + 0] * B[bi + 3];
+                result7 += A[ai + 1] * B[bi + 3];
+                result8 += A[ai + 0] * B[bi + 4];
+                result9 += A[ai + 1] * B[bi + 4];
+                result10 += A[ai + 0] * B[bi + 5];
+                result11 += A[ai + 1] * B[bi + 5];
+                result12 += A[ai + 0] * B[bi + 6];
+                result13 += A[ai + 1] * B[bi + 6];
+                result14 += A[ai + 0] * B[bi + 7];
+                result15 += A[ai + 1] * B[bi + 7];
+                ai += 2;
+                bi += 8;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            C[ci + 0 * ldc + 0] = alpha * result0;
+            C[ci + 0 * ldc + 1] = alpha * result1;
+            C[ci + 1 * ldc + 0] = alpha * result2;
+            C[ci + 1 * ldc + 1] = alpha * result3;
+            C[ci + 2 * ldc + 0] = alpha * result4;
+            C[ci + 2 * ldc + 1] = alpha * result5;
+            C[ci + 3 * ldc + 0] = alpha * result6;
+            C[ci + 3 * ldc + 1] = alpha * result7;
+            C[ci + 4 * ldc + 0] = alpha * result8;
+            C[ci + 4 * ldc + 1] = alpha * result9;
+            C[ci + 5 * ldc + 0] = alpha * result10;
+            C[ci + 5 * ldc + 1] = alpha * result11;
+            C[ci + 6 * ldc + 0] = alpha * result12;
+            C[ci + 6 * ldc + 1] = alpha * result13;
+            C[ci + 7 * ldc + 0] = alpha * result14;
+            C[ci + 7 * ldc + 1] = alpha * result15;
+            m_top += 2;
+        }
+
+        if (M & 1) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            float result4 = 0;
+            float result5 = 0;
+            float result6 = 0;
+            float result7 = 0;
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 1;
+            bi += off * 8;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 1;
+#else
+            pass_K = off + 8;
+#endif
+#endif
+
+            for (BLASLONG k = 0; k < pass_K; k++) {
+                result0 += A[ai + 0] * B[bi + 0];
+                result1 += A[ai + 0] * B[bi + 1];
+                result2 += A[ai + 0] * B[bi + 2];
+                result3 += A[ai + 0] * B[bi + 3];
+                result4 += A[ai + 0] * B[bi + 4];
+                result5 += A[ai + 0] * B[bi + 5];
+                result6 += A[ai + 0] * B[bi + 6];
+                result7 += A[ai + 0] * B[bi + 7];
+                ai += 1;
+                bi += 8;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            C[ci + 0 * ldc + 0] = alpha * result0;
+            C[ci + 1 * ldc + 0] = alpha * result1;
+            C[ci + 2 * ldc + 0] = alpha * result2;
+            C[ci + 3 * ldc + 0] = alpha * result3;
+            C[ci + 4 * ldc + 0] = alpha * result4;
+            C[ci + 5 * ldc + 0] = alpha * result5;
+            C[ci + 6 * ldc + 0] = alpha * result6;
+            C[ci + 7 * ldc + 0] = alpha * result7;
+            m_top += 1;
+        }
+
+        n_top += 8;
+    }
+
+    // -- tails for N=4
+
+    if (N & 4) {
+        gvl = __riscv_vsetvl_e32m2(8);
+        m_top = 0;
+
+        for (BLASLONG i = 0; i < M / 8; i += 1) {
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 8;
+            bi += off * 4;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 8;
+#else
+            pass_K = off + 4;
+#endif
+#endif
+            float B0 = B[bi + 0];
+            float B1 = B[bi + 1];
+            float B2 = B[bi + 2];
+            float B3 = B[bi + 3];
+            bi += 4;
+
+            vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+            ai += 8;
+
+            vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl);
+            vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl);
+            vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl);
+            vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl);
+
+            for (BLASLONG k = 1; k < pass_K; k++) {
+                B0 = B[bi + 0];
+                B1 = B[bi + 1];
+                B2 = B[bi + 2];
+                B3 = B[bi + 3];
+                bi += 4;
+
+                A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl);
+                result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat32m2_t c0 = __riscv_vfmul_vf_f32m2(result0, alpha, gvl);
+            vfloat32m2_t c1 = __riscv_vfmul_vf_f32m2(result1, alpha, gvl);
+            vfloat32m2_t c2 = __riscv_vfmul_vf_f32m2(result2, alpha, gvl);
+            vfloat32m2_t c3 = __riscv_vfmul_vf_f32m2(result3, alpha, gvl);
+            __riscv_vse32_v_f32m2(&C[ci], c0, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c1, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c2, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c3, gvl);
+            m_top += 8;
+        }
+
+        if (M & 4) {
+            gvl = __riscv_vsetvl_e32m2(4);
+
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 4;
+            bi += off * 4;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 4;
+#else
+            pass_K = off + 4;
+#endif
+#endif
+            float B0 = B[bi + 0];
+            float B1 = B[bi + 1];
+            float B2 = B[bi + 2];
+            float B3 = B[bi + 3];
+            bi += 4;
+
+            vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+            ai += 4;
+
+            vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl);
+            vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl);
+            vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl);
+            vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl);
+
+            for (BLASLONG k = 1; k < pass_K; k++) {
+                B0 = B[bi + 0];
+                B1 = B[bi + 1];
+                B2 = B[bi + 2];
+                B3 = B[bi + 3];
+                bi += 4;
+
+                A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl);
+                result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat32m2_t c0 = __riscv_vfmul_vf_f32m2(result0, alpha, gvl);
+            vfloat32m2_t c1 = __riscv_vfmul_vf_f32m2(result1, alpha, gvl);
+            vfloat32m2_t c2 = __riscv_vfmul_vf_f32m2(result2, alpha, gvl);
+            vfloat32m2_t c3 = __riscv_vfmul_vf_f32m2(result3, alpha, gvl);
+            __riscv_vse32_v_f32m2(&C[ci], c0, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c1, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c2, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c3, gvl);
+            m_top += 4;
+        }
+
+        if (M & 2) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            float result4 = 0;
+            float result5 = 0;
+            float result6 = 0;
+            float result7 = 0;
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 2;
+            bi += off * 4;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 2;
+#else
+            pass_K = off + 4;
+#endif
+#endif
+
+            for (BLASLONG k = 0; k < pass_K; k++) {
+                result0 += A[ai + 0] * B[bi + 0];
+                result1 += A[ai + 1] * B[bi + 0];
+                result2 += A[ai + 0] * B[bi + 1];
+                result3 += A[ai + 1] * B[bi + 1];
+                result4 += A[ai + 0] * B[bi + 2];
+                result5 += A[ai + 1] * B[bi + 2];
+                result6 += A[ai + 0] * B[bi + 3];
+                result7 += A[ai + 1] * B[bi + 3];
+                ai += 2;
+                bi += 4;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            C[ci + 0 * ldc + 0] = alpha * result0;
+            C[ci + 0 * ldc + 1] = alpha * result1;
+            C[ci + 1 * ldc + 0] = alpha * result2;
+            C[ci + 1 * ldc + 1] = alpha * result3;
+            C[ci + 2 * ldc + 0] = alpha * result4;
+            C[ci + 2 * ldc + 1] = alpha * result5;
+            C[ci + 3 * ldc + 0] = alpha * result6;
+            C[ci + 3 * ldc + 1] = alpha * result7;
+            m_top += 2;
+        }
+
+        if (M & 1) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 1;
+            bi += off * 4;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 1;
+#else
+            pass_K = off + 4;
+#endif
+#endif
+
+            for (BLASLONG k = 0; k < pass_K; k++) {
+                result0 += A[ai + 0] * B[bi + 0];
+                result1 += A[ai + 0] * B[bi + 1];
+                result2 += A[ai + 0] * B[bi + 2];
+                result3 += A[ai + 0] * B[bi + 3];
+                ai += 1;
+                bi += 4;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            C[ci + 0 * ldc + 0] = alpha * result0;
+            C[ci + 1 * ldc + 0] = alpha * result1;
+            C[ci + 2 * ldc + 0] = alpha * result2;
+            C[ci + 3 * ldc + 0] = alpha * result3;
+            m_top += 1;
+        }
+
+        n_top += 4;
+    }
+
+    // -- tails for N=2
+
+    if (N & 2) {
+        gvl = __riscv_vsetvl_e32m2(8);
+        m_top = 0;
+
+        for (BLASLONG i = 0; i < M / 8; i += 1) {
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 8;
+            bi += off * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 8;
+#else
+            pass_K = off + 2;
+#endif
+#endif
+            float B0 = B[bi + 0];
+            float B1 = B[bi + 1];
+            bi += 2;
+
+            vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+            ai += 8;
+
+            vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl);
+            vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl);
+
+            for (BLASLONG k = 1; k < pass_K; k++) {
+                B0 = B[bi + 0];
+                B1 = B[bi + 1];
+                bi += 2;
+
+                A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat32m2_t c0 = __riscv_vfmul_vf_f32m2(result0, alpha, gvl);
+            vfloat32m2_t c1 = __riscv_vfmul_vf_f32m2(result1, alpha, gvl);
+            __riscv_vse32_v_f32m2(&C[ci], c0, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c1, gvl);
+            m_top += 8;
+        }
+
+        if (M & 4) {
+            gvl = __riscv_vsetvl_e32m2(4);
+
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 4;
+            bi += off * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 4;
+#else
+            pass_K = off + 2;
+#endif
+#endif
+            float B0 = B[bi + 0];
+            float B1 = B[bi + 1];
+            bi += 2;
+
+            vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+            ai += 4;
+
+            vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl);
+            vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl);
+
+            for (BLASLONG k = 1; k < pass_K; k++) {
+                B0 = B[bi + 0];
+                B1 = B[bi + 1];
+                bi += 2;
+
+                A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat32m2_t c0 = __riscv_vfmul_vf_f32m2(result0, alpha, gvl);
+            vfloat32m2_t c1 = __riscv_vfmul_vf_f32m2(result1, alpha, gvl);
+            __riscv_vse32_v_f32m2(&C[ci], c0, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c1, gvl);
+            m_top += 4;
+        }
+
+        if (M & 2) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 2;
+            bi += off * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 2;
+#else
+            pass_K = off + 2;
+#endif
+#endif
+
+            for (BLASLONG k = 0; k < pass_K; k++) {
+                result0 += A[ai + 0] * B[bi + 0];
+                result1 += A[ai + 1] * B[bi + 0];
+                result2 += A[ai + 0] * B[bi + 1];
+                result3 += A[ai + 1] * B[bi + 1];
+                ai += 2;
+                bi += 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            C[ci + 0 * ldc + 0] = alpha * result0;
+            C[ci + 0 * ldc + 1] = alpha * result1;
+            C[ci + 1 * ldc + 0] = alpha * result2;
+            C[ci + 1 * ldc + 1] = alpha * result3;
+            m_top += 2;
+        }
+
+        if (M & 1) {
+            float result0 = 0;
+            float result1 = 0;
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 1;
+            bi += off * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 1;
+#else
+            pass_K = off + 2;
+#endif
+#endif
+
+            for (BLASLONG k = 0; k < pass_K; k++) {
+                result0 += A[ai + 0] * B[bi + 0];
+                result1 += A[ai + 0] * B[bi + 1];
+                ai += 1;
+                bi += 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            C[ci + 0 * ldc + 0] = alpha * result0;
+            C[ci + 1 * ldc + 0] = alpha * result1;
+            m_top += 1;
+        }
+
+        n_top += 2;
+    }
+
+    // -- tails for N=1
+
+    if (N & 1) {
+        gvl = __riscv_vsetvl_e32m2(8);
+        m_top = 0;
+
+        for (BLASLONG i = 0; i < M / 8; i += 1) {
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 8;
+            bi += off * 1;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 8;
+#else
+            pass_K = off + 1;
+#endif
+#endif
+            float B0 = B[bi + 0];
+            bi += 1;
+
+            vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+            ai += 8;
+
+            vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl);
+
+            for (BLASLONG k = 1; k < pass_K; k++) {
+                B0 = B[bi + 0];
+                bi += 1;
+
+                A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat32m2_t c0 = __riscv_vfmul_vf_f32m2(result0, alpha, gvl);
+            __riscv_vse32_v_f32m2(&C[ci], c0, gvl);
+            m_top += 8;
+        }
+
+        if (M & 4) {
+            gvl = __riscv_vsetvl_e32m2(4);
+
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 4;
+            bi += off * 1;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 4;
+#else
+            pass_K = off + 1;
+#endif
+#endif
+            float B0 = B[bi + 0];
+            bi += 1;
+
+            vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+            ai += 4;
+
+            vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl);
+
+            for (BLASLONG k = 1; k < pass_K; k++) {
+                B0 = B[bi + 0];
+                bi += 1;
+
+                A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat32m2_t c0 = __riscv_vfmul_vf_f32m2(result0, alpha, gvl);
+            __riscv_vse32_v_f32m2(&C[ci], c0, gvl);
+            m_top += 4;
+        }
+
+        if (M & 2) {
+            float result0 = 0;
+            float result1 = 0;
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 2;
+            bi += off * 1;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 2;
+#else
+            pass_K = off + 1;
+#endif
+#endif
+
+            for (BLASLONG k = 0; k < pass_K; k++) {
+                result0 += A[ai + 0] * B[bi + 0];
+                result1 += A[ai + 1] * B[bi + 0];
+                ai += 2;
+                bi += 1;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            C[ci + 0 * ldc + 0] = alpha * result0;
+            C[ci + 0 * ldc + 1] = alpha * result1;
+            m_top += 2;
+        }
+
+        if (M & 1) {
+            float result0 = 0;
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 1;
+            bi += off * 1;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 1;
+#else
+            pass_K = off + 1;
+#endif
+#endif
+
+            for (BLASLONG k = 0; k < pass_K; k++) {
+                result0 += A[ai + 0] * B[bi + 0];
+                ai += 1;
+                bi += 1;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            C[ci + 0 * ldc + 0] = alpha * result0;
+            m_top += 1;
+        }
+
+        n_top += 1;
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/sum_rvv.c b/kernel/riscv64/sum_rvv.c
new file mode 100644
index 000000000..c5629197f
--- /dev/null
+++ b/kernel/riscv64/sum_rvv.c
@@ -0,0 +1,95 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m8()
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e32m1()
+#define FLOAT_V_T               vfloat32m8_t
+#define FLOAT_V_T_M1            vfloat32m1_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m8
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m8
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m8
+#define VFADDVV_FLOAT_TU        __riscv_vfadd_vv_f32m8_tu
+#define VFREDSUMVS_FLOAT        __riscv_vfredusum_vs_f32m8_f32m1
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m8()
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e64m1()
+#define FLOAT_V_T               vfloat64m8_t
+#define FLOAT_V_T_M1            vfloat64m1_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m8
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m8
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m8
+#define VFADDVV_FLOAT_TU        __riscv_vfadd_vv_f64m8_tu
+#define VFREDSUMVS_FLOAT        __riscv_vfredusum_vs_f64m8_f64m1
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+    FLOAT sumf = 0.0;
+    if (n <= 0 || inc_x <= 0) return(sumf);
+
+    FLOAT_V_T vx, vsum;
+    FLOAT_V_T_M1 v_res;
+
+    v_res = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1);
+    size_t vlmax = VSETVL_MAX;
+    vsum = VFMVVF_FLOAT(0.0, vlmax);
+
+    if(inc_x == 1) {
+
+        for (size_t vl; n > 0; n -= vl, x += vl) {
+            vl = VSETVL(n);
+
+            vx = VLEV_FLOAT(x, vl);
+            vsum = VFADDVV_FLOAT_TU(vsum, vsum, vx, vl);
+        }
+
+    } else {
+
+        BLASLONG stride_x = inc_x * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x) {
+            vl = VSETVL(n);
+
+            vx = VLSEV_FLOAT(x, stride_x, vl);
+            vsum = VFADDVV_FLOAT_TU(vsum, vsum, vx, vl);
+        }
+
+    }
+
+    v_res = VFREDSUMVS_FLOAT(vsum, v_res, vlmax);
+    sumf = VFMVFS_FLOAT_M1(v_res);
+    return(sumf);
+}
diff --git a/kernel/riscv64/sum_vector.c b/kernel/riscv64/sum_vector.c
new file mode 100644
index 000000000..cf734faab
--- /dev/null
+++ b/kernel/riscv64/sum_vector.c
@@ -0,0 +1,114 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#if !defined(DOUBLE)
+#define VSETVL(n) RISCV_RVV(vsetvl_e32m8)(n)
+#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)()
+#define FLOAT_V_T vfloat32m8_t
+#define FLOAT_V_T_M1 vfloat32m1_t
+#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m8)
+#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m8)
+#define VFREDSUMVS_FLOAT RISCV_RVV(vfredusum_vs_f32m8_f32m1)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m8)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1)
+#define VFADDVV_FLOAT RISCV_RVV(vfadd_vv_f32m8)
+#else
+#define VSETVL(n) RISCV_RVV(vsetvl_e64m8)(n)
+#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)()
+#define FLOAT_V_T vfloat64m8_t
+#define FLOAT_V_T_M1 vfloat64m1_t
+#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m8)
+#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m8)
+#define VFREDSUMVS_FLOAT RISCV_RVV(vfredusum_vs_f64m8_f64m1)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m8)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1)
+#define VFADDVV_FLOAT RISCV_RVV(vfadd_vv_f64m8)
+#endif
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0, j=0;
+	BLASLONG ix=0;
+	FLOAT asumf=0.0;
+	if (n <= 0 || inc_x <= 0) return(asumf);
+        unsigned int gvl = 0;
+        FLOAT_V_T v0, v1, v_sum;
+        FLOAT_V_T_M1 v_res;
+        gvl = VSETVL_MAX;
+        v_res = VFMVVF_FLOAT_M1(0, gvl);
+
+        if(inc_x == 1){
+                gvl = VSETVL(n);
+                if(gvl <= n/2){
+                        v_sum = VFMVVF_FLOAT(0, gvl);
+                        for(i=0,j=0; i<n/(gvl*2); i++){
+                                v0 = VLEV_FLOAT(&x[j], gvl);
+                                v_sum = VFADDVV_FLOAT(v_sum, v0, gvl);
+
+                                v1 = VLEV_FLOAT(&x[j+gvl], gvl);
+                                v_sum = VFADDVV_FLOAT(v_sum, v1, gvl);
+                                j += gvl * 2;
+                        }
+                        v_res = VFREDSUMVS_FLOAT(v_sum, v_res, gvl);
+                }
+                for(;j<n;){
+                        gvl = VSETVL(n-j);
+                        v0 = VLEV_FLOAT(&x[j], gvl);
+                        v_res = VFREDSUMVS_FLOAT(v0, v_res, gvl);
+                        j += gvl;
+                }
+        }else{
+                gvl = VSETVL(n);
+                unsigned int stride_x = inc_x * sizeof(FLOAT);
+                if(gvl <= n/2){
+                        v_sum = VFMVVF_FLOAT(0, gvl);
+                        BLASLONG inc_xv = inc_x * gvl;
+                        for(i=0,j=0; i<n/(gvl*2); i++){
+                                v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                v_sum = VFADDVV_FLOAT(v_sum, v0, gvl);
+
+                                v1 = VLSEV_FLOAT(&x[ix+inc_xv], stride_x, gvl);
+                                v_sum = VFADDVV_FLOAT(v_sum, v1, gvl);
+                                j += gvl * 2;
+                                inc_xv += inc_xv * 2;
+                        }
+                        v_res = VFREDSUMVS_FLOAT(v_sum, v_res, gvl);
+                }
+                for(;j<n;){
+                        gvl = VSETVL(n-j);
+                        v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
+                        v_res = VFREDSUMVS_FLOAT(v0, v_res, gvl);
+                        j += gvl;
+                }
+        }
+        asumf = EXTRACT_FLOAT(v_res);
+	return(asumf);
+}
+
+
diff --git a/kernel/riscv64/swap.c b/kernel/riscv64/swap.c
index eac621fb2..33bbeeb6a 100644
--- a/kernel/riscv64/swap.c
+++ b/kernel/riscv64/swap.c
@@ -41,7 +41,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
 	BLASLONG ix=0,iy=0;
 	FLOAT temp;
 
-	if ( n < 0     )  return(0);
+	if ( n <= 0     )  return(0);
 
 	while(i < n)
 	{
diff --git a/kernel/riscv64/swap_rvv.c b/kernel/riscv64/swap_rvv.c
new file mode 100644
index 000000000..893d70554
--- /dev/null
+++ b/kernel/riscv64/swap_rvv.c
@@ -0,0 +1,138 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m8(n)
+#define FLOAT_V_T               vfloat32m8_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m8
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m8
+#define VSEV_FLOAT              __riscv_vse32_v_f32m8
+#define VSSEV_FLOAT             __riscv_vsse32_v_f32m8
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m8(n)
+#define FLOAT_V_T               vfloat64m8_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m8
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m8
+#define VSEV_FLOAT              __riscv_vse64_v_f64m8
+#define VSSEV_FLOAT             __riscv_vsse64_v_f64m8
+#endif
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+    BLASLONG stride_x, stride_y;
+    FLOAT_V_T vx, vy;
+
+    if (n <= 0) return(0);
+
+    if (inc_x == 0 && inc_y == 0) {
+        if (n & 1) {
+            FLOAT temp = x[0];
+            x[0] = y[0];
+            y[0] = temp;
+        }
+        else {
+            return 0;
+        }
+    }
+    else if(inc_x == 0) {
+        FLOAT temp = x[0];
+        x[0] = y[(n - 1) * inc_y];
+        FLOAT* ptr = y + (n - 1) * inc_y;   // start from the last one
+        stride_y = (0 - inc_y) * sizeof(FLOAT); // reverse
+        BLASLONG m = n - 1;
+        for (size_t vl; m > 0; m -= vl, ptr -= vl*inc_y) {
+            vl = VSETVL(m);
+            vy = VLSEV_FLOAT(ptr - 1, stride_y, vl);
+            VSSEV_FLOAT(ptr, stride_y, vy, vl);
+        }
+        y[0] = temp;
+    }
+    else if(inc_y == 0) {
+        FLOAT temp = y[0];
+        y[0] = x[(n - 1) * inc_x];
+        FLOAT* ptr = x + (n - 1) * inc_x;   // start from the last one
+        stride_x = (0 - inc_x) * sizeof(FLOAT); // reverse
+        BLASLONG m = n - 1;
+        for (size_t vl; m > 0; m -= vl, ptr -= vl*inc_x) {
+            vl = VSETVL(m);
+            vx = VLSEV_FLOAT(ptr - 1, stride_x, vl);
+            VSSEV_FLOAT(ptr, stride_x, vx, vl);
+        }
+        x[0] = temp;
+    }
+    else if(inc_x == 1 && inc_y == 1) {
+        for (size_t vl; n > 0; n -= vl, x += vl, y += vl) {
+            vl = VSETVL(n);
+
+            vx = VLEV_FLOAT(x, vl);
+            vy = VLEV_FLOAT(y, vl);
+            VSEV_FLOAT(y, vx, vl);
+            VSEV_FLOAT(x, vy, vl);
+        }
+  
+    } else if (inc_y == 1) {
+        stride_x = inc_x * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) {
+            vl = VSETVL(n);
+
+            vx = VLSEV_FLOAT(x, stride_x, vl);
+            vy = VLEV_FLOAT(y, vl);
+            VSEV_FLOAT(y, vx, vl);
+            VSSEV_FLOAT(x, stride_x, vy, vl);
+        }
+ 
+    } else if(inc_x == 1) {
+        stride_y = inc_y * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) {
+            vl = VSETVL(n);
+
+            vx = VLEV_FLOAT(x, vl);
+            vy = VLSEV_FLOAT(y, stride_y, vl);
+            VSSEV_FLOAT(y, stride_y, vx, vl);
+            VSEV_FLOAT(x, vy, vl);
+        }
+ 
+    } else {
+        stride_x = inc_x * sizeof(FLOAT);
+        stride_y = inc_y * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) {
+            vl = VSETVL(n);
+
+            vx = VLSEV_FLOAT(x, stride_x, vl);
+            vy = VLSEV_FLOAT(y, stride_y, vl);
+            VSSEV_FLOAT(y, stride_y, vx, vl);
+            VSSEV_FLOAT(x, stride_x, vy, vl);
+        }
+    }
+
+    return(0);
+}
diff --git a/kernel/riscv64/swap_vector.c b/kernel/riscv64/swap_vector.c
index 82fa5ce31..f583f5392 100644
--- a/kernel/riscv64/swap_vector.c
+++ b/kernel/riscv64/swap_vector.c
@@ -27,35 +27,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 #include <stdio.h>
-#if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m8(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
-#define FLOAT_V_T vfloat32m8_t
-#define VLEV_FLOAT vle32_v_f32m8
-#define VLSEV_FLOAT vlse32_v_f32m8
-#define VSEV_FLOAT vse32_v_f32m8
-#define VSSEV_FLOAT vsse32_v_f32m8
+
+#ifdef RISCV64_ZVL256B
+#       define LMUL m2
+#       if defined(DOUBLE)
+#               define ELEN 64
+#               define MLEN 32
+#       else
+#               define ELEN 32
+#               define MLEN 16
+#       endif
 #else
-#define VSETVL(n) vsetvl_e64m8(n)
-#define VSETVL_MAX vsetvlmax_e64m1()
-#define FLOAT_V_T vfloat64m8_t
-#define VLEV_FLOAT vle64_v_f64m8
-#define VLSEV_FLOAT vlse64_v_f64m8
-#define VSEV_FLOAT vse64_v_f64m8
-#define VSSEV_FLOAT vsse64_v_f64m8
+#       define LMUL m8
+#       if defined(DOUBLE)
+#               define ELEN 64
+#               define MLEN 8
+#       else
+#               define ELEN 32
+#               define MLEN 4
+#       endif
 #endif
 
+#define _
+#define JOIN2_X(x, y) x ## y
+#define JOIN2(x, y) JOIN2_X(x, y)
+#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
+
+#define VSETVL          JOIN(RISCV_RVV(vsetvl),    _e,     ELEN,   LMUL,   _)
+#define FLOAT_V_T       JOIN(vfloat,            ELEN,   LMUL,   _t,     _)
+#define VLEV_FLOAT      JOIN(RISCV_RVV(vle),       ELEN,   _v_f,   ELEN,   LMUL)
+#define VLSEV_FLOAT     JOIN(RISCV_RVV(vlse),      ELEN,   _v_f,   ELEN,   LMUL)
+#define VSEV_FLOAT      JOIN(RISCV_RVV(vse),       ELEN,   _v_f,   ELEN,   LMUL)
+#define VSSEV_FLOAT     JOIN(RISCV_RVV(vsse),      ELEN,   _v_f,   ELEN,   LMUL)
+
 int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
 {
 	BLASLONG i = 0, j = 0;
 	BLASLONG ix = 0,iy = 0;
         BLASLONG stride_x, stride_y;
         FLOAT_V_T vx0, vx1, vy0, vy1;
-        unsigned int gvl = 0;
 
-	if (n < 0)  return(0);
+	if (n <= 0)  return(0);
+
+        unsigned int gvl = VSETVL((inc_x != 0 && inc_y != 0) ? n : 1);
+        if( inc_x == 0 && inc_y == 0 ) { n = n & 1; }
+
         if(inc_x == 1 && inc_y == 1){
-                gvl = VSETVL(n);
                 if(gvl <= n/2){
                         for(i=0,j=0; i<n/(2*gvl); i++){
                                 vx0 = VLEV_FLOAT(&x[j], gvl);
@@ -79,7 +96,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
                         j+=gvl;
                 }
         }else if (inc_y == 1){
-                gvl = VSETVL(n);
                 stride_x = inc_x * sizeof(FLOAT);
                 if(gvl <= n/2){
                         BLASLONG inc_xv = inc_x * gvl;
@@ -107,7 +123,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
                         ix += inc_x * gvl;
                 }
         }else if(inc_x == 1){
-                gvl = VSETVL(n);
                 stride_y = inc_y * sizeof(FLOAT);
                 if(gvl <= n/2){
                         BLASLONG inc_yv = inc_y * gvl;
@@ -135,8 +150,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
                         iy += inc_y * gvl;
                 }
         }else{
-                gvl = VSETVL(n);
-                if (inc_x == 0 && inc_y == 0) gvl = VSETVL(1);
                 stride_x = inc_x * sizeof(FLOAT);
                 stride_y = inc_y * sizeof(FLOAT);
                 if(gvl <= n/2){
diff --git a/kernel/riscv64/symm_lcopy_rvv_v1.c b/kernel/riscv64/symm_lcopy_rvv_v1.c
new file mode 100644
index 000000000..a615db44d
--- /dev/null
+++ b/kernel/riscv64/symm_lcopy_rvv_v1.c
@@ -0,0 +1,101 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m2()
+#define FLOAT_V_T               vfloat32m2_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m2
+#define INT_V_T                 vint32m2_t
+#define VID_V_INT               __riscv_vid_v_i32m2
+#define VADD_VX_INT             __riscv_vadd_vx_i32m2
+#define VMSGT_VX_INT            __riscv_vmsgt_vx_i32m2_b16
+#define VBOOL_T                 vbool16_t
+#define VMERGE_VVM_FLOAT        __riscv_vmerge_vvm_f32m2
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m2()
+#define FLOAT_V_T               vfloat64m2_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m2
+#define INT_V_T                 vint64m2_t
+#define VID_V_INT               __riscv_vid_v_i64m2
+#define VADD_VX_INT             __riscv_vadd_vx_i64m2
+#define VMSGT_VX_INT            __riscv_vmsgt_vx_i64m2_b32
+#define VBOOL_T                 vbool32_t
+#define VMERGE_VVM_FLOAT        __riscv_vmerge_vvm_f64m2
+#endif
+
+// Optimizes the implementation in ../generic/symm_lcopy_4.c
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b)
+{
+    BLASLONG i, js, offset;
+
+    FLOAT *ao1, *ao2;
+
+    BLASLONG stride_lda = sizeof(FLOAT)*lda;
+
+    FLOAT_V_T vb, va1, va2;
+    VBOOL_T vbool;
+    INT_V_T vindex_max, vindex;
+
+    size_t vl = VSETVL_MAX;
+    vindex_max   = VID_V_INT(vl);
+
+    for (js = n; js > 0; js -= vl, posX += vl) {
+        vl = VSETVL(js);
+        offset = posX - posY;
+
+        ao1 = a + posX + posY * lda;
+        ao2 = a + posY + (posX) * lda;
+
+        for (i = m; i > 0; i--, offset--) {
+            va2 = VLSEV_FLOAT(ao2, stride_lda, vl);
+            va1 = VLEV_FLOAT(ao1, vl);
+
+            // offset > (0 - vindex)   --->   (offset + vindex) > 0
+            vindex = VADD_VX_INT(vindex_max, offset, vl);
+            vbool  = VMSGT_VX_INT(vindex, 0, vl);
+
+            vb =  VMERGE_VVM_FLOAT(va2, va1, vbool, vl);
+            VSEV_FLOAT(b, vb, vl);
+
+            b += vl;
+            ao1 += lda;
+            ao2++;
+        }
+    }
+
+    return 0;
+}
+
diff --git a/kernel/riscv64/symm_ucopy_rvv_v1.c b/kernel/riscv64/symm_ucopy_rvv_v1.c
new file mode 100644
index 000000000..464f97b3a
--- /dev/null
+++ b/kernel/riscv64/symm_ucopy_rvv_v1.c
@@ -0,0 +1,100 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m2()
+#define FLOAT_V_T               vfloat32m2_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m2
+#define INT_V_T                 vint32m2_t
+#define VID_V_INT               __riscv_vid_v_i32m2
+#define VADD_VX_INT             __riscv_vadd_vx_i32m2
+#define VMSGT_VX_INT            __riscv_vmsgt_vx_i32m2_b16
+#define VBOOL_T                 vbool16_t
+#define VMERGE_VVM_FLOAT        __riscv_vmerge_vvm_f32m2
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m2()
+#define FLOAT_V_T               vfloat64m2_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m2
+#define INT_V_T                 vint64m2_t
+#define VID_V_INT               __riscv_vid_v_i64m2
+#define VADD_VX_INT             __riscv_vadd_vx_i64m2
+#define VMSGT_VX_INT            __riscv_vmsgt_vx_i64m2_b32
+#define VBOOL_T                 vbool32_t
+#define VMERGE_VVM_FLOAT        __riscv_vmerge_vvm_f64m2
+#endif
+
+// Optimizes the implementation in ../generic/symm_ucopy_4.c
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b)
+{
+    BLASLONG i, js, offset;
+
+    FLOAT *ao1, *ao2;
+
+    BLASLONG stride_lda = sizeof(FLOAT)*lda;
+    
+    FLOAT_V_T vb, va1, va2;
+    VBOOL_T vbool;
+    INT_V_T vindex_max, vindex;
+
+    size_t vl = VSETVL_MAX;
+    vindex_max   = VID_V_INT(vl);
+
+    for (js = n; js > 0; js -= vl, posX += vl) {
+        vl = VSETVL(js);
+        offset = posX - posY;
+
+        ao1 = a + posY + (posX + 0) * lda;
+        ao2 = a + posX + 0 + posY * lda;
+
+        for (i = m; i > 0; i--, offset--) {
+            va1 = VLSEV_FLOAT(ao1, stride_lda, vl);
+            va2 = VLEV_FLOAT(ao2, vl);
+
+            // offset > (0 - vindex)   --->   (offset + vindex) > 0
+            vindex = VADD_VX_INT(vindex_max, offset, vl);
+            vbool  = VMSGT_VX_INT(vindex, 0, vl);
+
+            vb =  VMERGE_VVM_FLOAT(va2, va1, vbool, vl);
+            VSEV_FLOAT(b, vb, vl);
+
+            b += vl;
+            ao1++;
+            ao2 += lda;
+        }
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/symv_L_rvv.c b/kernel/riscv64/symv_L_rvv.c
new file mode 100644
index 000000000..888d628a5
--- /dev/null
+++ b/kernel/riscv64/symv_L_rvv.c
@@ -0,0 +1,219 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e32m1()
+#define VSETVL(n)               __riscv_vsetvl_e32m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m8()
+#define FLOAT_V_T_M1            vfloat32m1_t
+#define FLOAT_V_T               vfloat32m8_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m8
+#define VSEV_FLOAT              __riscv_vse32_v_f32m8
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m8
+#define VSSEV_FLOAT             __riscv_vsse32_v_f32m8
+#define VFMACCVV_FLOAT_TU       __riscv_vfmacc_vv_f32m8_tu
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f32m8
+#define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f32m8
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f32m8
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m8
+#define VFMSACVF_FLOAT          __riscv_vfmsac_vf_f32m8
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
+#define VFREDSUM_FLOAT          __riscv_vfredusum_vs_f32m8_f32m1
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
+#else
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e64m1()
+#define VSETVL(n)               __riscv_vsetvl_e64m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m8()
+#define FLOAT_V_T_M1            vfloat64m1_t
+#define FLOAT_V_T               vfloat64m8_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m8
+#define VSEV_FLOAT              __riscv_vse64_v_f64m8
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m8
+#define VSSEV_FLOAT             __riscv_vsse64_v_f64m8
+#define VFMACCVV_FLOAT_TU       __riscv_vfmacc_vv_f64m8_tu
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f64m8
+#define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f64m8
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f64m8
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m8
+#define VFMSACVF_FLOAT          __riscv_vfmsac_vf_f64m8
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
+#define VFREDSUM_FLOAT          __riscv_vfredusum_vs_f64m8_f64m1
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
+#endif
+
+int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+        BLASLONG i, j, k;
+        BLASLONG ix,iy;
+        BLASLONG jx,jy;
+        FLOAT temp1;
+        FLOAT *a_ptr = a;
+
+        FLOAT_V_T_M1 v_res, v_z0;
+        size_t vlmax = VSETVL_MAX_M1, vl;
+        v_z0 = VFMVVF_FLOAT_M1(0, vlmax);
+        vlmax = VSETVL_MAX;
+
+        FLOAT_V_T va, vx, vy, vr;
+        BLASLONG stride_x, stride_y, inc_xv, inc_yv;
+
+        if(inc_x == 1 && inc_y == 1)
+        {
+                for (j=0; j<offset; j++)
+                {
+                        temp1 = alpha * x[j];
+                        y[j] += temp1 * a_ptr[j];
+                        i = j + 1;
+                        vr = VFMVVF_FLOAT(0, vlmax);
+                        for (k = (m-i); k > 0; k -= vl, i += vl)
+                        {
+                                vl = VSETVL(k);
+                                va = VLEV_FLOAT(&a_ptr[i], vl);
+                                vy = VLEV_FLOAT(&y[i], vl);
+                                vy = VFMACCVF_FLOAT(vy, temp1, va, vl);
+                                VSEV_FLOAT(&y[i], vy, vl);
+
+                                vx = VLEV_FLOAT(&x[i], vl);
+                                vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl);
+
+                        }
+                        v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax);
+
+                        y[j] += alpha * VFMVFS_FLOAT_M1(v_res);
+                        a_ptr += lda;
+                }
+        }
+        else if(inc_x == 1)
+        {
+                jy = 0;
+                stride_y = inc_y * sizeof(FLOAT);
+                for (j=0; j<offset; j++)
+                {
+                        temp1 = alpha * x[j];
+                        y[jy] += temp1 * a_ptr[j];
+                        iy = jy + inc_y;
+                        i = j + 1;
+                        vr = VFMVVF_FLOAT(0, vlmax);
+                        for (k = (m-i); k > 0; k -= vl, i += vl)
+                        {
+                                vl = VSETVL(k);
+                                inc_yv = inc_y * vl;
+                                va = VLEV_FLOAT(&a_ptr[i], vl);
+                                vy = VLSEV_FLOAT(&y[iy], stride_y, vl);
+                                vy = VFMACCVF_FLOAT(vy, temp1, va, vl);
+                                VSSEV_FLOAT(&y[iy], stride_y, vy, vl);
+
+                                vx = VLEV_FLOAT(&x[i], vl);
+                                vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl);
+
+                                iy += inc_yv;
+                        }
+                        v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax);
+
+                        y[jy] += alpha * VFMVFS_FLOAT_M1(v_res);
+                        jy    += inc_y;
+                        a_ptr += lda;
+                }
+        }
+        else if(inc_y == 1)
+        {
+                jx = 0;
+                stride_x = inc_x * sizeof(FLOAT);
+                for (j=0; j<offset; j++)
+                {
+                        temp1 = alpha * x[jx];
+                        y[j] += temp1 * a_ptr[j];
+                        ix = jx + inc_x;
+                        i = j + 1;
+                        vr = VFMVVF_FLOAT(0, vlmax);
+                        for (k = (m-i); k > 0; k -= vl, i += vl)
+                        {
+                                vl = VSETVL(k);
+                                inc_xv = inc_x * vl;
+
+                                va = VLEV_FLOAT(&a_ptr[i], vl);
+                                vy = VLEV_FLOAT(&y[i], vl);
+                                vy = VFMACCVF_FLOAT(vy, temp1, va, vl);
+                                VSEV_FLOAT(&y[i], vy, vl);
+
+                                vx = VLSEV_FLOAT(&x[ix], stride_x, vl);
+                                vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl);
+
+                                ix += inc_xv;
+                        }
+
+                        v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax);
+
+                        y[j] += alpha * VFMVFS_FLOAT_M1(v_res);
+                        jx    += inc_x;
+                        a_ptr += lda;
+                }
+        }
+        else
+        {
+                stride_x = inc_x * sizeof(FLOAT);
+                stride_y = inc_y * sizeof(FLOAT);
+                jx = 0;
+                jy = 0;
+                for (j=0; j<offset; j++)
+                {
+                        temp1 = alpha * x[jx];
+                        y[jy] += temp1 * a_ptr[j];
+                        ix = jx + inc_x;
+                        iy = jy + inc_y;
+                        i = j + 1;
+                        vr = VFMVVF_FLOAT(0, vlmax);
+                        for (k = (m-i); k > 0; k -= vl, i += vl)
+                        {
+                                vl = VSETVL(k);
+                                inc_xv = inc_x * vl;
+                                inc_yv = inc_y * vl;
+                                
+                                va = VLEV_FLOAT(&a_ptr[i], vl);
+                                vy = VLSEV_FLOAT(&y[iy], stride_y, vl);
+                                vy = VFMACCVF_FLOAT(vy, temp1, va, vl);
+                                VSSEV_FLOAT(&y[iy], stride_y, vy, vl);
+
+                                vx = VLSEV_FLOAT(&x[ix], stride_x, vl);
+                                vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl);
+
+                                ix += inc_xv;
+                                iy += inc_yv;
+                        }
+                        v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax);
+
+                        y[jy] += alpha * VFMVFS_FLOAT_M1(v_res);
+                        jx    += inc_x;
+                        jy    += inc_y;
+                        a_ptr += lda;
+                }
+        }
+        return(0);
+}
+
diff --git a/kernel/riscv64/symv_L_vector.c b/kernel/riscv64/symv_L_vector.c
index 58ec17b03..cd89c63ec 100644
--- a/kernel/riscv64/symv_L_vector.c
+++ b/kernel/riscv64/symv_L_vector.c
@@ -27,37 +27,43 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m4(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
+#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n)
+#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)()
 #define FLOAT_V_T vfloat32m4_t
 #define FLOAT_V_T_M1 vfloat32m1_t
-#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLEV_FLOAT vle32_v_f32m4
-#define VLSEV_FLOAT vlse32_v_f32m4
-#define VSEV_FLOAT vse32_v_f32m4
-#define VSSEV_FLOAT vsse32_v_f32m4
-#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
-#define VFMACCVV_FLOAT vfmacc_vv_f32m4
-#define VFMACCVF_FLOAT vfmacc_vf_f32m4
-#define VFMVVF_FLOAT vfmv_v_f_f32m4
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFMULVV_FLOAT vfmul_vv_f32m4
+#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4)
+#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4)
+#define VSEV_FLOAT RISCV_RVV(vse32_v_f32m4)
+#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f32m4_f32m1(v_res, va, vb, gvl)
 #else
-#define VSETVL(n) vsetvl_e64m4(n)
-#define VSETVL_MAX vsetvlmax_e64m1()
+#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f32m4_f32m1)
+#endif
+#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f32m4)
+#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m4)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1)
+#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f32m4)
+#else
+#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n)
+#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)()
 #define FLOAT_V_T vfloat64m4_t
 #define FLOAT_V_T_M1 vfloat64m1_t
-#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLEV_FLOAT vle64_v_f64m4
-#define VLSEV_FLOAT vlse64_v_f64m4
-#define VSEV_FLOAT vse64_v_f64m4
-#define VSSEV_FLOAT vsse64_v_f64m4
-#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
-#define VFMACCVV_FLOAT vfmacc_vv_f64m4
-#define VFMACCVF_FLOAT vfmacc_vf_f64m4
-#define VFMVVF_FLOAT vfmv_v_f_f64m4
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFMULVV_FLOAT vfmul_vv_f64m4
+#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4)
+#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4)
+#define VSEV_FLOAT RISCV_RVV(vse64_v_f64m4)
+#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f64m4_f64m1(v_res, va, vb, gvl)
+#else
+#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f64m4_f64m1)
+#endif
+#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f64m4)
+#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m4)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1)
+#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f64m4)
 #endif
 
 int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
@@ -99,8 +105,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
 
                                         i += gvl;
                                 }
-                                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                                temp2 = VFMVFS_FLOAT(v_res);
+                                v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
+                                temp2 = EXTRACT_FLOAT(v_res);
                                 if(i < m){
                                         gvl = VSETVL(m-i);
                                         vy = VLEV_FLOAT(&y[i], gvl);
@@ -110,8 +116,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
 
                                         vx = VLEV_FLOAT(&x[i], gvl);
                                         vr = VFMULVV_FLOAT(vx, va, gvl);
-                                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                                        temp2 += VFMVFS_FLOAT(v_res);
+                                        v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
+                                        temp2 += EXTRACT_FLOAT(v_res);
                                 }
 			}
                         y[j] += alpha * temp2;
@@ -144,8 +150,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
                                         i += gvl;
                                         iy += inc_yv;
                                 }
-                                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                                temp2 = VFMVFS_FLOAT(v_res);
+                                v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
+                                temp2 = EXTRACT_FLOAT(v_res);
                                 if(i < m){
                                         gvl = VSETVL(m-i);
                                         vy = VLSEV_FLOAT(&y[iy], stride_y, gvl);
@@ -155,8 +161,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
 
                                         vx = VLEV_FLOAT(&x[i], gvl);
                                         vr = VFMULVV_FLOAT(vx, va, gvl);
-                                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                                        temp2 += VFMVFS_FLOAT(v_res);
+                                        v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
+                                        temp2 += EXTRACT_FLOAT(v_res);
                                 }
 			}
                         y[jy] += alpha * temp2;
@@ -190,8 +196,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
                                         i += gvl;
                                         ix += inc_xv;
                                 }
-                                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                                temp2 = VFMVFS_FLOAT(v_res);
+                                v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
+                                temp2 = EXTRACT_FLOAT(v_res);
                                 if(i < m){
                                         gvl = VSETVL(m-i);
                                         vy = VLEV_FLOAT(&y[i], gvl);
@@ -201,8 +207,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
 
                                         vx = VLSEV_FLOAT(&x[ix], stride_x, gvl);
                                         vr = VFMULVV_FLOAT(vx, va, gvl);
-                                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                                        temp2 += VFMVFS_FLOAT(v_res);
+                                        v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
+                                        temp2 += EXTRACT_FLOAT(v_res);
                                 }
 			}
                         y[j] += alpha * temp2;
@@ -241,8 +247,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
                                         ix += inc_xv;
                                         iy += inc_yv;
                                 }
-                                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                                temp2 = VFMVFS_FLOAT(v_res);
+                                v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
+                                temp2 = EXTRACT_FLOAT(v_res);
                                 if(i < m){
                                         gvl = VSETVL(m-i);
                                         vy = VLSEV_FLOAT(&y[iy], stride_y, gvl);
@@ -252,8 +258,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
 
                                         vx = VLSEV_FLOAT(&x[ix], stride_x, gvl);
                                         vr = VFMULVV_FLOAT(vx, va, gvl);
-                                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                                        temp2 += VFMVFS_FLOAT(v_res);
+                                        v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
+                                        temp2 += EXTRACT_FLOAT(v_res);
                                 }
 			}
                         y[jy] += alpha * temp2;
diff --git a/kernel/riscv64/symv_U_rvv.c b/kernel/riscv64/symv_U_rvv.c
new file mode 100644
index 000000000..bcd2f6981
--- /dev/null
+++ b/kernel/riscv64/symv_U_rvv.c
@@ -0,0 +1,216 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e32m1()
+#define VSETVL(n)               __riscv_vsetvl_e32m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m8()
+#define FLOAT_V_T_M1            vfloat32m1_t
+#define FLOAT_V_T               vfloat32m8_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m8
+#define VSEV_FLOAT              __riscv_vse32_v_f32m8
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m8
+#define VSSEV_FLOAT             __riscv_vsse32_v_f32m8
+#define VFMACCVV_FLOAT_TU       __riscv_vfmacc_vv_f32m8_tu
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f32m8
+#define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f32m8
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f32m8
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m8
+#define VFMSACVF_FLOAT          __riscv_vfmsac_vf_f32m8
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
+#define VFREDSUM_FLOAT          __riscv_vfredusum_vs_f32m8_f32m1
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
+#else
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e64m1()
+#define VSETVL(n)               __riscv_vsetvl_e64m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m8()
+#define FLOAT_V_T_M1            vfloat64m1_t
+#define FLOAT_V_T               vfloat64m8_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m8
+#define VSEV_FLOAT              __riscv_vse64_v_f64m8
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m8
+#define VSSEV_FLOAT             __riscv_vsse64_v_f64m8
+#define VFMACCVV_FLOAT_TU       __riscv_vfmacc_vv_f64m8_tu
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f64m8
+#define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f64m8
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f64m8
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m8
+#define VFMSACVF_FLOAT          __riscv_vfmsac_vf_f64m8
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
+#define VFREDSUM_FLOAT          __riscv_vfredusum_vs_f64m8_f64m1
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
+#endif
+
+int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+        BLASLONG i, j, k;
+        BLASLONG ix,iy;
+        BLASLONG jx,jy;
+        FLOAT temp1;
+        FLOAT *a_ptr = a;
+        FLOAT_V_T_M1 v_res, v_z0;
+        size_t vl_max = VSETVL_MAX_M1, vl;
+        v_z0 = VFMVVF_FLOAT_M1(0, vl_max);
+        vl_max = VSETVL_MAX;
+
+        FLOAT_V_T va, vx, vy, vr;
+        BLASLONG stride_x, stride_y, inc_xv, inc_yv;
+        
+        BLASLONG m1 = m - offset;
+        if(inc_x == 1 && inc_y == 1)
+        {
+                a_ptr += m1 * lda;
+                for (j=m1; j<m; j++)
+                {
+                        temp1 = alpha * x[j];
+                        i = 0;
+                        vr = VFMVVF_FLOAT(0, vl_max);
+                        for (k = j; k > 0; k -= vl, i += vl)
+                        {
+                                vl = VSETVL(k);
+                                vy = VLEV_FLOAT(&y[i], vl);
+                                va = VLEV_FLOAT(&a_ptr[i], vl);
+                                vy = VFMACCVF_FLOAT(vy, temp1, va, vl);
+                                VSEV_FLOAT(&y[i], vy, vl);
+
+                                vx = VLEV_FLOAT(&x[i], vl);
+                                vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl);
+                        }
+                        v_res = VFREDSUM_FLOAT(vr, v_z0, vl_max);
+
+                        y[j] += temp1 * a_ptr[j] + alpha * VFMVFS_FLOAT_M1(v_res);
+                        a_ptr += lda;
+                }
+        }
+        else if(inc_x == 1)
+        {
+                jy = m1 * inc_y;
+                a_ptr += m1 * lda;
+                stride_y = inc_y * sizeof(FLOAT);
+                for (j=m1; j<m; j++)
+                {
+                        temp1 = alpha * x[j];
+                        iy = 0;
+                        i = 0;
+                        vr = VFMVVF_FLOAT(0, vl_max);
+                        for (k = j; k > 0; k -= vl, i += vl)
+                        {
+                                vl = VSETVL(k);
+                                inc_yv = inc_y * vl;
+                                vy = VLSEV_FLOAT(&y[iy], stride_y, vl);
+                                va = VLEV_FLOAT(&a_ptr[i], vl);
+                                vy = VFMACCVF_FLOAT(vy, temp1, va, vl);
+                                VSSEV_FLOAT(&y[iy], stride_y, vy, vl);
+
+                                vx = VLEV_FLOAT(&x[i], vl);
+                                vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl);
+
+                                iy += inc_yv;
+                        }
+                        v_res = VFREDSUM_FLOAT(vr, v_z0, vl_max);
+
+                        y[jy] += temp1 * a_ptr[j] + alpha * VFMVFS_FLOAT_M1(v_res);
+                        a_ptr += lda;
+                        jy    += inc_y;
+                }
+        }
+        else if(inc_y == 1)
+        {
+                jx = m1 * inc_x;
+                a_ptr += m1 * lda;
+                stride_x = inc_x * sizeof(FLOAT);
+                for (j=m1; j<m; j++)
+                {
+                        temp1 = alpha * x[jx];
+                        ix = 0;
+                        i = 0;
+                        vr = VFMVVF_FLOAT(0, vl_max);
+                        for (k = j; k > 0; k -= vl, i += vl)
+                        {
+                                vl = VSETVL(k);
+                                inc_xv = inc_x * vl;
+
+                                vy = VLEV_FLOAT(&y[i], vl);
+                                va = VLEV_FLOAT(&a_ptr[i], vl);
+                                vy = VFMACCVF_FLOAT(vy, temp1, va, vl);
+                                VSEV_FLOAT(&y[i], vy, vl);
+
+                                vx = VLSEV_FLOAT(&x[ix], stride_x, vl);
+                                vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl);
+
+                                ix += inc_xv;
+                        }
+                        v_res = VFREDSUM_FLOAT(vr, v_z0, vl_max);
+
+                        y[j] += temp1 * a_ptr[j] + alpha * VFMVFS_FLOAT_M1(v_res);
+                        a_ptr += lda;
+                        jx    += inc_x;
+                }
+        }
+        else
+        {
+                jx = m1 * inc_x;
+                jy = m1 * inc_y;
+                a_ptr += m1 * lda;
+                stride_x = inc_x * sizeof(FLOAT);
+                stride_y = inc_y * sizeof(FLOAT);
+                for (j=m1; j<m; j++)
+                {
+                        temp1 = alpha * x[jx];
+
+                        ix = 0;
+                        iy = 0;
+                        i = 0;
+                        vr = VFMVVF_FLOAT(0, vl_max);
+                        for (k = j; k > 0; k -= vl, i += vl)
+                        {
+                                vl = VSETVL(k);
+                                inc_xv = inc_x * vl;
+                                inc_yv = inc_y * vl;
+                                vy = VLSEV_FLOAT(&y[iy], stride_y, vl);
+                                va = VLEV_FLOAT(&a_ptr[i], vl);
+                                vy = VFMACCVF_FLOAT(vy, temp1, va, vl);
+                                VSSEV_FLOAT(&y[iy], stride_y, vy, vl);
+
+                                vx = VLSEV_FLOAT(&x[ix], stride_x, vl);
+                                vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl);
+                                ix += inc_xv;
+                                iy += inc_yv;
+                        }
+                        v_res = VFREDSUM_FLOAT(vr, v_z0, vl_max);
+
+                        y[jy] += temp1 * a_ptr[j] + alpha * VFMVFS_FLOAT_M1(v_res);
+                        a_ptr += lda;
+                        jx    += inc_x;
+                        jy    += inc_y;
+                }
+        }
+        return(0);
+}
diff --git a/kernel/riscv64/symv_U_vector.c b/kernel/riscv64/symv_U_vector.c
index 34ff0e30a..894c6a643 100644
--- a/kernel/riscv64/symv_U_vector.c
+++ b/kernel/riscv64/symv_U_vector.c
@@ -27,39 +27,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m4(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
+#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n)
+#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)()
 #define FLOAT_V_T vfloat32m4_t
 #define FLOAT_V_T_M1 vfloat32m1_t
-#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLEV_FLOAT vle32_v_f32m4
-#define VLSEV_FLOAT vlse32_v_f32m4
-#define VSEV_FLOAT vse32_v_f32m4
-#define VSSEV_FLOAT vsse32_v_f32m4
-#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
-#define VFMACCVV_FLOAT vfmacc_vv_f32m4
-#define VFMACCVF_FLOAT vfmacc_vf_f32m4
-#define VFMVVF_FLOAT vfmv_v_f_f32m4
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFDOTVV_FLOAT vfdot_vv_f32m4
-#define VFMULVV_FLOAT vfmul_vv_f32m4
+#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4)
+#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4)
+#define VSEV_FLOAT RISCV_RVV(vse32_v_f32m4)
+#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f32m4_f32m1(v_res, va, vb, gvl)
 #else
-#define VSETVL(n) vsetvl_e64m4(n)
-#define VSETVL_MAX vsetvlmax_e64m1()
+#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f32m4_f32m1)
+#endif
+#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f32m4)
+#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m4)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1)
+#define VFDOTVV_FLOAT RISCV_RVV(vfdot_vv_f32m4)
+#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f32m4)
+#else
+#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n)
+#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)()
 #define FLOAT_V_T vfloat64m4_t
 #define FLOAT_V_T_M1 vfloat64m1_t
-#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLEV_FLOAT vle64_v_f64m4
-#define VLSEV_FLOAT vlse64_v_f64m4
-#define VSEV_FLOAT vse64_v_f64m4
-#define VSSEV_FLOAT vsse64_v_f64m4
-#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
-#define VFMACCVV_FLOAT vfmacc_vv_f64m4
-#define VFMACCVF_FLOAT vfmacc_vf_f64m4
-#define VFMVVF_FLOAT vfmv_v_f_f64m4
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFDOTVV_FLOAT vfdot_vv_f64m4
-#define VFMULVV_FLOAT vfmul_vv_f64m4
+#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4)
+#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4)
+#define VSEV_FLOAT RISCV_RVV(vse64_v_f64m4)
+#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f64m4_f64m1(v_res, va, vb, gvl)
+#else
+#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f64m4_f64m1)
+#endif
+#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f64m4)
+#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m4)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1)
+#define VFDOTVV_FLOAT RISCV_RVV(vfdot_vv_f64m4)
+#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f64m4)
 #endif
 
 int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
@@ -101,8 +107,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
 
                                         i += gvl;
                                 }
-                                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                                temp2 = VFMVFS_FLOAT(v_res);
+                                v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
+                                temp2 = EXTRACT_FLOAT(v_res);
                                 if(i < j){
                                         gvl = VSETVL(j-i);
                                         vy = VLEV_FLOAT(&y[i], gvl);
@@ -112,8 +118,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
 
                                         vx = VLEV_FLOAT(&x[i], gvl);
                                         vr = VFMULVV_FLOAT(vx, va, gvl);
-                                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                                        temp2 += VFMVFS_FLOAT(v_res);
+                                        v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
+                                        temp2 += EXTRACT_FLOAT(v_res);
                                 }
                         }
                         y[j] += temp1 * a_ptr[j] + alpha * temp2;
@@ -145,8 +151,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
                                         i += gvl;
                                         iy += inc_yv;
                                 }
-                                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                                temp2 = VFMVFS_FLOAT(v_res);
+                                v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
+                                temp2 = EXTRACT_FLOAT(v_res);
                                 if(i < j){
                                         gvl = VSETVL(j-i);
                                         vy = VLSEV_FLOAT(&y[iy], stride_y, gvl);
@@ -156,8 +162,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
 
                                         vx = VLEV_FLOAT(&x[i], gvl);
                                         vr = VFMULVV_FLOAT(vx, va, gvl);
-                                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                                        temp2 += VFMVFS_FLOAT(v_res);
+                                        v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
+                                        temp2 += EXTRACT_FLOAT(v_res);
                                 }
                         }
                         y[jy] += temp1 * a_ptr[j] + alpha * temp2;
@@ -190,8 +196,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
                                         i += gvl;
                                         ix += inc_xv;
                                 }
-                                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                                temp2 = VFMVFS_FLOAT(v_res);
+                                v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
+                                temp2 = EXTRACT_FLOAT(v_res);
                                 if(i < j){
                                         gvl = VSETVL(j-i);
                                         vy = VLEV_FLOAT(&y[i], gvl);
@@ -201,8 +207,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
 
                                         vx = VLSEV_FLOAT(&x[ix], stride_x, gvl);
                                         vr = VFMULVV_FLOAT(vx, va, gvl);
-                                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                                        temp2 += VFMVFS_FLOAT(v_res);
+                                        v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
+                                        temp2 += EXTRACT_FLOAT(v_res);
                                 }
                         }
                         y[j] += temp1 * a_ptr[j] + alpha * temp2;
@@ -240,8 +246,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
                                         ix += inc_xv;
                                         iy += inc_yv;
                                 }
-                                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                                temp2 = VFMVFS_FLOAT(v_res);
+                                v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
+                                temp2 = EXTRACT_FLOAT(v_res);
                                 if(i < j){
                                         gvl = VSETVL(j-i);
                                         vy = VLSEV_FLOAT(&y[iy], stride_y, gvl);
@@ -251,8 +257,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
 
                                         vx = VLSEV_FLOAT(&x[ix], stride_x, gvl);
                                         vr = VFMULVV_FLOAT(vx, va, gvl);
-                                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                                        temp2 += VFMVFS_FLOAT(v_res);
+                                        v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
+                                        temp2 += EXTRACT_FLOAT(v_res);
                                 }
                         }
                         y[jy] += temp1 * a_ptr[j] + alpha * temp2;
diff --git a/kernel/riscv64/trmm_lncopy_rvv_v1.c b/kernel/riscv64/trmm_lncopy_rvv_v1.c
new file mode 100644
index 000000000..4135a9b62
--- /dev/null
+++ b/kernel/riscv64/trmm_lncopy_rvv_v1.c
@@ -0,0 +1,138 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include <stdio.h>
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define FLOAT_V_T               vfloat32m2_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m2
+#define VBOOL_T                 vbool16_t
+#define UINT_V_T                vuint32m2_t
+#define VID_V_UINT              __riscv_vid_v_u32m2
+#define VMSGTU_VX_UINT          __riscv_vmsgtu_vx_u32m2_b16
+#define VMSEQ_VX_UINT           __riscv_vmseq_vx_u32m2_b16
+#define VFMERGE_VFM_FLOAT       __riscv_vfmerge_vfm_f32m2
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define FLOAT_V_T               vfloat64m2_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m2
+#define VBOOL_T                 vbool32_t
+#define UINT_V_T                vuint64m2_t
+#define VID_V_UINT              __riscv_vid_v_u64m2
+#define VMSGTU_VX_UINT          __riscv_vmsgtu_vx_u64m2_b32
+#define VMSEQ_VX_UINT           __riscv_vmseq_vx_u64m2_b32
+#define VFMERGE_VFM_FLOAT       __riscv_vfmerge_vfm_f64m2
+#endif
+
+// Optimizes the implementation in ../arm64/tmmm_lncopy_sve_v1.c
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
+
+    BLASLONG i, js, X;
+
+    FLOAT *ao;
+
+    BLASLONG stride_lda = sizeof(FLOAT)*lda;
+    
+    FLOAT_V_T vb, va1;
+
+    size_t vl;
+#ifdef UNIT
+    VBOOL_T vbool_eq;
+#endif
+
+    VBOOL_T vbool_cmp;
+    UINT_V_T vindex;
+
+    for (js = n; js > 0; js -= vl)
+    {
+        vl = VSETVL(js);
+        X = posX;
+
+        if (posX <= posY) 
+        {
+            ao = a + posY + posX * lda;
+        } 
+        else 
+        {
+            ao = a + posX + posY * lda;
+        }
+
+        i = 0;
+        do 
+        {
+            if (X > posY) 
+            {
+                va1 = VLSEV_FLOAT(ao, stride_lda, vl);
+                VSEV_FLOAT(b, va1, vl);
+
+                ao ++;
+                b += vl;
+                X ++;
+                i ++;
+            } 
+            else if (X < posY) 
+            {
+                ao += lda;
+                b += vl;
+                X ++;
+                i ++;
+            } 
+            else 
+            {
+                vindex  = VID_V_UINT(vl);
+                for (unsigned int j = 0; j < vl; j++) 
+                {
+                    va1 = VLSEV_FLOAT(ao, stride_lda, vl);
+                    vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl);
+                    vb = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl);
+#ifdef UNIT
+                    vbool_eq = VMSEQ_VX_UINT(vindex, j, vl);
+                    vb =  VFMERGE_VFM_FLOAT(vb, ONE, vbool_eq, vl);
+#endif
+                    VSEV_FLOAT(b, vb, vl);
+                    ao++;
+                    b += vl;
+                }
+
+                X += vl;
+                i += vl;
+            }
+        } while (i < m);
+
+        posY += vl;
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/trmm_ltcopy_rvv_v1.c b/kernel/riscv64/trmm_ltcopy_rvv_v1.c
new file mode 100644
index 000000000..580714fde
--- /dev/null
+++ b/kernel/riscv64/trmm_ltcopy_rvv_v1.c
@@ -0,0 +1,134 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include <stdio.h>
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define FLOAT_V_T               vfloat32m2_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#define VBOOL_T                 vbool16_t
+#define UINT_V_T                vuint32m2_t
+#define VID_V_UINT              __riscv_vid_v_u32m2
+#define VMSLTU_VX_UINT          __riscv_vmsltu_vx_u32m2_b16
+#define VMSEQ_VX_UINT           __riscv_vmseq_vx_u32m2_b16
+#define VFMERGE_VFM_FLOAT       __riscv_vfmerge_vfm_f32m2
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define FLOAT_V_T               vfloat64m2_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
+#define VBOOL_T                 vbool32_t
+#define UINT_V_T                vuint64m2_t
+#define VID_V_UINT              __riscv_vid_v_u64m2
+#define VMSLTU_VX_UINT          __riscv_vmsltu_vx_u64m2_b32
+#define VMSEQ_VX_UINT           __riscv_vmseq_vx_u64m2_b32
+#define VFMERGE_VFM_FLOAT       __riscv_vfmerge_vfm_f64m2
+#endif
+
+// Optimizes the implementation in ../arm64/tmmm_ltcopy_sve_v1.c
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
+
+    BLASLONG i, js, X;
+
+    FLOAT *ao;
+    
+    FLOAT_V_T vb, va1;
+    size_t vl;
+#ifdef UNIT
+    VBOOL_T vbool_eq;
+#endif
+
+    VBOOL_T vbool_cmp;
+    UINT_V_T vindex;
+
+    for (js = n; js > 0; js -= vl)
+    {
+        vl = VSETVL(js);
+        X = posX;
+
+        if (posX <= posY) 
+        {
+            ao = a + posY + posX * lda;
+        } 
+        else 
+        {
+            ao = a + posX + posY * lda;
+        }
+
+        i = 0;
+        do 
+        {
+            if (X > posY) 
+            {
+                ao ++;
+                b += vl;
+                X ++;
+                i ++;
+            } 
+            else if (X < posY) 
+            {
+                va1 = VLEV_FLOAT(ao, vl);
+                VSEV_FLOAT(b, va1, vl);
+
+                ao += lda;
+                b += vl;
+                X ++;
+                i ++;
+            }
+            else
+            {
+                vindex  = VID_V_UINT(vl);
+                for (unsigned int j = 0; j < vl; j++) 
+                {
+                    va1 = VLEV_FLOAT(ao, vl);
+                    vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl);
+                    vb = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl);
+#ifdef UNIT
+                    vbool_eq = VMSEQ_VX_UINT(vindex, j, vl);
+                    vb =  VFMERGE_VFM_FLOAT(vb, ONE, vbool_eq, vl);
+#endif
+                    VSEV_FLOAT(b, vb, vl);
+                    ao += lda;
+                    b += vl;
+                }
+                X += vl;
+                i += vl;
+
+            }
+        } while (i < m);
+
+        posY += vl;
+    }
+
+    return 0;
+}
+
diff --git a/kernel/riscv64/trmm_uncopy_rvv_v1.c b/kernel/riscv64/trmm_uncopy_rvv_v1.c
new file mode 100644
index 000000000..852ab7f11
--- /dev/null
+++ b/kernel/riscv64/trmm_uncopy_rvv_v1.c
@@ -0,0 +1,136 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include <stdio.h>
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define FLOAT_V_T               vfloat32m2_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#define VBOOL_T                 vbool16_t
+#define UINT_V_T                vuint32m2_t
+#define VID_V_UINT              __riscv_vid_v_u32m2
+#define VMSLTU_VX_UINT          __riscv_vmsltu_vx_u32m2_b16
+#define VMSEQ_VX_UINT           __riscv_vmseq_vx_u32m2_b16
+#define VFMERGE_VFM_FLOAT       __riscv_vfmerge_vfm_f32m2
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define FLOAT_V_T               vfloat64m2_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m2
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
+#define VBOOL_T                 vbool32_t
+#define UINT_V_T                vuint64m2_t
+#define VID_V_UINT              __riscv_vid_v_u64m2
+#define VMSLTU_VX_UINT          __riscv_vmsltu_vx_u64m2_b32
+#define VMSEQ_VX_UINT           __riscv_vmseq_vx_u64m2_b32
+#define VFMERGE_VFM_FLOAT       __riscv_vfmerge_vfm_f64m2
+#endif
+
+// Optimizes the implementation in ../arm64/tmmm_uncopy_sve_v1.c
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
+
+    BLASLONG i, js, X;
+    BLASLONG stride_lda = sizeof(FLOAT) * lda;
+    FLOAT *ao;
+
+    FLOAT_V_T vb, va1;
+    size_t vl;
+
+#ifdef UNIT
+    VBOOL_T vbool_eq;
+#endif
+
+    VBOOL_T vbool_cmp;
+    UINT_V_T vindex;
+
+    for (js = n; js > 0; js -= vl)
+    {
+        vl = VSETVL(js);
+        X = posX;
+
+        if (posX <= posY) 
+        {
+            ao = a + posX + posY * lda;
+        } 
+        else 
+        {
+            ao = a + posY + posX * lda;
+        }
+
+        i = 0;
+        do
+        {
+            if (X < posY) 
+            {
+                va1 = VLSEV_FLOAT(ao, stride_lda, vl);
+                VSEV_FLOAT(b, va1, vl);
+
+                ao ++;
+                b += vl;
+                X ++;
+                i ++;
+            } 
+            else if (X > posY) 
+            {
+                ao += lda;
+                b += vl;
+                X ++;
+                i ++;
+            } 
+            else 
+            {
+                vindex  = VID_V_UINT(vl);
+                for (unsigned int j = 0; j < vl; j++) 
+                {
+                    va1 = VLSEV_FLOAT(ao, stride_lda, vl);
+                    vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl);
+                    vb = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl);
+#ifdef UNIT
+                    vbool_eq = VMSEQ_VX_UINT(vindex, j, vl);
+                    vb =  VFMERGE_VFM_FLOAT(vb, ONE, vbool_eq, vl);
+#endif
+                    VSEV_FLOAT(b, vb, vl);
+                    ao++;
+                    b += vl;
+                }
+
+                X += vl;
+                i += vl;
+            }
+        }while (i < m);
+
+        posY += vl;
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/trmm_utcopy_rvv_v1.c b/kernel/riscv64/trmm_utcopy_rvv_v1.c
new file mode 100644
index 000000000..e0b6d362d
--- /dev/null
+++ b/kernel/riscv64/trmm_utcopy_rvv_v1.c
@@ -0,0 +1,133 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include <stdio.h>
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define FLOAT_V_T               vfloat32m2_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#define VBOOL_T                 vbool16_t
+#define UINT_V_T                vuint32m2_t
+#define VID_V_UINT              __riscv_vid_v_u32m2
+#define VMSGTU_VX_UINT          __riscv_vmsgtu_vx_u32m2_b16
+#define VMSEQ_VX_UINT           __riscv_vmseq_vx_u32m2_b16
+#define VFMERGE_VFM_FLOAT       __riscv_vfmerge_vfm_f32m2
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define FLOAT_V_T               vfloat64m2_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
+#define VBOOL_T                 vbool32_t
+#define UINT_V_T                vuint64m2_t
+#define VID_V_UINT              __riscv_vid_v_u64m2
+#define VMSGTU_VX_UINT          __riscv_vmsgtu_vx_u64m2_b32
+#define VMSEQ_VX_UINT           __riscv_vmseq_vx_u64m2_b32
+#define VFMERGE_VFM_FLOAT       __riscv_vfmerge_vfm_f64m2
+#endif
+
+// Optimizes the implementation in ../arm64/tmmm_utcopy_sve_v1.c
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
+
+    BLASLONG i, j, js, X;
+
+    FLOAT *ao;
+    FLOAT_V_T vb, va1;
+#ifdef UNIT
+    VBOOL_T vbool_eq;
+#endif
+
+    VBOOL_T vbool_cmp;
+    UINT_V_T vindex;
+
+    size_t vl;
+
+    for (js = n; js > 0; js -= vl)
+    {
+        vl = VSETVL(js);
+
+        X = posX;
+
+        if (posX <= posY) 
+        {
+            ao = a + posX + posY * lda;
+        } 
+        else 
+        {
+            ao = a + posY + posX * lda;
+        }
+
+        i = 0;
+        do
+        {
+            if (X < posY) 
+            {
+                ao ++;
+                b += vl;
+                X ++;
+                i++;
+            }
+            else if (X > posY)
+            {
+                va1 = VLEV_FLOAT(ao, vl);
+                VSEV_FLOAT(b, va1, vl);
+                ao += lda;
+                b += vl;
+                X++;
+                i++;
+            }
+            else
+            {
+                vindex  = VID_V_UINT(vl);
+                for (j = 0; j < vl; j++) 
+                {
+                    va1 = VLEV_FLOAT(ao, vl);
+                    vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl);
+                    vb = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl);
+#ifdef UNIT
+                    vbool_eq = VMSEQ_VX_UINT(vindex, j, vl);
+                    vb =  VFMERGE_VFM_FLOAT(vb, ONE, vbool_eq, vl);
+#endif
+                    VSEV_FLOAT(b, vb, vl);
+                    ao += lda;
+                    b += vl;
+                }
+                X += vl;
+                i += vl;
+            }
+        }while (i < m);
+        posY += vl;
+    }
+    return 0;
+}
+
diff --git a/kernel/riscv64/trmmkernel_rvv_v1x8.c b/kernel/riscv64/trmmkernel_rvv_v1x8.c
new file mode 100644
index 000000000..393b24bce
--- /dev/null
+++ b/kernel/riscv64/trmmkernel_rvv_v1x8.c
@@ -0,0 +1,685 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define FLOAT_V_T               vfloat32m2_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m2
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f32m2
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f32m2
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define FLOAT_V_T               vfloat64m2_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m2
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f64m2
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f64m2
+#endif
+
+
+// Optimizes the implementation in ../generic/trmmkernel_8x8.c
+
+
+int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset)
+{
+    //fprintf(stderr, "%s, %s, bm=%4ld bn=%4ld bk=%4ld alpha=%f ldc=%ld\n", __FILE__, __FUNCTION__, bm, bn, bk, alpha, ldc);
+
+    BLASLONG i,j,k;
+    FLOAT *C0,*C1,*C2,*C3,*C4,*C5,*C6,*C7,*ptrba,*ptrbb;
+
+    FLOAT_V_T va0, va1, va2, va3, va4, va5, va6, va7;
+    FLOAT_V_T vres0, vres1, vres2, vres3, vres4, vres5, vres6, vres7;
+    size_t vl;
+
+    BLASLONG off, temp;
+
+#if !defined(LEFT)
+    off = -offset;
+#else
+    off = 0;
+#endif
+    for (j = bn/8; j > 0; j--)
+    {
+        C0 = C;
+        C1 = C0+ldc;
+        C2 = C1+ldc;
+        C3 = C2+ldc;
+        C4 = C3+ldc;
+        C5 = C4+ldc;
+        C6 = C5+ldc;
+        C7 = C6+ldc;
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        off = offset;
+#endif
+
+        ptrba = ba;
+
+        for (i = bm; i > 0; i -= vl)
+        {
+            vl = VSETVL(i);
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*vl;
+            ptrbb = bb + off*8;
+#endif
+
+            vres0 = VFMVVF_FLOAT(0.0, vl);
+            vres1 = VFMVVF_FLOAT(0.0, vl);
+            vres2 = VFMVVF_FLOAT(0.0, vl);
+            vres3 = VFMVVF_FLOAT(0.0, vl);
+            vres4 = VFMVVF_FLOAT(0.0, vl);
+            vres5 = VFMVVF_FLOAT(0.0, vl);
+            vres6 = VFMVVF_FLOAT(0.0, vl);
+            vres7 = VFMVVF_FLOAT(0.0, vl);
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+vl;  // number of values in A
+#else
+            temp = off+8;   // number of values in B
+#endif
+
+            for (k = temp/8; k > 0; k--) {
+                va0 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+                va1 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl);
+                vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va0, vl);
+                vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va0, vl);
+                vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va0, vl);
+                vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va0, vl);
+                ptrbb += 8;
+                va2 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va1, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va1, vl);
+                vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va1, vl);
+                vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va1, vl);
+                vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va1, vl);
+                vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va1, vl);
+                ptrbb += 8;
+                va3 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va2, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va2, vl);
+                vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va2, vl);
+                vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va2, vl);
+                vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va2, vl);
+                vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va2, vl);
+                ptrbb += 8;
+                va4 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va3, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va3, vl);
+                vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va3, vl);
+                vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va3, vl);
+                vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va3, vl);
+                vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va3, vl);
+                ptrbb += 8;
+                va5 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va4, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va4, vl);
+                vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va4, vl);
+                vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va4, vl);
+                vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va4, vl);
+                vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va4, vl);
+                ptrbb += 8;
+                va6 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va5, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va5, vl);
+                vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va5, vl);
+                vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va5, vl);
+                vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va5, vl);
+                vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va5, vl);
+                ptrbb += 8;
+                va7 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va6, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va6, vl);
+                vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va6, vl);
+                vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va6, vl);
+                vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va6, vl);
+                vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va6, vl);
+                ptrbb += 8;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va7, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va7, vl);
+                vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va7, vl);
+                vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va7, vl);
+                vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va7, vl);
+                vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va7, vl);
+                ptrbb += 8;
+            }
+
+            for (k = temp&7; k > 0; k--) {
+                va0 = VLEV_FLOAT(ptrba, vl); // M:8 (should be vlen);
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl);
+                vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va0, vl);
+                vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va0, vl);
+                vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va0, vl);
+                vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va0, vl);
+
+                ptrbb += 8;
+                ptrba += vl;
+            }
+
+            va0 = VFMULVF_FLOAT(vres0, alpha, vl);
+            VSEV_FLOAT(C0, va0, vl);
+
+            va1 = VFMULVF_FLOAT(vres1, alpha, vl);
+            VSEV_FLOAT(C1, va1, vl);
+
+            va2 = VFMULVF_FLOAT(vres2, alpha, vl);
+            VSEV_FLOAT(C2, va2, vl);
+
+            va3 = VFMULVF_FLOAT(vres3, alpha, vl);
+            VSEV_FLOAT(C3, va3, vl);
+
+            va4 = VFMULVF_FLOAT(vres4, alpha, vl);
+            VSEV_FLOAT(C4, va4, vl);
+
+            va5 = VFMULVF_FLOAT(vres5, alpha, vl);
+            VSEV_FLOAT(C5, va5, vl);
+
+            va6 = VFMULVF_FLOAT(vres6, alpha, vl);
+            VSEV_FLOAT(C6, va6, vl);
+
+            va7 = VFMULVF_FLOAT(vres7, alpha, vl);
+            VSEV_FLOAT(C7, va7, vl);
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= vl; // number of values in A
+#else
+            temp -= 8; // number of values in B
+#endif
+            ptrba += temp*vl;
+            ptrbb += temp*8;
+#endif
+
+#ifdef LEFT
+            off += vl; // number of values in A
+#endif
+
+            C0 += vl;
+            C1 += vl;
+            C2 += vl;
+            C3 += vl;
+            C4 += vl;
+            C5 += vl;
+            C6 += vl;
+            C7 += vl;
+        }
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        off += 8;
+#endif
+
+        bb += (bk<<3);
+        C += (ldc<<3);
+    }
+
+    if (bn & 4)
+    {
+        C0 = C;
+        C1 = C0+ldc;
+        C2 = C1+ldc;
+        C3 = C2+ldc;
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        off = offset;
+#endif
+        ptrba = ba;
+
+        for (i = bm; i > 0; i -= vl)
+        {
+            vl = VSETVL(i);
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*vl;
+            ptrbb = bb + off*4;
+#endif
+
+            vres0 = VFMVVF_FLOAT(0.0, vl);
+            vres1 = VFMVVF_FLOAT(0.0, vl);
+            vres2 = VFMVVF_FLOAT(0.0, vl);
+            vres3 = VFMVVF_FLOAT(0.0, vl);
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+vl;  // number of values in A
+#else
+            temp = off+4;   // number of values in B
+#endif
+
+            for (k = temp/8; k > 0; k--) {
+                va0 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+                va1 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl);
+                ptrbb += 4;
+                va2 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va1, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va1, vl);
+                ptrbb += 4;
+                va3 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va2, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va2, vl);
+                ptrbb += 4;
+                va4 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va3, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va3, vl);
+                ptrbb += 4;
+                va5 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va4, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va4, vl);
+                ptrbb += 4;
+                va6 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va5, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va5, vl);
+                ptrbb += 4;
+                va7 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va6, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va6, vl);
+                ptrbb += 4;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va7, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va7, vl);
+                ptrbb += 4;
+            }
+
+            // K remainder
+            for (k = temp&7; k > 0; k--) {
+                va0 = VLEV_FLOAT(ptrba, vl);
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl);
+
+                ptrbb += 4;
+                ptrba += vl;
+            }
+
+            va0 = VFMULVF_FLOAT(vres0, alpha, vl);
+            VSEV_FLOAT(C0, va0, vl);
+
+            va1 = VFMULVF_FLOAT(vres1, alpha, vl);
+            VSEV_FLOAT(C1, va1, vl);
+
+            va2 = VFMULVF_FLOAT(vres2, alpha, vl);
+            VSEV_FLOAT(C2, va2, vl);
+
+            va3 = VFMULVF_FLOAT(vres3, alpha, vl);
+            VSEV_FLOAT(C3, va3, vl);
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= vl; // number of values in A
+#else
+            temp -= 4; // number of values in B
+#endif
+            ptrba += temp*vl;
+            ptrbb += temp*4;
+#endif
+
+#ifdef LEFT
+            off += vl; // number of values in A
+#endif
+
+            C0 += vl;
+            C1 += vl;
+            C2 += vl;
+            C3 += vl;
+        }
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        off += 4;
+#endif
+
+        bb += (bk<<2);
+        C += (ldc<<2);
+    }
+
+    if (bn & 2)
+    {
+        C0 = C;
+        C1 = C0+ldc;
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        off = offset;
+#endif
+
+        ptrba = ba;
+
+        for (i = bm; i > 0; i -= vl) 
+        {
+            vl = VSETVL(i);
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*vl;
+            ptrbb = bb + off*2;
+#endif
+
+            vres0 = VFMVVF_FLOAT(0.0, vl);
+            vres1 = VFMVVF_FLOAT(0.0, vl);
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+vl;  // number of values in A
+#else
+            temp = off+2;   // number of values in B
+#endif
+
+            for (k = temp/8; k > 0; k--) {
+                va0 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+                va1 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
+                ptrbb += 2;
+                va2 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl);
+                ptrbb += 2;
+                va3 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl);
+                ptrbb += 2;
+                va4 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl);
+                ptrbb += 2;
+                va5 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl);
+                ptrbb += 2;
+                va6 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl);
+                ptrbb += 2;
+                va7 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl);
+                ptrbb += 2;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl);
+                ptrbb += 2;
+            }
+
+            // K remainder
+            for (k = temp&7; k > 0; k--) {
+                va0 = VLEV_FLOAT(ptrba, vl);
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
+
+                ptrbb += 2;
+                ptrba += vl;
+            }
+            va0 = VFMULVF_FLOAT(vres0, alpha, vl);
+            VSEV_FLOAT(C0, va0, vl);
+
+            va1 = VFMULVF_FLOAT(vres1, alpha, vl);
+            VSEV_FLOAT(C1, va1, vl);
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= vl; // number of values in A
+#else
+            temp -= 2; // number of values in B
+#endif
+            ptrba += temp*vl;
+            ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+            off += vl; // number of values in A
+#endif
+
+            C0 += vl;
+            C1 += vl;
+        }
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        off += 2;
+#endif
+
+        bb += (bk<<1);
+        C += (ldc<<1);
+    }
+
+    if (bn & 1)
+    {
+        C0 = C;
+
+#if defined(TRMMKERNEL) &&  defined(LEFT)
+        off = offset;
+#endif
+
+        ptrba = ba;
+
+        for (i = bm; i > 0; i -= vl)
+        {
+            vl = VSETVL(i);
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*vl;
+            ptrbb = bb + off*1;
+#endif
+
+            vres0 = VFMVVF_FLOAT(0.0, vl);
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+vl;  // number of values in A
+#else
+            temp = off+1;   // number of values in B
+#endif
+
+            for (k = temp/8; k > 0; k--) {
+                va0 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+                va1 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
+                ptrbb += 1;
+                va2 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl);
+                ptrbb += 1;
+                va3 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl);
+                ptrbb += 1;
+                va4 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl);
+                ptrbb += 1;
+                va5 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl);
+                ptrbb += 1;
+                va6 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl);
+                ptrbb += 1;
+                va7 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl);
+                ptrbb += 1;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl);
+                ptrbb += 1;
+            }
+
+            // K remainder
+            for (k = temp&7; k > 0; k--) {
+                va0 = VLEV_FLOAT(ptrba, vl);
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
+
+                ptrbb += 1;
+                ptrba += vl;
+            }
+            va0 = VFMULVF_FLOAT(vres0, alpha, vl);
+            VSEV_FLOAT(C0, va0, vl);
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= vl; // number of values in A
+#else
+            temp -= 1; // number of values in B
+#endif
+            ptrba += temp*vl;
+            ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+            off += vl; // number of values in A
+#endif
+
+            C0 += vl;
+        }
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        off += 1;
+#endif
+
+        bb += (bk);
+        C += (ldc);
+    }
+    return 0;
+}
+
diff --git a/kernel/riscv64/trsm_kernel_LN_rvv_v1.c b/kernel/riscv64/trsm_kernel_LN_rvv_v1.c
new file mode 100644
index 000000000..869561fb3
--- /dev/null
+++ b/kernel/riscv64/trsm_kernel_LN_rvv_v1.c
@@ -0,0 +1,364 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m2()
+#define FLOAT_V_T               vfloat32m2_t
+#define FLOAT_VX2_T             vfloat32m2x2_t
+#define VGET_VX2                __riscv_vget_v_f32m2x2_f32m2
+#define VSET_VX2                __riscv_vset_v_f32m2_f32m2x2
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m2
+#define VSSEV_FLOAT             __riscv_vsse32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2x2
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e32_v_f32m2x2
+#define VSSSEG2_FLOAT           __riscv_vssseg2e32_v_f32m2x2
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f32m2
+#define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f32m2
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f32m2
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m2()
+#define FLOAT_V_T               vfloat64m2_t
+#define FLOAT_VX2_T             vfloat64m2x2_t
+#define VGET_VX2                __riscv_vget_v_f64m2x2_f64m2
+#define VSET_VX2                __riscv_vset_v_f64m2_f64m2x2
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m2
+#define VSSEV_FLOAT             __riscv_vsse64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2x2
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e64_v_f64m2x2
+#define VSSSEG2_FLOAT           __riscv_vssseg2e64_v_f64m2x2
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m2
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f64m2
+#define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f64m2
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f64m2
+#endif
+
+
+static FLOAT dm1 = -1.;
+
+#ifdef CONJ
+#define GEMM_KERNEL   GEMM_KERNEL_L
+#else
+#define GEMM_KERNEL   GEMM_KERNEL_N
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 1
+#define GEMM_UNROLL_N_SHIFT 0
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 2
+#define GEMM_UNROLL_N_SHIFT 1
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 4
+#define GEMM_UNROLL_N_SHIFT 2
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 8
+#define GEMM_UNROLL_N_SHIFT 3
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 16
+#define GEMM_UNROLL_N_SHIFT 4
+#endif
+
+// Optimizes the implementation in ../arm64/trsm_kernel_LN_sve.c
+
+#ifndef COMPLEX
+
+static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+    FLOAT aa;
+    FLOAT* pc;
+
+    int i, j, k;
+
+    BLASLONG stride_ldc = sizeof(FLOAT) * ldc;
+
+    FLOAT_V_T vb, vc;
+
+    size_t vl;
+
+    a += (m - 1) * m;
+    b += (m - 1) * n;
+
+    for (i = m - 1; i >= 0; i--) {
+
+        aa = *(a + i);
+        pc  = c;
+        for (j = n; j > 0; j -= vl) {
+            vl = VSETVL(j);
+            vb = VLSEV_FLOAT(pc + i, stride_ldc, vl);
+            vb = VFMULVF_FLOAT(vb, aa, vl);
+            VSEV_FLOAT(b, vb, vl);
+            VSSEV_FLOAT(pc + i, stride_ldc, vb, vl);
+            b   += vl;
+
+            for (k = 0; k < i; k ++) {
+                vc = VLSEV_FLOAT(pc + k, stride_ldc, vl);
+                vc = VFNMSACVF_FLOAT(vc, *(a + k), vb, vl);
+                VSSEV_FLOAT(pc + k, stride_ldc, vc, vl);
+            }
+            pc  += vl * ldc;
+        }
+        a -= m;
+        b -= 2 * n;
+    }
+
+}
+#else
+
+static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+
+    FLOAT aa1, aa2;
+    FLOAT *pc;
+    int i, j, k;
+
+    BLASLONG stride_ldc = sizeof(FLOAT) * ldc * 2;
+
+    FLOAT_VX2_T vbx2, vsx2, vcx2;
+    FLOAT_V_T vb1, vb2, vc1, vc2, vs1, vs2;
+    size_t vl;
+    a += (m - 1) * m * 2;
+    b += (m - 1) * n * 2;
+
+    for (i = m - 1; i >= 0; i--) {
+
+        aa1 = *(a + i * 2 + 0);
+        aa2 = *(a + i * 2 + 1);
+        pc  = c;
+
+        for (j = n; j > 0; j -= vl) {
+            vl = VSETVL(j);
+            vbx2 = VLSSEG2_FLOAT(pc + i * 2, stride_ldc, vl);
+            vb1 = VGET_VX2(vbx2, 0);
+            vb2 = VGET_VX2(vbx2, 1);
+#ifndef CONJ
+            vs1 =   VFMULVF_FLOAT(vb1, aa1, vl);
+            vs1 = VFNMSACVF_FLOAT(vs1, aa2, vb2, vl);
+            vs2 =   VFMULVF_FLOAT(vb2, aa1, vl);
+            vs2 =  VFMACCVF_FLOAT(vs2, aa2, vb1, vl);
+#else
+            vs1 =   VFMULVF_FLOAT(vb1, aa1, vl);
+            vs1 =  VFMACCVF_FLOAT(vs1, aa2, vb2, vl);
+            vs2 =   VFMULVF_FLOAT(vb2, aa1, vl);
+            vs2 = VFNMSACVF_FLOAT(vs2, aa2, vb1, vl);
+#endif
+            vsx2 = VSET_VX2(vsx2, 0, vs1);
+            vsx2 = VSET_VX2(vsx2, 1, vs2);
+            VSSEG2_FLOAT(b, vsx2, vl);
+            VSSSEG2_FLOAT(pc + i * 2, stride_ldc, vsx2, vl);
+            b   += vl * 2;
+
+            for (k = 0; k < i; k ++) {
+                vcx2 = VLSSEG2_FLOAT(pc + k * 2, stride_ldc, vl);
+                vc1 = VGET_VX2(vcx2, 0);
+                vc2 = VGET_VX2(vcx2, 1);
+#ifndef CONJ
+                vc1 =  VFMACCVF_FLOAT(vc1, *(a + k * 2 + 1), vs2, vl);
+                vc1 = VFNMSACVF_FLOAT(vc1, *(a + k * 2 + 0), vs1, vl);
+                vc2 = VFNMSACVF_FLOAT(vc2, *(a + k * 2 + 1), vs1, vl);
+                vc2 = VFNMSACVF_FLOAT(vc2, *(a + k * 2 + 0), vs2, vl);
+#else                                                        
+                vc1 = VFNMSACVF_FLOAT(vc1, *(a + k * 2 + 1), vs2, vl);
+                vc1 = VFNMSACVF_FLOAT(vc1, *(a + k * 2 + 0), vs1, vl);
+                vc2 =  VFMACCVF_FLOAT(vc2, *(a + k * 2 + 1), vs1, vl);
+                vc2 = VFNMSACVF_FLOAT(vc2, *(a + k * 2 + 0), vs2, vl);
+#endif
+                vcx2 = VSET_VX2(vcx2, 0, vc1);
+                vcx2 = VSET_VX2(vcx2, 1, vc2);
+                VSSSEG2_FLOAT(pc + k * 2, stride_ldc, vcx2, vl);
+            }
+            pc  += vl * ldc * 2;
+        }
+        a -= m * 2;
+        b -= 4 * n;
+    }
+}
+
+
+#endif
+
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG k,  FLOAT dummy1,
+#ifdef COMPLEX
+    FLOAT dummy2,
+#endif
+    FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){
+
+  BLASLONG i, j;
+  FLOAT *aa, *cc;
+  BLASLONG  kk;
+  
+  size_t vl = VSETVL_MAX;
+
+    //fprintf(stderr, "%s , %s, m = %4ld  n = %4ld  k = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, k, offset); // Debug
+
+  j = (n >> GEMM_UNROLL_N_SHIFT);
+
+  while (j > 0) {
+
+    kk = m + offset;
+
+    i = m % vl;
+    if (i) {
+      aa = a + (m - i) * k * COMPSIZE;
+      cc = c + (m - i)     * COMPSIZE;
+
+      if (k - kk > 0) {
+        GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1,
+#ifdef COMPLEX
+            ZERO,
+#endif
+            aa + i             * kk * COMPSIZE,
+            b  + GEMM_UNROLL_N * kk * COMPSIZE,
+            cc,
+            ldc);
+      }
+
+      solve(i, GEMM_UNROLL_N,
+          aa + (kk - i) * i             * COMPSIZE,
+          b  + (kk - i) * GEMM_UNROLL_N * COMPSIZE,
+          cc, ldc);
+
+      kk -= i;
+
+    }
+
+    int mod = i;
+    i = vl;
+    if (i <= m) {
+      aa = a + (m - mod - vl) * k * COMPSIZE;
+      cc = c + (m - mod - vl)     * COMPSIZE;
+
+      do {
+        if (k - kk > 0) {
+          GEMM_KERNEL(vl, GEMM_UNROLL_N, k - kk, dm1,
+#ifdef COMPLEX
+              ZERO,
+#endif
+              aa + vl * kk * COMPSIZE,
+              b +  GEMM_UNROLL_N * kk * COMPSIZE,
+              cc,
+              ldc);
+        }
+
+        solve(vl, GEMM_UNROLL_N,
+            aa + (kk - vl) * vl * COMPSIZE,
+            b  + (kk - vl) * GEMM_UNROLL_N * COMPSIZE,
+            cc, ldc);
+
+        aa -= vl * k * COMPSIZE;
+        cc -= vl     * COMPSIZE;
+        kk -= vl;
+
+        i += vl;
+      } while (i <= m);
+    }
+
+
+    b += GEMM_UNROLL_N * k * COMPSIZE;
+    c += GEMM_UNROLL_N * ldc * COMPSIZE;
+    j --;
+  }
+
+  if (n & (GEMM_UNROLL_N - 1)) {
+
+    j = (GEMM_UNROLL_N >> 1);
+    while (j > 0) {
+      if (n & j) {
+
+        kk = m + offset;
+
+        i = m % vl;
+        if (i) {
+          aa = a + (m - i) * k * COMPSIZE;
+          cc = c + (m - i)     * COMPSIZE;
+
+          if (k - kk > 0) {
+            GEMM_KERNEL(i, j, k - kk, dm1,
+#ifdef COMPLEX
+                ZERO,
+#endif
+                aa + i * kk * COMPSIZE,
+                b  + j * kk * COMPSIZE,
+                cc, ldc);
+          }
+
+          solve(i, j,
+              aa + (kk - i) * i * COMPSIZE,
+              b  + (kk - i) * j * COMPSIZE,
+              cc, ldc);
+
+          kk -= i;
+
+        }
+
+        int mod = i;
+        i = vl;
+        if (i <= m) {
+          aa = a + (m - mod - vl) * k * COMPSIZE;
+          cc = c + (m - mod - vl)     * COMPSIZE;
+
+          do {
+            if (k - kk > 0) {
+              GEMM_KERNEL(vl, j, k - kk, dm1,
+#ifdef COMPLEX
+                  ZERO,
+#endif
+                  aa + vl * kk * COMPSIZE,
+                  b +  j             * kk * COMPSIZE,
+                  cc,
+                  ldc);
+            }
+
+            solve(vl, j,
+                aa + (kk - vl) * vl * COMPSIZE,
+                b  + (kk - vl) * j             * COMPSIZE,
+                cc, ldc);
+
+            aa -= vl * k * COMPSIZE;
+            cc -= vl     * COMPSIZE;
+            kk -= vl;
+
+            i += vl;
+          } while (i <= m);
+        }
+
+        b += j * k   * COMPSIZE;
+        c += j * ldc * COMPSIZE;
+      }
+      j >>= 1;
+    }
+  }
+
+  return 0;
+}
diff --git a/kernel/riscv64/trsm_kernel_LT_rvv_v1.c b/kernel/riscv64/trsm_kernel_LT_rvv_v1.c
new file mode 100644
index 000000000..da443cfba
--- /dev/null
+++ b/kernel/riscv64/trsm_kernel_LT_rvv_v1.c
@@ -0,0 +1,341 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m2()
+#define FLOAT_V_T               vfloat32m2_t
+#define FLOAT_VX2_T             vfloat32m2x2_t
+#define VGET_VX2                __riscv_vget_v_f32m2x2_f32m2
+#define VSET_VX2                __riscv_vset_v_f32m2_f32m2x2
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m2
+#define VSSEV_FLOAT             __riscv_vsse32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2x2
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e32_v_f32m2x2
+#define VSSSEG2_FLOAT           __riscv_vssseg2e32_v_f32m2x2
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f32m2
+#define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f32m2
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f32m2
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m2()
+#define FLOAT_V_T               vfloat64m2_t
+#define FLOAT_VX2_T             vfloat64m2x2_t
+#define VGET_VX2                __riscv_vget_v_f64m2x2_f64m2
+#define VSET_VX2                __riscv_vset_v_f64m2_f64m2x2
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m2
+#define VSSEV_FLOAT             __riscv_vsse64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2x2
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e64_v_f64m2x2
+#define VSSSEG2_FLOAT           __riscv_vssseg2e64_v_f64m2x2
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m2
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f64m2
+#define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f64m2
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f64m2
+#endif
+
+
+static FLOAT dm1 = -1.;
+
+#ifdef CONJ
+#define GEMM_KERNEL   GEMM_KERNEL_L
+#else
+#define GEMM_KERNEL   GEMM_KERNEL_N
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 1
+#define GEMM_UNROLL_N_SHIFT 0
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 2
+#define GEMM_UNROLL_N_SHIFT 1
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 4
+#define GEMM_UNROLL_N_SHIFT 2
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 8
+#define GEMM_UNROLL_N_SHIFT 3
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 16
+#define GEMM_UNROLL_N_SHIFT 4
+#endif
+
+// Optimizes the implementation in ../arm64/trsm_kernel_LT_sve.c
+
+#ifndef COMPLEX
+
+static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+
+    FLOAT aa;
+    FLOAT* pc;
+
+    int i, j, k;
+
+    BLASLONG stride_ldc = sizeof(FLOAT) * ldc;
+
+    FLOAT_V_T vb, vc;
+
+    size_t vl;
+
+    for (i = 0; i < m; i++) {
+
+        aa = *(a + i);
+        pc  = c;
+        for (j = n; j > 0; j -= vl) {
+            vl = VSETVL(j);
+            vb = VLSEV_FLOAT(pc + i, stride_ldc, vl);
+            vb = VFMULVF_FLOAT(vb, aa, vl);
+            VSEV_FLOAT(b, vb, vl);
+            VSSEV_FLOAT(pc + i, stride_ldc, vb, vl);
+            b   += vl;
+
+            for (k = i + 1; k < m; k++) {
+                vc = VLSEV_FLOAT(pc + k, stride_ldc, vl);
+                vc = VFNMSACVF_FLOAT(vc, *(a + k), vb, vl);
+                VSSEV_FLOAT(pc + k, stride_ldc, vc, vl);
+            }
+            pc  += vl * ldc;
+        }
+        a += m;
+    }
+}
+
+#else
+
+static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+
+    FLOAT aa1, aa2;
+    FLOAT *pc;
+    int i, j, k;
+
+    BLASLONG stride_ldc = sizeof(FLOAT) * ldc * 2;
+
+    FLOAT_VX2_T vbx2, vsx2, vcx2;
+    FLOAT_V_T vb1, vb2, vc1, vc2, vs1, vs2;
+    size_t vl;
+
+    ldc *= 2;
+
+    for (i = 0; i < m; i++) {
+        aa1 = *(a + i * 2 + 0);
+        aa2 = *(a + i * 2 + 1);
+        pc  = c;
+
+        for (j = n; j > 0; j -= vl) {
+            vl = VSETVL(j);
+            vbx2 = VLSSEG2_FLOAT(pc + i * 2, stride_ldc, vl);
+            vb1 = VGET_VX2(vbx2, 0);
+            vb2 = VGET_VX2(vbx2, 1);
+#ifndef CONJ
+            vs1 =   VFMULVF_FLOAT(vb1, aa1, vl);
+            vs1 = VFNMSACVF_FLOAT(vs1, aa2, vb2, vl);
+            vs2 =   VFMULVF_FLOAT(vb2, aa1, vl);
+            vs2 =  VFMACCVF_FLOAT(vs2, aa2, vb1, vl);
+#else
+            vs1 =   VFMULVF_FLOAT(vb1, aa1, vl);
+            vs1 =  VFMACCVF_FLOAT(vs1, aa2, vb2, vl);
+            vs2 =   VFMULVF_FLOAT(vb2, aa1, vl);
+            vs2 = VFNMSACVF_FLOAT(vs2, aa2, vb1, vl);
+#endif
+            vsx2 = VSET_VX2(vsx2, 0, vs1);
+            vsx2 = VSET_VX2(vsx2, 1, vs2);
+            VSSEG2_FLOAT(b, vsx2, vl);
+            VSSSEG2_FLOAT(pc + i * 2, stride_ldc, vsx2, vl);
+            b   += vl * 2;
+
+            for (k = i + 1; k < m; k++) {
+                vcx2 = VLSSEG2_FLOAT(pc + k * 2, stride_ldc, vl);
+                vc1 = VGET_VX2(vcx2, 0);
+                vc2 = VGET_VX2(vcx2, 1);
+#ifndef CONJ
+                vc1 =  VFMACCVF_FLOAT(vc1, *(a + k * 2 + 1), vs2, vl);
+                vc1 = VFNMSACVF_FLOAT(vc1, *(a + k * 2 + 0), vs1, vl);
+                vc2 = VFNMSACVF_FLOAT(vc2, *(a + k * 2 + 1), vs1, vl);
+                vc2 = VFNMSACVF_FLOAT(vc2, *(a + k * 2 + 0), vs2, vl);
+#else                                                        
+                vc1 = VFNMSACVF_FLOAT(vc1, *(a + k * 2 + 1), vs2, vl);
+                vc1 = VFNMSACVF_FLOAT(vc1, *(a + k * 2 + 0), vs1, vl);
+                vc2 =  VFMACCVF_FLOAT(vc2, *(a + k * 2 + 1), vs1, vl);
+                vc2 = VFNMSACVF_FLOAT(vc2, *(a + k * 2 + 0), vs2, vl);
+#endif
+                vcx2 = VSET_VX2(vcx2, 0, vc1);
+                vcx2 = VSET_VX2(vcx2, 1, vc2);
+                VSSSEG2_FLOAT(pc + k * 2, stride_ldc, vcx2, vl);
+            }
+            pc  += vl * ldc * 2;
+        }
+
+        a += m * 2;
+    }
+}
+
+#endif
+
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1,
+#ifdef COMPLEX
+	   FLOAT dummy2,
+#endif
+	   FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){
+
+  FLOAT *aa, *cc;
+  BLASLONG  kk;
+  BLASLONG i, j;
+
+  size_t vl = VSETVL_MAX;
+
+    //fprintf(stderr, "%s , %s, m = %4ld  n = %4ld  k = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, k, offset); // Debug
+
+  j = (n >> GEMM_UNROLL_N_SHIFT);
+
+  while (j > 0) {
+
+    kk = offset;
+    aa = a;
+    cc = c;
+
+    i = vl;
+
+    while (i <= m) {
+
+      if (kk > 0) {
+        GEMM_KERNEL(vl, GEMM_UNROLL_N, kk, dm1,
+#ifdef COMPLEX
+            ZERO,
+#endif
+            aa, b, cc, ldc);
+      }
+
+      solve(vl, GEMM_UNROLL_N,
+          aa + kk * vl * COMPSIZE,
+          b  + kk * GEMM_UNROLL_N * COMPSIZE,
+          cc, ldc);
+
+      aa += vl * k * COMPSIZE;
+      cc += vl     * COMPSIZE;
+      kk += vl;
+      i += vl;
+    }
+
+    i = m % vl;
+    if (i) {
+      if (kk > 0) {
+        GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1,
+#ifdef COMPLEX
+            ZERO,
+#endif
+            aa, b, cc, ldc);
+      }
+      solve(i, GEMM_UNROLL_N,
+          aa + kk * i             * COMPSIZE,
+          b  + kk * GEMM_UNROLL_N * COMPSIZE,
+          cc, ldc);
+
+      aa += i * k * COMPSIZE;
+      cc += i     * COMPSIZE;
+      kk += i;
+
+    }
+
+    b += GEMM_UNROLL_N * k   * COMPSIZE;
+    c += GEMM_UNROLL_N * ldc * COMPSIZE;
+    j --;
+  }
+
+  if (n & (GEMM_UNROLL_N - 1)) {
+
+    j = (GEMM_UNROLL_N >> 1);
+    while (j > 0) {
+      if (n & j) {
+
+        kk = offset;
+        aa = a;
+        cc = c;
+
+        i = vl;
+
+        while (i <= m) {
+          if (kk > 0) {
+            GEMM_KERNEL(vl, j, kk, dm1,
+#ifdef COMPLEX
+                ZERO,
+#endif
+                aa,
+                b,
+                cc,
+                ldc);
+          }
+
+          solve(vl, j,
+              aa + kk * vl * COMPSIZE,
+              b  + kk * j             * COMPSIZE, cc, ldc);
+
+          aa += vl * k * COMPSIZE;
+          cc += vl     * COMPSIZE;
+          kk += vl;
+          i += vl;
+        }
+
+        i = m % vl;
+        if (i) {
+          if (kk > 0) {
+            GEMM_KERNEL(i, j, kk, dm1,
+#ifdef COMPLEX
+                ZERO,
+#endif
+                aa,
+                b,
+                cc,
+                ldc);
+          }
+
+          solve(i, j,
+              aa + kk * i * COMPSIZE,
+              b  + kk * j * COMPSIZE, cc, ldc);
+
+          aa += i * k * COMPSIZE;
+          cc += i     * COMPSIZE;
+          kk += i;
+
+        }
+
+        b += j * k   * COMPSIZE;
+        c += j * ldc * COMPSIZE;
+      }
+      j >>= 1;
+    }
+  }
+
+  return 0;
+}
diff --git a/kernel/riscv64/trsm_kernel_RN_rvv_v1.c b/kernel/riscv64/trsm_kernel_RN_rvv_v1.c
new file mode 100644
index 000000000..32e481036
--- /dev/null
+++ b/kernel/riscv64/trsm_kernel_RN_rvv_v1.c
@@ -0,0 +1,337 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m2()
+#define FLOAT_V_T               vfloat32m2_t
+#define FLOAT_VX2_T             vfloat32m2x2_t
+#define VGET_VX2                __riscv_vget_v_f32m2x2_f32m2
+#define VSET_VX2                __riscv_vset_v_f32m2_f32m2x2
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VSSEV_FLOAT             __riscv_vsse32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e32_v_f32m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2x2
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f32m2
+#define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f32m2
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f32m2
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m2()
+#define FLOAT_V_T               vfloat64m2_t
+#define FLOAT_VX2_T             vfloat64m2x2_t
+#define VGET_VX2                __riscv_vget_v_f64m2x2_f64m2
+#define VSET_VX2                __riscv_vset_v_f64m2_f64m2x2
+#define VLEV_FLOAT              __riscv_vle64_v_f64m2
+#define VSSEV_FLOAT             __riscv_vsse64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e64_v_f64m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2x2
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m2
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f64m2
+#define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f64m2
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f64m2
+#endif
+
+static FLOAT dm1 = -1.;
+
+#ifdef CONJ
+#define GEMM_KERNEL   GEMM_KERNEL_R
+#else
+#define GEMM_KERNEL   GEMM_KERNEL_N
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 1
+#define GEMM_UNROLL_N_SHIFT 0
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 2
+#define GEMM_UNROLL_N_SHIFT 1
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 4
+#define GEMM_UNROLL_N_SHIFT 2
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 8
+#define GEMM_UNROLL_N_SHIFT 3
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 16
+#define GEMM_UNROLL_N_SHIFT 4
+#endif
+
+// Optimizes the implementation in ../arm64/trsm_kernel_RN_sve.c
+
+#ifndef COMPLEX
+
+static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+
+    FLOAT bb;
+    FLOAT *pci, *pcj;
+
+    int i, j, k;
+    FLOAT_V_T va, vc;
+
+    size_t vl;
+    for (i = 0; i < n; i++) {
+
+        bb = *(b + i);
+        pci = c + i * ldc;
+        pcj = c;
+        for (j = m; j > 0; j -= vl) {
+            vl = VSETVL(j);
+            va = VLEV_FLOAT(pci, vl);
+            va = VFMULVF_FLOAT(va, bb, vl);
+            VSEV_FLOAT(a, va, vl);
+            VSEV_FLOAT(pci, va, vl);
+            a   += vl;
+            pci += vl;
+            for (k = i + 1; k < n; k ++){
+                vc = VLEV_FLOAT(pcj + k * ldc, vl);
+                vc = VFNMSACVF_FLOAT(vc, *(b + k), va, vl);
+                VSEV_FLOAT(pcj + k * ldc, vc, vl);
+            }
+            pcj += vl;
+        }
+        b += n;
+    }
+}
+
+#else
+
+static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+
+    FLOAT bb1, bb2;
+
+    FLOAT *pci, *pcj;
+
+    int i, j, k;
+
+    FLOAT_VX2_T vax2, vsx2, vcx2;
+    FLOAT_V_T va1, va2, vs1, vs2, vc1, vc2;
+
+    size_t vl;
+
+    for (i = 0; i < n; i++) {
+
+        bb1 = *(b + i * 2 + 0);
+        bb2 = *(b + i * 2 + 1);
+
+        pci = c + i * ldc * 2;
+        pcj = c;
+
+        for (j = m; j > 0; j -= vl) {
+            vl = VSETVL(j);
+            vax2 = VLSEG2_FLOAT(pci, vl);
+            va1 = VGET_VX2(vax2, 0);
+            va2 = VGET_VX2(vax2, 1);
+#ifndef CONJ
+            vs1 =   VFMULVF_FLOAT(va1, bb1, vl);
+            vs1 = VFNMSACVF_FLOAT(vs1, bb2, va2, vl);
+            vs2 =   VFMULVF_FLOAT(va1, bb2, vl);
+            vs2 =  VFMACCVF_FLOAT(vs2, bb1, va2, vl);
+#else
+            vs1 =   VFMULVF_FLOAT(va1, bb1, vl);
+            vs1 =  VFMACCVF_FLOAT(vs1, bb2, va2, vl);
+            vs2 =   VFMULVF_FLOAT(va2, bb1, vl);
+            vs2 = VFNMSACVF_FLOAT(vs2, bb2, va1, vl);
+#endif
+            vsx2 = VSET_VX2(vsx2, 0, vs1);
+            vsx2 = VSET_VX2(vsx2, 1, vs2);
+            VSSEG2_FLOAT(a, vsx2, vl);
+            VSSEG2_FLOAT(pci, vsx2, vl);
+            a += vl * 2;
+            pci += vl * 2;
+
+            for (k = i + 1; k < n; k ++){
+                vcx2 = VLSEG2_FLOAT(pcj + k * ldc * 2, vl);
+                vc1 = VGET_VX2(vcx2, 0);
+                vc2 = VGET_VX2(vcx2, 1);
+#ifndef CONJ
+                vc1 =  VFMACCVF_FLOAT(vc1, *(b + k * 2 + 1), vs2, vl);
+                vc1 = VFNMSACVF_FLOAT(vc1, *(b + k * 2 + 0), vs1, vl);
+                vc2 = VFNMSACVF_FLOAT(vc2, *(b + k * 2 + 1), vs1, vl);
+                vc2 = VFNMSACVF_FLOAT(vc2, *(b + k * 2 + 0), vs2, vl);
+#else
+                vc1 = VFNMSACVF_FLOAT(vc1, *(b + k * 2 + 0), vs1, vl);
+                vc1 = VFNMSACVF_FLOAT(vc1, *(b + k * 2 + 1), vs2, vl);
+                vc2 =  VFMACCVF_FLOAT(vc2, *(b + k * 2 + 1), vs1, vl);
+                vc2 = VFNMSACVF_FLOAT(vc2, *(b + k * 2 + 0), vs2, vl);
+#endif
+                vcx2 = VSET_VX2(vcx2, 0, vc1);
+                vcx2 = VSET_VX2(vcx2, 1, vc2);
+                VSSEG2_FLOAT(pcj + k * ldc * 2, vcx2, vl);
+            }
+            pcj += vl * 2;
+        }
+        b += n * 2;
+    }
+}
+
+#endif
+
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1,
+#ifdef COMPLEX
+	   FLOAT dummy2,
+#endif
+	   FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){
+
+  FLOAT *aa, *cc;
+  BLASLONG  kk;
+  BLASLONG i, j;
+
+  size_t vl = VSETVL_MAX;
+
+    //fprintf(stderr, "%s , %s, m = %4ld  n = %4ld  k = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, k, offset); // Debug
+
+
+  j = (n >> GEMM_UNROLL_N_SHIFT);
+  kk = -offset;
+
+  while (j > 0) {
+
+    aa = a;
+    cc = c;
+
+    i = vl;
+
+    if (i <= m) {
+      do {
+	if (kk > 0) {
+	  GEMM_KERNEL(vl, GEMM_UNROLL_N, kk, dm1,
+#ifdef COMPLEX
+		      ZERO,
+#endif
+		      aa, b, cc, ldc);
+	}
+
+	solve(vl, GEMM_UNROLL_N,
+	      aa + kk * vl * COMPSIZE,
+	      b  + kk * GEMM_UNROLL_N * COMPSIZE,
+	      cc, ldc);
+
+	aa += vl * k * COMPSIZE;
+	cc += vl     * COMPSIZE;
+	i += vl;
+      } while (i <= m);
+    }
+
+
+    i = m % vl;
+    if (i) {
+      if (kk > 0) {
+        GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1,
+#ifdef COMPLEX
+            ZERO,
+#endif
+            aa, b, cc, ldc);
+      }
+      solve(i, GEMM_UNROLL_N,
+          aa + kk * i             * COMPSIZE,
+          b  + kk * GEMM_UNROLL_N * COMPSIZE,
+          cc, ldc);
+
+      aa += i * k * COMPSIZE;
+      cc += i     * COMPSIZE;
+
+    }
+
+    kk += GEMM_UNROLL_N;
+    b += GEMM_UNROLL_N * k   * COMPSIZE;
+    c += GEMM_UNROLL_N * ldc * COMPSIZE;
+    j --;
+  }
+
+  if (n & (GEMM_UNROLL_N - 1)) {
+
+    j = (GEMM_UNROLL_N >> 1);
+    while (j > 0) {
+      if (n & j) {
+
+	aa = a;
+	cc = c;
+
+  i = vl;
+
+	while (i <= m) {
+	  if (kk > 0) {
+	    GEMM_KERNEL(vl, j, kk, dm1,
+#ifdef COMPLEX
+			ZERO,
+#endif
+			aa,
+			b,
+			cc,
+			ldc);
+	  }
+
+	  solve(vl, j,
+		aa + kk * vl * COMPSIZE,
+		b  + kk * j             * COMPSIZE, cc, ldc);
+
+	  aa += vl * k * COMPSIZE;
+	  cc += vl     * COMPSIZE;
+	  i += vl;
+	}
+
+  i = m % vl;
+  if (i) {
+	      if (kk > 0) {
+		GEMM_KERNEL(i, j, kk, dm1,
+#ifdef COMPLEX
+			    ZERO,
+#endif
+			    aa,
+			    b,
+			    cc,
+			    ldc);
+	      }
+
+	      solve(i, j,
+		    aa + kk * i * COMPSIZE,
+		    b  + kk * j * COMPSIZE, cc, ldc);
+
+	      aa += i * k * COMPSIZE;
+	      cc += i     * COMPSIZE;
+
+  }
+
+	b += j * k   * COMPSIZE;
+	c += j * ldc * COMPSIZE;
+	kk += j;
+      }
+      j >>= 1;
+    }
+  }
+
+  return 0;
+}
diff --git a/kernel/riscv64/trsm_kernel_RT_rvv_v1.c b/kernel/riscv64/trsm_kernel_RT_rvv_v1.c
new file mode 100644
index 000000000..81cc41818
--- /dev/null
+++ b/kernel/riscv64/trsm_kernel_RT_rvv_v1.c
@@ -0,0 +1,356 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m2()
+#define FLOAT_V_T               vfloat32m2_t
+#define FLOAT_VX2_T             vfloat32m2x2_t
+#define VGET_VX2                __riscv_vget_v_f32m2x2_f32m2
+#define VSET_VX2                __riscv_vset_v_f32m2_f32m2x2
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e32_v_f32m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2x2
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f32m2
+#define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f32m2
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f32m2
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m2()
+#define FLOAT_V_T               vfloat64m2_t
+#define FLOAT_VX2_T             vfloat64m2x2_t
+#define VGET_VX2                __riscv_vget_v_f64m2x2_f64m2
+#define VSET_VX2                __riscv_vset_v_f64m2_f64m2x2
+#define VLEV_FLOAT              __riscv_vle64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e64_v_f64m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2x2
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m2
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f64m2
+#define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f64m2
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f64m2
+#endif
+
+
+static FLOAT dm1 = -1.;
+
+#ifdef CONJ
+#define GEMM_KERNEL   GEMM_KERNEL_R
+#else
+#define GEMM_KERNEL   GEMM_KERNEL_N
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 1
+#define GEMM_UNROLL_N_SHIFT 0
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 2
+#define GEMM_UNROLL_N_SHIFT 1
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 4
+#define GEMM_UNROLL_N_SHIFT 2
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 8
+#define GEMM_UNROLL_N_SHIFT 3
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 16
+#define GEMM_UNROLL_N_SHIFT 4
+#endif
+
+// Optimizes the implementation in ../arm64/trsm_kernel_RT_sve.c
+
+#ifndef COMPLEX
+
+static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+
+    FLOAT bb;
+    FLOAT *pci, *pcj;
+
+    int i, j, k;
+    FLOAT_V_T va, vc;
+
+    size_t vl;
+
+    a += (n - 1) * m;
+    b += (n - 1) * n;
+
+    for (i = n - 1; i >= 0; i--) {
+
+        bb = *(b + i);
+        pci = c + i * ldc;
+        pcj = c;
+        for (j = m; j > 0; j -= vl) {
+            vl = VSETVL(j);
+            va = VLEV_FLOAT(pci, vl);
+            va = VFMULVF_FLOAT(va, bb, vl);
+            VSEV_FLOAT(a, va, vl);
+            VSEV_FLOAT(pci, va, vl);
+            a   += vl;
+            pci += vl;
+            for (k = 0; k < i; k ++){
+                vc = VLEV_FLOAT(pcj + k * ldc, vl);
+                vc = VFNMSACVF_FLOAT(vc, *(b + k), va, vl);
+                VSEV_FLOAT(pcj + k * ldc, vc, vl);
+            }
+            pcj += vl;
+        }
+        b -= n;
+        a -= 2 * m;
+    }
+}
+
+#else
+
+static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+
+    FLOAT bb1, bb2;
+
+    FLOAT *pci, *pcj;
+
+    int i, j, k;
+
+    FLOAT_VX2_T vax2, vsx2, vcx2;
+    FLOAT_V_T va1, va2, vs1, vs2, vc1, vc2;
+
+    size_t vl;
+
+    a += (n - 1) * m * 2;
+    b += (n - 1) * n * 2;
+
+    for (i = n - 1; i >= 0; i--) {
+
+        bb1 = *(b + i * 2 + 0);
+        bb2 = *(b + i * 2 + 1);
+
+        pci = c + i * ldc * 2;
+        pcj = c;
+        for (j = m; j > 0; j -= vl) {
+            vl = VSETVL(j);
+            vax2 = VLSEG2_FLOAT(pci, vl);
+            va1 = VGET_VX2(vax2, 0);
+            va2 = VGET_VX2(vax2, 1);
+#ifndef CONJ
+            vs1 =   VFMULVF_FLOAT(va1, bb1, vl);
+            vs1 = VFNMSACVF_FLOAT(vs1, bb2, va2, vl);
+            vs2 =   VFMULVF_FLOAT(va1, bb2, vl);
+            vs2 =  VFMACCVF_FLOAT(vs2, bb1, va2, vl);
+#else
+            vs1 =   VFMULVF_FLOAT(va1, bb1, vl);
+            vs1 =  VFMACCVF_FLOAT(vs1, bb2, va2, vl);
+            vs2 =   VFMULVF_FLOAT(va2, bb1, vl);
+            vs2 = VFNMSACVF_FLOAT(vs2, bb2, va1, vl);
+#endif
+            vsx2 = VSET_VX2(vsx2, 0, vs1);
+            vsx2 = VSET_VX2(vsx2, 1, vs2);
+            VSSEG2_FLOAT(a, vsx2, vl);
+            VSSEG2_FLOAT(pci, vsx2, vl);
+            a += vl * 2;
+            pci += vl * 2;
+
+            for (k = 0; k < i; k ++){
+                vcx2 = VLSEG2_FLOAT(pcj + k * ldc * 2, vl);
+                vc1 = VGET_VX2(vcx2, 0);
+                vc2 = VGET_VX2(vcx2, 1);
+#ifndef CONJ
+                vc1 =  VFMACCVF_FLOAT(vc1, *(b + k * 2 + 1), vs2, vl);
+                vc1 = VFNMSACVF_FLOAT(vc1, *(b + k * 2 + 0), vs1, vl);
+                vc2 = VFNMSACVF_FLOAT(vc2, *(b + k * 2 + 1), vs1, vl);
+                vc2 = VFNMSACVF_FLOAT(vc2, *(b + k * 2 + 0), vs2, vl);
+#else
+                vc1 = VFNMSACVF_FLOAT(vc1, *(b + k * 2 + 0), vs1, vl);
+                vc1 = VFNMSACVF_FLOAT(vc1, *(b + k * 2 + 1), vs2, vl);
+                vc2 =  VFMACCVF_FLOAT(vc2, *(b + k * 2 + 1), vs1, vl);
+                vc2 = VFNMSACVF_FLOAT(vc2, *(b + k * 2 + 0), vs2, vl);
+#endif
+                vcx2 = VSET_VX2(vcx2, 0, vc1);
+                vcx2 = VSET_VX2(vcx2, 1, vc2);
+                VSSEG2_FLOAT(pcj + k * ldc * 2, vcx2, vl);
+            }
+            pcj += vl * 2;
+        }
+        b -= n * 2;
+        a -= 4 * m;
+    }
+}
+
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG k,  FLOAT dummy1,
+#ifdef COMPLEX
+    FLOAT dummy2,
+#endif
+    FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){
+
+  BLASLONG i, j;
+  FLOAT *aa, *cc;
+  BLASLONG  kk;
+
+  size_t vl = VSETVL_MAX;
+
+    //fprintf(stderr, "%s , %s, m = %4ld  n = %4ld  k = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, k, offset); // Debug
+
+  kk = n - offset;
+  c += n * ldc * COMPSIZE;
+  b += n * k   * COMPSIZE;
+
+  if (n & (GEMM_UNROLL_N - 1)) {
+
+    j = 1;
+    while (j < GEMM_UNROLL_N) {
+      if (n & j) {
+
+        aa  = a;
+        b -= j * k  * COMPSIZE;
+        c -= j * ldc* COMPSIZE;
+        cc  = c;
+
+        i = vl;
+        if (i <= m) {
+
+          do {
+            if (k - kk > 0) {
+              GEMM_KERNEL(vl, j, k - kk, dm1,
+#ifdef COMPLEX
+                  ZERO,
+#endif
+                  aa + vl * kk * COMPSIZE,
+                  b  +  j            * kk * COMPSIZE,
+                  cc,
+                  ldc);
+            }
+
+            solve(vl, j,
+                aa + (kk - j) * vl * COMPSIZE,
+                b  + (kk - j) * j             * COMPSIZE,
+                cc, ldc);
+
+            aa += vl * k * COMPSIZE;
+            cc += vl     * COMPSIZE;
+            i += vl;
+          } while (i <= m);
+        }
+
+        i = m % vl;
+        if (i) {
+          if (k - kk > 0) {
+            GEMM_KERNEL(i, j, k - kk, dm1,
+#ifdef COMPLEX
+                ZERO,
+#endif
+                aa + i * kk * COMPSIZE,
+                b  + j * kk * COMPSIZE,
+                cc, ldc);
+          }
+
+          solve(i, j,
+              aa + (kk - j) * i * COMPSIZE,
+              b  + (kk - j) * j * COMPSIZE,
+              cc, ldc);
+
+          aa += i * k * COMPSIZE;
+          cc += i     * COMPSIZE;
+
+        }
+        kk -= j;
+      }
+      j <<= 1;
+    }
+  }
+
+  j = (n >> GEMM_UNROLL_N_SHIFT);
+
+  if (j > 0) {
+
+    do {
+      aa  = a;
+      b -= GEMM_UNROLL_N * k   * COMPSIZE;
+      c -= GEMM_UNROLL_N * ldc * COMPSIZE;
+      cc  = c;
+
+      i = vl;
+      if (i <= m) {
+	do {
+	  if (k - kk > 0) {
+	    GEMM_KERNEL(vl, GEMM_UNROLL_N, k - kk, dm1,
+#ifdef COMPLEX
+			ZERO,
+#endif
+			aa + vl * kk * COMPSIZE,
+			b  + GEMM_UNROLL_N * kk * COMPSIZE,
+			cc,
+			ldc);
+	  }
+
+	  solve(vl, GEMM_UNROLL_N,
+		aa + (kk - GEMM_UNROLL_N) * vl * COMPSIZE,
+		b  + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE,
+		cc, ldc);
+
+	  aa += vl * k * COMPSIZE;
+	  cc += vl     * COMPSIZE;
+	  i += vl;
+	} while (i <= m);
+      }
+
+      i = m % vl;
+      if (i) {
+	    if (k - kk > 0) {
+	      GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1,
+#ifdef COMPLEX
+			  ZERO,
+#endif
+			  aa + i             * kk * COMPSIZE,
+			  b  + GEMM_UNROLL_N * kk * COMPSIZE,
+			  cc,
+			  ldc);
+	    }
+
+	    solve(i, GEMM_UNROLL_N,
+		  aa + (kk - GEMM_UNROLL_N) * i             * COMPSIZE,
+		  b  + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE,
+		  cc, ldc);
+
+	    aa += i * k * COMPSIZE;
+	    cc += i     * COMPSIZE;
+
+      }
+
+      kk -= GEMM_UNROLL_N;
+      j --;
+    } while (j > 0);
+  }
+
+  return 0;
+}
+
+
diff --git a/kernel/riscv64/trsm_lncopy_rvv_v1.c b/kernel/riscv64/trsm_lncopy_rvv_v1.c
new file mode 100644
index 000000000..41c84be25
--- /dev/null
+++ b/kernel/riscv64/trsm_lncopy_rvv_v1.c
@@ -0,0 +1,122 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define FLOAT_V_T               vfloat32m2_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#define VSEV_FLOAT_M            __riscv_vse32_v_f32m2_m
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m2
+#define VBOOL_T                 vbool16_t
+#define UINT_V_T                vuint32m2_t
+#define VID_V_UINT              __riscv_vid_v_u32m2
+#define VMSLTU_VX_UINT          __riscv_vmsltu_vx_u32m2_b16
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define FLOAT_V_T               vfloat64m2_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
+#define VSEV_FLOAT_M            __riscv_vse64_v_f64m2_m
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m2
+#define VBOOL_T                 vbool32_t
+#define UINT_V_T                vuint64m2_t
+#define VID_V_UINT              __riscv_vid_v_u64m2
+#define VMSLTU_VX_UINT          __riscv_vmsltu_vx_u64m2_b32
+
+#endif
+
+#ifndef UNIT
+#define INV(a) (ONE / (a))
+#else
+#define INV(a) (ONE)
+#endif
+
+// Optimizes the implementation in ../arm64/trsm_lncopy_sve.c
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
+
+    BLASLONG i, ii, jj, js;
+
+    FLOAT *ao;
+
+    jj = offset;
+
+    BLASLONG stride_lda = sizeof(FLOAT)*lda;
+
+    FLOAT_V_T va1;
+    VBOOL_T vbool_cmp;
+    UINT_V_T vindex;
+    size_t vl;
+
+    for (js = n; js > 0; js -= vl)
+    {
+        vl = VSETVL(js);
+        ao = a;
+
+        ii = 0;
+        for (i = 0; i < m;)
+        {
+            if (ii == jj) 
+            {
+                vindex  = VID_V_UINT(vl);
+                for (unsigned int j = 0; j < vl; j++) 
+                {
+                    va1 = VLSEV_FLOAT(ao, stride_lda, vl);
+                    vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl);
+                    VSEV_FLOAT_M(vbool_cmp, b, va1, vl);
+
+                    *(b + j) = INV(*(ao + j * lda));
+                    ao++;
+                    b += vl;
+                }
+                i += vl;
+                ii += vl;
+            }
+            else
+            {
+                if (ii > jj)
+                {
+                    va1 = VLSEV_FLOAT(ao, stride_lda, vl);
+                    VSEV_FLOAT(b, va1, vl);
+                }
+                ao++;
+                b += vl;
+                i++;
+                ii++;
+            }
+        }
+
+        a += vl * lda;
+        jj += vl;
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/trsm_ltcopy_rvv_v1.c b/kernel/riscv64/trsm_ltcopy_rvv_v1.c
new file mode 100644
index 000000000..003bd3465
--- /dev/null
+++ b/kernel/riscv64/trsm_ltcopy_rvv_v1.c
@@ -0,0 +1,122 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define FLOAT_V_T               vfloat32m2_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#define VSEV_FLOAT_M            __riscv_vse32_v_f32m2_m
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m2
+#define VBOOL_T                 vbool16_t
+#define UINT_V_T                vuint32m2_t
+#define VID_V_UINT              __riscv_vid_v_u32m2
+#define VMSGTU_VX_UINT          __riscv_vmsgtu_vx_u32m2_b16
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define FLOAT_V_T               vfloat64m2_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
+#define VSEV_FLOAT_M            __riscv_vse64_v_f64m2_m
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m2
+#define VBOOL_T                 vbool32_t
+#define UINT_V_T                vuint64m2_t
+#define VID_V_UINT              __riscv_vid_v_u64m2
+#define VMSGTU_VX_UINT          __riscv_vmsgtu_vx_u64m2_b32
+#endif
+
+#ifndef UNIT
+#define INV(a) (ONE / (a))
+#else
+#define INV(a) (ONE)
+#endif
+
+// Optimizes the implementation in ../arm64/trsm_ltcopy_sve.c
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
+
+    BLASLONG i, ii, jj, js;
+
+    FLOAT *ao;
+
+    jj = offset;
+
+    FLOAT_V_T va1;
+    VBOOL_T vbool_cmp;
+    UINT_V_T vindex;
+
+    size_t vl;
+
+    for (js = n; js > 0; js -= vl)
+    {
+        vl = VSETVL(js);
+        ao = a;
+
+        ii = 0;
+        for (i = 0; i < m;)
+        {
+
+            if (ii == jj) 
+            {
+                vindex  = VID_V_UINT(vl);
+                for (unsigned int j = 0; j < vl; j++) 
+                {
+                    *(b + j) = INV(*(ao + j));
+
+                    va1 = VLEV_FLOAT(ao, vl);
+                    vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl);
+                    VSEV_FLOAT_M(vbool_cmp, b, va1, vl);
+
+                    b += vl;
+                    ao += lda;
+                }
+                i += vl;
+                ii += vl;
+            }
+            else 
+            {
+                if (ii < jj) 
+                {
+                    va1 = VLEV_FLOAT(ao, vl);
+                    VSEV_FLOAT(b, va1, vl);
+                }
+                ao += lda;
+                b += vl;
+                i ++;
+                ii ++;
+            }
+        }
+
+        a += vl;
+        jj += vl;
+    }
+    return 0;
+}
+
diff --git a/kernel/riscv64/trsm_uncopy_rvv_v1.c b/kernel/riscv64/trsm_uncopy_rvv_v1.c
new file mode 100644
index 000000000..6cca5d49c
--- /dev/null
+++ b/kernel/riscv64/trsm_uncopy_rvv_v1.c
@@ -0,0 +1,121 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include <stdio.h>
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define FLOAT_V_T               vfloat32m2_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#define VSEV_FLOAT_M            __riscv_vse32_v_f32m2_m
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m2
+#define VBOOL_T                 vbool16_t
+#define UINT_V_T                vuint32m2_t
+#define VID_V_UINT              __riscv_vid_v_u32m2
+#define VMSGTU_VX_UINT          __riscv_vmsgtu_vx_u32m2_b16
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define FLOAT_V_T               vfloat64m2_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
+#define VSEV_FLOAT_M            __riscv_vse64_v_f64m2_m
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m2
+#define VBOOL_T                 vbool32_t
+#define UINT_V_T                vuint64m2_t
+#define VID_V_UINT              __riscv_vid_v_u64m2
+#define VMSGTU_VX_UINT          __riscv_vmsgtu_vx_u64m2_b32
+#endif
+
+
+#ifndef UNIT
+#define INV(a) (ONE / (a))
+#else
+#define INV(a) (ONE)
+#endif
+
+// Optimizes the implementation in ../arm64/trsm_uncopy_sve.c
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
+
+    BLASLONG i, ii, jj, js;
+    BLASLONG stride_lda = sizeof(FLOAT)*lda;
+
+    FLOAT *ao;
+    jj = offset;
+
+    FLOAT_V_T va1;
+    VBOOL_T vbool_cmp;
+    UINT_V_T vindex;
+
+    size_t vl;
+
+    for (js = n; js > 0; js -= vl)
+    {
+        vl = VSETVL(js);
+        ao = a;
+
+        i = 0;
+        ii = 0;
+        for (i = 0; i < m;)
+        {
+            if (ii == jj) 
+            {
+                vindex  = VID_V_UINT(vl);
+                for (unsigned int j = 0; j < vl; j++) 
+                {
+                    *(b + j) = INV(*(ao + j * lda));
+                    va1 = VLSEV_FLOAT(ao, stride_lda, vl);
+                    vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl);
+                    VSEV_FLOAT_M(vbool_cmp, b, va1, vl);
+                    ao++;
+                    b += vl;
+                }
+                i += vl;
+                ii += vl;
+            } 
+            else
+            {
+                if (ii < jj) 
+                {
+                    va1 = VLSEV_FLOAT(ao, stride_lda, vl);
+                    VSEV_FLOAT(b, va1, vl);
+                }
+                ao++;
+                b += vl;
+                i++;
+                ii++;
+            }
+        } 
+
+        a += vl * lda;
+        jj += vl;
+    }
+    return 0;
+}
diff --git a/kernel/riscv64/trsm_utcopy_rvv_v1.c b/kernel/riscv64/trsm_utcopy_rvv_v1.c
new file mode 100644
index 000000000..bc058525f
--- /dev/null
+++ b/kernel/riscv64/trsm_utcopy_rvv_v1.c
@@ -0,0 +1,123 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define FLOAT_V_T               vfloat32m2_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#define VSEV_FLOAT_M            __riscv_vse32_v_f32m2_m
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m2
+#define VBOOL_T                 vbool16_t
+#define UINT_V_T                vuint32m2_t
+#define VID_V_UINT              __riscv_vid_v_u32m2
+#define VMSLTU_VX_UINT          __riscv_vmsltu_vx_u32m2_b16
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define FLOAT_V_T               vfloat64m2_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
+#define VSEV_FLOAT_M            __riscv_vse64_v_f64m2_m
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m2
+#define VBOOL_T                 vbool32_t
+#define UINT_V_T                vuint64m2_t
+#define VID_V_UINT              __riscv_vid_v_u64m2
+#define VMSLTU_VX_UINT          __riscv_vmsltu_vx_u64m2_b32
+#endif
+
+
+#ifndef UNIT
+#define INV(a) (ONE / (a))
+#else
+#define INV(a) (ONE)
+#endif
+
+// Optimizes the implementation in ../arm64/trsm_utcopy_sve.c
+
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
+
+    BLASLONG i, ii, jj, js;
+
+    FLOAT *ao;
+
+    jj = offset;
+    FLOAT_V_T va1;
+
+    VBOOL_T vbool_cmp;
+    UINT_V_T vindex;
+
+    size_t vl;
+  
+    for (js = n; js > 0; js -= vl)
+    {
+        vl = VSETVL(js);
+        ao = a;
+
+        ii = 0;
+        for (i = 0; i < m;)
+        {
+
+            if (ii == jj) 
+            {
+                vindex  = VID_V_UINT(vl);
+                for (unsigned int j = 0; j < vl; j++) 
+                {
+                    va1 = VLEV_FLOAT(ao, vl);
+                    vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl);
+                    VSEV_FLOAT_M(vbool_cmp, b, va1, vl);
+                    *(b + j) = INV(*(ao + j));
+
+                    ao += lda;
+                    b += vl;
+                }
+                i += vl;
+                ii += vl;
+            } 
+            else 
+            {
+                if (ii > jj) 
+                {
+                    va1 = VLEV_FLOAT(ao, vl);
+                    VSEV_FLOAT(b, va1, vl);
+                }
+                ao += lda;
+                b += vl;
+                i ++;
+                ii ++;
+            }
+        }
+
+        a += vl;
+        jj += vl;
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/zamax_rvv.c b/kernel/riscv64/zamax_rvv.c
new file mode 100644
index 000000000..180cf059a
--- /dev/null
+++ b/kernel/riscv64/zamax_rvv.c
@@ -0,0 +1,124 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <float.h>
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m4(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m4()
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e32m1()
+#define FLOAT_V_T               vfloat32m4_t
+#define FLOAT_V_T_M1            vfloat32m1_t
+#define FLOAT_VX2_T             vfloat32m4x2_t
+#define VGET_VX2                __riscv_vget_v_f32m4x2_f32m4
+#define VLSEG_FLOAT             __riscv_vlseg2e32_v_f32m4x2
+#define VLSSEG_FLOAT            __riscv_vlsseg2e32_v_f32m4x2
+#define VFREDMAXVS_FLOAT        __riscv_vfredmax_vs_f32m4_f32m1
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m4
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
+#define VFMAXVV_FLOAT_TU        __riscv_vfmax_vv_f32m4_tu
+#define VFADDVV_FLOAT           __riscv_vfadd_vv_f32m4
+#define VFABSV_FLOAT            __riscv_vfabs_v_f32m4
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m4(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m4()
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e64m1()
+#define FLOAT_V_T               vfloat64m4_t
+#define FLOAT_V_T_M1            vfloat64m1_t
+#define FLOAT_VX2_T             vfloat64m4x2_t
+#define VGET_VX2                __riscv_vget_v_f64m4x2_f64m4
+#define VLSEG_FLOAT             __riscv_vlseg2e64_v_f64m4x2
+#define VLSSEG_FLOAT            __riscv_vlsseg2e64_v_f64m4x2
+#define VFREDMAXVS_FLOAT        __riscv_vfredmax_vs_f64m4_f64m1
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m4
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
+#define VFMAXVV_FLOAT_TU        __riscv_vfmax_vv_f64m4_tu
+#define VFADDVV_FLOAT           __riscv_vfadd_vv_f64m4
+#define VFABSV_FLOAT            __riscv_vfabs_v_f64m4
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+    FLOAT maxf=0.0;
+
+    if (n <= 0 || inc_x <= 0) return(maxf);
+
+    FLOAT_V_T v0, v1, vmax;
+    FLOAT_V_T_M1 v_res;
+    FLOAT_VX2_T vx2;
+
+    v_res = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1);
+    size_t vlmax = VSETVL_MAX;
+    vmax = VFMVVF_FLOAT(0.0, vlmax);
+
+    if(inc_x == 1) {
+
+        for (size_t vl; n > 0; n -= vl, x += vl*2) {
+            vl = VSETVL(n);
+
+            vx2 = VLSEG_FLOAT(x, vl);
+            
+            v0 = VGET_VX2(vx2, 0);
+            v1 = VGET_VX2(vx2, 1);
+
+            v0 = VFABSV_FLOAT(v0, vl);
+            v1 = VFABSV_FLOAT(v1, vl);
+
+            v0 = VFADDVV_FLOAT(v0, v1, vl);
+            vmax = VFMAXVV_FLOAT_TU(vmax, vmax, v0, vl);
+  
+        }
+
+    } else {
+
+        BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2;
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) {
+            vl = VSETVL(n);
+
+            vx2 = VLSSEG_FLOAT(x, stride_x, vl);
+
+            v0 = VGET_VX2(vx2, 0);
+            v1 = VGET_VX2(vx2, 1);
+
+            v0 = VFABSV_FLOAT(v0, vl);
+            v1 = VFABSV_FLOAT(v1, vl);
+
+            v0 = VFADDVV_FLOAT(v0, v1, vl);
+            vmax = VFMAXVV_FLOAT_TU(vmax, vmax, v0, vl);
+        }
+
+    }
+
+    v_res = VFREDMAXVS_FLOAT(vmax, v_res, vlmax);
+    maxf = VFMVFS_FLOAT_M1(v_res);
+
+    return(maxf);
+}
diff --git a/kernel/riscv64/zamax_vector.c b/kernel/riscv64/zamax_vector.c
index bfb282ae0..ec4a5a1e9 100644
--- a/kernel/riscv64/zamax_vector.c
+++ b/kernel/riscv64/zamax_vector.c
@@ -28,39 +28,47 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 #include <math.h>
 
-#if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m8(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
-#define FLOAT_V_T vfloat32m8_t
-#define FLOAT_V_T_M1 vfloat32m1_t
-#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLSEV_FLOAT vlse32_v_f32m8
-#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1
-#define MASK_T vbool4_t
-#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
-#define VFMVVF_FLOAT vfmv_v_f_f32m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m
-#define VFMAXVV_FLOAT vfmax_vv_f32m8
-#define VFADDVV_FLOAT vfadd_vv_f32m8
-
+#ifdef RISCV64_ZVL256B
+#       define LMUL m2
+#       if defined(DOUBLE)
+#               define ELEN 64
+#               define MLEN 32
+#       else
+#               define ELEN 32
+#               define MLEN 16
+#       endif
 #else
-#define VSETVL(n) vsetvl_e64m8(n)
-#define VSETVL_MAX vsetvlmax_e64m1()
-#define FLOAT_V_T vfloat64m8_t
-#define FLOAT_V_T_M1 vfloat64m1_t
-#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLSEV_FLOAT vlse64_v_f64m8
-#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1
-#define MASK_T vbool8_t
-#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
-#define VFMVVF_FLOAT vfmv_v_f_f64m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m
-#define VFMAXVV_FLOAT vfmax_vv_f64m8
-#define VFADDVV_FLOAT vfadd_vv_f64m8
+#       define LMUL m8
+#       if defined(DOUBLE)
+#               define ELEN 64
+#               define MLEN 8
+#       else
+#               define ELEN 32
+#               define MLEN 4
+#       endif
+#endif
+
+#define _
+#define JOIN2_X(x, y) x ## y
+#define JOIN2(x, y) JOIN2_X(x, y)
+#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
 
+#define VSETVL          JOIN(RISCV_RVV(vsetvl),    _e,     ELEN,   LMUL,   _)
+#define FLOAT_V_T       JOIN(vfloat,            ELEN,   LMUL,   _t,     _)
+#define FLOAT_V_T_M1    JOIN(vfloat,            ELEN,   m1,     _t,     _)
+#define VLEV_FLOAT      JOIN(RISCV_RVV(vle),       ELEN,   _v_f,   ELEN,   LMUL)
+#define VLSEV_FLOAT     JOIN(RISCV_RVV(vlse),      ELEN,   _v_f,   ELEN,   LMUL)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDMAXVS_FLOAT(va,vb,gvl) JOIN(RISCV_RVV(vfredmax_vs_f),  ELEN,   LMUL,   _f, JOIN2( ELEN,   m1)) (v_res, va, vb, gvl)
+#else
+#define VFREDMAXVS_FLOAT JOIN(RISCV_RVV(vfredmax_vs_f),  ELEN,   LMUL,   _f, JOIN2( ELEN,   m1))
 #endif
+#define MASK_T          JOIN(vbool,             MLEN,   _t,     _,      _)
+#define VFMVVF_FLOAT    JOIN(RISCV_RVV(vfmv),      _v_f_f, ELEN,   LMUL,   _)
+#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv),      _v_f_f, ELEN,   m1,     _)
+#define VFMAXVV_FLOAT   JOIN(RISCV_RVV(vfmax),     _vv_f,  ELEN,   LMUL,   _)
+#define VFADDVV_FLOAT   JOIN(RISCV_RVV(vfadd),     _vv_f,  ELEN,   LMUL,   _)
+#define VFABSV_FLOAT   JOIN(RISCV_RVV(vfabs),     _v_f,  ELEN,   LMUL,   _)
 
 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
@@ -70,10 +78,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	if (n <= 0 || inc_x <= 0) return(maxf);
         unsigned int gvl = 0;
         FLOAT_V_T v0, v1, v_max;
-        FLOAT_V_T_M1 v_res, v_z0;
-        gvl = VSETVL_MAX;
-        v_res = VFMVVF_FLOAT_M1(0, gvl);
-        v_z0 = VFMVVF_FLOAT_M1(0, gvl);
+        FLOAT_V_T_M1 v_res;
+        v_res = VFMVVF_FLOAT_M1(0, 1);
 
         MASK_T mask0, mask1;
         BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2;
@@ -83,10 +89,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
         for(; i<n/gvl; i++){
                 v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
                 v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
-                mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
-                v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl);
-                mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
-                v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl);
+
+                v0 = VFABSV_FLOAT(v0, gvl);
+                v1 = VFABSV_FLOAT(v1, gvl);
 
                 v0 = VFADDVV_FLOAT(v0, v1, gvl);
                 v_max = VFMAXVV_FLOAT(v_max, v0, gvl);
@@ -94,22 +99,17 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                 j += gvl;
                 ix += inc_xv;
         }
-        v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl);
-        maxf = VFMVFS_FLOAT(v_res);
+        v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl);
 
         if(j<n){
                 gvl = VSETVL(n-j);
                 v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
                 v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
-                mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
-                v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl);
-                mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
-                v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl);
+                v0 = VFABSV_FLOAT(v0, gvl);
+                v1 = VFABSV_FLOAT(v1, gvl);
                 v1 = VFADDVV_FLOAT(v0, v1, gvl);
-                v_res = VFREDMAXVS_FLOAT(v_res, v1, v_z0, gvl);
-                             
-                if(VFMVFS_FLOAT(v_res)> maxf)
-                        maxf = VFMVFS_FLOAT(v_res);
+                v_res = VFREDMAXVS_FLOAT(v1, v_res, gvl);
         }
+        maxf = EXTRACT_FLOAT(v_res);
         return(maxf);
 }
diff --git a/kernel/riscv64/zamin_rvv.c b/kernel/riscv64/zamin_rvv.c
new file mode 100644
index 000000000..56a467502
--- /dev/null
+++ b/kernel/riscv64/zamin_rvv.c
@@ -0,0 +1,123 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <float.h>
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m4(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m4()
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e32m1()
+#define FLOAT_V_T               vfloat32m4_t
+#define FLOAT_V_T_M1            vfloat32m1_t
+#define FLOAT_VX2_T             vfloat32m4x2_t
+#define VGET_VX2                __riscv_vget_v_f32m4x2_f32m4
+#define VLSEG_FLOAT             __riscv_vlseg2e32_v_f32m4x2
+#define VLSSEG_FLOAT            __riscv_vlsseg2e32_v_f32m4x2
+#define VFREDMINVS_FLOAT        __riscv_vfredmin_vs_f32m4_f32m1
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m4
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
+#define VFMINVV_FLOAT_TU        __riscv_vfmin_vv_f32m4_tu
+#define VFADDVV_FLOAT           __riscv_vfadd_vv_f32m4
+#define VFABSV_FLOAT            __riscv_vfabs_v_f32m4
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m4(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m4()
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e64m1()
+#define FLOAT_V_T               vfloat64m4_t
+#define FLOAT_V_T_M1            vfloat64m1_t
+#define FLOAT_VX2_T             vfloat64m4x2_t
+#define VGET_VX2                __riscv_vget_v_f64m4x2_f64m4
+#define VLSEG_FLOAT             __riscv_vlseg2e64_v_f64m4x2
+#define VLSSEG_FLOAT            __riscv_vlsseg2e64_v_f64m4x2
+#define VFREDMINVS_FLOAT        __riscv_vfredmin_vs_f64m4_f64m1
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m4
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
+#define VFMINVV_FLOAT_TU        __riscv_vfmin_vv_f64m4_tu
+#define VFADDVV_FLOAT           __riscv_vfadd_vv_f64m4
+#define VFABSV_FLOAT            __riscv_vfabs_v_f64m4
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+    FLOAT minf=0.0;
+
+    if (n <= 0 || inc_x <= 0) return(minf);
+
+    FLOAT_V_T v0, v1, vmin;
+    FLOAT_V_T_M1 v_res;
+    FLOAT_VX2_T vx2;
+
+    v_res = VFMVVF_FLOAT_M1(FLT_MAX, VSETVL_MAX_M1);
+    size_t vlmax = VSETVL_MAX;
+    vmin = VFMVVF_FLOAT(FLT_MAX, vlmax);
+
+    if(inc_x == 1) {
+
+        for (size_t vl; n > 0; n -= vl, x += vl*2) {
+            vl = VSETVL(n);
+
+            vx2 = VLSEG_FLOAT(x, vl);
+            
+            v0 = VGET_VX2(vx2, 0);
+            v1 = VGET_VX2(vx2, 1);
+
+            v0 = VFABSV_FLOAT(v0, vl);
+            v1 = VFABSV_FLOAT(v1, vl);
+
+            v0 = VFADDVV_FLOAT(v0, v1, vl);
+            vmin = VFMINVV_FLOAT_TU(vmin, vmin, v0, vl);
+        }
+
+    } else {
+
+        BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2;
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) {
+            vl = VSETVL(n);
+
+            vx2 = VLSSEG_FLOAT(x, stride_x, vl);
+
+            v0 = VGET_VX2(vx2, 0);
+            v1 = VGET_VX2(vx2, 1);
+
+            v0 = VFABSV_FLOAT(v0, vl);
+            v1 = VFABSV_FLOAT(v1, vl);
+
+            v0 = VFADDVV_FLOAT(v0, v1, vl);
+            vmin = VFMINVV_FLOAT_TU(vmin, vmin, v0, vl);
+        }
+
+    }
+
+    v_res = VFREDMINVS_FLOAT(vmin, v_res, vlmax);
+    minf = VFMVFS_FLOAT_M1(v_res);
+
+    return(minf);
+}
diff --git a/kernel/riscv64/zamin_vector.c b/kernel/riscv64/zamin_vector.c
index d9eca7f10..45b3e0b9d 100644
--- a/kernel/riscv64/zamin_vector.c
+++ b/kernel/riscv64/zamin_vector.c
@@ -29,38 +29,49 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <math.h>
 #include <float.h>
 
-#if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m8(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
-#define FLOAT_V_T vfloat32m8_t
-#define FLOAT_V_T_M1 vfloat32m1_t
-#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLSEV_FLOAT vlse32_v_f32m8
-#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1
-#define MASK_T vbool4_t
-#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
-#define VFMVVF_FLOAT vfmv_v_f_f32m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m
-#define VFMINVV_FLOAT vfmin_vv_f32m8
-#define VFADDVV_FLOAT vfadd_vv_f32m8
+
+#ifdef RISCV64_ZVL256B
+#       define LMUL m2
+#       if defined(DOUBLE)
+#               define ELEN 64
+#               define MLEN 32
+#       else
+#               define ELEN 32
+#               define MLEN 16
+#       endif
 #else
-#define VSETVL(n) vsetvl_e64m8(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
-#define FLOAT_V_T vfloat64m8_t
-#define FLOAT_V_T_M1 vfloat64m1_t
-#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLSEV_FLOAT vlse64_v_f64m8
-#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1
-#define MASK_T vbool8_t
-#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
-#define VFMVVF_FLOAT vfmv_v_f_f64m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m
-#define VFMINVV_FLOAT vfmin_vv_f64m8
-#define VFADDVV_FLOAT vfadd_vv_f64m8
+#       define LMUL m8
+#       if defined(DOUBLE)
+#               define ELEN 64
+#               define MLEN 8
+#       else
+#               define ELEN 32
+#               define MLEN 4
+#       endif
 #endif
 
+#define _
+#define JOIN2_X(x, y) x ## y
+#define JOIN2(x, y) JOIN2_X(x, y)
+#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
+
+#define VSETVL          JOIN(RISCV_RVV(vsetvl),    _e,     ELEN,   LMUL,   _)
+#define FLOAT_V_T       JOIN(vfloat,            ELEN,   LMUL,   _t,     _)
+#define FLOAT_V_T_M1    JOIN(vfloat,            ELEN,   m1,     _t,     _)
+#define VLEV_FLOAT      JOIN(RISCV_RVV(vle),       ELEN,   _v_f,   ELEN,   LMUL)
+#define VLSEV_FLOAT     JOIN(RISCV_RVV(vlse),      ELEN,   _v_f,   ELEN,   LMUL)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDMINVS_FLOAT(va,vb,gvl) JOIN(RISCV_RVV(vfredmin_vs_f),  ELEN,   LMUL,   _f, JOIN2( ELEN,   m1)) (v_res, va, vb, gvl)
+#else
+#define VFREDMINVS_FLOAT JOIN(RISCV_RVV(vfredmin_vs_f),  ELEN,   LMUL,   _f, JOIN2( ELEN,   m1))
+#endif
+#define MASK_T          JOIN(vbool,             MLEN,   _t,     _,      _)
+#define VFMVVF_FLOAT    JOIN(RISCV_RVV(vfmv),      _v_f_f, ELEN,   LMUL,   _)
+#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv),      _v_f_f, ELEN,   m1,     _)
+#define VFMINVV_FLOAT   JOIN(RISCV_RVV(vfmin),     _vv_f,  ELEN,   LMUL,   _)
+#define VFADDVV_FLOAT   JOIN(RISCV_RVV(vfadd),     _vv_f,  ELEN,   LMUL,   _)
+#define VFABSV_FLOAT   JOIN(RISCV_RVV(vfabs),     _v_f,  ELEN,   LMUL,   _)
+
 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
 	BLASLONG i=0, j=0;
@@ -69,10 +80,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	FLOAT minf=FLT_MAX;
         unsigned int gvl = 0;
         FLOAT_V_T v0, v1, v_min;
-        FLOAT_V_T_M1 v_res, v_max;
-        gvl = VSETVL_MAX;
-        v_res = VFMVVF_FLOAT_M1(0, gvl);
-        v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl);
+        FLOAT_V_T_M1 v_res;
+        v_res = VFMVVF_FLOAT_M1(FLT_MAX, 1);
 
         MASK_T mask0, mask1;
         BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2;
@@ -82,10 +91,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
         for(; i<n/gvl; i++){
                 v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
                 v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
-                mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
-                v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl);
-                mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
-                v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl);
+
+                v0 = VFABSV_FLOAT(v0, gvl);
+                v1 = VFABSV_FLOAT(v1, gvl);
 
                 v0 = VFADDVV_FLOAT(v0, v1, gvl);
                 v_min = VFMINVV_FLOAT(v_min, v0, gvl);
@@ -93,21 +101,18 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                 j += gvl;
                 ix += inc_xv;
         }
-        v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
-        minf = VFMVFS_FLOAT(v_res);
+        v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl);
 
         if(j<n){
                 gvl = VSETVL(n-j);
                 v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
                 v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
-                mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
-                v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl);
-                mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
-                v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl);
+                v0 = VFABSV_FLOAT(v0, gvl);
+                v1 = VFABSV_FLOAT(v1, gvl);
                 v1 = VFADDVV_FLOAT(v0, v1, gvl);
-                v_res = VFREDMINVS_FLOAT(v_res, v1, v_max, gvl);
-                if(VFMVFS_FLOAT(v_res) < minf)
-                        minf = VFMVFS_FLOAT(v_res);
+                v_res = VFREDMINVS_FLOAT(v1, v_res, gvl);
         }
+
+        minf = EXTRACT_FLOAT(v_res);
         return(minf);
 }
diff --git a/kernel/riscv64/zasum_rvv.c b/kernel/riscv64/zasum_rvv.c
new file mode 100644
index 000000000..ebec1b19c
--- /dev/null
+++ b/kernel/riscv64/zasum_rvv.c
@@ -0,0 +1,107 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m8()
+#define FLOAT_V_T               vfloat32m8_t
+#define FLOAT_V_T_M1            vfloat32m1_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m8
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m8
+#define VFREDSUMVS_FLOAT        __riscv_vfredusum_vs_f32m8_f32m1
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m8
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
+#define VFADDVV_FLOAT_TU        __riscv_vfadd_vv_f32m8_tu
+#define VFABSV_FLOAT            __riscv_vfabs_v_f32m8
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m8()
+#define FLOAT_V_T               vfloat64m8_t
+#define FLOAT_V_T_M1            vfloat64m1_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m8
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m8
+#define VFREDSUMVS_FLOAT        __riscv_vfredusum_vs_f64m8_f64m1
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m8
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
+#define VFADDVV_FLOAT_TU        __riscv_vfadd_vv_f64m8_tu
+#define VFABSV_FLOAT            __riscv_vfabs_v_f64m8
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+    FLOAT asumf = 0.0;
+    if (n <= 0 || inc_x <= 0) return(asumf);
+
+    FLOAT_V_T v0, v1;
+    size_t vlmax = VSETVL_MAX; 
+    FLOAT_V_T v_sum = VFMVVF_FLOAT(0, vlmax);
+
+    if(inc_x == 1) {
+
+        for (size_t vl; n > 0; n -= vl, x += vl*2) {
+            vl = VSETVL(n);
+
+            v0 = VLEV_FLOAT(x, vl);
+            v1 = VLEV_FLOAT(x+vl, vl);
+
+            v0 = VFABSV_FLOAT(v0, vl);
+            v1 = VFABSV_FLOAT(v1, vl);
+
+            v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v0, vl);
+            v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v1, vl);
+        }
+
+    }
+    else {
+
+        int stride_x = inc_x * sizeof(FLOAT) * 2;
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) {
+            vl = VSETVL(n);
+
+            v0 = VLSEV_FLOAT(x, stride_x, vl);
+            v1 = VLSEV_FLOAT(x+1, stride_x, vl);
+
+            v0 = VFABSV_FLOAT(v0, vl);
+            v1 = VFABSV_FLOAT(v1, vl);
+
+            v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v0, vl);
+            v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v1, vl);
+        }
+
+    }
+
+    FLOAT_V_T_M1 v_res = VFMVVF_FLOAT_M1(0, vlmax);
+    v_res = VFREDSUMVS_FLOAT(v_sum, v_res, vlmax);
+    asumf += VFMVFS_FLOAT_M1(v_res);
+
+    return(asumf);
+}
diff --git a/kernel/riscv64/zasum_vector.c b/kernel/riscv64/zasum_vector.c
index 0d1cc42f1..fca904d6a 100644
--- a/kernel/riscv64/zasum_vector.c
+++ b/kernel/riscv64/zasum_vector.c
@@ -28,37 +28,47 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 #include <math.h>
 
-#if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m8(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
-#define FLOAT_V_T vfloat32m8_t
-#define FLOAT_V_T_M1 vfloat32m1_t
-#define VFFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLEV_FLOAT vle32_v_f32m8
-#define VLSEV_FLOAT vlse32_v_f32m8
-#define VFREDSUMVS_FLOAT vfredusum_vs_f32m8_f32m1
-#define MASK_T vbool4_t
-#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
-#define VFMVVF_FLOAT vfmv_v_f_f32m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m
-#define VFADDVV_FLOAT vfadd_vv_f32m8
+#ifdef RISCV64_ZVL256B
+#       define LMUL m2
+#       if defined(DOUBLE)
+#               define ELEN 64
+#               define MLEN _b32
+#       else
+#               define ELEN 32
+#               define MLEN _b16
+#       endif
 #else
-#define VSETVL(n) vsetvl_e64m8(n)
-#define VSETVL_MAX vsetvlmax_e64m1()
-#define FLOAT_V_T vfloat64m8_t
-#define FLOAT_V_T_M1 vfloat64m1_t
-#define VFFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLEV_FLOAT vle64_v_f64m8
-#define VLSEV_FLOAT vlse64_v_f64m8
-#define VFREDSUMVS_FLOAT vfredusum_vs_f64m8_f64m1
-#define MASK_T vbool8_t
-#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
-#define VFMVVF_FLOAT vfmv_v_f_f64m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m
-#define VFADDVV_FLOAT vfadd_vv_f64m8
+#       define LMUL m8
+#       if defined(DOUBLE)
+#               define ELEN 64
+#               define MLEN _b8
+#       else
+#               define ELEN 32
+#               define MLEN _b4
+#       endif
 #endif
+
+#define _
+#define JOIN2_X(x, y) x ## y
+#define JOIN2(x, y) JOIN2_X(x, y)
+#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
+
+#define VSETVL          JOIN(RISCV_RVV(vsetvl),    _e,     ELEN,   LMUL,   _)
+#define FLOAT_V_T       JOIN(vfloat,            ELEN,   LMUL,   _t,     _)
+#define FLOAT_V_T_M1    JOIN(vfloat,            ELEN,   m1,     _t,     _)
+#define VLEV_FLOAT      JOIN(RISCV_RVV(vle),       ELEN,   _v_f,   ELEN,   LMUL)
+#define VLSEV_FLOAT     JOIN(RISCV_RVV(vlse),      ELEN,   _v_f,   ELEN,   LMUL)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDSUMVS_FLOAT(va, vb, gvl) JOIN(RISCV_RVV(vfredusum_vs_f),  ELEN,   LMUL,   _f, JOIN2( ELEN,   m1))(v_res, va, vb, gvl)
+#else
+#define VFREDSUMVS_FLOAT JOIN(RISCV_RVV(vfredusum_vs_f),  ELEN,   LMUL,   _f, JOIN2( ELEN,   m1))
+#endif
+#define VFABS_FLOAT     JOIN(RISCV_RVV(vfabs),     _v_f,   ELEN,   LMUL,   _)
+#define VFMVVF_FLOAT    JOIN(RISCV_RVV(vfmv),      _v_f_f, ELEN,   LMUL,   _)
+#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv),      _v_f_f, ELEN,   m1,     _)
+#define VFADDVV_FLOAT   JOIN(RISCV_RVV(vfadd),     _vv_f,  ELEN,   LMUL,   _)
+#define VMFLTVF_FLOAT   JOIN(RISCV_RVV(vmflt),     _vf_f,  ELEN,   LMUL,   MLEN)
+
 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
 	BLASLONG i=0, j=0;
@@ -67,12 +77,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	if (n <= 0 || inc_x <= 0) return(asumf);
         unsigned int gvl = 0;
         FLOAT_V_T v0, v1, v_zero,v_sum;
-        FLOAT_V_T_M1 v_res, v_z0;
-        gvl = VSETVL_MAX;
-        v_res = VFMVVF_FLOAT_M1(0, gvl);
-        v_z0 = VFMVVF_FLOAT_M1(0, gvl);
+        FLOAT_V_T_M1 v_res;
+        v_res = VFMVVF_FLOAT_M1(0, 1);
 
-        MASK_T mask0, mask1;
         if(inc_x == 1){
                 BLASLONG n2 = n * 2;
                 gvl = VSETVL(n2);
@@ -81,26 +88,21 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                         v_sum = VFMVVF_FLOAT(0, gvl);
                         for(i=0,j=0; i<n2/(gvl*2); i++){
                                 v0 = VLEV_FLOAT(&x[j], gvl);
-                                mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
-                                v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl);
+                                v0 = VFABS_FLOAT(v0, gvl);
                                 v_sum = VFADDVV_FLOAT(v_sum, v0, gvl);
 
                                 v1 = VLEV_FLOAT(&x[j+gvl], gvl);
-                                mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
-                                v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl);
+                                v1 = VFABS_FLOAT(v1, gvl);
                                 v_sum = VFADDVV_FLOAT(v_sum, v1, gvl);
                                 j += gvl * 2;
                         }
-                        v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, gvl);
-                        asumf += VFFMVFS_FLOAT(v_res);
+                        v_res = VFREDSUMVS_FLOAT(v_sum, v_res, gvl);
                 }
                 for(;j<n2;){
                         gvl = VSETVL(n2-j);
                         v0 = VLEV_FLOAT(&x[j], gvl);
-                        mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
-                        v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl);
-                        v_res = VFREDSUMVS_FLOAT(v_res, v0, v_z0, gvl);
-                        asumf += VFFMVFS_FLOAT(v_res);
+                        v0 = VFABS_FLOAT(v0, gvl);
+                        v_res = VFREDSUMVS_FLOAT(v0, v_res, gvl);
                         j += gvl;
                 }
         }else{
@@ -112,34 +114,29 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                 v_sum = VFMVVF_FLOAT(0, gvl);
                 for(i=0,j=0; i<n/gvl; i++){
                         v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
-                        mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
-                        v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl);
+                        v0 = VFABS_FLOAT(v0, gvl);
                         v_sum = VFADDVV_FLOAT(v_sum, v0, gvl);
 
                         v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
-                        mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
-                        v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl);
+                        v1 = VFABS_FLOAT(v1, gvl);
                         v_sum = VFADDVV_FLOAT(v_sum, v1, gvl);
 
                         j += gvl;
                         ix += inc_xv;
                 }
-                v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, gvl);
-                asumf += VFFMVFS_FLOAT(v_res);
+                v_res = VFREDSUMVS_FLOAT(v_sum, v_res, gvl);
                 if(j<n){
                         gvl = VSETVL(n-j);
                         v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
-                        mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
-                        v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl);
-                        v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+                        v0 = VFABS_FLOAT(v0, gvl);
 
-                        mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
-                        v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl);
+                        v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+                        v1 = VFABS_FLOAT(v1, gvl);
                         v_sum = VFADDVV_FLOAT(v0, v1, gvl);
-                        v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, gvl);
-                        asumf += VFFMVFS_FLOAT(v_res);
+                        v_res = VFREDSUMVS_FLOAT(v_sum, v_res, gvl);
                 }
         }
+        asumf = EXTRACT_FLOAT(v_res);
 	return(asumf);
 }
 
diff --git a/kernel/riscv64/zaxpby_rvv.c b/kernel/riscv64/zaxpby_rvv.c
new file mode 100644
index 000000000..66e38c1e4
--- /dev/null
+++ b/kernel/riscv64/zaxpby_rvv.c
@@ -0,0 +1,180 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/***************************************************************************
+* 2014/06/07 Saar
+*
+***************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m4(n)
+#define FLOAT_V_T               vfloat32m4_t
+#define FLOAT_VX2_T             vfloat32m4x2_t
+#define VGET_VX2                __riscv_vget_v_f32m4x2_f32m4
+#define VSET_VX2                __riscv_vset_v_f32m4_f32m4x2
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m4
+#define VSSEV_FLOAT             __riscv_vsse32_v_f32m4
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f32m4
+#define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f32m4
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m4
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f32m4
+#define VFMSACVF_FLOAT          __riscv_vfmsac_vf_f32m4
+#define VLSEG_FLOAT             __riscv_vlseg2e32_v_f32m4x2
+#define VSSEG_FLOAT             __riscv_vsseg2e32_v_f32m4x2
+#define VLSSEG_FLOAT            __riscv_vlsseg2e32_v_f32m4x2
+#define VSSSEG_FLOAT            __riscv_vssseg2e32_v_f32m4x2
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m4(n)
+#define FLOAT_V_T               vfloat64m4_t
+#define FLOAT_VX2_T             vfloat64m4x2_t
+#define VGET_VX2                __riscv_vget_v_f64m4x2_f64m4
+#define VSET_VX2                __riscv_vset_v_f64m4_f64m4x2
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m4
+#define VSSEV_FLOAT             __riscv_vsse64_v_f64m4
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f64m4
+#define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f64m4
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m4
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f64m4
+#define VFMSACVF_FLOAT          __riscv_vfmsac_vf_f64m4
+#define VLSEG_FLOAT             __riscv_vlseg2e64_v_f64m4x2
+#define VSSEG_FLOAT             __riscv_vsseg2e64_v_f64m4x2
+#define VLSSEG_FLOAT            __riscv_vlsseg2e64_v_f64m4x2
+#define VSSSEG_FLOAT            __riscv_vssseg2e64_v_f64m4x2
+#endif
+
+int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FLOAT beta_r, FLOAT beta_i,FLOAT *y, BLASLONG inc_y)
+{
+    BLASLONG inc_x2, inc_y2;
+
+    if ( n <= 0     )  return(0);
+
+    inc_x2 = 2 * inc_x;
+    inc_y2 = 2 * inc_y;
+    
+    BLASLONG stride_x = inc_x2 * sizeof(FLOAT);
+    BLASLONG stride_y = inc_y2 * sizeof(FLOAT);
+    FLOAT_V_T vx0, vx1, vy0, vy1;
+    FLOAT_VX2_T vxx2, vyx2;
+
+    if ( beta_r == 0.0 && beta_i == 0.0)
+    {
+        if ( alpha_r == 0.0 && alpha_i == 0.0 )
+        {
+            size_t vl = VSETVL(n);
+            FLOAT_V_T temp = VFMVVF_FLOAT(0.0, vl);
+            vxx2 = VSET_VX2(vxx2, 0, temp);
+            vxx2 = VSET_VX2(vxx2, 1, temp);
+            for ( ; n > 0; n -= vl, y += vl*inc_y2)
+            {
+                vl = VSETVL(n);
+                VSSSEG_FLOAT(y, stride_y, vxx2, vl);
+            }
+        }
+        else
+        {
+            for (size_t vl; n > 0; n -= vl, x += vl*inc_x2, y += vl*inc_y2) 
+            {
+                vl = VSETVL(n);
+
+                vxx2 = VLSSEG_FLOAT(x, stride_x, vl);
+                vx0 = VGET_VX2(vxx2, 0);
+                vx1 = VGET_VX2(vxx2, 1);
+                
+                vy0 = VFMULVF_FLOAT(vx1, alpha_i, vl);
+                vy0 = VFMSACVF_FLOAT(vy0, alpha_r, vx0, vl);
+
+                vy1 = VFMULVF_FLOAT(vx1, alpha_r, vl);
+                vy1 = VFMACCVF_FLOAT(vy1, alpha_i, vx0, vl);
+
+                vyx2 = VSET_VX2(vyx2, 0, vy0);
+                vyx2 = VSET_VX2(vyx2, 1, vy1);
+                VSSSEG_FLOAT(y, stride_y, vyx2, vl);
+            }
+        }
+    }
+    else
+    {
+        FLOAT_V_T v0, v1;
+        FLOAT_VX2_T v_x2;
+
+        if ( alpha_r == 0.0 && alpha_i == 0.0 )
+        {
+            for (size_t vl; n > 0; n -= vl, y += vl*inc_y2) 
+            {
+                vl = VSETVL(n);
+
+                vyx2 = VLSSEG_FLOAT(y, stride_y, vl);
+                vy0 = VGET_VX2(vyx2, 0);
+                vy1 = VGET_VX2(vyx2, 1);
+                
+                v0 = VFMULVF_FLOAT(vy1, beta_i, vl);
+                v0 = VFMSACVF_FLOAT(v0, beta_r, vy0, vl);
+
+                v1 = VFMULVF_FLOAT(vy1, beta_r, vl);
+                v1 = VFMACCVF_FLOAT(v1, beta_i, vy0, vl);
+
+                v_x2 = VSET_VX2(v_x2, 0, v0);
+                v_x2 = VSET_VX2(v_x2, 1, v1);
+                VSSSEG_FLOAT(y, stride_y, v_x2, vl);
+            }
+        }
+        else
+        {
+            for (size_t vl; n > 0; n -= vl, x += vl*inc_x2, y += vl*inc_y2) 
+            {
+                vl = VSETVL(n);
+
+                vxx2 = VLSSEG_FLOAT(x, stride_x, vl);
+                vyx2 = VLSSEG_FLOAT(y, stride_y, vl);
+
+                vx0 = VGET_VX2(vxx2, 0);
+                vx1 = VGET_VX2(vxx2, 1);
+                vy0 = VGET_VX2(vyx2, 0);
+                vy1 = VGET_VX2(vyx2, 1);
+
+                v0 = VFMULVF_FLOAT(vx0, alpha_r, vl);
+                v0 = VFNMSACVF_FLOAT(v0, alpha_i, vx1, vl);
+                v0 = VFMACCVF_FLOAT(v0, beta_r, vy0, vl);
+                v0 = VFNMSACVF_FLOAT(v0, beta_i, vy1, vl);
+                
+                v1 = VFMULVF_FLOAT(vx1, alpha_r, vl);
+                v1 = VFMACCVF_FLOAT(v1, alpha_i, vx0, vl);
+                v1 = VFMACCVF_FLOAT(v1, beta_r, vy1, vl);
+                v1 = VFMACCVF_FLOAT(v1, beta_i, vy0, vl);
+
+                v_x2 = VSET_VX2(v_x2, 0, v0);
+                v_x2 = VSET_VX2(v_x2, 1, v1);
+
+                VSSSEG_FLOAT(y, stride_y, v_x2, vl);
+            }
+        }
+    }
+    return(0);
+
+}
diff --git a/kernel/riscv64/zaxpby_vector.c b/kernel/riscv64/zaxpby_vector.c
index 5e6034ac5..bbf2bbe7d 100644
--- a/kernel/riscv64/zaxpby_vector.c
+++ b/kernel/riscv64/zaxpby_vector.c
@@ -28,25 +28,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m4(n)
+#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n)
 #define FLOAT_V_T vfloat32m4_t
-#define VLSEV_FLOAT vlse32_v_f32m4
-#define VSSEV_FLOAT vsse32_v_f32m4
-#define VFMACCVF_FLOAT vfmacc_vf_f32m4
-#define VFMVVF_FLOAT vfmv_v_f_f32m4
-#define VFMULVF_FLOAT vfmul_vf_f32m4
-#define VFMSACVF_FLOAT vfmsac_vf_f32m4
-#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4
+#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4)
+#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4)
+#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m4)
+#define VFMULVF_FLOAT RISCV_RVV(vfmul_vf_f32m4)
+#define VFMSACVF_FLOAT RISCV_RVV(vfmsac_vf_f32m4)
+#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f32m4)
 #else
-#define VSETVL(n) vsetvl_e64m4(n)
+#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n)
 #define FLOAT_V_T vfloat64m4_t
-#define VLSEV_FLOAT vlse64_v_f64m4
-#define VSSEV_FLOAT vsse64_v_f64m4
-#define VFMACCVF_FLOAT vfmacc_vf_f64m4
-#define VFMVVF_FLOAT vfmv_v_f_f64m4
-#define VFMULVF_FLOAT vfmul_vf_f64m4
-#define VFMSACVF_FLOAT vfmsac_vf_f64m4
-#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4
+#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4)
+#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4)
+#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m4)
+#define VFMULVF_FLOAT RISCV_RVV(vfmul_vf_f64m4)
+#define VFMSACVF_FLOAT RISCV_RVV(vfmsac_vf_f64m4)
+#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m4)
 #endif
 
 int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FLOAT beta_r, FLOAT beta_i, FLOAT *y, BLASLONG inc_y)
@@ -62,6 +62,82 @@ int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FL
         stride_x = inc_x * 2 * sizeof(FLOAT);
         stride_y = inc_y * 2 * sizeof(FLOAT);
 
+	if (inc_x == 0 || inc_y == 0) {
+
+	FLOAT temp;
+	BLASLONG inc_x2, inc_y2;
+
+	inc_x2 = 2 * inc_x;
+	inc_y2 = 2 * inc_y;
+
+	if ( beta_r == 0.0 && beta_i == 0.0)
+	{
+		if ( alpha_r == 0.0 && alpha_i == 0.0 )
+		{
+
+			while(i < n)
+			{
+				y[iy]   = 0.0 ;
+				y[iy+1] = 0.0 ;
+				iy += inc_y2 ;
+				i++ ;
+			}
+
+		}
+		else
+		{
+
+			while(i < n)
+			{
+				y[iy]   = ( alpha_r * x[ix]   - alpha_i * x[ix+1] ) ;
+				y[iy+1] = ( alpha_r * x[ix+1] + alpha_i * x[ix]   ) ;
+				ix += inc_x2 ;
+				iy += inc_y2 ;
+				i++ ;
+			}
+
+
+		}
+
+	}
+	else
+	{
+		if ( alpha_r == 0.0 && alpha_i == 0.0 )
+		{
+
+			while(i < n)
+			{
+				temp    = ( beta_r * y[iy]   - beta_i * y[iy+1] ) ;
+				y[iy+1] = ( beta_r * y[iy+1] + beta_i * y[iy]   ) ;
+				y[iy]   = temp;
+				iy += inc_y2 ;
+				i++ ;
+			}
+
+		}
+		else
+		{
+
+			while(i < n)
+			{
+				temp    = ( alpha_r * x[ix]   - alpha_i * x[ix+1] ) + ( beta_r * y[iy]   - beta_i * y[iy+1] ) ;
+				y[iy+1] = ( alpha_r * x[ix+1] + alpha_i * x[ix]   ) + ( beta_r * y[iy+1] + beta_i * y[iy]   ) ;
+				y[iy]   = temp;
+				ix += inc_x2 ;
+				iy += inc_y2 ;
+				i++ ;
+			}
+
+
+		}
+
+
+
+	}
+	return(0);
+
+	} else {
+
         if(beta_r == 0.0 && beta_i == 0.0){
                 if(alpha_r == 0.0 && alpha_i == 0.0){
                         if(inc_y == 1){
@@ -191,5 +267,6 @@ int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FL
                 }
         }
 	return(0);
+	}
 }
 
diff --git a/kernel/riscv64/zaxpy.c b/kernel/riscv64/zaxpy.c
index 1dcaeac27..18b6315cb 100644
--- a/kernel/riscv64/zaxpy.c
+++ b/kernel/riscv64/zaxpy.c
@@ -44,7 +44,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
 	BLASLONG inc_x2;
 	BLASLONG inc_y2;
 
-	if ( n < 0     )  return(0);
+	if ( n <= 0     )  return(0);
 	if ( da_r == 0.0 && da_i == 0.0 ) return(0);
 
 	ix = 0;
diff --git a/kernel/riscv64/zaxpy_rvv.c b/kernel/riscv64/zaxpy_rvv.c
new file mode 100644
index 000000000..0db32df10
--- /dev/null
+++ b/kernel/riscv64/zaxpy_rvv.c
@@ -0,0 +1,190 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m4(n)
+#define FLOAT_V_T               vfloat32m4_t
+#define FLOAT_VX2_T             vfloat32m4x2_t
+#define VGET_VX2                __riscv_vget_v_f32m4x2_f32m4
+#define VSET_VX2                __riscv_vset_v_f32m4_f32m4x2
+#define VLSEG_FLOAT             __riscv_vlseg2e32_v_f32m4x2
+#define VLSSEG_FLOAT            __riscv_vlsseg2e32_v_f32m4x2
+#define VSSEG_FLOAT             __riscv_vsseg2e32_v_f32m4x2
+#define VSSSEG_FLOAT            __riscv_vssseg2e32_v_f32m4x2
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f32m4
+#define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f32m4
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m4(n)
+#define FLOAT_V_T               vfloat64m4_t
+#define FLOAT_VX2_T             vfloat64m4x2_t
+#define VGET_VX2                __riscv_vget_v_f64m4x2_f64m4
+#define VSET_VX2                __riscv_vset_v_f64m4_f64m4x2
+#define VLSEG_FLOAT             __riscv_vlseg2e64_v_f64m4x2
+#define VLSSEG_FLOAT            __riscv_vlsseg2e64_v_f64m4x2
+#define VSSEG_FLOAT             __riscv_vsseg2e64_v_f64m4x2
+#define VSSSEG_FLOAT            __riscv_vssseg2e64_v_f64m4x2
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f64m4
+#define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f64m4
+#endif
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+    if(n < 0) return(0);
+    if(da_r == 0.0 && da_i == 0.0) return(0);
+ 
+    FLOAT_V_T vx0, vx1, vy0, vy1;
+    FLOAT_VX2_T vxx2, vyx2;
+
+    if(inc_x == 1 && inc_y == 1) {
+
+        for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*2) {
+            vl = VSETVL(n);
+
+            vxx2 = VLSEG_FLOAT(x, vl);
+            vyx2 = VLSEG_FLOAT(y, vl);
+
+            vx0 = VGET_VX2(vxx2, 0);
+            vx1 = VGET_VX2(vxx2, 1);
+            vy0 = VGET_VX2(vyx2, 0);
+            vy1 = VGET_VX2(vyx2, 1);
+
+        #if !defined(CONJ)
+            vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl);
+            vy0 = VFNMSACVF_FLOAT(vy0, da_i, vx1, vl);
+            vy1 = VFMACCVF_FLOAT(vy1, da_r, vx1, vl);
+            vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl);
+        #else
+            vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl);
+            vy0 = VFMACCVF_FLOAT(vy0, da_i, vx1, vl);
+            vy1 = VFNMSACVF_FLOAT(vy1, da_r, vx1, vl);
+            vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl);
+        #endif
+            vyx2 = VSET_VX2(vyx2, 0, vy0);
+            vyx2 = VSET_VX2(vyx2, 1, vy1);
+            VSSEG_FLOAT(y, vyx2, vl);
+        }
+
+    } else if (inc_x == 1) {
+
+        BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*inc_y*2) {
+            vl = VSETVL(n);
+
+            vxx2 = VLSEG_FLOAT(x, vl);
+            vyx2 = VLSSEG_FLOAT(y, stride_y, vl);
+
+            vx0 = VGET_VX2(vxx2, 0);
+            vx1 = VGET_VX2(vxx2, 1);
+            vy0 = VGET_VX2(vyx2, 0);
+            vy1 = VGET_VX2(vyx2, 1);
+
+        #if !defined(CONJ)
+            vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl);
+            vy0 = VFNMSACVF_FLOAT(vy0, da_i, vx1, vl);
+            vy1 = VFMACCVF_FLOAT(vy1, da_r, vx1, vl);
+            vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl);
+        #else
+            vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl);
+            vy0 = VFMACCVF_FLOAT(vy0, da_i, vx1, vl);
+            vy1 = VFNMSACVF_FLOAT(vy1, da_r, vx1, vl);
+            vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl);
+        #endif
+            vyx2 = VSET_VX2(vyx2, 0, vy0);
+            vyx2 = VSET_VX2(vyx2, 1, vy1);
+            VSSSEG_FLOAT(y, stride_y, vyx2, vl);
+        }
+
+    } else if (inc_y == 1) {
+
+        BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*2) {
+            vl = VSETVL(n);
+
+            vxx2 = VLSSEG_FLOAT(x, stride_x, vl);
+            vyx2 = VLSEG_FLOAT(y, vl);
+
+            vx0 = VGET_VX2(vxx2, 0);
+            vx1 = VGET_VX2(vxx2, 1);
+            vy0 = VGET_VX2(vyx2, 0);
+            vy1 = VGET_VX2(vyx2, 1);
+
+        #if !defined(CONJ)
+            vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl);
+            vy0 = VFNMSACVF_FLOAT(vy0, da_i, vx1, vl);
+            vy1 = VFMACCVF_FLOAT(vy1, da_r, vx1, vl);
+            vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl);
+        #else
+            vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl);
+            vy0 = VFMACCVF_FLOAT(vy0, da_i, vx1, vl);
+            vy1 = VFNMSACVF_FLOAT(vy1, da_r, vx1, vl);
+            vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl);
+        #endif
+            vyx2 = VSET_VX2(vyx2, 0, vy0);
+            vyx2 = VSET_VX2(vyx2, 1, vy1);
+            VSSEG_FLOAT(y, vyx2, vl);
+        }
+
+    } else {
+
+        BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
+        BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*inc_y*2) {
+            vl = VSETVL(n);
+
+            vxx2 = VLSSEG_FLOAT(x, stride_x, vl);
+            vyx2 = VLSSEG_FLOAT(y, stride_y, vl);
+
+            vx0 = VGET_VX2(vxx2, 0);
+            vx1 = VGET_VX2(vxx2, 1);
+            vy0 = VGET_VX2(vyx2, 0);
+            vy1 = VGET_VX2(vyx2, 1);
+
+        #if !defined(CONJ)
+            vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl);
+            vy0 = VFNMSACVF_FLOAT(vy0, da_i, vx1, vl);
+            vy1 = VFMACCVF_FLOAT(vy1, da_r, vx1, vl);
+            vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl);
+        #else
+            vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl);
+            vy0 = VFMACCVF_FLOAT(vy0, da_i, vx1, vl);
+            vy1 = VFNMSACVF_FLOAT(vy1, da_r, vx1, vl);
+            vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl);
+        #endif
+            vyx2 = VSET_VX2(vyx2, 0, vy0);
+            vyx2 = VSET_VX2(vyx2, 1, vy1);
+            VSSSEG_FLOAT(y, stride_y, vyx2, vl);
+        }
+
+    }
+
+    return(0);
+}
diff --git a/kernel/riscv64/zaxpy_vector.c b/kernel/riscv64/zaxpy_vector.c
index 4ccfe4a81..1e766c5f4 100644
--- a/kernel/riscv64/zaxpy_vector.c
+++ b/kernel/riscv64/zaxpy_vector.c
@@ -28,27 +28,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m4(n)
+#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n)
 #define FLOAT_V_T vfloat32m4_t
-#define VLSEV_FLOAT vlse32_v_f32m4
-#define VSSEV_FLOAT vsse32_v_f32m4
-#define VFMACCVF_FLOAT vfmacc_vf_f32m4
-#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4
+#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4)
+#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4)
+#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4)
+#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f32m4)
 #else
-#define VSETVL(n) vsetvl_e64m4(n)
+#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n)
 #define FLOAT_V_T vfloat64m4_t
-#define VLSEV_FLOAT vlse64_v_f64m4
-#define VSSEV_FLOAT vsse64_v_f64m4
-#define VFMACCVF_FLOAT vfmacc_vf_f64m4
-#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4
+#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4)
+#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4)
+#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4)
+#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m4)
 #endif
 
 int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
 {
-	BLASLONG i = 0, j = 0;
-	BLASLONG ix = 0,iy = 0;
-	if(n < 0) return(0);
-	if(da_r == 0.0 && da_i == 0.0) return(0);
+        BLASLONG i = 0, j = 0;
+        BLASLONG ix = 0,iy = 0;
+        if(n <= 0) return(0);
+        if(da_r == 0.0 && da_i == 0.0) return(0);
         unsigned int gvl = 0;
         BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
         BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT);
diff --git a/kernel/riscv64/zcopy.c b/kernel/riscv64/zcopy.c
index 07fe584c5..b0f19efd5 100644
--- a/kernel/riscv64/zcopy.c
+++ b/kernel/riscv64/zcopy.c
@@ -43,7 +43,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
 	BLASLONG inc_x2;
 	BLASLONG inc_y2;
 
-	if ( n < 0     )  return(0);
+	if ( n <= 0     )  return(0);
 
 	inc_x2 = 2 * inc_x;
 	inc_y2 = 2 * inc_y;
diff --git a/kernel/riscv64/zcopy_rvv.c b/kernel/riscv64/zcopy_rvv.c
new file mode 100644
index 000000000..13879f03b
--- /dev/null
+++ b/kernel/riscv64/zcopy_rvv.c
@@ -0,0 +1,105 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL_M8(n)            __riscv_vsetvl_e32m8(n)
+#define FLOAT_V_T_M8            vfloat32m8_t
+#define VLEV_FLOAT_M8           __riscv_vle32_v_f32m8
+#define VSEV_FLOAT_M8           __riscv_vse32_v_f32m8
+
+#define VSETVL_M4(n)            __riscv_vsetvl_e32m4(n)
+#define FLOAT_VX2_T_M4          vfloat32m4x2_t
+#define VLSEG_FLOAT_M4          __riscv_vlseg2e32_v_f32m4x2
+#define VSSEG_FLOAT_M4          __riscv_vsseg2e32_v_f32m4x2
+#define VLSSEG_FLOAT_M4         __riscv_vlsseg2e32_v_f32m4x2
+#define VSSSEG_FLOAT_M4         __riscv_vssseg2e32_v_f32m4x2
+#else
+#define VSETVL_M8(n)            __riscv_vsetvl_e64m8(n)
+#define FLOAT_V_T_M8            vfloat64m8_t
+#define VLEV_FLOAT_M8           __riscv_vle64_v_f64m8
+#define VSEV_FLOAT_M8           __riscv_vse64_v_f64m8
+
+#define VSETVL_M4(n)            __riscv_vsetvl_e64m4(n)
+#define FLOAT_VX2_T_M4          vfloat64m4x2_t
+#define VLSEG_FLOAT_M4          __riscv_vlseg2e64_v_f64m4x2
+#define VSSEG_FLOAT_M4          __riscv_vsseg2e64_v_f64m4x2
+#define VLSSEG_FLOAT_M4         __riscv_vlsseg2e64_v_f64m4x2
+#define VSSSEG_FLOAT_M4         __riscv_vssseg2e64_v_f64m4x2
+#endif
+
+int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+{
+    if(n <= 0) return(0);
+
+    if(inc_x == 1 && inc_y == 1) {
+
+        FLOAT_V_T_M8 vx;
+        n *= 2; // convert to words
+
+        for(size_t vl; n > 0; n -= vl, x += vl, y += vl) {
+            vl = VSETVL_M8(n);
+            vx = VLEV_FLOAT_M8(x, vl);
+            VSEV_FLOAT_M8(y, vx, vl);
+        }
+
+    }else if (1 == inc_x) {
+
+        FLOAT_VX2_T_M4 vx2;
+        BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT);
+
+        for(size_t vl; n > 0; n -= vl, x += vl*2, y += vl*inc_y*2) {
+            vl = VSETVL_M4(n);
+            vx2 = VLSEG_FLOAT_M4(x, vl);
+            VSSSEG_FLOAT_M4(y, stride_y, vx2, vl);
+        }
+    } else if (1 == inc_y) {
+
+        FLOAT_VX2_T_M4 vx2;
+        BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
+
+        for(size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*2) {
+            vl = VSETVL_M4(n);
+            vx2 = VLSSEG_FLOAT_M4(x, stride_x, vl);
+            VSSEG_FLOAT_M4(y, vx2, vl);
+        }
+    } else {
+
+        FLOAT_VX2_T_M4 vx2;
+        BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
+        BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT);
+
+        for(size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*inc_y*2) {
+            vl = VSETVL_M4(n);
+            vx2 = VLSSEG_FLOAT_M4(x, stride_x, vl);
+            VSSSEG_FLOAT_M4(y, stride_y, vx2, vl);
+        }
+    }
+
+    return(0);
+}
diff --git a/kernel/riscv64/zcopy_vector.c b/kernel/riscv64/zcopy_vector.c
index 55a480a35..9e4a67b71 100644
--- a/kernel/riscv64/zcopy_vector.c
+++ b/kernel/riscv64/zcopy_vector.c
@@ -27,15 +27,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m4(n)
+#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n)
 #define FLOAT_V_T vfloat32m4_t
-#define VLSEV_FLOAT vlse32_v_f32m4
-#define VSSEV_FLOAT vsse32_v_f32m4
+#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4)
+#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4)
 #else
-#define VSETVL(n) vsetvl_e64m4(n)
+#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n)
 #define FLOAT_V_T vfloat64m4_t
-#define VLSEV_FLOAT vlse64_v_f64m4
-#define VSSEV_FLOAT vsse64_v_f64m4
+#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4)
+#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4)
 #endif
 
 
diff --git a/kernel/riscv64/zdot_rvv.c b/kernel/riscv64/zdot_rvv.c
new file mode 100644
index 000000000..13bc2ee39
--- /dev/null
+++ b/kernel/riscv64/zdot_rvv.c
@@ -0,0 +1,194 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m4(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m4()
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e32m1()
+#define FLOAT_V_T               vfloat32m4_t
+#define FLOAT_V_T_M1            vfloat32m1_t
+#define FLOAT_VX2_T             vfloat32m4x2_t
+#define VGET_VX2                __riscv_vget_v_f32m4x2_f32m4
+#define VLSEG_FLOAT             __riscv_vlseg2e32_v_f32m4x2
+#define VLSSEG_FLOAT            __riscv_vlsseg2e32_v_f32m4x2
+#define VFREDSUM_FLOAT          __riscv_vfredusum_vs_f32m4_f32m1
+#define VFMACCVV_FLOAT_TU       __riscv_vfmacc_vv_f32m4_tu
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m4
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
+#define VFMULVV_FLOAT           __riscv_vfmul_vv_f32m4
+#define VFMSACVV_FLOAT          __riscv_vfmsac_vv_f32m4
+#define VFNMSACVV_FLOAT_TU      __riscv_vfnmsac_vv_f32m4_tu
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m4(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m4()
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e64m1()
+#define FLOAT_V_T               vfloat64m4_t
+#define FLOAT_V_T_M1            vfloat64m1_t
+#define FLOAT_VX2_T             vfloat64m4x2_t
+#define VGET_VX2                __riscv_vget_v_f64m4x2_f64m4
+#define VLSEG_FLOAT             __riscv_vlseg2e64_v_f64m4x2
+#define VLSSEG_FLOAT            __riscv_vlsseg2e64_v_f64m4x2
+#define VFREDSUM_FLOAT          __riscv_vfredusum_vs_f64m4_f64m1
+#define VFMACCVV_FLOAT_TU       __riscv_vfmacc_vv_f64m4_tu
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m4
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
+#define VFMULVV_FLOAT           __riscv_vfmul_vv_f64m4
+#define VFMSACVV_FLOAT          __riscv_vfmsac_vv_f64m4
+#define VFNMSACVV_FLOAT_TU      __riscv_vfnmsac_vv_f64m4_tu
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
+#endif
+
+OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+{
+    OPENBLAS_COMPLEX_FLOAT result;
+    CREAL(result) = 0.0;
+    CIMAG(result) = 0.0;
+
+    if ( n <= 0 ) return(result);
+
+    FLOAT_V_T vr0, vr1, vx0, vx1, vy0, vy1;
+    FLOAT_V_T_M1 v_res, v_z0;
+    FLOAT_VX2_T vxx2, vyx2;
+    size_t vlmax_m1 = VSETVL_MAX_M1;
+    v_z0 = VFMVVF_FLOAT_M1(0, vlmax_m1);
+
+    size_t vlmax = VSETVL_MAX;
+    vr0 = VFMVVF_FLOAT(0, vlmax);
+    vr1 = VFMVVF_FLOAT(0, vlmax);
+ 
+    if(inc_x == 1 && inc_y == 1) {
+
+        for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*2) {
+            vl = VSETVL(n);
+
+            vxx2 = VLSEG_FLOAT(x, vl);
+            vyx2 = VLSEG_FLOAT(y, vl);
+
+            vx0 = VGET_VX2(vxx2, 0);
+            vx1 = VGET_VX2(vxx2, 1);
+            vy0 = VGET_VX2(vyx2, 0);
+            vy1 = VGET_VX2(vyx2, 1);
+
+            vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, vy0, vl);
+            vr1 = VFMACCVV_FLOAT_TU(vr1, vx0, vy1, vl);
+        #if !defined(CONJ)
+            vr0 = VFNMSACVV_FLOAT_TU(vr0, vx1, vy1, vl);
+            vr1 = VFMACCVV_FLOAT_TU(vr1, vx1, vy0, vl);
+        #else
+            vr0 = VFMACCVV_FLOAT_TU(vr0, vx1, vy1, vl);
+            vr1 = VFNMSACVV_FLOAT_TU(vr1, vx1, vy0, vl);
+        #endif
+        }
+
+    }  else if (inc_x == 1){
+
+        BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*inc_y*2) {
+            vl = VSETVL(n);
+
+            vxx2 = VLSEG_FLOAT(x, vl);
+            vyx2 = VLSSEG_FLOAT(y, stride_y, vl);
+
+            vx0 = VGET_VX2(vxx2, 0);
+            vx1 = VGET_VX2(vxx2, 1);
+            vy0 = VGET_VX2(vyx2, 0);
+            vy1 = VGET_VX2(vyx2, 1);
+
+            vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, vy0, vl);
+            vr1 = VFMACCVV_FLOAT_TU(vr1, vx0, vy1, vl);
+        #if !defined(CONJ)
+            vr0 = VFNMSACVV_FLOAT_TU(vr0, vx1, vy1, vl);
+            vr1 = VFMACCVV_FLOAT_TU(vr1, vx1, vy0, vl);
+        #else
+            vr0 = VFMACCVV_FLOAT_TU(vr0, vx1, vy1, vl);
+            vr1 = VFNMSACVV_FLOAT_TU(vr1, vx1, vy0, vl);
+        #endif
+        }
+    } else if (inc_y == 1){
+
+        BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*2) {
+            vl = VSETVL(n);
+
+            vxx2 = VLSSEG_FLOAT(x, stride_x, vl);
+            vyx2 = VLSEG_FLOAT(y, vl);
+
+            vx0 = VGET_VX2(vxx2, 0);
+            vx1 = VGET_VX2(vxx2, 1);
+            vy0 = VGET_VX2(vyx2, 0);
+            vy1 = VGET_VX2(vyx2, 1);
+
+            vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, vy0, vl);
+            vr1 = VFMACCVV_FLOAT_TU(vr1, vx0, vy1, vl);
+        #if !defined(CONJ)
+            vr0 = VFNMSACVV_FLOAT_TU(vr0, vx1, vy1, vl);
+            vr1 = VFMACCVV_FLOAT_TU(vr1, vx1, vy0, vl);
+        #else
+            vr0 = VFMACCVV_FLOAT_TU(vr0, vx1, vy1, vl);
+            vr1 = VFNMSACVV_FLOAT_TU(vr1, vx1, vy0, vl);
+        #endif
+        }
+    }else {
+
+        BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
+        BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*inc_y*2) {
+            vl = VSETVL(n);
+
+            vxx2 = VLSSEG_FLOAT(x, stride_x, vl);
+            vyx2 = VLSSEG_FLOAT(y, stride_y, vl);
+
+            vx0 = VGET_VX2(vxx2, 0);
+            vx1 = VGET_VX2(vxx2, 1);
+            vy0 = VGET_VX2(vyx2, 0);
+            vy1 = VGET_VX2(vyx2, 1);
+
+            vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, vy0, vl);
+            vr1 = VFMACCVV_FLOAT_TU(vr1, vx0, vy1, vl);
+        #if !defined(CONJ)
+            vr0 = VFNMSACVV_FLOAT_TU(vr0, vx1, vy1, vl);
+            vr1 = VFMACCVV_FLOAT_TU(vr1, vx1, vy0, vl);
+        #else
+            vr0 = VFMACCVV_FLOAT_TU(vr0, vx1, vy1, vl);
+            vr1 = VFNMSACVV_FLOAT_TU(vr1, vx1, vy0, vl);
+        #endif
+        }
+    }
+
+    v_res = VFREDSUM_FLOAT(vr0, v_z0, vlmax);
+    CREAL(result) = VFMVFS_FLOAT_M1(v_res);
+    v_res = VFREDSUM_FLOAT(vr1, v_z0, vlmax);
+    CIMAG(result) = VFMVFS_FLOAT_M1(v_res);
+ 
+   return(result);
+}
diff --git a/kernel/riscv64/zdot_vector.c b/kernel/riscv64/zdot_vector.c
index 0900206b3..13b8fe378 100644
--- a/kernel/riscv64/zdot_vector.c
+++ b/kernel/riscv64/zdot_vector.c
@@ -27,37 +27,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m4(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
+#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n)
+#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)()
 #define FLOAT_V_T vfloat32m4_t
 #define FLOAT_V_T_M1 vfloat32m1_t
-#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLEV_FLOAT vle32_v_f32m4
-#define VLSEV_FLOAT vlse32_v_f32m4
-#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
-#define VFMACCVV_FLOAT vfmacc_vv_f32m4
-#define VFMVVF_FLOAT vfmv_v_f_f32m4
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFDOTVV_FLOAT vfdot_vv_f32m4
-#define VFMULVV_FLOAT vfmul_vv_f32m4
-#define VFMSACVV_FLOAT vfmsac_vv_f32m4
-#define VFNMSACVV_FLOAT vfnmsac_vv_f32m4
+#define VFMVFS_FLOAT RISCV_RVV(vfmv_f_s_f32m1_f32)
+#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4)
+#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDSUM_FLOAT(va, vb, gvl) RISCV_RVV(vfredusum_vs_f32m4_f32m1)(v_res, va, vb, gvl)
 #else
-#define VSETVL(n) vsetvl_e64m4(n)
-#define VSETVL_MAX vsetvlmax_e64m1()
+#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f32m4_f32m1)
+#endif
+#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f32m4)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m4)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1)
+#define VFDOTVV_FLOAT RISCV_RVV(vfdot_vv_f32m4)
+#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f32m4)
+#define VFMSACVV_FLOAT RISCV_RVV(vfmsac_vv_f32m4)
+#define VFNMSACVV_FLOAT RISCV_RVV(vfnmsac_vv_f32m4)
+#else
+#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n)
+#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)()
 #define FLOAT_V_T vfloat64m4_t
 #define FLOAT_V_T_M1 vfloat64m1_t
-#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLEV_FLOAT vle64_v_f64m4
-#define VLSEV_FLOAT vlse64_v_f64m4
-#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
-#define VFMACCVV_FLOAT vfmacc_vv_f64m4
-#define VFMVVF_FLOAT vfmv_v_f_f64m4
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFDOTVV_FLOAT vfdot_vv_f64m4
-#define VFMULVV_FLOAT vfmul_vv_f64m4
-#define VFMSACVV_FLOAT vfmsac_vv_f64m4
-#define VFNMSACVV_FLOAT vfnmsac_vv_f64m4
+#define VFMVFS_FLOAT RISCV_RVV(vfmv_f_s_f64m1_f64)
+#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4)
+#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDSUM_FLOAT(va, vb, gvl) RISCV_RVV(vfredusum_vs_f64m4_f64m1)(v_res, va, vb, gvl)
+#else
+#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f64m4_f64m1)
+#endif
+#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f64m4)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m4)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1)
+#define VFDOTVV_FLOAT RISCV_RVV(vfdot_vv_f64m4)
+#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f64m4)
+#define VFMSACVV_FLOAT RISCV_RVV(vfmsac_vv_f64m4)
+#define VFNMSACVV_FLOAT RISCV_RVV(vfnmsac_vv_f64m4)
 #endif
 
 OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
@@ -109,9 +117,9 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
                 ix += inc_xv;
                 iy += inc_yv;
         }
-        v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl);
+        v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl);
         dot[0] += VFMVFS_FLOAT(v_res);
-        v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl);
+        v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl);
         dot[1] += VFMVFS_FLOAT(v_res);
         //tail
         if(j < n){
@@ -132,9 +140,9 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
                 vr1 = VFMULVV_FLOAT(vx1, vy0, gvl);
                 vr1 = VFMSACVV_FLOAT(vr1, vx0, vy1, gvl);
 #endif
-                v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl);
+                v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl);
                 dot[0] += VFMVFS_FLOAT(v_res);
-                v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl);
+                v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl);
                 dot[1] += VFMVFS_FLOAT(v_res);
         }
         CREAL(result) = dot[0];
diff --git a/kernel/riscv64/zgemm_beta_rvv.c b/kernel/riscv64/zgemm_beta_rvv.c
new file mode 100644
index 000000000..ee334801b
--- /dev/null
+++ b/kernel/riscv64/zgemm_beta_rvv.c
@@ -0,0 +1,130 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m4(n)
+#define FLOAT_V_T               vfloat32m4_t
+#define FLOAT_VX2_T             vfloat32m4x2_t
+#define VGET_VX2                __riscv_vget_v_f32m4x2_f32m4
+#define VSET_VX2                __riscv_vset_v_f32m4_f32m4x2
+#define VLSEG_FLOAT             __riscv_vlseg2e32_v_f32m4x2
+#define VSSEG_FLOAT             __riscv_vsseg2e32_v_f32m4x2
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m4
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f32m4
+#define VFADDVV_FLOAT           __riscv_vfadd_vv_f32m4
+#define VFSUBVV_FLOAT           __riscv_vfsub_vv_f32m4
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m4(n)
+#define FLOAT_V_T               vfloat64m4_t
+#define FLOAT_VX2_T             vfloat64m4x2_t
+#define VGET_VX2                __riscv_vget_v_f64m4x2_f64m4
+#define VSET_VX2                __riscv_vset_v_f64m4_f64m4x2
+#define VLSEG_FLOAT             __riscv_vlseg2e64_v_f64m4x2
+#define VSSEG_FLOAT             __riscv_vsseg2e64_v_f64m4x2
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m4
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f64m4
+#define VFADDVV_FLOAT           __riscv_vfadd_vv_f64m4
+#define VFSUBVV_FLOAT           __riscv_vfsub_vv_f64m4
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1,
+          FLOAT beta_r, FLOAT beta_i,
+          FLOAT *dummy2, BLASLONG dummy3,
+          FLOAT *dummy4, BLASLONG dummy5,
+          FLOAT *c, BLASLONG ldc)
+{
+    BLASLONG chunk;
+    FLOAT *c_offset;
+	size_t vl;
+    FLOAT_V_T vr, vi, v1, v2, v3, v4;
+    FLOAT_VX2_T vx2;
+
+    ldc *= 2;
+    c_offset = c;
+
+    if (beta_r == 0.0 && beta_i == 0.0) {
+
+        vl = VSETVL(m);
+        vr = VFMVVF_FLOAT(0.0, vl);
+        vi = VFMVVF_FLOAT(0.0, vl);
+        vx2 = VSET_VX2(vx2, 0, vr);
+        vx2 = VSET_VX2(vx2, 1, vi);
+
+        for( ; n > 0; n--, c += ldc) {
+            c_offset = c;
+
+            for(chunk=m; chunk > 0; chunk -= vl, c_offset += vl*2) {
+                vl = VSETVL(chunk);
+
+                VSSEG_FLOAT(c_offset, vx2, vl);
+			}
+		}
+
+    } else {
+
+        for( ; n > 0; n--, c += ldc) {
+            c_offset = c;
+
+            for(chunk=m; chunk > 0; chunk -= vl, c_offset += vl*2) {
+                vl = VSETVL(chunk);
+
+                vx2 = VLSEG_FLOAT(c_offset, vl);
+                vr = VGET_VX2(vx2, 0);
+                vi = VGET_VX2(vx2, 1);
+
+                v1 = VFMULVF_FLOAT(vr, beta_r, vl);
+                v2 = VFMULVF_FLOAT(vi, beta_i, vl);
+
+                v3 = VFMULVF_FLOAT(vi, beta_r, vl);
+                v4 = VFMULVF_FLOAT(vr, beta_i, vl);
+
+				vr = VFSUBVV_FLOAT(v1, v2, vl);
+				vi = VFADDVV_FLOAT(v3, v4, vl);
+
+                vx2 = VSET_VX2(vx2, 0, vr);
+                vx2 = VSET_VX2(vx2, 1, vi);
+                VSSEG_FLOAT(c_offset, vx2, vl);
+			}
+		}
+
+	}
+
+    return 0;
+}
diff --git a/kernel/riscv64/zgemm_kernel_4x4_zvl128b.c b/kernel/riscv64/zgemm_kernel_4x4_zvl128b.c
new file mode 100644
index 000000000..0776f03fd
--- /dev/null
+++ b/kernel/riscv64/zgemm_kernel_4x4_zvl128b.c
@@ -0,0 +1,720 @@
+/*
+
+AUTOGENERATED KERNEL
+Script: ./kernel/riscv64/generate_kernel.py
+Settings:
+ LMUL=2
+ M=4
+ M_tail_scalar_from=2
+ N=4
+ __riscv_='__riscv_'
+ complex=True
+ conjugate=False
+ cpu='zvl128b'
+ force_acc_double=False
+ index_type='BLASLONG'
+ op='gemm'
+ param_precision='double'
+ reg_width_bits=128
+ tail_policy=''
+ trace=False
+
+Derived:
+ ELEN_ACC=64
+ ELEN_PARAM=64
+ LMUL_ACC=2
+ VFMACC='__riscv_vfmacc_vf_f64m2'
+ VFMUL='__riscv_vfmul_vf_f64m2'
+ VLEV='__riscv_vle64_v_f64m2'
+ VLSEV='__riscv_vlse64_v_f64m2'
+ VMACC_TO_ACC='__riscv_vfmacc_vf_f64m2'
+ VMUL_TO_ACC='__riscv_vfmul_vf_f64m2'
+ VSETVL='__riscv_vsetvl_e64m2'
+ VSEV='__riscv_vse64_v_f64m2'
+ VSSEV='__riscv_vsse64_v_f64m2'
+ acc_vector_t='vfloat64m2_t'
+ output='zgemm_kernel_4x4_zvl128b.c'
+ param_scalar_t='double'
+ param_vector_t='vfloat64m2_t'
+
+*/
+
+#include "common.h"
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+#define S0 1
+#define S1 -1
+#define S2 1
+#define S3 1
+#define VFMACC_RR __riscv_vfmsac
+#define VFMACC_RI __riscv_vfmacc
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+#define S0 1
+#define S1 1
+#define S2 1
+#define S3 -1
+#define VFMACC_RR __riscv_vfmacc
+#define VFMACC_RI __riscv_vfmsac
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+#define S0 1
+#define S1 1
+#define S2 -1
+#define S3 1
+#define VFMACC_RR __riscv_vfmacc
+#define VFMACC_RI __riscv_vfnmsac
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+#define S0 1
+#define S1 -1
+#define S2 -1
+#define S3 -1
+#define VFMACC_RR __riscv_vfmsac
+#define VFMACC_RI __riscv_vfnmacc
+#endif
+
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc)
+
+{
+    BLASLONG gvl = 0;
+    BLASLONG m_top = 0;
+    BLASLONG n_top = 0;
+
+    // -- MAIN PASS
+
+    for (BLASLONG j = 0; j < N / 4; j += 1) {
+        m_top = 0;
+        BLASLONG gvl = __riscv_vsetvl_e64m2(4);
+
+        for (BLASLONG i = 0; i < M / 4; i += 1) {
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            double B0r = B[bi + 0 * 2 + 0];
+            double B0i = B[bi + 0 * 2 + 1];
+            double B1r = B[bi + 1 * 2 + 0];
+            double B1i = B[bi + 1 * 2 + 1];
+            double B2r = B[bi + 2 * 2 + 0];
+            double B2i = B[bi + 2 * 2 + 1];
+            double B3r = B[bi + 3 * 2 + 0];
+            double B3i = B[bi + 3 * 2 + 1];
+            bi += 4 * 2;
+
+            vfloat64m2_t A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+            vfloat64m2_t A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ai += 4 * 2;
+
+            // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k
+            // leaving 6 vector registers for temporaries
+            // performing 2 operations between reuses of temporaries
+            vfloat64m2_t tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl);
+            vfloat64m2_t tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl);
+            vfloat64m2_t tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl);
+            vfloat64m2_t tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl);
+            tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
+            vfloat64m2_t ACC0r = tmp0r;
+            vfloat64m2_t ACC0i = tmp0i;
+            vfloat64m2_t ACC1r = tmp1r;
+            vfloat64m2_t ACC1i = tmp1i;
+            tmp0r = __riscv_vfmul_vf_f64m2(A0i, B2i, gvl);
+            tmp0i = __riscv_vfmul_vf_f64m2(A0r, B2i, gvl);
+            tmp1r = __riscv_vfmul_vf_f64m2(A0i, B3i, gvl);
+            tmp1i = __riscv_vfmul_vf_f64m2(A0r, B3i, gvl);
+            tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl);
+            tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl);
+            tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl);
+            tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl);
+            vfloat64m2_t ACC2r = tmp0r;
+            vfloat64m2_t ACC2i = tmp0i;
+            vfloat64m2_t ACC3r = tmp1r;
+            vfloat64m2_t ACC3i = tmp1i;
+
+            for (BLASLONG k = 1; k < K; k++) {
+                B0r = B[bi + 0 * 2 + 0];
+                B0i = B[bi + 0 * 2 + 1];
+                B1r = B[bi + 1 * 2 + 0];
+                B1i = B[bi + 1 * 2 + 1];
+                B2r = B[bi + 2 * 2 + 0];
+                B2i = B[bi + 2 * 2 + 1];
+                B3r = B[bi + 3 * 2 + 0];
+                B3i = B[bi + 3 * 2 + 1];
+                bi += 4 * 2;
+
+                A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+                A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+                ai += 4 * 2;
+
+                tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl);
+                tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
+                ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl);
+                tmp0r = __riscv_vfmul_vf_f64m2(A0i, B2i, gvl);
+                tmp0i = __riscv_vfmul_vf_f64m2(A0r, B2i, gvl);
+                tmp1r = __riscv_vfmul_vf_f64m2(A0i, B3i, gvl);
+                tmp1i = __riscv_vfmul_vf_f64m2(A0r, B3i, gvl);
+                tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl);
+                tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl);
+                tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl);
+                tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl);
+                ACC2r = __riscv_vfadd(ACC2r, tmp0r, gvl);
+                ACC2i = __riscv_vfadd(ACC2i, tmp0i, gvl);
+                ACC3r = __riscv_vfadd(ACC3r, tmp1r, gvl);
+                ACC3i = __riscv_vfadd(ACC3i, tmp1i, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat64m2_t C0r = __riscv_vlse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
+            vfloat64m2_t C0i = __riscv_vlse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ci += ldc - gvl * 0;
+            vfloat64m2_t C1r = __riscv_vlse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
+            vfloat64m2_t C1i = __riscv_vlse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ci += ldc - gvl * 0;
+            vfloat64m2_t C2r = __riscv_vlse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
+            vfloat64m2_t C2i = __riscv_vlse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ci += ldc - gvl * 0;
+            vfloat64m2_t C3r = __riscv_vlse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
+            vfloat64m2_t C3i = __riscv_vlse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
+
+            C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl);
+            C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl);
+            C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl);
+            C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl);
+            C2r = __riscv_vfmacc(C2r, alphar, ACC2r, gvl);
+            C2i = __riscv_vfmacc(C2i, alphar, ACC2i, gvl);
+            C3r = __riscv_vfmacc(C3r, alphar, ACC3r, gvl);
+            C3i = __riscv_vfmacc(C3i, alphar, ACC3i, gvl);
+            C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl);
+            C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl);
+            C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl);
+            C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl);
+            C2r = __riscv_vfnmsac(C2r, alphai, ACC2i, gvl);
+            C2i = __riscv_vfmacc(C2i, alphai, ACC2r, gvl);
+            C3r = __riscv_vfnmsac(C3r, alphai, ACC3i, gvl);
+            C3i = __riscv_vfmacc(C3i, alphai, ACC3r, gvl);
+
+            ci = n_top * ldc + m_top;
+
+            __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl);
+            __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl);
+            __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C2r, gvl);
+            __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C2i, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C3r, gvl);
+            __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C3i, gvl);
+
+            m_top += 4;
+        }
+
+        // -- tails for main pass
+
+        if (M & 2) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            double result4 = 0;
+            double result5 = 0;
+            double result6 = 0;
+            double result7 = 0;
+            double result8 = 0;
+            double result9 = 0;
+            double result10 = 0;
+            double result11 = 0;
+            double result12 = 0;
+            double result13 = 0;
+            double result14 = 0;
+            double result15 = 0;
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+
+            for (BLASLONG k = 0; k < K; k++) {
+                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
+                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
+                result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1];
+                result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1];
+                result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1];
+                result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1];
+                result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1];
+                result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1];
+                result8 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1];
+                result9 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1];
+                result10 += S0 * A[ai + 2 + 0] * B[bi + 4 + 0] + S1 * A[ai + 2 + 1] * B[bi + 4 + 1];
+                result11 += S2 * A[ai + 2 + 1] * B[bi + 4 + 0] + S3 * A[ai + 2 + 0] * B[bi + 4 + 1];
+                result12 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1];
+                result13 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1];
+                result14 += S0 * A[ai + 2 + 0] * B[bi + 6 + 0] + S1 * A[ai + 2 + 1] * B[bi + 6 + 1];
+                result15 += S2 * A[ai + 2 + 1] * B[bi + 6 + 0] + S3 * A[ai + 2 + 0] * B[bi + 6 + 1];
+                ai += 2 * 2;
+                bi += 4 * 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            double Cr, Ci;
+            Cr = C[(ci + 0 * ldc + 0) * 2 + 0];
+            Ci = C[(ci + 0 * ldc + 0) * 2 + 1];
+            Cr += result0 * alphar;
+            Ci += result1 * alphar;
+            Cr -= result1 * alphai;
+            Ci += result0 * alphai;
+            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
+            Cr = C[(ci + 0 * ldc + 1) * 2 + 0];
+            Ci = C[(ci + 0 * ldc + 1) * 2 + 1];
+            Cr += result2 * alphar;
+            Ci += result3 * alphar;
+            Cr -= result3 * alphai;
+            Ci += result2 * alphai;
+            C[(ci + 0 * ldc + 1) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 1) * 2 + 1] = Ci;
+            Cr = C[(ci + 1 * ldc + 0) * 2 + 0];
+            Ci = C[(ci + 1 * ldc + 0) * 2 + 1];
+            Cr += result4 * alphar;
+            Ci += result5 * alphar;
+            Cr -= result5 * alphai;
+            Ci += result4 * alphai;
+            C[(ci + 1 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 1 * ldc + 0) * 2 + 1] = Ci;
+            Cr = C[(ci + 1 * ldc + 1) * 2 + 0];
+            Ci = C[(ci + 1 * ldc + 1) * 2 + 1];
+            Cr += result6 * alphar;
+            Ci += result7 * alphar;
+            Cr -= result7 * alphai;
+            Ci += result6 * alphai;
+            C[(ci + 1 * ldc + 1) * 2 + 0] = Cr;
+            C[(ci + 1 * ldc + 1) * 2 + 1] = Ci;
+            Cr = C[(ci + 2 * ldc + 0) * 2 + 0];
+            Ci = C[(ci + 2 * ldc + 0) * 2 + 1];
+            Cr += result8 * alphar;
+            Ci += result9 * alphar;
+            Cr -= result9 * alphai;
+            Ci += result8 * alphai;
+            C[(ci + 2 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 2 * ldc + 0) * 2 + 1] = Ci;
+            Cr = C[(ci + 2 * ldc + 1) * 2 + 0];
+            Ci = C[(ci + 2 * ldc + 1) * 2 + 1];
+            Cr += result10 * alphar;
+            Ci += result11 * alphar;
+            Cr -= result11 * alphai;
+            Ci += result10 * alphai;
+            C[(ci + 2 * ldc + 1) * 2 + 0] = Cr;
+            C[(ci + 2 * ldc + 1) * 2 + 1] = Ci;
+            Cr = C[(ci + 3 * ldc + 0) * 2 + 0];
+            Ci = C[(ci + 3 * ldc + 0) * 2 + 1];
+            Cr += result12 * alphar;
+            Ci += result13 * alphar;
+            Cr -= result13 * alphai;
+            Ci += result12 * alphai;
+            C[(ci + 3 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 3 * ldc + 0) * 2 + 1] = Ci;
+            Cr = C[(ci + 3 * ldc + 1) * 2 + 0];
+            Ci = C[(ci + 3 * ldc + 1) * 2 + 1];
+            Cr += result14 * alphar;
+            Ci += result15 * alphar;
+            Cr -= result15 * alphai;
+            Ci += result14 * alphai;
+            C[(ci + 3 * ldc + 1) * 2 + 0] = Cr;
+            C[(ci + 3 * ldc + 1) * 2 + 1] = Ci;
+            m_top += 2;
+        }
+
+        if (M & 1) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            double result4 = 0;
+            double result5 = 0;
+            double result6 = 0;
+            double result7 = 0;
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+
+            for (BLASLONG k = 0; k < K; k++) {
+                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
+                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
+                result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1];
+                result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1];
+                result4 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1];
+                result5 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1];
+                result6 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1];
+                result7 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1];
+                ai += 1 * 2;
+                bi += 4 * 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            double Cr, Ci;
+            Cr = C[(ci + 0 * ldc + 0) * 2 + 0];
+            Ci = C[(ci + 0 * ldc + 0) * 2 + 1];
+            Cr += result0 * alphar;
+            Ci += result1 * alphar;
+            Cr -= result1 * alphai;
+            Ci += result0 * alphai;
+            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
+            Cr = C[(ci + 1 * ldc + 0) * 2 + 0];
+            Ci = C[(ci + 1 * ldc + 0) * 2 + 1];
+            Cr += result2 * alphar;
+            Ci += result3 * alphar;
+            Cr -= result3 * alphai;
+            Ci += result2 * alphai;
+            C[(ci + 1 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 1 * ldc + 0) * 2 + 1] = Ci;
+            Cr = C[(ci + 2 * ldc + 0) * 2 + 0];
+            Ci = C[(ci + 2 * ldc + 0) * 2 + 1];
+            Cr += result4 * alphar;
+            Ci += result5 * alphar;
+            Cr -= result5 * alphai;
+            Ci += result4 * alphai;
+            C[(ci + 2 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 2 * ldc + 0) * 2 + 1] = Ci;
+            Cr = C[(ci + 3 * ldc + 0) * 2 + 0];
+            Ci = C[(ci + 3 * ldc + 0) * 2 + 1];
+            Cr += result6 * alphar;
+            Ci += result7 * alphar;
+            Cr -= result7 * alphai;
+            Ci += result6 * alphai;
+            C[(ci + 3 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 3 * ldc + 0) * 2 + 1] = Ci;
+            m_top += 1;
+        }
+
+        n_top += 4;
+    }
+
+    // -- tails for N=2
+
+    if (N & 2) {
+        gvl = __riscv_vsetvl_e64m2(4);
+        m_top = 0;
+
+        for (BLASLONG i = 0; i < M / 4; i += 1) {
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            double B0r = B[bi + 0 * 2 + 0];
+            double B0i = B[bi + 0 * 2 + 1];
+            double B1r = B[bi + 1 * 2 + 0];
+            double B1i = B[bi + 1 * 2 + 1];
+            bi += 2 * 2;
+
+            vfloat64m2_t A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+            vfloat64m2_t A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ai += 4 * 2;
+
+            // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k
+            // leaving 10 vector registers for temporaries
+            vfloat64m2_t tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl);
+            vfloat64m2_t tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl);
+            vfloat64m2_t tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl);
+            vfloat64m2_t tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl);
+            tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
+            vfloat64m2_t ACC0r = tmp0r;
+            vfloat64m2_t ACC0i = tmp0i;
+            vfloat64m2_t ACC1r = tmp1r;
+            vfloat64m2_t ACC1i = tmp1i;
+
+            for (BLASLONG k = 1; k < K; k++) {
+                B0r = B[bi + 0 * 2 + 0];
+                B0i = B[bi + 0 * 2 + 1];
+                B1r = B[bi + 1 * 2 + 0];
+                B1i = B[bi + 1 * 2 + 1];
+                bi += 2 * 2;
+
+                A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+                A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+                ai += 4 * 2;
+
+                tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl);
+                tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
+                ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat64m2_t C0r = __riscv_vlse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
+            vfloat64m2_t C0i = __riscv_vlse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ci += ldc - gvl * 0;
+            vfloat64m2_t C1r = __riscv_vlse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
+            vfloat64m2_t C1i = __riscv_vlse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
+
+            C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl);
+            C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl);
+            C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl);
+            C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl);
+            C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl);
+            C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl);
+            C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl);
+            C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl);
+
+            ci = n_top * ldc + m_top;
+
+            __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl);
+            __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl);
+            __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl);
+
+            m_top += 4;
+        }
+
+        if (M & 2) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            double result4 = 0;
+            double result5 = 0;
+            double result6 = 0;
+            double result7 = 0;
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+
+            for (BLASLONG k = 0; k < K; k++) {
+                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
+                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
+                result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1];
+                result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1];
+                result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1];
+                result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1];
+                result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1];
+                result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1];
+                ai += 2 * 2;
+                bi += 2 * 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            double Cr, Ci;
+            Cr = C[(ci + 0 * ldc + 0) * 2 + 0];
+            Ci = C[(ci + 0 * ldc + 0) * 2 + 1];
+            Cr += result0 * alphar;
+            Ci += result1 * alphar;
+            Cr -= result1 * alphai;
+            Ci += result0 * alphai;
+            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
+            Cr = C[(ci + 0 * ldc + 1) * 2 + 0];
+            Ci = C[(ci + 0 * ldc + 1) * 2 + 1];
+            Cr += result2 * alphar;
+            Ci += result3 * alphar;
+            Cr -= result3 * alphai;
+            Ci += result2 * alphai;
+            C[(ci + 0 * ldc + 1) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 1) * 2 + 1] = Ci;
+            Cr = C[(ci + 1 * ldc + 0) * 2 + 0];
+            Ci = C[(ci + 1 * ldc + 0) * 2 + 1];
+            Cr += result4 * alphar;
+            Ci += result5 * alphar;
+            Cr -= result5 * alphai;
+            Ci += result4 * alphai;
+            C[(ci + 1 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 1 * ldc + 0) * 2 + 1] = Ci;
+            Cr = C[(ci + 1 * ldc + 1) * 2 + 0];
+            Ci = C[(ci + 1 * ldc + 1) * 2 + 1];
+            Cr += result6 * alphar;
+            Ci += result7 * alphar;
+            Cr -= result7 * alphai;
+            Ci += result6 * alphai;
+            C[(ci + 1 * ldc + 1) * 2 + 0] = Cr;
+            C[(ci + 1 * ldc + 1) * 2 + 1] = Ci;
+            m_top += 2;
+        }
+
+        if (M & 1) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+
+            for (BLASLONG k = 0; k < K; k++) {
+                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
+                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
+                result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1];
+                result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1];
+                ai += 1 * 2;
+                bi += 2 * 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            double Cr, Ci;
+            Cr = C[(ci + 0 * ldc + 0) * 2 + 0];
+            Ci = C[(ci + 0 * ldc + 0) * 2 + 1];
+            Cr += result0 * alphar;
+            Ci += result1 * alphar;
+            Cr -= result1 * alphai;
+            Ci += result0 * alphai;
+            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
+            Cr = C[(ci + 1 * ldc + 0) * 2 + 0];
+            Ci = C[(ci + 1 * ldc + 0) * 2 + 1];
+            Cr += result2 * alphar;
+            Ci += result3 * alphar;
+            Cr -= result3 * alphai;
+            Ci += result2 * alphai;
+            C[(ci + 1 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 1 * ldc + 0) * 2 + 1] = Ci;
+            m_top += 1;
+        }
+
+        n_top += 2;
+    }
+
+    // -- tails for N=1
+
+    if (N & 1) {
+        gvl = __riscv_vsetvl_e64m2(4);
+        m_top = 0;
+
+        for (BLASLONG i = 0; i < M / 4; i += 1) {
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            double B0r = B[bi + 0 * 2 + 0];
+            double B0i = B[bi + 0 * 2 + 1];
+            bi += 1 * 2;
+
+            vfloat64m2_t A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+            vfloat64m2_t A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ai += 4 * 2;
+
+            // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k
+            // leaving 12 vector registers for temporaries
+            vfloat64m2_t tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl);
+            vfloat64m2_t tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl);
+            tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+            vfloat64m2_t ACC0r = tmp0r;
+            vfloat64m2_t ACC0i = tmp0i;
+
+            for (BLASLONG k = 1; k < K; k++) {
+                B0r = B[bi + 0 * 2 + 0];
+                B0i = B[bi + 0 * 2 + 1];
+                bi += 1 * 2;
+
+                A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+                A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+                ai += 4 * 2;
+
+                tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl);
+                tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+                ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat64m2_t C0r = __riscv_vlse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
+            vfloat64m2_t C0i = __riscv_vlse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
+
+            C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl);
+            C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl);
+            C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl);
+            C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl);
+
+            ci = n_top * ldc + m_top;
+
+            __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl);
+            __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl);
+
+            m_top += 4;
+        }
+
+        if (M & 2) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+
+            for (BLASLONG k = 0; k < K; k++) {
+                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
+                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
+                result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1];
+                result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1];
+                ai += 2 * 2;
+                bi += 1 * 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            double Cr, Ci;
+            Cr = C[(ci + 0 * ldc + 0) * 2 + 0];
+            Ci = C[(ci + 0 * ldc + 0) * 2 + 1];
+            Cr += result0 * alphar;
+            Ci += result1 * alphar;
+            Cr -= result1 * alphai;
+            Ci += result0 * alphai;
+            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
+            Cr = C[(ci + 0 * ldc + 1) * 2 + 0];
+            Ci = C[(ci + 0 * ldc + 1) * 2 + 1];
+            Cr += result2 * alphar;
+            Ci += result3 * alphar;
+            Cr -= result3 * alphai;
+            Ci += result2 * alphai;
+            C[(ci + 0 * ldc + 1) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 1) * 2 + 1] = Ci;
+            m_top += 2;
+        }
+
+        if (M & 1) {
+            double result0 = 0;
+            double result1 = 0;
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+
+            for (BLASLONG k = 0; k < K; k++) {
+                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
+                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
+                ai += 1 * 2;
+                bi += 1 * 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            double Cr, Ci;
+            Cr = C[(ci + 0 * ldc + 0) * 2 + 0];
+            Ci = C[(ci + 0 * ldc + 0) * 2 + 1];
+            Cr += result0 * alphar;
+            Ci += result1 * alphar;
+            Cr -= result1 * alphai;
+            Ci += result0 * alphai;
+            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
+            m_top += 1;
+        }
+
+        n_top += 1;
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/zgemm_kernel_8x4_zvl256b.c b/kernel/riscv64/zgemm_kernel_8x4_zvl256b.c
new file mode 100644
index 000000000..ca33368f0
--- /dev/null
+++ b/kernel/riscv64/zgemm_kernel_8x4_zvl256b.c
@@ -0,0 +1,1253 @@
+/*
+
+AUTOGENERATED KERNEL
+Settings:
+ LMUL=1
+ M=8
+ M_tail_scalar_from=1
+ N=4
+ __riscv_='__riscv_'
+ complex=True
+ conjugate=False
+ cpu='zvl256b'
+ force_acc_double=False
+ index_type='BLASLONG'
+ op='gemm'
+ param_precision='double'
+ reg_width_bits=256
+ tail_policy=''
+ trace=False
+
+Derived:
+ ELEN_ACC=64
+ ELEN_PARAM=64
+ LMUL_ACC=1
+ VFMACC='__riscv_vfmacc_vf_f64m1'
+ VFMUL='__riscv_vfmul_vf_f64m1'
+ VLEV='__riscv_vle64_v_f64m1'
+ VLSEV='__riscv_vlse64_v_f64m1'
+ VMACC_TO_ACC='__riscv_vfmacc_vf_f64m1'
+ VMUL_TO_ACC='__riscv_vfmul_vf_f64m1'
+ VSETVL='__riscv_vsetvl_e64m1'
+ VSEV='__riscv_vse64_v_f64m1'
+ VSSEV='__riscv_vsse64_v_f64m1'
+ acc_vector_t='vfloat64m1_t'
+ output='zgemm_kernel_8x4_zvl256b.c'
+ param_scalar_t='double'
+ param_vector_t='vfloat64m1_t'
+
+*/
+
+#include "common.h"
+
+
+
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+    #define S0  1
+    #define S1 -1
+    #define S2  1
+    #define S3  1
+    #define VFMACC_RR __riscv_vfmsac
+    #define VFMACC_RI __riscv_vfmacc
+#endif
+#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
+    #define S0  1
+    #define S1  1
+    #define S2  1
+    #define S3 -1
+    #define VFMACC_RR __riscv_vfmacc
+    #define VFMACC_RI __riscv_vfmsac
+#endif
+#if   defined(RN) || defined(RT) || defined(CN) || defined(CT)
+    #define S0  1
+    #define S1  1
+    #define S2 -1
+    #define S3  1
+    #define VFMACC_RR __riscv_vfmacc
+    #define VFMACC_RI __riscv_vfnmsac
+#endif
+#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
+    #define S0  1
+    #define S1 -1
+    #define S2 -1
+    #define S3 -1
+    #define VFMACC_RR __riscv_vfmsac
+    #define VFMACC_RI __riscv_vfnmacc
+#endif
+
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc)
+
+{
+    BLASLONG gvl = 0;
+    BLASLONG m_top = 0;
+    BLASLONG n_top = 0;
+
+
+    // -- MAIN PASS
+
+    for (BLASLONG j=0; j<N/4; j+=1) {
+        m_top = 0;
+        BLASLONG gvl = __riscv_vsetvl_e64m1(4);
+
+
+        for (BLASLONG i=0; i<M/8; i+=1) {
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            double B0r = B[bi+0*2+0];
+            double B0i = B[bi+0*2+1];
+            double B1r = B[bi+1*2+0];
+            double B1i = B[bi+1*2+1];
+            double B2r = B[bi+2*2+0];
+            double B2i = B[bi+2*2+1];
+            double B3r = B[bi+3*2+0];
+            double B3i = B[bi+3*2+1];
+            bi += 4*2;
+
+            vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A1r = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A1i = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 8*2;
+
+            // 4 vector regs to hold A array contents, 16 regs to hold values accumulated over k
+            // leaving 12 vector registers for temporaries
+            // performing 4 operations between reuses of temporaries
+            vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+            vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+            vfloat64m1_t tmp1r = __riscv_vfmul_vf_f64m1( A1i, B0i, gvl);
+            vfloat64m1_t tmp1i = __riscv_vfmul_vf_f64m1( A1r, B0i, gvl);
+            vfloat64m1_t tmp2r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
+            vfloat64m1_t tmp2i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
+            vfloat64m1_t tmp3r = __riscv_vfmul_vf_f64m1( A1i, B1i, gvl);
+            vfloat64m1_t tmp3i = __riscv_vfmul_vf_f64m1( A1r, B1i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B0r, A1r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B0r, A1i, gvl);
+            tmp2r = VFMACC_RR( tmp2r, B1r, A0r, gvl);
+            tmp2i = VFMACC_RI( tmp2i, B1r, A0i, gvl);
+            tmp3r = VFMACC_RR( tmp3r, B1r, A1r, gvl);
+            tmp3i = VFMACC_RI( tmp3i, B1r, A1i, gvl);
+            vfloat64m1_t ACC0r = tmp0r;
+            vfloat64m1_t ACC0i = tmp0i;
+            vfloat64m1_t ACC1r = tmp1r;
+            vfloat64m1_t ACC1i = tmp1i;
+            vfloat64m1_t ACC2r = tmp2r;
+            vfloat64m1_t ACC2i = tmp2i;
+            vfloat64m1_t ACC3r = tmp3r;
+            vfloat64m1_t ACC3i = tmp3i;
+            tmp0r = __riscv_vfmul_vf_f64m1( A0i, B2i, gvl);
+            tmp0i = __riscv_vfmul_vf_f64m1( A0r, B2i, gvl);
+            tmp1r = __riscv_vfmul_vf_f64m1( A1i, B2i, gvl);
+            tmp1i = __riscv_vfmul_vf_f64m1( A1r, B2i, gvl);
+            tmp2r = __riscv_vfmul_vf_f64m1( A0i, B3i, gvl);
+            tmp2i = __riscv_vfmul_vf_f64m1( A0r, B3i, gvl);
+            tmp3r = __riscv_vfmul_vf_f64m1( A1i, B3i, gvl);
+            tmp3i = __riscv_vfmul_vf_f64m1( A1r, B3i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B2r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B2r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B2r, A1r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B2r, A1i, gvl);
+            tmp2r = VFMACC_RR( tmp2r, B3r, A0r, gvl);
+            tmp2i = VFMACC_RI( tmp2i, B3r, A0i, gvl);
+            tmp3r = VFMACC_RR( tmp3r, B3r, A1r, gvl);
+            tmp3i = VFMACC_RI( tmp3i, B3r, A1i, gvl);
+            vfloat64m1_t ACC4r = tmp0r;
+            vfloat64m1_t ACC4i = tmp0i;
+            vfloat64m1_t ACC5r = tmp1r;
+            vfloat64m1_t ACC5i = tmp1i;
+            vfloat64m1_t ACC6r = tmp2r;
+            vfloat64m1_t ACC6i = tmp2i;
+            vfloat64m1_t ACC7r = tmp3r;
+            vfloat64m1_t ACC7i = tmp3i;
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                B2r = B[bi+2*2+0];
+                B2i = B[bi+2*2+1];
+                B3r = B[bi+3*2+0];
+                B3i = B[bi+3*2+1];
+                bi += 4*2;
+
+                A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                A1r = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2], sizeof(FLOAT)*2, gvl );
+                A1i = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 8*2;
+
+                tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f64m1( A1i, B0i, gvl);
+                tmp1i = __riscv_vfmul_vf_f64m1( A1r, B0i, gvl);
+                tmp2r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
+                tmp2i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
+                tmp3r = __riscv_vfmul_vf_f64m1( A1i, B1i, gvl);
+                tmp3i = __riscv_vfmul_vf_f64m1( A1r, B1i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B0r, A1r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B0r, A1i, gvl);
+                tmp2r = VFMACC_RR( tmp2r, B1r, A0r, gvl);
+                tmp2i = VFMACC_RI( tmp2i, B1r, A0i, gvl);
+                tmp3r = VFMACC_RR( tmp3r, B1r, A1r, gvl);
+                tmp3i = VFMACC_RI( tmp3i, B1r, A1i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+                ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
+                ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
+                ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
+                ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
+                tmp0r = __riscv_vfmul_vf_f64m1( A0i, B2i, gvl);
+                tmp0i = __riscv_vfmul_vf_f64m1( A0r, B2i, gvl);
+                tmp1r = __riscv_vfmul_vf_f64m1( A1i, B2i, gvl);
+                tmp1i = __riscv_vfmul_vf_f64m1( A1r, B2i, gvl);
+                tmp2r = __riscv_vfmul_vf_f64m1( A0i, B3i, gvl);
+                tmp2i = __riscv_vfmul_vf_f64m1( A0r, B3i, gvl);
+                tmp3r = __riscv_vfmul_vf_f64m1( A1i, B3i, gvl);
+                tmp3i = __riscv_vfmul_vf_f64m1( A1r, B3i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B2r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B2r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B2r, A1r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B2r, A1i, gvl);
+                tmp2r = VFMACC_RR( tmp2r, B3r, A0r, gvl);
+                tmp2i = VFMACC_RI( tmp2i, B3r, A0i, gvl);
+                tmp3r = VFMACC_RR( tmp3r, B3r, A1r, gvl);
+                tmp3i = VFMACC_RI( tmp3i, B3r, A1i, gvl);
+                ACC4r = __riscv_vfadd( ACC4r, tmp0r, gvl);
+                ACC4i = __riscv_vfadd( ACC4i, tmp0i, gvl);
+                ACC5r = __riscv_vfadd( ACC5r, tmp1r, gvl);
+                ACC5i = __riscv_vfadd( ACC5i, tmp1i, gvl);
+                ACC6r = __riscv_vfadd( ACC6r, tmp2r, gvl);
+                ACC6i = __riscv_vfadd( ACC6i, tmp2i, gvl);
+                ACC7r = __riscv_vfadd( ACC7r, tmp3r, gvl);
+                ACC7i = __riscv_vfadd( ACC7i, tmp3i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t C0r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t C0i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+             ci += gvl;
+            vfloat64m1_t C1r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t C1i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*1;
+            vfloat64m1_t C2r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t C2i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+             ci += gvl;
+            vfloat64m1_t C3r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t C3i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*1;
+            vfloat64m1_t C4r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t C4i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+             ci += gvl;
+            vfloat64m1_t C5r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t C5i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*1;
+            vfloat64m1_t C6r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t C6i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+             ci += gvl;
+            vfloat64m1_t C7r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t C7i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            
+            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
+            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
+            C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
+            C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
+            C2r = __riscv_vfmacc( C2r, alphar, ACC2r, gvl );
+            C2i = __riscv_vfmacc( C2i, alphar, ACC2i, gvl );
+            C3r = __riscv_vfmacc( C3r, alphar, ACC3r, gvl );
+            C3i = __riscv_vfmacc( C3i, alphar, ACC3i, gvl );
+            C4r = __riscv_vfmacc( C4r, alphar, ACC4r, gvl );
+            C4i = __riscv_vfmacc( C4i, alphar, ACC4i, gvl );
+            C5r = __riscv_vfmacc( C5r, alphar, ACC5r, gvl );
+            C5i = __riscv_vfmacc( C5i, alphar, ACC5i, gvl );
+            C6r = __riscv_vfmacc( C6r, alphar, ACC6r, gvl );
+            C6i = __riscv_vfmacc( C6i, alphar, ACC6i, gvl );
+            C7r = __riscv_vfmacc( C7r, alphar, ACC7r, gvl );
+            C7i = __riscv_vfmacc( C7i, alphar, ACC7i, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+            C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
+            C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
+            C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
+            C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
+            C4r = __riscv_vfnmsac( C4r, alphai, ACC4i, gvl );
+            C4i = __riscv_vfmacc ( C4i, alphai, ACC4r, gvl );
+            C5r = __riscv_vfnmsac( C5r, alphai, ACC5i, gvl );
+            C5i = __riscv_vfmacc ( C5i, alphai, ACC5r, gvl );
+            C6r = __riscv_vfnmsac( C6r, alphai, ACC6i, gvl );
+            C6i = __riscv_vfmacc ( C6i, alphai, ACC6r, gvl );
+            C7r = __riscv_vfnmsac( C7r, alphai, ACC7i, gvl );
+            C7i = __riscv_vfmacc ( C7i, alphai, ACC7r, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+             ci += gvl;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            ci += ldc-gvl*1;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
+             ci += gvl;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
+            ci += ldc-gvl*1;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C4r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C4i, gvl);
+             ci += gvl;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C5r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C5i, gvl);
+            ci += ldc-gvl*1;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C6r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C6i, gvl);
+             ci += gvl;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C7r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C7i, gvl);
+            
+            m_top += 8;
+        }
+
+
+
+        // -- tails for main pass
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e64m1(4);
+
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            double B0r = B[bi+0*2+0];
+            double B0i = B[bi+0*2+1];
+            double B1r = B[bi+1*2+0];
+            double B1i = B[bi+1*2+1];
+            double B2r = B[bi+2*2+0];
+            double B2i = B[bi+2*2+1];
+            double B3r = B[bi+3*2+0];
+            double B3i = B[bi+3*2+1];
+            bi += 4*2;
+
+            vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 4*2;
+
+            // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k
+            // leaving 22 vector registers for temporaries
+            vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+            vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+            vfloat64m1_t tmp1r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
+            vfloat64m1_t tmp1i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
+            vfloat64m1_t tmp2r = __riscv_vfmul_vf_f64m1( A0i, B2i, gvl);
+            vfloat64m1_t tmp2i = __riscv_vfmul_vf_f64m1( A0r, B2i, gvl);
+            vfloat64m1_t tmp3r = __riscv_vfmul_vf_f64m1( A0i, B3i, gvl);
+            vfloat64m1_t tmp3i = __riscv_vfmul_vf_f64m1( A0r, B3i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+            tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+            tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+            tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+            tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+            vfloat64m1_t ACC0r = tmp0r;
+            vfloat64m1_t ACC0i = tmp0i;
+            vfloat64m1_t ACC1r = tmp1r;
+            vfloat64m1_t ACC1i = tmp1i;
+            vfloat64m1_t ACC2r = tmp2r;
+            vfloat64m1_t ACC2i = tmp2i;
+            vfloat64m1_t ACC3r = tmp3r;
+            vfloat64m1_t ACC3i = tmp3i;
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                B2r = B[bi+2*2+0];
+                B2i = B[bi+2*2+1];
+                B3r = B[bi+3*2+0];
+                B3i = B[bi+3*2+1];
+                bi += 4*2;
+
+                A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 4*2;
+
+                tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
+                tmp2r = __riscv_vfmul_vf_f64m1( A0i, B2i, gvl);
+                tmp2i = __riscv_vfmul_vf_f64m1( A0r, B2i, gvl);
+                tmp3r = __riscv_vfmul_vf_f64m1( A0i, B3i, gvl);
+                tmp3i = __riscv_vfmul_vf_f64m1( A0r, B3i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+                tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+                tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+                tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+                tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+                ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
+                ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
+                ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
+                ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t C0r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t C0i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat64m1_t C1r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t C1i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat64m1_t C2r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t C2i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat64m1_t C3r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t C3i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            
+            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
+            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
+            C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
+            C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
+            C2r = __riscv_vfmacc( C2r, alphar, ACC2r, gvl );
+            C2i = __riscv_vfmacc( C2i, alphar, ACC2i, gvl );
+            C3r = __riscv_vfmacc( C3r, alphar, ACC3r, gvl );
+            C3i = __riscv_vfmacc( C3i, alphar, ACC3i, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+            C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
+            C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
+            C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
+            C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
+            
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            gvl = __riscv_vsetvl_e64m1(2);
+
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            double B0r = B[bi+0*2+0];
+            double B0i = B[bi+0*2+1];
+            double B1r = B[bi+1*2+0];
+            double B1i = B[bi+1*2+1];
+            double B2r = B[bi+2*2+0];
+            double B2i = B[bi+2*2+1];
+            double B3r = B[bi+3*2+0];
+            double B3i = B[bi+3*2+1];
+            bi += 4*2;
+
+            vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 2*2;
+
+            // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k
+            // leaving 22 vector registers for temporaries
+            vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+            vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+            vfloat64m1_t tmp1r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
+            vfloat64m1_t tmp1i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
+            vfloat64m1_t tmp2r = __riscv_vfmul_vf_f64m1( A0i, B2i, gvl);
+            vfloat64m1_t tmp2i = __riscv_vfmul_vf_f64m1( A0r, B2i, gvl);
+            vfloat64m1_t tmp3r = __riscv_vfmul_vf_f64m1( A0i, B3i, gvl);
+            vfloat64m1_t tmp3i = __riscv_vfmul_vf_f64m1( A0r, B3i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+            tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+            tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+            tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+            tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+            vfloat64m1_t ACC0r = tmp0r;
+            vfloat64m1_t ACC0i = tmp0i;
+            vfloat64m1_t ACC1r = tmp1r;
+            vfloat64m1_t ACC1i = tmp1i;
+            vfloat64m1_t ACC2r = tmp2r;
+            vfloat64m1_t ACC2i = tmp2i;
+            vfloat64m1_t ACC3r = tmp3r;
+            vfloat64m1_t ACC3i = tmp3i;
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                B2r = B[bi+2*2+0];
+                B2i = B[bi+2*2+1];
+                B3r = B[bi+3*2+0];
+                B3i = B[bi+3*2+1];
+                bi += 4*2;
+
+                A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 2*2;
+
+                tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
+                tmp2r = __riscv_vfmul_vf_f64m1( A0i, B2i, gvl);
+                tmp2i = __riscv_vfmul_vf_f64m1( A0r, B2i, gvl);
+                tmp3r = __riscv_vfmul_vf_f64m1( A0i, B3i, gvl);
+                tmp3i = __riscv_vfmul_vf_f64m1( A0r, B3i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+                tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+                tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+                tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+                tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+                ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
+                ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
+                ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
+                ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t C0r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t C0i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat64m1_t C1r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t C1i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat64m1_t C2r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t C2i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat64m1_t C3r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t C3i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            
+            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
+            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
+            C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
+            C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
+            C2r = __riscv_vfmacc( C2r, alphar, ACC2r, gvl );
+            C2i = __riscv_vfmacc( C2i, alphar, ACC2i, gvl );
+            C3r = __riscv_vfmacc( C3r, alphar, ACC3r, gvl );
+            C3i = __riscv_vfmacc( C3i, alphar, ACC3i, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+            C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
+            C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
+            C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
+            C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
+            
+            m_top += 2;
+        }
+
+
+        if( M & 1 ) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            double result4 = 0;
+            double result5 = 0;
+            double result6 = 0;
+            double result7 = 0;
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+
+            for(BLASLONG k=0; k<K; k++) {
+                result0+=S0*A[ai+0+0]*B[bi+0+0] + S1*A[ai+0+1]*B[bi+0+1];
+                result1+=S2*A[ai+0+1]*B[bi+0+0] + S3*A[ai+0+0]*B[bi+0+1];
+                result2+=S0*A[ai+0+0]*B[bi+2+0] + S1*A[ai+0+1]*B[bi+2+1];
+                result3+=S2*A[ai+0+1]*B[bi+2+0] + S3*A[ai+0+0]*B[bi+2+1];
+                result4+=S0*A[ai+0+0]*B[bi+4+0] + S1*A[ai+0+1]*B[bi+4+1];
+                result5+=S2*A[ai+0+1]*B[bi+4+0] + S3*A[ai+0+0]*B[bi+4+1];
+                result6+=S0*A[ai+0+0]*B[bi+6+0] + S1*A[ai+0+1]*B[bi+6+1];
+                result7+=S2*A[ai+0+1]*B[bi+6+0] + S3*A[ai+0+0]*B[bi+6+1];
+                ai+=1*2;
+                bi+=4*2;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            double Cr, Ci;
+            Cr = C[(ci+0*ldc+0)*2+0];
+            Ci = C[(ci+0*ldc+0)*2+1];
+            Cr += result0*alphar;
+            Ci += result1*alphar;
+            Cr -= result1*alphai;
+            Ci += result0*alphai;
+            C[(ci+0*ldc+0)*2+0] = Cr;
+            C[(ci+0*ldc+0)*2+1] = Ci;
+            Cr = C[(ci+1*ldc+0)*2+0];
+            Ci = C[(ci+1*ldc+0)*2+1];
+            Cr += result2*alphar;
+            Ci += result3*alphar;
+            Cr -= result3*alphai;
+            Ci += result2*alphai;
+            C[(ci+1*ldc+0)*2+0] = Cr;
+            C[(ci+1*ldc+0)*2+1] = Ci;
+            Cr = C[(ci+2*ldc+0)*2+0];
+            Ci = C[(ci+2*ldc+0)*2+1];
+            Cr += result4*alphar;
+            Ci += result5*alphar;
+            Cr -= result5*alphai;
+            Ci += result4*alphai;
+            C[(ci+2*ldc+0)*2+0] = Cr;
+            C[(ci+2*ldc+0)*2+1] = Ci;
+            Cr = C[(ci+3*ldc+0)*2+0];
+            Ci = C[(ci+3*ldc+0)*2+1];
+            Cr += result6*alphar;
+            Ci += result7*alphar;
+            Cr -= result7*alphai;
+            Ci += result6*alphai;
+            C[(ci+3*ldc+0)*2+0] = Cr;
+            C[(ci+3*ldc+0)*2+1] = Ci;
+            m_top+=1;
+        }
+
+        n_top += 4;
+    }
+
+
+
+    // -- tails for N=2
+
+    if( N & 2 ) {
+        gvl = __riscv_vsetvl_e64m1(4);
+        m_top = 0;
+
+        for (BLASLONG i=0; i<M/8; i+=1) {
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            double B0r = B[bi+0*2+0];
+            double B0i = B[bi+0*2+1];
+            double B1r = B[bi+1*2+0];
+            double B1i = B[bi+1*2+1];
+            bi += 2*2;
+
+            vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A1r = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A1i = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 8*2;
+
+            // 4 vector regs to hold A array contents, 8 regs to hold values accumulated over k
+            // leaving 20 vector registers for temporaries
+            vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+            vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+            vfloat64m1_t tmp1r = __riscv_vfmul_vf_f64m1( A1i, B0i, gvl);
+            vfloat64m1_t tmp1i = __riscv_vfmul_vf_f64m1( A1r, B0i, gvl);
+            vfloat64m1_t tmp2r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
+            vfloat64m1_t tmp2i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
+            vfloat64m1_t tmp3r = __riscv_vfmul_vf_f64m1( A1i, B1i, gvl);
+            vfloat64m1_t tmp3i = __riscv_vfmul_vf_f64m1( A1r, B1i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B0r, A1r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B0r, A1i, gvl);
+            tmp2r = VFMACC_RR( tmp2r, B1r, A0r, gvl);
+            tmp2i = VFMACC_RI( tmp2i, B1r, A0i, gvl);
+            tmp3r = VFMACC_RR( tmp3r, B1r, A1r, gvl);
+            tmp3i = VFMACC_RI( tmp3i, B1r, A1i, gvl);
+            vfloat64m1_t ACC0r = tmp0r;
+            vfloat64m1_t ACC0i = tmp0i;
+            vfloat64m1_t ACC1r = tmp1r;
+            vfloat64m1_t ACC1i = tmp1i;
+            vfloat64m1_t ACC2r = tmp2r;
+            vfloat64m1_t ACC2i = tmp2i;
+            vfloat64m1_t ACC3r = tmp3r;
+            vfloat64m1_t ACC3i = tmp3i;
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                bi += 2*2;
+
+                A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                A1r = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2], sizeof(FLOAT)*2, gvl );
+                A1i = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 8*2;
+
+                tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f64m1( A1i, B0i, gvl);
+                tmp1i = __riscv_vfmul_vf_f64m1( A1r, B0i, gvl);
+                tmp2r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
+                tmp2i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
+                tmp3r = __riscv_vfmul_vf_f64m1( A1i, B1i, gvl);
+                tmp3i = __riscv_vfmul_vf_f64m1( A1r, B1i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B0r, A1r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B0r, A1i, gvl);
+                tmp2r = VFMACC_RR( tmp2r, B1r, A0r, gvl);
+                tmp2i = VFMACC_RI( tmp2i, B1r, A0i, gvl);
+                tmp3r = VFMACC_RR( tmp3r, B1r, A1r, gvl);
+                tmp3i = VFMACC_RI( tmp3i, B1r, A1i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+                ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
+                ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
+                ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
+                ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t C0r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t C0i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+             ci += gvl;
+            vfloat64m1_t C1r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t C1i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*1;
+            vfloat64m1_t C2r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t C2i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+             ci += gvl;
+            vfloat64m1_t C3r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t C3i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            
+            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
+            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
+            C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
+            C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
+            C2r = __riscv_vfmacc( C2r, alphar, ACC2r, gvl );
+            C2i = __riscv_vfmacc( C2i, alphar, ACC2i, gvl );
+            C3r = __riscv_vfmacc( C3r, alphar, ACC3r, gvl );
+            C3i = __riscv_vfmacc( C3i, alphar, ACC3i, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+            C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
+            C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
+            C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
+            C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+             ci += gvl;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            ci += ldc-gvl*1;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
+             ci += gvl;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
+            
+            m_top += 8;
+        }
+
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e64m1(4);
+
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            double B0r = B[bi+0*2+0];
+            double B0i = B[bi+0*2+1];
+            double B1r = B[bi+1*2+0];
+            double B1i = B[bi+1*2+1];
+            bi += 2*2;
+
+            vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 4*2;
+
+            // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k
+            // leaving 26 vector registers for temporaries
+            vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+            vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+            vfloat64m1_t tmp1r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
+            vfloat64m1_t tmp1i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+            vfloat64m1_t ACC0r = tmp0r;
+            vfloat64m1_t ACC0i = tmp0i;
+            vfloat64m1_t ACC1r = tmp1r;
+            vfloat64m1_t ACC1i = tmp1i;
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                bi += 2*2;
+
+                A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 4*2;
+
+                tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t C0r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t C0i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat64m1_t C1r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t C1i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            
+            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
+            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
+            C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
+            C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            gvl = __riscv_vsetvl_e64m1(2);
+
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            double B0r = B[bi+0*2+0];
+            double B0i = B[bi+0*2+1];
+            double B1r = B[bi+1*2+0];
+            double B1i = B[bi+1*2+1];
+            bi += 2*2;
+
+            vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 2*2;
+
+            // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k
+            // leaving 26 vector registers for temporaries
+            vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+            vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+            vfloat64m1_t tmp1r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
+            vfloat64m1_t tmp1i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+            vfloat64m1_t ACC0r = tmp0r;
+            vfloat64m1_t ACC0i = tmp0i;
+            vfloat64m1_t ACC1r = tmp1r;
+            vfloat64m1_t ACC1i = tmp1i;
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                bi += 2*2;
+
+                A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 2*2;
+
+                tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t C0r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t C0i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat64m1_t C1r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t C1i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            
+            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
+            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
+            C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
+            C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            
+            m_top += 2;
+        }
+
+
+        if( M & 1 ) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+
+            for(BLASLONG k=0; k<K; k++) {
+                result0+=S0*A[ai+0+0]*B[bi+0+0] + S1*A[ai+0+1]*B[bi+0+1];
+                result1+=S2*A[ai+0+1]*B[bi+0+0] + S3*A[ai+0+0]*B[bi+0+1];
+                result2+=S0*A[ai+0+0]*B[bi+2+0] + S1*A[ai+0+1]*B[bi+2+1];
+                result3+=S2*A[ai+0+1]*B[bi+2+0] + S3*A[ai+0+0]*B[bi+2+1];
+                ai+=1*2;
+                bi+=2*2;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            double Cr, Ci;
+            Cr = C[(ci+0*ldc+0)*2+0];
+            Ci = C[(ci+0*ldc+0)*2+1];
+            Cr += result0*alphar;
+            Ci += result1*alphar;
+            Cr -= result1*alphai;
+            Ci += result0*alphai;
+            C[(ci+0*ldc+0)*2+0] = Cr;
+            C[(ci+0*ldc+0)*2+1] = Ci;
+            Cr = C[(ci+1*ldc+0)*2+0];
+            Ci = C[(ci+1*ldc+0)*2+1];
+            Cr += result2*alphar;
+            Ci += result3*alphar;
+            Cr -= result3*alphai;
+            Ci += result2*alphai;
+            C[(ci+1*ldc+0)*2+0] = Cr;
+            C[(ci+1*ldc+0)*2+1] = Ci;
+            m_top+=1;
+        }
+
+        n_top += 2;
+    }
+
+
+
+    // -- tails for N=1
+
+    if( N & 1 ) {
+        gvl = __riscv_vsetvl_e64m1(4);
+        m_top = 0;
+
+        for (BLASLONG i=0; i<M/8; i+=1) {
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            double B0r = B[bi+0*2+0];
+            double B0i = B[bi+0*2+1];
+            bi += 1*2;
+
+            vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A1r = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A1i = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 8*2;
+
+            // 4 vector regs to hold A array contents, 4 regs to hold values accumulated over k
+            // leaving 24 vector registers for temporaries
+            vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+            vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+            vfloat64m1_t tmp1r = __riscv_vfmul_vf_f64m1( A1i, B0i, gvl);
+            vfloat64m1_t tmp1i = __riscv_vfmul_vf_f64m1( A1r, B0i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B0r, A1r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B0r, A1i, gvl);
+            vfloat64m1_t ACC0r = tmp0r;
+            vfloat64m1_t ACC0i = tmp0i;
+            vfloat64m1_t ACC1r = tmp1r;
+            vfloat64m1_t ACC1i = tmp1i;
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                bi += 1*2;
+
+                A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                A1r = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2], sizeof(FLOAT)*2, gvl );
+                A1i = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 8*2;
+
+                tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f64m1( A1i, B0i, gvl);
+                tmp1i = __riscv_vfmul_vf_f64m1( A1r, B0i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B0r, A1r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B0r, A1i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t C0r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t C0i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+             ci += gvl;
+            vfloat64m1_t C1r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t C1i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            
+            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
+            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
+            C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
+            C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+             ci += gvl;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            
+            m_top += 8;
+        }
+
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e64m1(4);
+
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            double B0r = B[bi+0*2+0];
+            double B0i = B[bi+0*2+1];
+            bi += 1*2;
+
+            vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 4*2;
+
+            // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k
+            // leaving 28 vector registers for temporaries
+            vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+            vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            vfloat64m1_t ACC0r = tmp0r;
+            vfloat64m1_t ACC0i = tmp0i;
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                bi += 1*2;
+
+                A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 4*2;
+
+                tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t C0r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t C0i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            
+            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
+            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            gvl = __riscv_vsetvl_e64m1(2);
+
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            double B0r = B[bi+0*2+0];
+            double B0i = B[bi+0*2+1];
+            bi += 1*2;
+
+            vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 2*2;
+
+            // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k
+            // leaving 28 vector registers for temporaries
+            vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+            vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            vfloat64m1_t ACC0r = tmp0r;
+            vfloat64m1_t ACC0i = tmp0i;
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                bi += 1*2;
+
+                A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 2*2;
+
+                tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t C0r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t C0i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            
+            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
+            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            
+            m_top += 2;
+        }
+
+
+        if( M & 1 ) {
+            double result0 = 0;
+            double result1 = 0;
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+
+            for(BLASLONG k=0; k<K; k++) {
+                result0+=S0*A[ai+0+0]*B[bi+0+0] + S1*A[ai+0+1]*B[bi+0+1];
+                result1+=S2*A[ai+0+1]*B[bi+0+0] + S3*A[ai+0+0]*B[bi+0+1];
+                ai+=1*2;
+                bi+=1*2;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            double Cr, Ci;
+            Cr = C[(ci+0*ldc+0)*2+0];
+            Ci = C[(ci+0*ldc+0)*2+1];
+            Cr += result0*alphar;
+            Ci += result1*alphar;
+            Cr -= result1*alphai;
+            Ci += result0*alphai;
+            C[(ci+0*ldc+0)*2+0] = Cr;
+            C[(ci+0*ldc+0)*2+1] = Ci;
+            m_top+=1;
+        }
+
+        n_top += 1;
+    }
+
+    return 0;
+}
+
diff --git a/kernel/riscv64/zgemm_kernel_generic.c b/kernel/riscv64/zgemm_kernel_generic.c
new file mode 100644
index 000000000..a46dc9171
--- /dev/null
+++ b/kernel/riscv64/zgemm_kernel_generic.c
@@ -0,0 +1,140 @@
+#include "common.h"
+
+
+/* for debugging/unit tests
+ * this is a drop-in replacement for zgemm/cgemm/ztrmm/ctrmm kernels that supports arbitrary combinations of unroll values
+ */
+
+#ifdef TRMMKERNEL
+    #if defined(LEFT) != defined(TRANSA)
+    #define BACKWARDS
+    #endif
+#endif
+
+#ifdef DOUBLE
+
+#define UNROLL_M ZGEMM_DEFAULT_UNROLL_M
+#define UNROLL_N ZGEMM_DEFAULT_UNROLL_N
+
+#else
+
+#define UNROLL_M CGEMM_DEFAULT_UNROLL_M
+#define UNROLL_N CGEMM_DEFAULT_UNROLL_N
+
+#endif
+
+int CNAME(BLASLONG M,BLASLONG N,BLASLONG K,FLOAT alphar,FLOAT alphai,FLOAT* A,FLOAT* B,FLOAT* C,BLASLONG ldc
+#ifdef TRMMKERNEL
+    ,BLASLONG offset
+#endif    
+    )
+{
+    FLOAT res[UNROLL_M*UNROLL_N*2];
+
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+    FLOAT sign[4] = { 1, -1,  1,  1};
+#endif
+#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
+    FLOAT sign[4] = { 1,  1,  1, -1};
+#endif
+#if   defined(RN) || defined(RT) || defined(CN) || defined(CT)
+    FLOAT sign[4] = { 1,  1, -1,  1};
+#endif
+#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
+    FLOAT sign[4] = { 1, -1, -1, -1};
+#endif
+
+    BLASLONG n_packing = UNROLL_N;
+    BLASLONG n_top = 0;
+
+    while(n_top < N)
+    {
+        while( n_top+n_packing > N )
+            n_packing >>= 1;
+
+        BLASLONG m_packing = UNROLL_M;
+        BLASLONG m_top = 0;
+        while (m_top < M)
+        {
+            while( m_top+m_packing > M )
+                m_packing >>= 1;
+
+            BLASLONG ai = K*m_top*2;
+            BLASLONG bi = K*n_top*2;
+
+            BLASLONG pass_K = K;
+
+
+            #ifdef TRMMKERNEL
+                #ifdef LEFT
+                    BLASLONG off = offset + m_top;
+                #else
+                    BLASLONG off = -offset + n_top;
+                #endif
+                #ifdef BACKWARDS
+                    ai += off * m_packing*2;
+                    bi += off * n_packing*2;
+                    pass_K -= off; 
+                #else
+                    #ifdef LEFT
+                        pass_K = off + m_packing;
+                    #else
+                        pass_K = off + n_packing;
+                    #endif
+                #endif
+            #endif
+
+            memset( res, 0, UNROLL_M*UNROLL_N*2*sizeof(FLOAT) );
+
+            for (BLASLONG k=0; k<pass_K; k+=1)
+            {
+                for( BLASLONG ki = 0; ki < n_packing; ++ki )
+                {
+                    FLOAT B0 = B[bi+ki*2+0];
+                    FLOAT B1 = B[bi+ki*2+1];
+
+                    for( BLASLONG kj = 0; kj < m_packing; ++kj )
+                    {
+                        FLOAT A0 = A[ai+kj*2+0];
+                        FLOAT A1 = A[ai+kj*2+1];
+
+                        res[(ki*UNROLL_M+kj)*2+0] += sign[0]*A0*B0 +sign[1]*A1*B1;
+                        res[(ki*UNROLL_M+kj)*2+1] += sign[2]*A1*B0 +sign[3]*A0*B1;
+                    }
+                }
+
+                ai += m_packing*2;
+                bi += n_packing*2;
+            }
+
+            BLASLONG cofs = ldc * n_top + m_top;
+            for( BLASLONG ki = 0; ki < n_packing; ++ki )
+            {
+                for( BLASLONG kj = 0; kj < m_packing; ++kj )
+                {
+                    #ifdef TRMMKERNEL
+                    FLOAT Cr = 0;
+                    FLOAT Ci = 0;
+                    #else
+                    FLOAT Cr = C[(cofs+ki*ldc+kj)*2+0];
+                    FLOAT Ci = C[(cofs+ki*ldc+kj)*2+1];
+                    #endif
+
+                    Cr += res[(ki*UNROLL_M+kj)*2+0]*alphar;
+                    Cr += -res[(ki*UNROLL_M+kj)*2+1]*alphai;
+                    Ci += res[(ki*UNROLL_M+kj)*2+1]*alphar;
+                    Ci += res[(ki*UNROLL_M+kj)*2+0]*alphai;
+
+                    C[(cofs+ki*ldc+kj)*2+0] = Cr; 
+                    C[(cofs+ki*ldc+kj)*2+1] = Ci;
+                }
+            }
+
+            m_top += m_packing;
+        }
+
+        n_top += n_packing;
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/zgemm_ncopy_4_rvv.c b/kernel/riscv64/zgemm_ncopy_4_rvv.c
new file mode 100644
index 000000000..dce98752e
--- /dev/null
+++ b/kernel/riscv64/zgemm_ncopy_4_rvv.c
@@ -0,0 +1,149 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m1(n)
+#define FLOAT_VX2_T             vfloat32m1x2_t
+#define FLOAT_VX4_T             vfloat32m1x4_t
+#define FLOAT_VX8_T             vfloat32m1x8_t
+#define VGET_VX2                __riscv_vget_v_f32m1x2_f32m1
+#define VSET_VX2                __riscv_vset_v_f32m1_f32m1x2
+#define VSET_VX4                __riscv_vset_v_f32m1_f32m1x4
+#define VSET_VX8                __riscv_vset_v_f32m1_f32m1x8
+#define VLSEG2_FLOAT            __riscv_vlseg2e32_v_f32m1x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m1x2
+#define VSSEG4_FLOAT            __riscv_vsseg4e32_v_f32m1x4
+#define VSSEG8_FLOAT            __riscv_vsseg8e32_v_f32m1x8
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m1(n)
+#define FLOAT_VX2_T             vfloat64m1x2_t
+#define FLOAT_VX4_T             vfloat64m1x4_t
+#define FLOAT_VX8_T             vfloat64m1x8_t
+#define VGET_VX2                __riscv_vget_v_f64m1x2_f64m1
+#define VSET_VX2                __riscv_vset_v_f64m1_f64m1x2
+#define VSET_VX4                __riscv_vset_v_f64m1_f64m1x4
+#define VSET_VX8                __riscv_vset_v_f64m1_f64m1x8
+#define VLSEG2_FLOAT            __riscv_vlseg2e64_v_f64m1x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m1x2
+#define VSSEG4_FLOAT            __riscv_vsseg4e64_v_f64m1x4
+#define VSSEG8_FLOAT            __riscv_vsseg8e64_v_f64m1x8
+#endif
+
+// Optimizes the implementation in ../generic/zgemm_ncopy_4.c
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
+    BLASLONG i, j;
+
+    FLOAT *aoffset;
+    FLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4;
+
+    FLOAT *boffset;
+
+    FLOAT_VX2_T v1x2, v2x2, v3x2, v4x2;
+    FLOAT_VX4_T vxx4;
+    FLOAT_VX8_T vxx8;
+    size_t vl;
+
+    aoffset = a;
+    boffset = b;
+    lda *= 2;
+
+    for (j = (n >> 2); j > 0; j--) {
+        aoffset1  = aoffset;
+        aoffset2  = aoffset1 + lda;
+        aoffset3  = aoffset2 + lda;
+        aoffset4  = aoffset3 + lda;
+        aoffset  += 4 * lda;
+
+        for (i = m; i > 0; i -= vl) {
+            vl = VSETVL(i);
+            v1x2 = VLSEG2_FLOAT(aoffset1, vl);
+            v2x2 = VLSEG2_FLOAT(aoffset2, vl);
+            v3x2 = VLSEG2_FLOAT(aoffset3, vl);
+            v4x2 = VLSEG2_FLOAT(aoffset4, vl);
+            
+            vxx8 = VSET_VX8(vxx8, 0, VGET_VX2(v1x2, 0));
+            vxx8 = VSET_VX8(vxx8, 1, VGET_VX2(v1x2, 1));
+            vxx8 = VSET_VX8(vxx8, 2, VGET_VX2(v2x2, 0));
+            vxx8 = VSET_VX8(vxx8, 3, VGET_VX2(v2x2, 1));
+            vxx8 = VSET_VX8(vxx8, 4, VGET_VX2(v3x2, 0));
+            vxx8 = VSET_VX8(vxx8, 5, VGET_VX2(v3x2, 1));
+            vxx8 = VSET_VX8(vxx8, 6, VGET_VX2(v4x2, 0));
+            vxx8 = VSET_VX8(vxx8, 7, VGET_VX2(v4x2, 1));
+
+            VSSEG8_FLOAT(boffset, vxx8, vl);
+
+            aoffset1 += vl * 2;
+            aoffset2 += vl * 2;
+            aoffset3 += vl * 2;
+            aoffset4 += vl * 2;
+            boffset  += vl * 8;
+        }
+    }
+
+    if (n & 2) {
+        aoffset1  = aoffset;
+        aoffset2  = aoffset1 + lda;
+        aoffset  += 2 * lda;
+        
+        for (i = m; i > 0; i -= vl) {
+            vl = VSETVL(i);
+            v1x2 = VLSEG2_FLOAT(aoffset1, vl);
+            v2x2 = VLSEG2_FLOAT(aoffset2, vl);
+
+            vxx4 = VSET_VX4(vxx4, 0, VGET_VX2(v1x2, 0));
+            vxx4 = VSET_VX4(vxx4, 1, VGET_VX2(v1x2, 1));
+            vxx4 = VSET_VX4(vxx4, 2, VGET_VX2(v2x2, 0));
+            vxx4 = VSET_VX4(vxx4, 3, VGET_VX2(v2x2, 1));
+        
+            VSSEG4_FLOAT(boffset, vxx4, vl);
+        
+            aoffset1 += vl * 2;
+            aoffset2 += vl * 2;
+            boffset  += vl * 4;
+        }
+    }
+
+    if (n & 1) {
+        aoffset1  = aoffset;
+        aoffset  += lda;
+
+        for (i = m; i > 0; i -= vl) {
+            vl = VSETVL(i);
+            v1x2 = VLSEG2_FLOAT(aoffset1, vl);
+
+            VSSEG2_FLOAT(boffset, v1x2, vl);
+
+            aoffset1 += vl * 2;
+            boffset  += vl * 2;
+        }
+    }
+
+     return 0;
+}
diff --git a/kernel/riscv64/zgemm_ncopy_rvv_v1.c b/kernel/riscv64/zgemm_ncopy_rvv_v1.c
new file mode 100644
index 000000000..275daa5f2
--- /dev/null
+++ b/kernel/riscv64/zgemm_ncopy_rvv_v1.c
@@ -0,0 +1,74 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define FLOAT_VX2_T             vfloat32m2x2_t
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e32_v_f32m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2x2
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define FLOAT_VX2_T             vfloat64m2x2_t
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e64_v_f64m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2x2
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
+
+    BLASLONG i, j;
+
+    FLOAT *a_offset;
+    FLOAT *a_offset1;
+    FLOAT *b_offset;
+
+    FLOAT_VX2_T vx2;
+    size_t vl;
+
+    //fprintf(stderr, "%s, m=%ld n=%ld lda=%ld\n", __FUNCTION__, m, n, lda);
+    a_offset = a;
+    b_offset = b;
+
+    for(j = n; j > 0; j -= vl) {
+        vl = VSETVL(j);
+
+        a_offset1 = a_offset;
+        a_offset += vl * lda * 2;
+
+        for(i = m; i > 0; i--) {
+            vx2 = VLSSEG2_FLOAT(a_offset1, lda * sizeof(FLOAT) * 2, vl);
+            VSSEG2_FLOAT(b_offset, vx2, vl);
+
+            a_offset1 += 2;
+            b_offset += vl * 2;
+        }
+    }
+    return 0;
+}
+
diff --git a/kernel/riscv64/zgemm_tcopy_4_rvv.c b/kernel/riscv64/zgemm_tcopy_4_rvv.c
new file mode 100644
index 000000000..cfafbf0dc
--- /dev/null
+++ b/kernel/riscv64/zgemm_tcopy_4_rvv.c
@@ -0,0 +1,191 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m1(n)
+#define FLOAT_V_T               vfloat32m1_t
+#define FLOAT_VX2_T             vfloat32m1x2_t
+#define FLOAT_VX4_T             vfloat32m1x4_t
+#define FLOAT_VX8_T             vfloat32m1x8_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m1
+#define VSEV_FLOAT              __riscv_vse32_v_f32m1
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e32_v_f32m1x2
+#define VLSSEG4_FLOAT           __riscv_vlsseg4e32_v_f32m1x4
+#define VLSSEG8_FLOAT           __riscv_vlsseg8e32_v_f32m1x8
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m1x2
+#define VSSEG4_FLOAT            __riscv_vsseg4e32_v_f32m1x4
+#define VSSEG8_FLOAT            __riscv_vsseg8e32_v_f32m1x8
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m1(n)
+#define FLOAT_V_T               vfloat64m1_t
+#define FLOAT_VX2_T             vfloat64m1x2_t
+#define FLOAT_VX4_T             vfloat64m1x4_t
+#define FLOAT_VX8_T             vfloat64m1x8_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m1
+#define VSEV_FLOAT              __riscv_vse64_v_f64m1
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e64_v_f64m1x2
+#define VLSSEG4_FLOAT           __riscv_vlsseg4e64_v_f64m1x4
+#define VLSSEG8_FLOAT           __riscv_vlsseg8e64_v_f64m1x8
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m1x2
+#define VSSEG4_FLOAT            __riscv_vsseg4e64_v_f64m1x4
+#define VSSEG8_FLOAT            __riscv_vsseg8e64_v_f64m1x8
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
+
+    BLASLONG i, j;
+
+    IFLOAT *aoffset;
+    IFLOAT *aoffset1;
+
+    IFLOAT *boffset, *boffset1, *boffset2, *boffset3;
+
+    FLOAT_V_T v0;
+    FLOAT_VX2_T vx2;
+    FLOAT_VX4_T vx4;
+    FLOAT_VX8_T vx8;
+
+    size_t vl;
+
+    //fprintf(stderr, "%s m=%ld n=%ld lda=%ld\n", __FUNCTION__, m, n, lda);
+
+    aoffset   = a;
+    boffset   = b;
+    boffset2  = b + 2 * m  * (n & ~3);
+    boffset3  = b + 2 * m  * (n & ~1);
+
+    for(j = (m >> 2); j > 0; j--) {
+
+        aoffset1  = aoffset;
+        aoffset += 8 * lda;
+
+        boffset1  = boffset;
+        boffset  += 32;
+
+        for(i = (n >> 2); i > 0; i--) {
+            vl = 4;
+
+            vx8 = VLSSEG8_FLOAT(aoffset1, lda * sizeof(FLOAT) * 2, vl);
+            VSSEG8_FLOAT(boffset1, vx8, vl);
+
+            aoffset1 += 8;
+            boffset1 += m * 8;
+        }
+
+        if (n & 2) {
+            vl = 4;
+
+            vx4 = VLSSEG4_FLOAT(aoffset1, lda * sizeof(FLOAT) * 2, vl);
+            VSSEG4_FLOAT(boffset2, vx4, vl);
+
+            aoffset1 += 4;
+            boffset2 += 16;
+        }
+
+        if (n & 1) {
+            vl = 4;
+
+            vx2 = VLSSEG2_FLOAT(aoffset1, lda * sizeof(FLOAT) * 2, vl);
+            VSSEG2_FLOAT(boffset3, vx2, vl);
+
+            aoffset1 += 2;
+            boffset3 += 8;
+        }
+    }
+
+    if (m & 2) {
+        aoffset1  = aoffset;
+        aoffset += 4 * lda;
+
+        boffset1  = boffset;
+        boffset  += 16;
+
+        for(i = (n >> 2); i > 0; i--) {
+            vl = 2;
+
+            vx8 = VLSSEG8_FLOAT(aoffset1, lda * sizeof(FLOAT) * 2, vl);
+            VSSEG8_FLOAT(boffset1, vx8, vl);
+
+            aoffset1 += 8;
+            boffset1 += m * 8;
+        }
+
+        if (n & 2) {
+            vl = 2;
+
+            vx4 = VLSSEG4_FLOAT(aoffset1, lda * sizeof(FLOAT) * 2, vl);
+            VSSEG4_FLOAT(boffset2, vx4, vl);
+
+            aoffset1 += 4;
+            boffset2 += 8;
+        }
+
+        if (n & 1) {
+            vl = 2;
+
+            vx2 = VLSSEG2_FLOAT(aoffset1, lda * sizeof(FLOAT) * 2, vl);
+            VSSEG2_FLOAT(boffset3, vx2, vl);
+
+            //aoffset1 += 2;
+            boffset3 += 4;
+        }
+    }
+
+    if (m & 1) {
+        aoffset1  = aoffset;
+        boffset1  = boffset;
+
+        for(i = (n >> 2); i > 0; i--) {
+            vl = 8;
+
+            v0 = VLEV_FLOAT(aoffset1, vl);
+            VSEV_FLOAT(boffset1, v0, vl);
+
+            aoffset1 += 8;
+            boffset1 += 8 * m;
+        }
+
+        if (n & 2) {
+            vl = 4;
+
+            v0 = VLEV_FLOAT(aoffset1, vl);
+            VSEV_FLOAT(boffset2, v0, vl);
+
+            aoffset1 += 4;
+            //boffset2 += 4;
+        }
+
+        if (n & 1) {
+           *(boffset3) = *(aoffset1);
+           *(boffset3 + 1) = *(aoffset1 + 1);
+        }
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/zgemm_tcopy_rvv_v1.c b/kernel/riscv64/zgemm_tcopy_rvv_v1.c
new file mode 100644
index 000000000..96e986502
--- /dev/null
+++ b/kernel/riscv64/zgemm_tcopy_rvv_v1.c
@@ -0,0 +1,74 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define FLOAT_VX2_T             vfloat32m2x2_t
+#define VLSEG2_FLOAT            __riscv_vlseg2e32_v_f32m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2x2
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define FLOAT_VX2_T             vfloat64m2x2_t
+#define VLSEG2_FLOAT            __riscv_vlseg2e64_v_f64m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2x2
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
+{
+    BLASLONG i, j;
+
+    IFLOAT *aoffset;
+    IFLOAT *aoffset1;
+    IFLOAT *boffset;
+
+    FLOAT_VX2_T vx2;
+    size_t vl;
+
+    //fprintf(stderr, "%s, m=%ld n=%ld lda=%ld\n", __FUNCTION__, m, n, lda);
+
+    aoffset = a;
+    boffset = b;
+
+    for(j = n; j > 0; j -= vl) {
+        vl = VSETVL(j);
+
+        aoffset1 = aoffset;
+        aoffset += vl * 2;
+
+        for(i = m; i > 0; i--) {
+            vx2 = VLSEG2_FLOAT(aoffset1, vl);
+            VSSEG2_FLOAT(boffset, vx2, vl);
+
+            aoffset1 += lda * 2;
+            boffset += vl * 2;
+        }
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/zgemmkernel_rvv_v1x4.c b/kernel/riscv64/zgemmkernel_rvv_v1x4.c
new file mode 100644
index 000000000..77e012ff5
--- /dev/null
+++ b/kernel/riscv64/zgemmkernel_rvv_v1x4.c
@@ -0,0 +1,553 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define FLOAT_V_T               vfloat32m2_t
+#define FLOAT_VX2_T             vfloat32m2x2_t
+#define VGET_VX2                __riscv_vget_v_f32m2x2_f32m2
+#define VSET_VX2                __riscv_vset_v_f32m2_f32m2x2
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e32_v_f32m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2x2
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m2
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f32m2
+#define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f32m2
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define FLOAT_V_T               vfloat64m2_t
+#define FLOAT_VX2_T             vfloat64m2x2_t
+#define VGET_VX2                __riscv_vget_v_f64m2x2_f64m2
+#define VSET_VX2                __riscv_vset_v_f64m2_f64m2x2
+#define VLEV_FLOAT              __riscv_vle64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e64_v_f64m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2x2
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m2
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f64m2
+#define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f64m2
+#endif
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+#define OP_rr       VFMACCVF_FLOAT
+#define OP_ir       VFMACCVF_FLOAT
+#define OP_ii       VFNMSACVF_FLOAT
+#define OP_ri       VFMACCVF_FLOAT
+#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
+#define OP_rr       VFMACCVF_FLOAT
+#define OP_ir       VFMACCVF_FLOAT
+#define OP_ii       VFMACCVF_FLOAT
+#define OP_ri       VFNMSACVF_FLOAT
+#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
+#define OP_rr       VFMACCVF_FLOAT
+#define OP_ir       VFNMSACVF_FLOAT
+#define OP_ii       VFMACCVF_FLOAT
+#define OP_ri       VFMACCVF_FLOAT
+#elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
+#define OP_rr       VFMACCVF_FLOAT
+#define OP_ir       VFNMSACVF_FLOAT
+#define OP_ii       VFNMSACVF_FLOAT
+#define OP_ri       VFNMSACVF_FLOAT
+#endif
+
+int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc
+#ifdef	TRMMKERNEL
+		, BLASLONG offset
+#endif
+		)
+{
+    BLASLONG i,j,k;
+    FLOAT *C0, *C1, *C2, *C3, *ptrba,*ptrbb;
+
+    FLOAT_VX2_T vax2;
+    FLOAT_V_T va0, va1, va2, va3, va4, va5, va6, va7;
+    FLOAT_V_T vres0, vres1, vres2, vres3, vres4, vres5, vres6, vres7;
+
+    //fprintf(stderr, "%s, bn=%ld bm=%ld bk=%ld alphar=%f alphai=%f ldc=%ld\n", __FUNCTION__, bn, bm, bk, alphar, alphai, ldc); // Debug
+
+    size_t vl;
+    for (j = bn/4; j > 0; j--)
+    {
+        C0 = C;
+        C1 = C0 + 2 * ldc;
+        C2 = C1 + 2 * ldc;
+        C3 = C2 + 2 * ldc;
+        ptrba = ba;
+        for (i = bm; i > 0; i -= vl)
+        {
+            vl = VSETVL(i);
+            ptrbb = bb;
+
+            vres0 = VFMVVF_FLOAT(0.0, vl);
+            vres1 = VFMVVF_FLOAT(0.0, vl);
+            vres2 = VFMVVF_FLOAT(0.0, vl);
+            vres3 = VFMVVF_FLOAT(0.0, vl);
+            vres4 = VFMVVF_FLOAT(0.0, vl);
+            vres5 = VFMVVF_FLOAT(0.0, vl);
+            vres6 = VFMVVF_FLOAT(0.0, vl);
+            vres7 = VFMVVF_FLOAT(0.0, vl);
+
+            for (k = bk/4; k > 0; k--)
+            {
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va0 = VGET_VX2(vax2, 0);
+                va1 = VGET_VX2(vax2, 1);
+                ptrba += vl*2;
+
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va2 = VGET_VX2(vax2, 0);
+                va3 = VGET_VX2(vax2, 1);
+                ptrba += vl*2;
+
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va0, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va1, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va1, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va0, vl);
+
+                vres2 =  OP_rr(vres2, *(ptrbb + 2), va0, vl);
+                vres3 =  OP_ir(vres3, *(ptrbb + 2), va1, vl);
+                vres2 =  OP_ii(vres2, *(ptrbb + 3), va1, vl);
+                vres3 =  OP_ri(vres3, *(ptrbb + 3), va0, vl);
+
+                vres4 =  OP_rr(vres4, *(ptrbb + 4), va0, vl);
+                vres5 =  OP_ir(vres5, *(ptrbb + 4), va1, vl);
+                vres4 =  OP_ii(vres4, *(ptrbb + 5), va1, vl);
+                vres5 =  OP_ri(vres5, *(ptrbb + 5), va0, vl);
+
+                vres6 =  OP_rr(vres6, *(ptrbb + 6), va0, vl);
+                vres7 =  OP_ir(vres7, *(ptrbb + 6), va1, vl);
+                vres6 =  OP_ii(vres6, *(ptrbb + 7), va1, vl);
+                vres7 =  OP_ri(vres7, *(ptrbb + 7), va0, vl);
+
+                ptrbb += 8;
+
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va4 = VGET_VX2(vax2, 0);
+                va5 = VGET_VX2(vax2, 1);
+                ptrba += vl*2;
+
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va2, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va3, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va3, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va2, vl);
+                
+                vres2 =  OP_rr(vres2, *(ptrbb + 2), va2, vl);
+                vres3 =  OP_ir(vres3, *(ptrbb + 2), va3, vl);
+                vres2 =  OP_ii(vres2, *(ptrbb + 3), va3, vl);
+                vres3 =  OP_ri(vres3, *(ptrbb + 3), va2, vl);
+                
+                vres4 =  OP_rr(vres4, *(ptrbb + 4), va2, vl);
+                vres5 =  OP_ir(vres5, *(ptrbb + 4), va3, vl);
+                vres4 =  OP_ii(vres4, *(ptrbb + 5), va3, vl);
+                vres5 =  OP_ri(vres5, *(ptrbb + 5), va2, vl);
+                
+                vres6 =  OP_rr(vres6, *(ptrbb + 6), va2, vl);
+                vres7 =  OP_ir(vres7, *(ptrbb + 6), va3, vl);
+                vres6 =  OP_ii(vres6, *(ptrbb + 7), va3, vl);
+                vres7 =  OP_ri(vres7, *(ptrbb + 7), va2, vl);
+                
+                ptrbb += 8;
+
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va6 = VGET_VX2(vax2, 0);
+                va7 = VGET_VX2(vax2, 1);
+                ptrba += vl*2;
+                
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va4, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va5, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va5, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va4, vl);
+                
+                vres2 =  OP_rr(vres2, *(ptrbb + 2), va4, vl);
+                vres3 =  OP_ir(vres3, *(ptrbb + 2), va5, vl);
+                vres2 =  OP_ii(vres2, *(ptrbb + 3), va5, vl);
+                vres3 =  OP_ri(vres3, *(ptrbb + 3), va4, vl);
+                
+                vres4 =  OP_rr(vres4, *(ptrbb + 4), va4, vl);
+                vres5 =  OP_ir(vres5, *(ptrbb + 4), va5, vl);
+                vres4 =  OP_ii(vres4, *(ptrbb + 5), va5, vl);
+                vres5 =  OP_ri(vres5, *(ptrbb + 5), va4, vl);
+                
+                vres6 =  OP_rr(vres6, *(ptrbb + 6), va4, vl);
+                vres7 =  OP_ir(vres7, *(ptrbb + 6), va5, vl);
+                vres6 =  OP_ii(vres6, *(ptrbb + 7), va5, vl);
+                vres7 =  OP_ri(vres7, *(ptrbb + 7), va4, vl);
+                ptrbb += 8;
+                
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va6, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va7, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va7, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va6, vl);
+                
+                vres2 =  OP_rr(vres2, *(ptrbb + 2), va6, vl);
+                vres3 =  OP_ir(vres3, *(ptrbb + 2), va7, vl);
+                vres2 =  OP_ii(vres2, *(ptrbb + 3), va7, vl);
+                vres3 =  OP_ri(vres3, *(ptrbb + 3), va6, vl);
+                
+                vres4 =  OP_rr(vres4, *(ptrbb + 4), va6, vl);
+                vres5 =  OP_ir(vres5, *(ptrbb + 4), va7, vl);
+                vres4 =  OP_ii(vres4, *(ptrbb + 5), va7, vl);
+                vres5 =  OP_ri(vres5, *(ptrbb + 5), va6, vl);
+                
+                vres6 =  OP_rr(vres6, *(ptrbb + 6), va6, vl);
+                vres7 =  OP_ir(vres7, *(ptrbb + 6), va7, vl);
+                vres6 =  OP_ii(vres6, *(ptrbb + 7), va7, vl);
+                vres7 =  OP_ri(vres7, *(ptrbb + 7), va6, vl);
+
+                ptrbb += 8;
+            }
+
+            for (k = (bk & 3); k > 0; k--)
+            {
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va0 = VGET_VX2(vax2, 0);
+                va1 = VGET_VX2(vax2, 1);
+                ptrba += vl*2;
+                
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va0, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va1, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va1, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va0, vl);
+                
+                vres2 =  OP_rr(vres2, *(ptrbb + 2), va0, vl);
+                vres3 =  OP_ir(vres3, *(ptrbb + 2), va1, vl);
+                vres2 =  OP_ii(vres2, *(ptrbb + 3), va1, vl);
+                vres3 =  OP_ri(vres3, *(ptrbb + 3), va0, vl);
+                
+                vres4 =  OP_rr(vres4, *(ptrbb + 4), va0, vl);
+                vres5 =  OP_ir(vres5, *(ptrbb + 4), va1, vl);
+                vres4 =  OP_ii(vres4, *(ptrbb + 5), va1, vl);
+                vres5 =  OP_ri(vres5, *(ptrbb + 5), va0, vl);
+                
+                vres6 =  OP_rr(vres6, *(ptrbb + 6), va0, vl);
+                vres7 =  OP_ir(vres7, *(ptrbb + 6), va1, vl);
+                vres6 =  OP_ii(vres6, *(ptrbb + 7), va1, vl);
+                vres7 =  OP_ri(vres7, *(ptrbb + 7), va0, vl);
+
+                ptrbb += 8;
+            }
+
+            vax2 = VLSEG2_FLOAT(C0, vl);
+            va0 = VGET_VX2(vax2, 0);
+            va1 = VGET_VX2(vax2, 1);
+
+            vax2 = VLSEG2_FLOAT(C1, vl);
+            va2 = VGET_VX2(vax2, 0);
+            va3 = VGET_VX2(vax2, 1);
+
+            va0 =  VFMACCVF_FLOAT(va0, alphar, vres0, vl);
+            va1 =  VFMACCVF_FLOAT(va1, alphar, vres1, vl);
+            va0 = VFNMSACVF_FLOAT(va0, alphai, vres1, vl);
+            va1 =  VFMACCVF_FLOAT(va1, alphai, vres0, vl);
+
+            vax2 = VSET_VX2(vax2, 0, va0);
+            vax2 = VSET_VX2(vax2, 1, va1);
+            VSSEG2_FLOAT(C0, vax2, vl);
+
+            va2 =  VFMACCVF_FLOAT(va2, alphar, vres2, vl);
+            va3 =  VFMACCVF_FLOAT(va3, alphar, vres3, vl);
+            va2 = VFNMSACVF_FLOAT(va2, alphai, vres3, vl);
+            va3 =  VFMACCVF_FLOAT(va3, alphai, vres2, vl);
+
+            vax2 = VSET_VX2(vax2, 0, va2);
+            vax2 = VSET_VX2(vax2, 1, va3);
+            VSSEG2_FLOAT(C1, vax2, vl);
+
+            vax2 = VLSEG2_FLOAT(C2, vl);
+            va0 = VGET_VX2(vax2, 0);
+            va1 = VGET_VX2(vax2, 1);
+
+            vax2 = VLSEG2_FLOAT(C3, vl);
+            va2 = VGET_VX2(vax2, 0);
+            va3 = VGET_VX2(vax2, 1);
+
+            va0 =  VFMACCVF_FLOAT(va0, alphar, vres4, vl);
+            va1 =  VFMACCVF_FLOAT(va1, alphar, vres5, vl);
+            va0 = VFNMSACVF_FLOAT(va0, alphai, vres5, vl);
+            va1 =  VFMACCVF_FLOAT(va1, alphai, vres4, vl);
+
+            vax2 = VSET_VX2(vax2, 0, va0);
+            vax2 = VSET_VX2(vax2, 1, va1);
+            VSSEG2_FLOAT(C2, vax2, vl);
+
+            va2 =  VFMACCVF_FLOAT(va2, alphar, vres6, vl);
+            va3 =  VFMACCVF_FLOAT(va3, alphar, vres7, vl);
+            va2 = VFNMSACVF_FLOAT(va2, alphai, vres7, vl);
+            va3 =  VFMACCVF_FLOAT(va3, alphai, vres6, vl);
+
+            vax2 = VSET_VX2(vax2, 0, va2);
+            vax2 = VSET_VX2(vax2, 1, va3);
+            VSSEG2_FLOAT(C3, vax2, vl);
+
+            C0 += vl * 2;
+            C1 += vl * 2;
+            C2 += vl * 2;
+            C3 += vl * 2;
+        }
+
+        bb += (bk << 3);
+        C  += (ldc << 3);
+    }
+
+    if (bn & 2)
+    {
+        C0 = C;
+        C1 = C0 + 2 * ldc;
+        ptrba = ba;
+        for (i = bm; i > 0; i -= vl)
+        {
+            vl = VSETVL(i);
+            ptrbb = bb;
+
+            vres0 = VFMVVF_FLOAT(0.0, vl);
+            vres1 = VFMVVF_FLOAT(0.0, vl);
+            vres2 = VFMVVF_FLOAT(0.0, vl);
+            vres3 = VFMVVF_FLOAT(0.0, vl);
+
+            for (k = bk/4; k > 0; k--)
+            {
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va0 = VGET_VX2(vax2, 0);
+                va1 = VGET_VX2(vax2, 1);
+                ptrba += vl*2;
+
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va2 = VGET_VX2(vax2, 0);
+                va3 = VGET_VX2(vax2, 1);
+                ptrba += vl*2;
+
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va0, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va1, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va1, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va0, vl);
+
+                vres2 =  OP_rr(vres2, *(ptrbb + 2), va0, vl);
+                vres3 =  OP_ir(vres3, *(ptrbb + 2), va1, vl);
+                vres2 =  OP_ii(vres2, *(ptrbb + 3), va1, vl);
+                vres3 =  OP_ri(vres3, *(ptrbb + 3), va0, vl);
+
+                ptrbb += 4;
+
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va4 = VGET_VX2(vax2, 0);
+                va5 = VGET_VX2(vax2, 1);
+                ptrba += vl*2;
+                
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va2, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va3, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va3, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va2, vl);
+                
+                vres2 =  OP_rr(vres2, *(ptrbb + 2), va2, vl);
+                vres3 =  OP_ir(vres3, *(ptrbb + 2), va3, vl);
+                vres2 =  OP_ii(vres2, *(ptrbb + 3), va3, vl);
+                vres3 =  OP_ri(vres3, *(ptrbb + 3), va2, vl);
+                
+                ptrbb += 4;
+
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va6 = VGET_VX2(vax2, 0);
+                va7 = VGET_VX2(vax2, 1);
+                ptrba += vl*2;
+                
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va4, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va5, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va5, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va4, vl);
+                
+                vres2 =  OP_rr(vres2, *(ptrbb + 2), va4, vl);
+                vres3 =  OP_ir(vres3, *(ptrbb + 2), va5, vl);
+                vres2 =  OP_ii(vres2, *(ptrbb + 3), va5, vl);
+                vres3 =  OP_ri(vres3, *(ptrbb + 3), va4, vl);
+                
+                ptrbb += 4;
+                
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va6, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va7, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va7, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va6, vl);
+                
+                vres2 =  OP_rr(vres2, *(ptrbb + 2), va6, vl);
+                vres3 =  OP_ir(vres3, *(ptrbb + 2), va7, vl);
+                vres2 =  OP_ii(vres2, *(ptrbb + 3), va7, vl);
+                vres3 =  OP_ri(vres3, *(ptrbb + 3), va6, vl);
+                
+                ptrbb += 4;
+            }
+
+            for (k = (bk & 3); k > 0; k--)
+            {
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va0 = VGET_VX2(vax2, 0);
+                va1 = VGET_VX2(vax2, 1);
+                ptrba += vl*2;
+                
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va0, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va1, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va1, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va0, vl);
+                
+                vres2 =  OP_rr(vres2, *(ptrbb + 2), va0, vl);
+                vres3 =  OP_ir(vres3, *(ptrbb + 2), va1, vl);
+                vres2 =  OP_ii(vres2, *(ptrbb + 3), va1, vl);
+                vres3 =  OP_ri(vres3, *(ptrbb + 3), va0, vl);
+                
+                ptrbb += 4;
+            }
+
+            vax2 = VLSEG2_FLOAT(C0, vl);
+            va0 = VGET_VX2(vax2, 0);
+            va1 = VGET_VX2(vax2, 1);
+
+            vax2 = VLSEG2_FLOAT(C1, vl);
+            va2 = VGET_VX2(vax2, 0);
+            va3 = VGET_VX2(vax2, 1);
+
+            va0 =  VFMACCVF_FLOAT(va0, alphar, vres0, vl);
+            va1 =  VFMACCVF_FLOAT(va1, alphar, vres1, vl);
+            va0 = VFNMSACVF_FLOAT(va0, alphai, vres1, vl);
+            va1 =  VFMACCVF_FLOAT(va1, alphai, vres0, vl);
+
+            vax2 = VSET_VX2(vax2, 0, va0);
+            vax2 = VSET_VX2(vax2, 1, va1);
+            VSSEG2_FLOAT(C0, vax2, vl);
+
+            va2 =  VFMACCVF_FLOAT(va2, alphar, vres2, vl);
+            va3 =  VFMACCVF_FLOAT(va3, alphar, vres3, vl);
+            va2 = VFNMSACVF_FLOAT(va2, alphai, vres3, vl);
+            va3 =  VFMACCVF_FLOAT(va3, alphai, vres2, vl);
+
+            vax2 = VSET_VX2(vax2, 0, va2);
+            vax2 = VSET_VX2(vax2, 1, va3);
+            VSSEG2_FLOAT(C1, vax2, vl);
+
+            C0 += vl * 2;
+            C1 += vl * 2;
+        }
+
+        bb += (bk << 2);
+        C  += (ldc << 2);
+    }
+
+    if (bn & 1)
+    {
+        C0 = C;
+        ptrba = ba;
+        for (i = bm; i > 0; i -= vl)
+        {
+            vl = VSETVL(i);
+            ptrbb = bb;
+
+            vres0 = VFMVVF_FLOAT(0.0, vl);
+            vres1 = VFMVVF_FLOAT(0.0, vl);
+
+            for (k = bk/4; k > 0; k--)
+            {
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va0 = VGET_VX2(vax2, 0);
+                va1 = VGET_VX2(vax2, 1);
+                ptrba += vl*2;
+
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va2 = VGET_VX2(vax2, 0);
+                va3 = VGET_VX2(vax2, 1);
+                ptrba += vl*2;
+
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va0, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va1, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va1, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va0, vl);
+                ptrbb += 2;
+
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va4 = VGET_VX2(vax2, 0);
+                va5 = VGET_VX2(vax2, 1);
+                ptrba += vl*2;
+                
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va2, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va3, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va3, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va2, vl);
+
+                ptrbb += 2;
+
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va6 = VGET_VX2(vax2, 0);
+                va7 = VGET_VX2(vax2, 1);
+                ptrba += vl*2;
+                
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va4, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va5, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va5, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va4, vl);
+                ptrbb += 2;
+                
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va6, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va7, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va7, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va6, vl);
+                ptrbb += 2;
+            }
+
+            for (k = (bk & 3); k > 0; k--)
+            {
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va0 = VGET_VX2(vax2, 0);
+                va1 = VGET_VX2(vax2, 1);
+                ptrba += vl*2;
+                
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va0, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va1, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va1, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va0, vl);
+                ptrbb += 2;
+            }
+            
+            vax2 = VLSEG2_FLOAT(C0, vl);
+            va0 = VGET_VX2(vax2, 0);
+            va1 = VGET_VX2(vax2, 1);
+
+            va0 =  VFMACCVF_FLOAT(va0, alphar, vres0, vl);
+            va1 =  VFMACCVF_FLOAT(va1, alphar, vres1, vl);
+            va0 = VFNMSACVF_FLOAT(va0, alphai, vres1, vl);
+            va1 =  VFMACCVF_FLOAT(va1, alphai, vres0, vl);
+
+            vax2 = VSET_VX2(vax2, 0, va0);
+            vax2 = VSET_VX2(vax2, 1, va1);
+            VSSEG2_FLOAT(C0, vax2, vl);
+            C0 += vl * 2;
+        }
+
+        bb += bk << 1;
+        C  += ldc << 1;
+   }
+   return 0;
+}
+
diff --git a/kernel/riscv64/zgemv_n_rvv.c b/kernel/riscv64/zgemv_n_rvv.c
new file mode 100644
index 000000000..f14ef5ba8
--- /dev/null
+++ b/kernel/riscv64/zgemv_n_rvv.c
@@ -0,0 +1,192 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m4(n)
+#define FLOAT_V_T               vfloat32m4_t
+#define FLOAT_VX2_T             vfloat32m4x2_t
+#define VGET_VX2                __riscv_vget_v_f32m4x2_f32m4
+#define VSET_VX2                __riscv_vset_v_f32m4_f32m4x2
+#define VLEV_FLOAT              __riscv_vle32_v_f32m4
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m4
+#define VSEV_FLOAT              __riscv_vse32_v_f32m4
+#define VSSEV_FLOAT             __riscv_vsse32_v_f32m4
+#define VLSEG_FLOAT             __riscv_vlseg2e32_v_f32m4x2
+#define VSSEG_FLOAT             __riscv_vsseg2e32_v_f32m4x2
+#define VLSSEG_FLOAT            __riscv_vlsseg2e32_v_f32m4x2
+#define VSSSEG_FLOAT            __riscv_vssseg2e32_v_f32m4x2
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f32m4
+#define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f32m4
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m4(n)
+#define FLOAT_V_T               vfloat64m4_t
+#define FLOAT_VX2_T             vfloat64m4x2_t
+#define VGET_VX2                __riscv_vget_v_f64m4x2_f64m4
+#define VSET_VX2                __riscv_vset_v_f64m4_f64m4x2
+#define VLEV_FLOAT              __riscv_vle64_v_f64m4
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m4
+#define VSEV_FLOAT              __riscv_vse64_v_f64m4
+#define VSSEV_FLOAT             __riscv_vsse64_v_f64m4
+#define VLSEG_FLOAT             __riscv_vlseg2e64_v_f64m4x2
+#define VSSEG_FLOAT             __riscv_vsseg2e64_v_f64m4x2
+#define VLSSEG_FLOAT            __riscv_vlsseg2e64_v_f64m4x2
+#define VSSSEG_FLOAT            __riscv_vssseg2e64_v_f64m4x2
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f64m4
+#define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f64m4
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+    BLASLONG i;
+    BLASLONG ix;
+    FLOAT *a_ptr;
+    FLOAT temp_r, temp_i;
+    FLOAT_V_T va0, va1, vy0, vy1;
+    FLOAT_VX2_T vax2, vyx2;
+
+    BLASLONG stride_y = inc_y * sizeof(FLOAT) * 2;
+
+    BLASLONG inc_x2 = inc_x * 2;
+    BLASLONG lda2 = lda * 2;
+    if (inc_y == 1)
+    {
+        for (size_t vl; m > 0; m -= vl, a += vl*2, y += vl*2) {
+            vl = VSETVL(m);
+            a_ptr = a;
+            ix = 0;
+            vyx2 = VLSEG_FLOAT(y, vl);
+
+            vy0 = VGET_VX2(vyx2, 0);
+            vy1 = VGET_VX2(vyx2, 1);
+
+            for(i = 0; i < n; i++){
+#if !defined(XCONJ)
+                temp_r = alpha_r * x[ix]   - alpha_i * x[ix+1];
+                temp_i = alpha_r * x[ix+1] + alpha_i * x[ix];
+#else
+                temp_r = alpha_r * x[ix]   + alpha_i * x[ix+1];
+                temp_i = alpha_r * x[ix+1] - alpha_i * x[ix];
+#endif
+
+                vax2 = VLSEG_FLOAT(a_ptr, vl);
+
+                va0 = VGET_VX2(vax2, 0);
+                va1 = VGET_VX2(vax2, 1);
+#if !defined(CONJ)
+#if !defined(XCONJ)
+                vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl);
+                vy0 = VFNMSACVF_FLOAT(vy0, temp_i, va1, vl);
+                vy1 = VFMACCVF_FLOAT(vy1, temp_r, va1, vl);
+                vy1 = VFMACCVF_FLOAT(vy1, temp_i, va0, vl);
+#else
+                vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl);
+                vy0 = VFMACCVF_FLOAT(vy0, temp_i, va1, vl);
+                vy1 = VFMACCVF_FLOAT(vy1, temp_r, va1, vl);
+                vy1 = VFNMSACVF_FLOAT(vy1, temp_i, va0, vl);
+#endif
+#else
+#if !defined(XCONJ)
+                vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl);
+                vy0 = VFMACCVF_FLOAT(vy0, temp_i, va1, vl);
+                vy1 = VFNMSACVF_FLOAT(vy1, temp_r, va1, vl);
+                vy1 = VFMACCVF_FLOAT(vy1, temp_i, va0, vl);
+#else
+                vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl);
+                vy0 = VFNMSACVF_FLOAT(vy0, temp_i, va1, vl);
+                vy1 = VFNMSACVF_FLOAT(vy1, temp_r, va1, vl);
+                vy1 = VFNMSACVF_FLOAT(vy1, temp_i, va0, vl);
+#endif
+#endif
+                a_ptr += lda2;
+                ix += inc_x2;
+            }
+
+            vyx2 = VSET_VX2(vyx2, 0, vy0);
+            vyx2 = VSET_VX2(vyx2, 1, vy1);
+            VSSEG_FLOAT(y, vyx2, vl);
+        }
+
+    }
+    else
+    {
+        for (size_t vl; m > 0; m -= vl, a += vl*2, y += vl*inc_y*2) {
+            vl = VSETVL(m);
+            a_ptr = a;
+            ix = 0;
+            vyx2 = VLSSEG_FLOAT(y, stride_y, vl);
+            vy0 = VGET_VX2(vyx2, 0);
+            vy1 = VGET_VX2(vyx2, 1);
+
+            for(i = 0; i < n; i++){
+#if !defined(XCONJ)
+                temp_r = alpha_r * x[ix]   - alpha_i * x[ix+1];
+                temp_i = alpha_r * x[ix+1] + alpha_i * x[ix];
+#else
+                temp_r = alpha_r * x[ix]   + alpha_i * x[ix+1];
+                temp_i = alpha_r * x[ix+1] - alpha_i * x[ix];
+#endif
+
+                vax2 = VLSEG_FLOAT(a_ptr, vl);
+                va0 = VGET_VX2(vax2, 0);
+                va1 = VGET_VX2(vax2, 1);
+#if !defined(CONJ)
+#if !defined(XCONJ)
+                vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl);
+                vy0 = VFNMSACVF_FLOAT(vy0, temp_i, va1, vl);
+                vy1 = VFMACCVF_FLOAT(vy1, temp_r, va1, vl);
+                vy1 = VFMACCVF_FLOAT(vy1, temp_i, va0, vl);
+#else
+                vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl);
+                vy0 = VFMACCVF_FLOAT(vy0, temp_i, va1, vl);
+                vy1 = VFMACCVF_FLOAT(vy1, temp_r, va1, vl);
+                vy1 = VFNMSACVF_FLOAT(vy1, temp_i, va0, vl);
+#endif
+#else
+#if !defined(XCONJ)
+                vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl);
+                vy0 = VFMACCVF_FLOAT(vy0, temp_i, va1, vl);
+                vy1 = VFNMSACVF_FLOAT(vy1, temp_r, va1, vl);
+                vy1 = VFMACCVF_FLOAT(vy1, temp_i, va0, vl);
+#else
+                vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl);
+                vy0 = VFNMSACVF_FLOAT(vy0, temp_i, va1, vl);
+                vy1 = VFNMSACVF_FLOAT(vy1, temp_r, va1, vl);
+                vy1 = VFNMSACVF_FLOAT(vy1, temp_i, va0, vl);
+#endif
+#endif
+                a_ptr += lda2;
+                ix += inc_x2;
+            }
+            vyx2 = VSET_VX2(vyx2, 0, vy0);
+            vyx2 = VSET_VX2(vyx2, 1, vy1);
+            VSSSEG_FLOAT(y, stride_y, vyx2, vl);
+        }
+    }
+    return(0);
+}
diff --git a/kernel/riscv64/zgemv_n_vector.c b/kernel/riscv64/zgemv_n_vector.c
index 3095c28f9..104d3865d 100644
--- a/kernel/riscv64/zgemv_n_vector.c
+++ b/kernel/riscv64/zgemv_n_vector.c
@@ -27,23 +27,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m4(n)
+#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n)
 #define FLOAT_V_T vfloat32m4_t
-#define VLEV_FLOAT vle32_v_f32m4
-#define VLSEV_FLOAT vlse32_v_f32m4
-#define VSEV_FLOAT vse32_v_f32m4
-#define VSSEV_FLOAT vsse32_v_f32m4
-#define VFMACCVF_FLOAT vfmacc_vf_f32m4
-#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4
+#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4)
+#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4)
+#define VSEV_FLOAT RISCV_RVV(vse32_v_f32m4)
+#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4)
+#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4)
+#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f32m4)
 #else
-#define VSETVL(n) vsetvl_e64m4(n)
+#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n)
 #define FLOAT_V_T vfloat64m4_t
-#define VLEV_FLOAT vle64_v_f64m4
-#define VLSEV_FLOAT vlse64_v_f64m4
-#define VSEV_FLOAT vse64_v_f64m4
-#define VSSEV_FLOAT vsse64_v_f64m4
-#define VFMACCVF_FLOAT vfmacc_vf_f64m4
-#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4
+#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4)
+#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4)
+#define VSEV_FLOAT RISCV_RVV(vse64_v_f64m4)
+#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4)
+#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4)
+#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m4)
 #endif
 
 int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
diff --git a/kernel/riscv64/zgemv_t_rvv.c b/kernel/riscv64/zgemv_t_rvv.c
new file mode 100644
index 000000000..1c89a9f72
--- /dev/null
+++ b/kernel/riscv64/zgemv_t_rvv.c
@@ -0,0 +1,187 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m4(n)
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e32m1()
+#define FLOAT_V_T               vfloat32m4_t
+#define FLOAT_V_T_M1            vfloat32m1_t
+#define FLOAT_VX2_T             vfloat32m4x2_t
+#define VGET_VX2                __riscv_vget_v_f32m4x2_f32m4
+#define VLSEG_FLOAT             __riscv_vlseg2e32_v_f32m4x2
+#define VLSSEG_FLOAT            __riscv_vlsseg2e32_v_f32m4x2
+#define VFREDSUM_FLOAT_TU       __riscv_vfredusum_vs_f32m4_f32m1_tu
+#define VFMACCVV_FLOAT_TU       __riscv_vfmacc_vv_f32m4_tu
+#define VFNMSACVV_FLOAT_TU      __riscv_vfnmsac_vv_f32m4_tu
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m4
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
+#define VFMULVV_FLOAT           __riscv_vfmul_vv_f32m4
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m4(n)
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e64m1()
+#define FLOAT_V_T               vfloat64m4_t
+#define FLOAT_V_T_M1            vfloat64m1_t
+#define FLOAT_VX2_T             vfloat64m4x2_t
+#define VGET_VX2                __riscv_vget_v_f64m4x2_f64m4
+#define VLSEG_FLOAT             __riscv_vlseg2e64_v_f64m4x2
+#define VLSSEG_FLOAT            __riscv_vlsseg2e64_v_f64m4x2
+#define VFREDSUM_FLOAT_TU       __riscv_vfredusum_vs_f64m4_f64m1_tu
+#define VFMACCVV_FLOAT_TU       __riscv_vfmacc_vv_f64m4_tu
+#define VFNMSACVV_FLOAT_TU      __riscv_vfnmsac_vv_f64m4_tu
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m4
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
+#define VFMULVV_FLOAT           __riscv_vfmul_vv_f64m4
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+    BLASLONG i = 0, j = 0;
+    BLASLONG ix = 0, iy = 0;
+    FLOAT *a_ptr = a;
+    FLOAT temp_r, temp_i;
+
+    FLOAT_V_T va0, va1, vx0, vx1, vr, vi; 
+    FLOAT_V_T_M1 v_res, v_z0;
+    FLOAT_VX2_T vxx2, vax2;
+
+    BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2;
+    //BLASLONG stride_a = sizeof(FLOAT) * 2;
+    BLASLONG inc_y2 = inc_y * 2;
+    BLASLONG lda2 = lda * 2;
+
+    size_t vlmax = VSETVL_MAX_M1;
+    v_res = VFMVVF_FLOAT_M1(0, vlmax);
+    v_z0 = VFMVVF_FLOAT_M1(0, vlmax);
+    vlmax = VSETVL(m);
+
+    if (inc_x == 1)
+    {
+        for(i = 0; i < n; i++) {    
+            j = 0;
+            ix = 0;
+            vr = VFMVVF_FLOAT(0, vlmax);
+            vi = VFMVVF_FLOAT(0, vlmax);
+            for(size_t vl, k = m; k > 0; k -= vl) {
+                vl = VSETVL(k);
+
+                vax2 = VLSEG_FLOAT(&a_ptr[j], vl);
+                vxx2 = VLSEG_FLOAT(&x[ix], vl);
+
+                va0 = VGET_VX2(vax2, 0);
+                va1 = VGET_VX2(vax2, 1);
+                vx0 = VGET_VX2(vxx2, 0);
+                vx1 = VGET_VX2(vxx2, 1);
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+                vr = VFMACCVV_FLOAT_TU(vr, va0, vx0, vl);
+                vr = VFNMSACVV_FLOAT_TU(vr, va1, vx1, vl);
+                vi = VFMACCVV_FLOAT_TU(vi, va0, vx1, vl);
+                vi = VFMACCVV_FLOAT_TU(vi, va1, vx0, vl);
+#else
+                vr = VFMACCVV_FLOAT_TU(vr, va0, vx0, vl);
+                vr = VFMACCVV_FLOAT_TU(vr, va1, vx1, vl);
+                vi = VFMACCVV_FLOAT_TU(vi, va0, vx1, vl);
+                vi = VFNMSACVV_FLOAT_TU(vi, va1, vx0, vl);
+#endif
+                j += vl * 2;
+                ix += vl * inc_x * 2;
+            }
+            
+            v_res = VFREDSUM_FLOAT_TU(v_res, vr, v_z0, vlmax);
+            temp_r = VFMVFS_FLOAT_M1(v_res);
+            v_res = VFREDSUM_FLOAT_TU(v_res, vi, v_z0, vlmax);
+            temp_i = VFMVFS_FLOAT_M1(v_res);
+
+#if !defined(XCONJ)
+            y[iy]   += alpha_r * temp_r - alpha_i * temp_i;
+            y[iy+1] += alpha_r * temp_i + alpha_i * temp_r;
+#else
+            y[iy]   += alpha_r * temp_r + alpha_i * temp_i;
+            y[iy+1] -= alpha_r * temp_i - alpha_i * temp_r;
+#endif
+            iy += inc_y2;
+            a_ptr += lda2;
+        }
+    }
+    else
+    {
+        for(i = 0; i < n; i++) {    
+            j = 0;
+            ix = 0;
+            vr = VFMVVF_FLOAT(0, vlmax);
+            vi = VFMVVF_FLOAT(0, vlmax);
+            for(size_t vl, k = m; k > 0; k -= vl) {
+                vl = VSETVL(k);
+    
+                vax2 = VLSEG_FLOAT(&a_ptr[j], vl);
+                vxx2 = VLSSEG_FLOAT(&x[ix], stride_x, vl);
+
+                va0 = VGET_VX2(vax2, 0);
+                va1 = VGET_VX2(vax2, 1);
+                vx0 = VGET_VX2(vxx2, 0);
+                vx1 = VGET_VX2(vxx2, 1);
+    
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+                vr = VFMACCVV_FLOAT_TU(vr, va0, vx0, vl);
+                vr = VFNMSACVV_FLOAT_TU(vr, va1, vx1, vl);
+                vi = VFMACCVV_FLOAT_TU(vi, va0, vx1, vl);
+                vi = VFMACCVV_FLOAT_TU(vi, va1, vx0, vl);
+#else
+                vr = VFMACCVV_FLOAT_TU(vr, va0, vx0, vl);
+                vr = VFMACCVV_FLOAT_TU(vr, va1, vx1, vl);
+                vi = VFMACCVV_FLOAT_TU(vi, va0, vx1, vl);
+                vi = VFNMSACVV_FLOAT_TU(vi, va1, vx0, vl);
+#endif
+                j += vl * 2;
+                ix += vl * inc_x * 2;
+            }
+            
+            v_res = VFREDSUM_FLOAT_TU(v_res, vr, v_z0, vlmax);
+            temp_r = VFMVFS_FLOAT_M1(v_res);
+            v_res = VFREDSUM_FLOAT_TU(v_res, vi, v_z0, vlmax);
+            temp_i = VFMVFS_FLOAT_M1(v_res);
+    
+#if !defined(XCONJ)
+            y[iy]   += alpha_r * temp_r - alpha_i * temp_i;
+            y[iy+1] += alpha_r * temp_i + alpha_i * temp_r;
+#else
+            y[iy]   += alpha_r * temp_r + alpha_i * temp_i;
+            y[iy+1] -= alpha_r * temp_i - alpha_i * temp_r;
+#endif
+            iy += inc_y2;
+            a_ptr += lda2;
+        }
+
+    }
+
+
+    return(0);
+}
diff --git a/kernel/riscv64/zgemv_t_vector.c b/kernel/riscv64/zgemv_t_vector.c
index a7a8a5279..5d85ab3a4 100644
--- a/kernel/riscv64/zgemv_t_vector.c
+++ b/kernel/riscv64/zgemv_t_vector.c
@@ -27,31 +27,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m4(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
-#define FLOAT_V_T vfloat32m4_t
+#define VSETVL(n) RISCV_RVV(vsetvl_e32m2)(n)
+#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)()
+#define FLOAT_V_T vfloat32m2_t
 #define FLOAT_V_T_M1 vfloat32m1_t
-#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLSEV_FLOAT vlse32_v_f32m4
-#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
-#define VFMACCVV_FLOAT vfmacc_vv_f32m4
-#define VFNMSACVV_FLOAT vfnmsac_vv_f32m4
-#define VFMVVF_FLOAT vfmv_v_f_f32m4
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFMULVV_FLOAT vfmul_vv_f32m4
+#define VFMVFS_FLOAT RISCV_RVV(vfmv_f_s_f32m1_f32)
+#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m2)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDSUM_FLOAT(vr, va, vb, gvl) RISCV_RVV(vfredusum_vs_f32m2_f32m1)(vr, va, vb, gvl)
 #else
-#define VSETVL(n) vsetvl_e64m4(n)
-#define VSETVL_MAX vsetvlmax_e64m1()
-#define FLOAT_V_T vfloat64m4_t
+#define VFREDSUM_FLOAT(vr, va, vb, gvl) RISCV_RVV(vfredusum_vs_f32m2_f32m1)(va, vb, gvl)
+#endif
+#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f32m2)
+#define VFNMSACVV_FLOAT RISCV_RVV(vfnmsac_vv_f32m2)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m2)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1)
+#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f32m2)
+#else
+#define VSETVL(n) RISCV_RVV(vsetvl_e64m2)(n)
+#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)()
+#define FLOAT_V_T vfloat64m2_t
 #define FLOAT_V_T_M1 vfloat64m1_t
-#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLSEV_FLOAT vlse64_v_f64m4
-#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
-#define VFMACCVV_FLOAT vfmacc_vv_f64m4
-#define VFNMSACVV_FLOAT vfnmsac_vv_f64m4
-#define VFMVVF_FLOAT vfmv_v_f_f64m4
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFMULVV_FLOAT vfmul_vv_f64m4
+#define VFMVFS_FLOAT RISCV_RVV(vfmv_f_s_f64m1_f64)
+#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m2)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDSUM_FLOAT(vr, va, vb, gvl) RISCV_RVV(vfredusum_vs_f64m2_f64m1)(vr, va, vb, gvl)
+#else
+#define VFREDSUM_FLOAT(vr, va, vb, gvl) RISCV_RVV(vfredusum_vs_f64m2_f64m1)(va, vb, gvl)
+#endif
+#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f64m2)
+#define VFNMSACVV_FLOAT RISCV_RVV(vfnmsac_vv_f64m2)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m2)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1)
+#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f64m2)
 #endif
 
 int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
@@ -62,49 +70,43 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
         FLOAT temp_r, temp_i;
 
         FLOAT_V_T va0, va1, vx0, vx1, vr, vi;
-        unsigned int gvl = 0;
-        FLOAT_V_T_M1 v_res, v_z0;
-        gvl = VSETVL_MAX;
-        v_res = VFMVVF_FLOAT_M1(0, gvl);
-        v_z0 = VFMVVF_FLOAT_M1(0, gvl);
-
+        unsigned int gvl = VSETVL(m);
+        FLOAT_V_T_M1 v_res_r, v_res_i;
         BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2;
         BLASLONG stride_a = sizeof(FLOAT) * 2;
-        gvl = VSETVL(m);
         BLASLONG inc_xv = inc_x * gvl * 2;
         BLASLONG inc_av = gvl * 2;
         BLASLONG inc_y2 = inc_y * 2;
         BLASLONG lda2 = lda * 2;
+
         for(i = 0; i < n; i++){
+                v_res_r = VFMVVF_FLOAT_M1(0, 1);
+                v_res_i = VFMVVF_FLOAT_M1(0, 1);
                 gvl = VSETVL(m);
                 j = 0;
                 ix = 0;
-                vr = VFMVVF_FLOAT(0, gvl);
-                vi = VFMVVF_FLOAT(0, gvl);
                 for(k = 0; k < m/gvl; k++){
                         va0 = VLSEV_FLOAT(&a_ptr[j], stride_a, gvl);
                         va1 = VLSEV_FLOAT(&a_ptr[j+1], stride_a, gvl);
                         vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
                         vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
-                        vr = VFMACCVV_FLOAT(vr, va0, vx0, gvl);
+                        vr = VFMULVV_FLOAT(va0, vx0, gvl);
+                        vi = VFMULVV_FLOAT(va0, vx1, gvl);
                         vr = VFNMSACVV_FLOAT(vr, va1, vx1, gvl);
-                        vi = VFMACCVV_FLOAT(vi, va0, vx1, gvl);
                         vi = VFMACCVV_FLOAT(vi, va1, vx0, gvl);
 #else
-                        vr = VFMACCVV_FLOAT(vr, va0, vx0, gvl);
+                        vr = VFMULVV_FLOAT(va0, vx0, gvl);
+                        vi = VFMULVV_FLOAT(va0, vx1, gvl);
                         vr = VFMACCVV_FLOAT(vr, va1, vx1, gvl);
-                        vi = VFMACCVV_FLOAT(vi, va0, vx1, gvl);
                         vi = VFNMSACVV_FLOAT(vi, va1, vx0, gvl);
-
 #endif
+                        v_res_r = VFREDSUM_FLOAT(v_res_r, vr, v_res_r, gvl);
+                        v_res_i = VFREDSUM_FLOAT(v_res_i, vi, v_res_i, gvl);
+
                         j += inc_av;
                         ix += inc_xv;
                 }
-                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                temp_r = VFMVFS_FLOAT(v_res);
-                v_res = VFREDSUM_FLOAT(v_res, vi, v_z0, gvl);
-                temp_i = VFMVFS_FLOAT(v_res);
                 if(j/2 < m){
                         gvl = VSETVL(m-j/2);
                         va0 = VLSEV_FLOAT(&a_ptr[j], stride_a, gvl);
@@ -113,21 +115,23 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
                         vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
                         vr = VFMULVV_FLOAT(va0, vx0, gvl);
-                        vr = VFNMSACVV_FLOAT(vr, va1, vx1, gvl);
                         vi = VFMULVV_FLOAT(va0, vx1, gvl);
+                        vr = VFNMSACVV_FLOAT(vr, va1, vx1, gvl);
                         vi = VFMACCVV_FLOAT(vi, va1, vx0, gvl);
 #else
                         vr = VFMULVV_FLOAT(va0, vx0, gvl);
-                        vr = VFMACCVV_FLOAT(vr, va1, vx1, gvl);
                         vi = VFMULVV_FLOAT(va0, vx1, gvl);
+                        vr = VFMACCVV_FLOAT(vr, va1, vx1, gvl);
                         vi = VFNMSACVV_FLOAT(vi, va1, vx0, gvl);
 
 #endif
-                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                        temp_r += VFMVFS_FLOAT(v_res);
-                        v_res = VFREDSUM_FLOAT(v_res, vi, v_z0, gvl);
-                        temp_i += VFMVFS_FLOAT(v_res);
+                        v_res_r = VFREDSUM_FLOAT(v_res_r, vr, v_res_r, gvl);
+                        v_res_i = VFREDSUM_FLOAT(v_res_i, vi, v_res_i, gvl);
                 }
+
+                temp_r = VFMVFS_FLOAT(v_res_r);
+                temp_i = VFMVFS_FLOAT(v_res_i);
+
 #if !defined(XCONJ)
                 y[iy]   += alpha_r * temp_r - alpha_i * temp_i;
                 y[iy+1] += alpha_r * temp_i + alpha_i * temp_r;
diff --git a/kernel/riscv64/zhemm_ltcopy_rvv_v1.c b/kernel/riscv64/zhemm_ltcopy_rvv_v1.c
new file mode 100644
index 000000000..97013895a
--- /dev/null
+++ b/kernel/riscv64/zhemm_ltcopy_rvv_v1.c
@@ -0,0 +1,139 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m2()
+#define FLOAT_V_T               vfloat32m2_t
+#define FLOAT_VX2_T             vfloat32m2x2_t
+#define VGET_VX2                __riscv_vget_v_f32m2x2_f32m2
+#define VSET_VX2                __riscv_vset_v_f32m2_f32m2x2
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e32_v_f32m2x2
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e32_v_f32m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2x2
+#define INT_V_T                 vint32m2_t
+#define VID_V_INT               __riscv_vid_v_i32m2
+#define VADD_VX_INT             __riscv_vadd_vx_i32m2
+#define VFRSUB_VF_FLOAT         __riscv_vfrsub_vf_f32m2
+#define VMSGT_VX_INT            __riscv_vmsgt_vx_i32m2_b16
+#define VMSLT_VX_INT            __riscv_vmslt_vx_i32m2_b16
+#define VMSEQ_VX_INT            __riscv_vmseq_vx_i32m2_b16
+#define VBOOL_T                 vbool16_t
+#define VMERGE_VVM_FLOAT        __riscv_vmerge_vvm_f32m2
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m2
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m2()
+#define FLOAT_V_T               vfloat64m2_t
+#define FLOAT_VX2_T             vfloat64m2x2_t
+#define VGET_VX2                __riscv_vget_v_f64m2x2_f64m2
+#define VSET_VX2                __riscv_vset_v_f64m2_f64m2x2
+#define VLEV_FLOAT              __riscv_vle64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e64_v_f64m2x2
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e64_v_f64m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2x2
+#define INT_V_T                 vint64m2_t
+#define VID_V_INT               __riscv_vid_v_i64m2
+#define VADD_VX_INT             __riscv_vadd_vx_i64m2
+#define VFRSUB_VF_FLOAT         __riscv_vfrsub_vf_f64m2
+#define VMSGT_VX_INT            __riscv_vmsgt_vx_i64m2_b32
+#define VMSLT_VX_INT            __riscv_vmslt_vx_i64m2_b32
+#define VMSEQ_VX_INT            __riscv_vmseq_vx_i64m2_b32
+#define VBOOL_T                 vbool32_t
+#define VMERGE_VVM_FLOAT        __riscv_vmerge_vvm_f64m2
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m2
+#endif
+
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b)
+{
+    //fprintf(stderr, "%s, %s, m=%ld n=%ld lda=%ld posX=%ld posY=%ld\n", __FUNCTION__, __FILE__, m, n, lda, posX, posY);
+
+    BLASLONG i, js, offset;
+
+    FLOAT *ao1, *ao2;
+
+    BLASLONG stride_lda = sizeof(FLOAT) * lda * 2;
+
+    FLOAT_V_T vb0, vb1, vb2, va10, va11, va20, va21, vzero;
+    FLOAT_VX2_T va1x2, va2x2, vbx2;
+    VBOOL_T vbool_gt0, vbool_lt0, vbool_eq0;
+    INT_V_T vindex_max, vindex;
+
+    size_t vl = VSETVL_MAX;
+    vindex_max   = VID_V_INT(vl);
+    vzero = VFMVVF_FLOAT(ZERO, vl);
+
+    for (js = n; js > 0; js -= vl, posX += vl) {
+        vl = VSETVL(js);
+        offset = posX - posY;
+
+        ao1 = a + posX * 2 + posY * lda * 2;
+        ao2 = a + posY * 2 + posX * lda * 2;
+
+        for (i = m; i > 0; i--, offset--) {
+            va2x2 = VLSSEG2_FLOAT(ao2, stride_lda, vl);
+            va1x2 = VLSEG2_FLOAT(ao1, vl);
+
+            va20 = VGET_VX2(va2x2, 0);
+            va21 = VGET_VX2(va2x2, 1);
+            va10 = VGET_VX2(va1x2, 0);
+            va11 = VGET_VX2(va1x2, 1);
+
+            vindex = VADD_VX_INT(vindex_max, offset, vl);
+            vbool_gt0  = VMSGT_VX_INT(vindex, 0, vl);
+            vbool_lt0  = VMSLT_VX_INT(vindex, 0, vl);
+            vbool_eq0  = VMSEQ_VX_INT(vindex, 0, vl);
+
+            vb0 =  VMERGE_VVM_FLOAT(va20, va10, vbool_gt0, vl);
+            vb1 =  VMERGE_VVM_FLOAT(va21, va11, vbool_gt0, vl);
+
+            vb2 = VFRSUB_VF_FLOAT(vb1, ZERO, vl);
+
+            vb1 =  VMERGE_VVM_FLOAT(vb1, vb2, vbool_lt0, vl);
+            vb1 =  VMERGE_VVM_FLOAT(vb1, vzero, vbool_eq0, vl);
+
+            vbx2 = VSET_VX2(vbx2, 0, vb0);
+            vbx2 = VSET_VX2(vbx2, 1, vb1);
+            VSSEG2_FLOAT(b, vbx2, vl);
+
+            b   += vl * 2;
+            ao1 += lda * 2;
+            ao2 += 2;
+        }
+    }
+    
+    return 0;
+}
+
diff --git a/kernel/riscv64/zhemm_utcopy_rvv_v1.c b/kernel/riscv64/zhemm_utcopy_rvv_v1.c
new file mode 100644
index 000000000..59029e9e5
--- /dev/null
+++ b/kernel/riscv64/zhemm_utcopy_rvv_v1.c
@@ -0,0 +1,135 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m2()
+#define FLOAT_V_T               vfloat32m2_t
+#define FLOAT_VX2_T             vfloat32m2x2_t
+#define VGET_VX2                __riscv_vget_v_f32m2x2_f32m2
+#define VSET_VX2                __riscv_vset_v_f32m2_f32m2x2
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e32_v_f32m2x2
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e32_v_f32m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2x2
+#define INT_V_T                 vint32m2_t
+#define VID_V_INT               __riscv_vid_v_i32m2
+#define VADD_VX_INT             __riscv_vadd_vx_i32m2
+#define VFRSUB_VF_FLOAT         __riscv_vfrsub_vf_f32m2
+#define VMSGT_VX_INT            __riscv_vmsgt_vx_i32m2_b16
+#define VMSLT_VX_INT            __riscv_vmslt_vx_i32m2_b16
+#define VMSEQ_VX_INT            __riscv_vmseq_vx_i32m2_b16
+#define VBOOL_T                 vbool16_t
+#define VMERGE_VVM_FLOAT        __riscv_vmerge_vvm_f32m2
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m2
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m2()
+#define FLOAT_V_T               vfloat64m2_t
+#define FLOAT_VX2_T             vfloat64m2x2_t
+#define VGET_VX2                __riscv_vget_v_f64m2x2_f64m2
+#define VSET_VX2                __riscv_vset_v_f64m2_f64m2x2
+#define VLEV_FLOAT              __riscv_vle64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e64_v_f64m2x2
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e64_v_f64m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2x2
+#define INT_V_T                 vint64m2_t
+#define VID_V_INT               __riscv_vid_v_i64m2
+#define VADD_VX_INT             __riscv_vadd_vx_i64m2
+#define VFRSUB_VF_FLOAT         __riscv_vfrsub_vf_f64m2
+#define VMSGT_VX_INT            __riscv_vmsgt_vx_i64m2_b32
+#define VMSLT_VX_INT            __riscv_vmslt_vx_i64m2_b32
+#define VMSEQ_VX_INT            __riscv_vmseq_vx_i64m2_b32
+#define VBOOL_T                 vbool32_t
+#define VMERGE_VVM_FLOAT        __riscv_vmerge_vvm_f64m2
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m2
+#endif
+
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b)
+{
+    BLASLONG i, js, offset;
+
+    FLOAT *ao1, *ao2;
+    //fprintf(stderr, "%s, %s, m=%ld n=%ld lda=%ld posX=%ld posY=%ld\n", __FUNCTION__, __FILE__, m, n, lda, posX, posY);
+    BLASLONG stride_lda = sizeof(FLOAT) * lda * 2;
+    
+    FLOAT_V_T vb0, vb1, vb2, va10, va11, va20, va21, vzero;
+    FLOAT_VX2_T va1x2, va2x2, vbx2;
+    VBOOL_T vbool_gt0, vbool_eq0;
+    INT_V_T vindex_max, vindex;
+
+    size_t vl = VSETVL_MAX;
+    vindex_max   = VID_V_INT(vl);
+    vzero = VFMVVF_FLOAT(ZERO, vl);
+
+    for (js = n; js > 0; js -= vl, posX += vl) {
+        vl = VSETVL(js);
+        offset = posX - posY;
+
+        ao1 = a + posY * 2 + posX * lda * 2;
+        ao2 = a + posX * 2 + posY * lda * 2;
+
+        for (i = m; i > 0; i--, offset--) {
+            va1x2 = VLSSEG2_FLOAT(ao1, stride_lda, vl);
+            va2x2 = VLSEG2_FLOAT(ao2, vl);
+
+            va20 = VGET_VX2(va2x2, 0);
+            va21 = VGET_VX2(va2x2, 1);
+            va10 = VGET_VX2(va1x2, 0);
+            va11 = VGET_VX2(va1x2, 1);
+
+            vindex = VADD_VX_INT(vindex_max, offset, vl);
+            vbool_gt0  = VMSGT_VX_INT(vindex, 0, vl);
+            vbool_eq0  = VMSEQ_VX_INT(vindex, 0, vl);
+
+            vb0 =  VMERGE_VVM_FLOAT(va20, va10, vbool_gt0, vl);
+            vb1 =  VMERGE_VVM_FLOAT(va21, va11, vbool_gt0, vl);
+
+            vb2 =  VFRSUB_VF_FLOAT(vb1, ZERO, vl);
+
+            vb1 =  VMERGE_VVM_FLOAT(vb1, vb2, vbool_gt0, vl);
+            vb1 =  VMERGE_VVM_FLOAT(vb1, vzero, vbool_eq0, vl);
+
+            vbx2 = VSET_VX2(vbx2, 0, vb0);
+            vbx2 = VSET_VX2(vbx2, 1, vb1);
+            VSSEG2_FLOAT(b, vbx2, vl);
+
+            b   += vl * 2;
+            ao1 += 2;
+            ao2 += lda * 2;
+        }
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/zhemv_LM_rvv.c b/kernel/riscv64/zhemv_LM_rvv.c
new file mode 100644
index 000000000..95c6a377c
--- /dev/null
+++ b/kernel/riscv64/zhemv_LM_rvv.c
@@ -0,0 +1,198 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#if !defined(DOUBLE)
+#define VSETVL(n) __riscv_vsetvl_e32m4(n)
+#define VSETVL_MAX __riscv_vsetvlmax_e32m1()
+#define FLOAT_V_T vfloat32m4_t
+#define FLOAT_V_T_M1 vfloat32m1_t
+#define VFMVFS_FLOAT __riscv_vfmv_f_s_f32m1_f32
+#define VLSEV_FLOAT __riscv_vlse32_v_f32m4
+#define VSSEV_FLOAT __riscv_vsse32_v_f32m4
+#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1
+#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4
+#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m4_tu
+#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4
+#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4
+#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
+#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4
+#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4
+#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4
+#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f32m4_tu
+#else
+#define VSETVL(n) __riscv_vsetvl_e64m4(n)
+#define VSETVL_MAX __riscv_vsetvlmax_e64m1()
+#define FLOAT_V_T vfloat64m4_t
+#define FLOAT_V_T_M1 vfloat64m1_t
+#define VFMVFS_FLOAT __riscv_vfmv_f_s_f64m1_f64
+#define VLSEV_FLOAT __riscv_vlse64_v_f64m4
+#define VSSEV_FLOAT __riscv_vsse64_v_f64m4
+#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1
+#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4
+#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m4_tu
+#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4
+#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4
+#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
+#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4
+#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4
+#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4
+#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f64m4_tu
+#endif
+
+int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer){
+        BLASLONG i, j, k;
+        BLASLONG ix, iy, ia;
+        BLASLONG jx, jy, ja;
+        FLOAT temp_r1, temp_i1;
+        FLOAT temp_r2, temp_i2;
+        FLOAT *a_ptr = a;
+        unsigned int gvl = 0;
+        FLOAT_V_T_M1 v_res, v_z0;
+        gvl = VSETVL_MAX;
+        v_res = VFMVVF_FLOAT_M1(0, gvl);
+        v_z0 = VFMVVF_FLOAT_M1(0, gvl);
+
+        FLOAT_V_T va0, va1, vx0, vx1, vy0, vy1, vr0, vr1;
+        BLASLONG stride_x, stride_y, stride_a, inc_xv, inc_yv, inc_av, len, lda2;
+
+        BLASLONG inc_x2 = incx * 2;
+        BLASLONG inc_y2 = incy * 2;
+        stride_x = inc_x2 * sizeof(FLOAT);
+        stride_y = inc_y2 * sizeof(FLOAT);
+        stride_a = 2 * sizeof(FLOAT);
+        lda2 = lda * 2;
+
+        jx = 0;
+        jy = 0;
+        ja = 0;
+        for(j = 0; j < offset; j++){
+                temp_r1 = alpha_r * x[jx]   - alpha_i * x[jx+1];;
+                temp_i1 = alpha_r * x[jx+1] + alpha_i * x[jx];
+                temp_r2 = 0;
+                temp_i2 = 0;
+                y[jy] += temp_r1 * a_ptr[ja];
+                y[jy+1] += temp_i1 * a_ptr[ja];
+                ix = jx + inc_x2;
+                iy = jy + inc_y2;
+                ia = ja + 2;
+                i = j + 1;
+                len = m - i;
+                if(len > 0){
+                        gvl = VSETVL(len);
+                        inc_xv = incx * gvl * 2;
+                        inc_yv = incy * gvl * 2;
+                        inc_av = gvl * 2;
+                        vr0 = VFMVVF_FLOAT(0, gvl);
+                        vr1 = VFMVVF_FLOAT(0, gvl);
+                        for(k = 0; k < len / gvl; k++){
+                                va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl);
+                                va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl);
+                                vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                                vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl);
+#ifndef HEMVREV
+                                vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl);
+                                vy0 = VFNMSACVF_FLOAT(vy0, temp_i1, va1, gvl);
+                                vy1 = VFMACCVF_FLOAT(vy1, temp_r1, va1, gvl);
+                                vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl);
+#else
+                                vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl);
+                                vy0 = VFMACCVF_FLOAT(vy0, temp_i1, va1, gvl);
+                                vy1 = VFNMSACVF_FLOAT(vy1, temp_r1, va1, gvl);
+                                vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl);
+#endif
+                                VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl);
+                                VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl);
+
+                                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+#ifndef HEMVREV
+                                vr0 = VFMACCVV_FLOAT(vr0, vx0, va0, gvl);
+                                vr0 = VFMACCVV_FLOAT(vr0, vx1, va1, gvl);
+                                vr1 = VFMACCVV_FLOAT(vr1, vx1, va0, gvl);
+                                vr1 = VFNMSACVV_FLOAT(vr1, vx0, va1, gvl);
+#else
+                                vr0 = VFMACCVV_FLOAT(vr0, vx0, va0, gvl);
+                                vr0 = VFNMSACVV_FLOAT(vr0, vx1, va1, gvl);
+                                vr1 = VFMACCVV_FLOAT(vr1, vx1, va0, gvl);
+                                vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl);
+
+#endif
+                                i += gvl;
+                                ix += inc_xv;
+                                iy += inc_yv;
+                                ia += inc_av;
+                        }
+
+                        if(i < m){
+				unsigned int gvl_rem = VSETVL(m-i);
+                                va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl_rem);
+                                va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl_rem);
+                                vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl_rem);
+                                vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl_rem);
+#ifndef HEMVREV
+                                vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl_rem);
+                                vy0 = VFNMSACVF_FLOAT(vy0, temp_i1, va1, gvl_rem);
+                                vy1 = VFMACCVF_FLOAT(vy1, temp_r1, va1, gvl_rem);
+                                vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl_rem);
+#else
+                                vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl_rem);
+                                vy0 = VFMACCVF_FLOAT(vy0, temp_i1, va1, gvl_rem);
+                                vy1 = VFNMSACVF_FLOAT(vy1, temp_r1, va1, gvl_rem);
+                                vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl_rem);
+#endif
+                                VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl_rem);
+                                VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl_rem);
+
+                                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl_rem);
+                                vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl_rem);
+#ifndef HEMVREV
+                                vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, va0, gvl_rem);
+                                vr0 = VFMACCVV_FLOAT_TU(vr0, vx1, va1, gvl_rem);
+                                vr1 = VFMACCVV_FLOAT_TU(vr1, vx1, va0, gvl_rem);
+                                vr1 = VFNMSACVV_FLOAT_TU(vr1, vx0, va1, gvl_rem);
+#else
+                                vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, va0, gvl_rem);
+                                vr0 = VFNMSACVV_FLOAT_TU(vr0, vx1, va1, gvl_rem);
+                                vr1 = VFMACCVV_FLOAT_TU(vr1, vx1, va0, gvl_rem);
+                                vr1 = VFMACCVV_FLOAT_TU(vr1, vx0, va1, gvl_rem);
+#endif
+                        }
+                        v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl);
+                        temp_r2 = VFMVFS_FLOAT(v_res);
+                        v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl);
+                        temp_i2 = VFMVFS_FLOAT(v_res);
+                }
+		y[jy] += alpha_r * temp_r2 - alpha_i * temp_i2;
+		y[jy+1] += alpha_r * temp_i2 + alpha_i * temp_r2;
+		jx    += inc_x2;
+		jy    += inc_y2;
+		ja    += 2;
+		a_ptr += lda2;
+        }
+	return(0);
+}
diff --git a/kernel/riscv64/zhemv_LM_vector.c b/kernel/riscv64/zhemv_LM_vector.c
index 0a284a999..117db7d84 100644
--- a/kernel/riscv64/zhemv_LM_vector.c
+++ b/kernel/riscv64/zhemv_LM_vector.c
@@ -27,37 +27,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m4(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
+#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n)
+#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)()
 #define FLOAT_V_T vfloat32m4_t
 #define FLOAT_V_T_M1 vfloat32m1_t
-#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLSEV_FLOAT vlse32_v_f32m4
-#define VSSEV_FLOAT vsse32_v_f32m4
-#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
-#define VFMACCVV_FLOAT vfmacc_vv_f32m4
-#define VFMACCVF_FLOAT vfmacc_vf_f32m4
-#define VFMVVF_FLOAT vfmv_v_f_f32m4
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFMULVV_FLOAT vfmul_vv_f32m4
-#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4
-#define VFNMSACVV_FLOAT vfnmsac_vv_f32m4
+#define VFMVFS_FLOAT RISCV_RVV(vfmv_f_s_f32m1_f32)
+#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4)
+#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDSUM_FLOAT(va, vb, gvl) RISCV_RVV(vfredusum_vs_f32m4_f32m1)(v_res, va, vb, gvl)
 #else
-#define VSETVL(n) vsetvl_e64m4(n)
-#define VSETVL_MAX vsetvlmax_e64m1()
+#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f32m4_f32m1)
+#endif
+#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f32m4)
+#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m4)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1)
+#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f32m4)
+#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f32m4)
+#define VFNMSACVV_FLOAT RISCV_RVV(vfnmsac_vv_f32m4)
+#else
+#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n)
+#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)()
 #define FLOAT_V_T vfloat64m4_t
 #define FLOAT_V_T_M1 vfloat64m1_t
-#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLSEV_FLOAT vlse64_v_f64m4
-#define VSSEV_FLOAT vsse64_v_f64m4
-#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
-#define VFMACCVV_FLOAT vfmacc_vv_f64m4
-#define VFMACCVF_FLOAT vfmacc_vf_f64m4
-#define VFMVVF_FLOAT vfmv_v_f_f64m4
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFMULVV_FLOAT vfmul_vv_f64m4
-#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4
-#define VFNMSACVV_FLOAT vfnmsac_vv_f64m4
+#define VFMVFS_FLOAT RISCV_RVV(vfmv_f_s_f64m1_f64)
+#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4)
+#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDSUM_FLOAT(va, vb, gvl) RISCV_RVV(vfredusum_vs_f64m4_f64m1)(v_res, va, vb, gvl)
+#else
+#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f64m4_f64m1)
+#endif
+#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f64m4)
+#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m4)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1)
+#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f64m4)
+#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m4)
+#define VFNMSACVV_FLOAT RISCV_RVV(vfnmsac_vv_f64m4)
 #endif
 
 int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer){
@@ -143,9 +151,9 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B
                                 iy += inc_yv;
                                 ia += inc_av;
                         }
-                        v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl);
+                        v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl);
                         temp_r2 = VFMVFS_FLOAT(v_res);
-                        v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl);
+                        v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl);
                         temp_i2 = VFMVFS_FLOAT(v_res);
                         if(i < m){
 				                gvl = VSETVL(m-i);
@@ -181,9 +189,9 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B
                                 vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl);
 #endif
 
-                                v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl);
+                                v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl);
                                 temp_r2 += VFMVFS_FLOAT(v_res);
-                                v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl);
+                                v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl);
                                 temp_i2 += VFMVFS_FLOAT(v_res);
                         }
                 }
diff --git a/kernel/riscv64/zhemv_UV_rvv.c b/kernel/riscv64/zhemv_UV_rvv.c
new file mode 100644
index 000000000..ec06622fc
--- /dev/null
+++ b/kernel/riscv64/zhemv_UV_rvv.c
@@ -0,0 +1,199 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#if !defined(DOUBLE)
+#define VSETVL(n) __riscv_vsetvl_e32m4(n)
+#define VSETVL_MAX __riscv_vsetvlmax_e32m1()
+#define FLOAT_V_T vfloat32m4_t
+#define FLOAT_V_T_M1 vfloat32m1_t
+#define VFMVFS_FLOAT __riscv_vfmv_f_s_f32m1_f32
+#define VLSEV_FLOAT __riscv_vlse32_v_f32m4
+#define VSSEV_FLOAT __riscv_vsse32_v_f32m4
+#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1
+#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4
+#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m4_tu
+#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4
+#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4
+#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
+#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4
+#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4
+#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4
+#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f32m4_tu
+#else
+#define VSETVL(n) __riscv_vsetvl_e64m4(n)
+#define VSETVL_MAX __riscv_vsetvlmax_e64m1()
+#define FLOAT_V_T vfloat64m4_t
+#define FLOAT_V_T_M1 vfloat64m1_t
+#define VFMVFS_FLOAT __riscv_vfmv_f_s_f64m1_f64
+#define VLSEV_FLOAT __riscv_vlse64_v_f64m4
+#define VSSEV_FLOAT __riscv_vsse64_v_f64m4
+#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1
+#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4
+#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m4_tu
+#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4
+#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4
+#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
+#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4
+#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4
+#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4
+#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f64m4_tu
+#endif
+
+int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer){
+        BLASLONG i, j, k;
+        BLASLONG ix, iy, ia;
+        BLASLONG jx, jy, ja;
+        FLOAT temp_r1, temp_i1;
+        FLOAT temp_r2, temp_i2;
+        FLOAT *a_ptr = a;
+        unsigned int gvl = 0;
+        FLOAT_V_T_M1 v_res, v_z0;
+        gvl = VSETVL_MAX;
+        v_res = VFMVVF_FLOAT_M1(0, gvl);
+        v_z0 = VFMVVF_FLOAT_M1(0, gvl);
+
+        FLOAT_V_T va0, va1, vx0, vx1, vy0, vy1, vr0, vr1;
+        BLASLONG stride_x, stride_y, stride_a, inc_xv, inc_yv, inc_av, lda2;
+
+        BLASLONG inc_x2 = incx * 2;
+        BLASLONG inc_y2 = incy * 2;
+        stride_x = inc_x2 * sizeof(FLOAT);
+        stride_y = inc_y2 * sizeof(FLOAT);
+        stride_a = 2 * sizeof(FLOAT);
+        lda2 = lda * 2;
+
+        BLASLONG m1 = m - offset;
+        a_ptr = a + m1 * lda2;
+        jx = m1 * inc_x2;
+        jy = m1 * inc_y2;
+        ja = m1 * 2;
+        for(j = m1; j < m; j++){
+                temp_r1 = alpha_r * x[jx]   - alpha_i * x[jx+1];;
+                temp_i1 = alpha_r * x[jx+1] + alpha_i * x[jx];
+                temp_r2 = 0;
+                temp_i2 = 0;
+                ix = 0;
+                iy = 0;
+                ia = 0;
+                i = 0;
+                if(j > 0){
+                        gvl = VSETVL(j);
+                        inc_xv = incx * gvl * 2;
+                        inc_yv = incy * gvl * 2;
+                        inc_av = gvl * 2;
+                        vr0 = VFMVVF_FLOAT(0, gvl);
+                        vr1 = VFMVVF_FLOAT(0, gvl);
+                        for(k = 0; k < j / gvl; k++){
+                                va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl);
+                                va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl);
+                                vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                                vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl);
+#ifndef HEMVREV
+                                vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl);
+                                vy0 = VFNMSACVF_FLOAT(vy0, temp_i1, va1, gvl);
+                                vy1 = VFMACCVF_FLOAT(vy1, temp_r1, va1, gvl);
+                                vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl);
+#else
+                                vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl);
+                                vy0 = VFMACCVF_FLOAT(vy0, temp_i1, va1, gvl);
+                                vy1 = VFNMSACVF_FLOAT(vy1, temp_r1, va1, gvl);
+                                vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl);
+#endif
+                                VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl);
+                                VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl);
+
+                                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+#ifndef HEMVREV
+                                vr0 = VFMACCVV_FLOAT(vr0, vx0, va0, gvl);
+                                vr0 = VFMACCVV_FLOAT(vr0, vx1, va1, gvl);
+                                vr1 = VFMACCVV_FLOAT(vr1, vx1, va0, gvl);
+                                vr1 = VFNMSACVV_FLOAT(vr1, vx0, va1, gvl);
+#else
+                                vr0 = VFMACCVV_FLOAT(vr0, vx0, va0, gvl);
+                                vr0 = VFNMSACVV_FLOAT(vr0, vx1, va1, gvl);
+                                vr1 = VFMACCVV_FLOAT(vr1, vx1, va0, gvl);
+                                vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl);
+
+#endif
+                                i += gvl;
+                                ix += inc_xv;
+                                iy += inc_yv;
+                                ia += inc_av;
+                        }
+
+                        if(i < j){
+				unsigned int gvl_rem = VSETVL(j-i);
+                                va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl_rem);
+                                va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl_rem);
+                                vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl_rem);
+                                vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl_rem);
+#ifndef HEMVREV
+                                vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl_rem);
+                                vy0 = VFNMSACVF_FLOAT(vy0, temp_i1, va1, gvl_rem);
+                                vy1 = VFMACCVF_FLOAT(vy1, temp_r1, va1, gvl_rem);
+                                vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl_rem);
+#else
+                                vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl_rem);
+                                vy0 = VFMACCVF_FLOAT(vy0, temp_i1, va1, gvl_rem);
+                                vy1 = VFNMSACVF_FLOAT(vy1, temp_r1, va1, gvl_rem);
+                                vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl_rem);
+#endif
+                                VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl_rem);
+                                VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl_rem);
+
+                                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl_rem);
+                                vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl_rem);
+#ifndef HEMVREV
+                                vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, va0, gvl_rem);
+                                vr0 = VFMACCVV_FLOAT_TU(vr0, vx1, va1, gvl_rem);
+                                vr1 = VFMACCVV_FLOAT_TU(vr1, vx1, va0, gvl_rem);
+                                vr1 = VFNMSACVV_FLOAT_TU(vr1, vx0, va1, gvl_rem);
+#else
+                                vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, va0, gvl_rem);
+                                vr0 = VFNMSACVV_FLOAT_TU(vr0, vx1, va1, gvl_rem);
+                                vr1 = VFMACCVV_FLOAT_TU(vr1, vx1, va0, gvl_rem);
+                                vr1 = VFMACCVV_FLOAT_TU(vr1, vx0, va1, gvl_rem);
+#endif
+                        }
+                        v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl);
+                        temp_r2 = VFMVFS_FLOAT(v_res);
+                        v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl);
+                        temp_i2 = VFMVFS_FLOAT(v_res);
+                }
+                y[jy] += temp_r1 * a_ptr[ja];
+                y[jy+1] += temp_i1 * a_ptr[ja];
+		y[jy] += alpha_r * temp_r2 - alpha_i * temp_i2;
+		y[jy+1] += alpha_r * temp_i2 + alpha_i * temp_r2;
+		jx    += inc_x2;
+		jy    += inc_y2;
+		ja    += 2;
+		a_ptr += lda2;
+        }
+	return(0);
+}
diff --git a/kernel/riscv64/zhemv_UV_vector.c b/kernel/riscv64/zhemv_UV_vector.c
index 33b7c9c25..7c6b63bf3 100644
--- a/kernel/riscv64/zhemv_UV_vector.c
+++ b/kernel/riscv64/zhemv_UV_vector.c
@@ -27,37 +27,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m4(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
+#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n)
+#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)()
 #define FLOAT_V_T vfloat32m4_t
 #define FLOAT_V_T_M1 vfloat32m1_t
-#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLSEV_FLOAT vlse32_v_f32m4
-#define VSSEV_FLOAT vsse32_v_f32m4
-#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
-#define VFMACCVV_FLOAT vfmacc_vv_f32m4
-#define VFMACCVF_FLOAT vfmacc_vf_f32m4
-#define VFMVVF_FLOAT vfmv_v_f_f32m4
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFMULVV_FLOAT vfmul_vv_f32m4
-#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4
-#define VFNMSACVV_FLOAT vfnmsac_vv_f32m4
+#define VFMVFS_FLOAT RISCV_RVV(vfmv_f_s_f32m1_f32)
+#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4)
+#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDSUM_FLOAT(va, vb, gvl) RISCV_RVV(vfredusum_vs_f32m4_f32m1)(v_res, va, vb, gvl)
 #else
-#define VSETVL(n) vsetvl_e64m4(n)
-#define VSETVL_MAX vsetvlmax_e64m1()
+#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f32m4_f32m1)
+#endif
+#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f32m4)
+#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m4)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1)
+#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f32m4)
+#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f32m4)
+#define VFNMSACVV_FLOAT RISCV_RVV(vfnmsac_vv_f32m4)
+#else
+#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n)
+#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)()
 #define FLOAT_V_T vfloat64m4_t
 #define FLOAT_V_T_M1 vfloat64m1_t
-#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLSEV_FLOAT vlse64_v_f64m4
-#define VSSEV_FLOAT vsse64_v_f64m4
-#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
-#define VFMACCVV_FLOAT vfmacc_vv_f64m4
-#define VFMACCVF_FLOAT vfmacc_vf_f64m4
-#define VFMVVF_FLOAT vfmv_v_f_f64m4
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFMULVV_FLOAT vfmul_vv_f64m4
-#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4
-#define VFNMSACVV_FLOAT vfnmsac_vv_f64m4
+#define VFMVFS_FLOAT RISCV_RVV(vfmv_f_s_f64m1_f64)
+#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4)
+#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDSUM_FLOAT(va, vb, gvl) RISCV_RVV(vfredusum_vs_f64m4_f64m1)(v_res, va, vb, gvl)
+#else
+#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f64m4_f64m1)
+#endif
+#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f64m4)
+#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m4)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1)
+#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f64m4)
+#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m4)
+#define VFNMSACVV_FLOAT RISCV_RVV(vfnmsac_vv_f64m4)
 #endif
 
 int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer){
@@ -142,9 +150,9 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B
                                 iy += inc_yv;
                                 ia += inc_av;
                         }
-                        v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl);
+                        v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl);
                         temp_r2 = VFMVFS_FLOAT(v_res);
-                        v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl);
+                        v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl);
                         temp_i2 = VFMVFS_FLOAT(v_res);
                         if(i < j){
 				                gvl = VSETVL(j-i);
@@ -180,9 +188,9 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B
                                 vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl);
 #endif
 
-                                v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl);
+                                v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl);
                                 temp_r2 += VFMVFS_FLOAT(v_res);
-                                v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl);
+                                v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl);
                                 temp_i2 += VFMVFS_FLOAT(v_res);
                         }
                 }
diff --git a/kernel/riscv64/znrm2_rvv.c b/kernel/riscv64/znrm2_rvv.c
new file mode 100644
index 000000000..32f67758a
--- /dev/null
+++ b/kernel/riscv64/znrm2_rvv.c
@@ -0,0 +1,275 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)           __riscv_vsetvl_e32m4(n)
+#define VSETVL_MAX          __riscv_vsetvlmax_e32m4()
+#define FLOAT_V_T           vfloat32m4_t
+#define FLOAT_V_T_M1        vfloat32m1_t
+#define MASK_T              vbool8_t
+#define VLEV_FLOAT          __riscv_vle32_v_f32m4
+#define VLSEV_FLOAT         __riscv_vlse32_v_f32m4
+#define VFREDSUM_FLOAT      __riscv_vfredusum_vs_f32m4_f32m1_tu
+#define VFMACCVV_FLOAT_TU   __riscv_vfmacc_vv_f32m4_tu
+#define VFMVVF_FLOAT        __riscv_vfmv_v_f_f32m4
+#define VFMVVF_FLOAT_M1     __riscv_vfmv_v_f_f32m1
+#define VMFIRSTM            __riscv_vfirst_m_b8
+#define VFREDMAXVS_FLOAT_TU __riscv_vfredmax_vs_f32m4_f32m1_tu
+#define VFMVFS_FLOAT        __riscv_vfmv_f_s_f32m1_f32
+#define VMFGTVF_FLOAT       __riscv_vmfgt_vf_f32m4_b8
+#define VFDIVVF_FLOAT       __riscv_vfdiv_vf_f32m4
+#define VFABSV_FLOAT        __riscv_vfabs_v_f32m4
+#else
+#define VSETVL(n)           __riscv_vsetvl_e64m4(n)
+#define VSETVL_MAX          __riscv_vsetvlmax_e64m4()
+#define FLOAT_V_T           vfloat64m4_t
+#define FLOAT_V_T_M1        vfloat64m1_t
+#define MASK_T              vbool16_t
+#define VLEV_FLOAT          __riscv_vle64_v_f64m4
+#define VLSEV_FLOAT         __riscv_vlse64_v_f64m4
+#define VFREDSUM_FLOAT      __riscv_vfredusum_vs_f64m4_f64m1_tu
+#define VFMACCVV_FLOAT_TU   __riscv_vfmacc_vv_f64m4_tu
+#define VFMVVF_FLOAT        __riscv_vfmv_v_f_f64m4
+#define VFMVVF_FLOAT_M1     __riscv_vfmv_v_f_f64m1
+#define VMFIRSTM            __riscv_vfirst_m_b16
+#define VFREDMAXVS_FLOAT_TU __riscv_vfredmax_vs_f64m4_f64m1_tu
+#define VFMVFS_FLOAT        __riscv_vfmv_f_s_f64m1_f64
+#define VMFGTVF_FLOAT       __riscv_vmfgt_vf_f64m4_b16
+#define VFDIVVF_FLOAT       __riscv_vfdiv_vf_f64m4
+#define VFABSV_FLOAT        __riscv_vfabs_v_f64m4
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0, j=0;
+
+	if (n <= 0 || inc_x <= 0) return(0.0);
+
+    FLOAT_V_T vr, v0, v_zero;
+    unsigned int gvl = 0;
+    FLOAT_V_T_M1 v_res, v_z0;
+    gvl = VSETVL_MAX;
+    v_res = VFMVVF_FLOAT_M1(0, gvl);
+    v_z0 = VFMVVF_FLOAT_M1(0, gvl);
+
+    FLOAT scale = 0.0, ssq = 0.0;
+    MASK_T mask;
+    BLASLONG index = 0;
+    if (inc_x == 1) {
+        BLASLONG n2 = n * 2;
+        gvl = VSETVL(n2);
+        vr = VFMVVF_FLOAT(0, gvl);
+        v_zero = VFMVVF_FLOAT(0, gvl);
+        for (i=0,j=0; i<n2/gvl; i++) {
+            v0 = VLEV_FLOAT(&x[j], gvl);
+            //fabs(vector)
+            v0 = VFABSV_FLOAT(v0, gvl);
+            //if scale change
+            mask = VMFGTVF_FLOAT(v0, scale, gvl);
+            index = VMFIRSTM(mask, gvl);
+            if (index == -1) { //no elements greater than scale
+                if (scale != 0.0) {
+                    v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                    vr = VFMACCVV_FLOAT_TU(vr, v0, v0, gvl);
+                }
+            } else { // found greater element
+                //ssq in vector vr: vr[0]
+                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
+                //total ssq before current vector
+                ssq += VFMVFS_FLOAT(v_res);
+                //find max
+                v_res = VFREDMAXVS_FLOAT_TU(v_res, v0, v_z0, gvl);
+                //update ssq before max_index
+                ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res));
+                //update scale
+                scale = VFMVFS_FLOAT(v_res);
+                //ssq in vector vr
+                v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                vr = VFMACCVV_FLOAT_TU(v_zero, v0, v0, gvl);
+            }
+            j += gvl;
+        }
+        //ssq in vector vr: vr[0]
+        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
+        //total ssq now
+        ssq += VFMVFS_FLOAT(v_res);
+
+        //tail
+        if(j < n2){
+            gvl = VSETVL(n2-j);
+            v0 = VLEV_FLOAT(&x[j], gvl);
+            // fabs(vector)
+            v0 = VFABSV_FLOAT(v0, gvl);
+            // if scale change
+            mask = VMFGTVF_FLOAT(v0, scale, gvl);
+            index = VMFIRSTM(mask, gvl);
+            if (index == -1) {//no elements greater than scale
+                if(scale != 0.0)
+                    v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+            } else { //found greater element
+                //find max
+                v_res = VFREDMAXVS_FLOAT_TU(v_res, v0, v_z0, gvl);
+                //update ssq before max_index
+                ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res));
+                //update scale
+                scale = VFMVFS_FLOAT(v_res);
+                v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+            }
+            vr = VFMACCVV_FLOAT_TU(v_zero, v0, v0, gvl);
+            //ssq in vector vr: vr[0]
+            v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
+            //total ssq now
+            ssq += VFMVFS_FLOAT(v_res);
+        }
+    } else {
+        gvl = VSETVL(n);
+        vr = VFMVVF_FLOAT(0, gvl);
+        v_zero = VFMVVF_FLOAT(0, gvl);
+        unsigned int stride_x = inc_x * sizeof(FLOAT) * 2;
+        int idx = 0, inc_v = inc_x * gvl * 2;
+        for (i=0,j=0; i<n/gvl; i++) {
+            v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl);
+            // fabs(vector)
+            v0 = VFABSV_FLOAT(v0, gvl);
+            //if scale change
+            mask = VMFGTVF_FLOAT(v0, scale, gvl);
+            index = VMFIRSTM(mask, gvl);
+            if (index == -1) { // no elements greater than scale
+                if(scale != 0.0){
+                    v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                    vr = VFMACCVV_FLOAT_TU(vr, v0, v0, gvl);
+                }
+            } else {//found greater element
+                //ssq in vector vr: vr[0]
+                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
+                //total ssq before current vector
+                ssq += VFMVFS_FLOAT(v_res);
+                //find max
+                v_res = VFREDMAXVS_FLOAT_TU(v_res, v0, v_z0, gvl);
+                //update ssq before max_index
+                ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res));
+                //update scale
+                scale = VFMVFS_FLOAT(v_res);
+                //ssq in vector vr
+                v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                vr = VFMACCVV_FLOAT_TU(v_zero, v0, v0, gvl);
+            }
+
+            v0 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl);
+            //fabs(vector)
+            v0 = VFABSV_FLOAT(v0, gvl);
+            //if scale change
+            mask = VMFGTVF_FLOAT(v0, scale, gvl);
+            index = VMFIRSTM(mask, gvl);
+            if (index == -1) { // no elements greater than scale
+                if(scale != 0.0) {
+                    v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                    vr = VFMACCVV_FLOAT_TU(vr, v0, v0, gvl);
+                }
+            } else { // found greater element
+                //ssq in vector vr: vr[0]
+                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
+                //total ssq before current vector
+                ssq += VFMVFS_FLOAT(v_res);
+                //find max
+                v_res = VFREDMAXVS_FLOAT_TU(v_res, v0, v_z0, gvl);
+                //update ssq before max_index
+                ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res));
+                //update scale
+                scale = VFMVFS_FLOAT(v_res);
+                //ssq in vector vr
+                v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                vr = VFMACCVV_FLOAT_TU(v_zero, v0, v0, gvl);
+            }
+            j += gvl;
+            idx += inc_v;
+        }
+        //ssq in vector vr: vr[0]
+        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
+        //total ssq now
+        ssq += VFMVFS_FLOAT(v_res);
+
+        //tail
+        if (j < n) {
+            gvl = VSETVL(n-j);
+            v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl);
+            //fabs(vector)
+            v0 = VFABSV_FLOAT(v0, gvl);
+            //if scale change
+            mask = VMFGTVF_FLOAT(v0, scale, gvl);
+            index = VMFIRSTM(mask, gvl);
+            if(index == -1) { // no elements greater than scale
+                if(scale != 0.0) {
+                    v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                    vr = VFMACCVV_FLOAT_TU(v_zero, v0, v0, gvl);
+                }
+            } else { // found greater element
+                //find max
+                v_res = VFREDMAXVS_FLOAT_TU(v_res, v0, v_z0, gvl);
+                //update ssq before max_index
+                ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res));
+                //update scale
+                scale = VFMVFS_FLOAT(v_res);
+                v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                vr = VFMACCVV_FLOAT_TU(v_zero, v0, v0, gvl);
+            }
+
+            v0 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl);
+            //fabs(vector)
+            v0 = VFABSV_FLOAT(v0, gvl);
+            //if scale change
+            mask = VMFGTVF_FLOAT(v0, scale, gvl);
+            index = VMFIRSTM(mask, gvl);
+            if (index == -1) {//no elements greater than scale
+                if(scale != 0.0) {
+                    v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                    vr = VFMACCVV_FLOAT_TU(vr, v0, v0, gvl);
+                }
+            } else { // found greater element
+                //ssq in vector vr: vr[0]
+                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
+                //total ssq before current vector
+                ssq += VFMVFS_FLOAT(v_res);
+                //find max
+                v_res = VFREDMAXVS_FLOAT_TU(v_res, v0, v_z0, gvl);
+                //update ssq before max_index
+                ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res));
+                //update scale
+                scale = VFMVFS_FLOAT(v_res);
+                v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                vr = VFMACCVV_FLOAT_TU(v_zero, v0, v0, gvl);
+            }
+            //ssq in vector vr: vr[0]
+            v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
+            //total ssq now
+            ssq += VFMVFS_FLOAT(v_res);
+        }
+    }
+	return(scale * sqrt(ssq));
+}
diff --git a/kernel/riscv64/znrm2_vector.c b/kernel/riscv64/znrm2_vector.c
index cadabdb75..6ee3be79e 100644
--- a/kernel/riscv64/znrm2_vector.c
+++ b/kernel/riscv64/znrm2_vector.c
@@ -26,264 +26,158 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 #include "common.h"
-#if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m4(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
-#define FLOAT_V_T vfloat32m4_t
-#define FLOAT_V_T_M1 vfloat32m1_t
-#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLEV_FLOAT vle32_v_f32m4
-#define VLSEV_FLOAT vlse32_v_f32m4
-#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
-#define VFMACCVV_FLOAT vfmacc_vv_f32m4
-#define VFMVVF_FLOAT vfmv_v_f_f32m4
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFDOTVV_FLOAT vfdot_vv_f32m4
-#define ABS fabsf
-#define MASK_T vbool8_t
-#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m4_m
-#define VMFGTVF_FLOAT vmfgt_vf_f32m4_b8
-#define VMFIRSTM vmfirst_m_b8
-#define VFDIVVF_FLOAT vfdiv_vf_f32m4
-#define VMFLTVF_FLOAT vmflt_vf_f32m4_b8
-#define VFREDMAXVS_FLOAT vfredmax_vs_f32m4_f32m1
+
+#ifdef RISCV64_ZVL256B
+#       define LMUL m1
+#       if defined(DOUBLE)
+#               define ELEN 64
+#               define MLEN 64
+#       else
+#               define ELEN 32
+#               define MLEN 32
+#       endif
 #else
-#define VSETVL(n) vsetvl_e64m4(n)
-#define VSETVL_MAX vsetvlmax_e64m1()
-#define FLOAT_V_T vfloat64m4_t
-#define FLOAT_V_T_M1 vfloat64m1_t
-#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLEV_FLOAT vle64_v_f64m4
-#define VLSEV_FLOAT vlse64_v_f64m4
-#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
-#define VFMACCVV_FLOAT vfmacc_vv_f64m4
-#define VFMVVF_FLOAT vfmv_v_f_f64m4
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFDOTVV_FLOAT vfdot_vv_f64m4
+#       define LMUL m4
+#       if defined(DOUBLE)
+#               define ELEN 64
+#               define MLEN 16
+#       else
+#               define ELEN 32
+#               define MLEN 8
+#       endif
+#endif
+
+#define _
+#define JOIN2_X(x, y) x ## y
+#define JOIN2(x, y) JOIN2_X(x, y)
+#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
+
+#define VSETVL          JOIN(RISCV_RVV(vsetvl),    _e,     ELEN,   LMUL,   _)
+#define FLOAT_V_T       JOIN(vfloat,            ELEN,   LMUL,   _t,     _)
+#define FLOAT_V_T_M1    JOIN(vfloat,            ELEN,   m1,     _t,     _)
+#define VLEV_FLOAT      JOIN(RISCV_RVV(vle),       ELEN,   _v_f,   ELEN,   LMUL)
+#define VLSEV_FLOAT     JOIN(RISCV_RVV(vlse),      ELEN,   _v_f,   ELEN,   LMUL)
+#define VFMVVF_FLOAT    JOIN(RISCV_RVV(vfmv),      _v_f_f, ELEN,   LMUL,   _)
+#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv),      _v_f_f, ELEN,   m1,     _)
+#define MASK_T          JOIN(vbool,             MLEN,   _t,     _,      _)
+#define VFABS           JOIN(RISCV_RVV(vfabs),     _v_f,   ELEN,   LMUL,   _)
+#define VMFNE           JOIN(RISCV_RVV(vmfne_vf_f),ELEN,   LMUL,   _b,     MLEN)
+#define VMFGT           JOIN(RISCV_RVV(vmfgt_vv_f),ELEN,   LMUL,   _b,     MLEN)
+#define VMFEQ           JOIN(RISCV_RVV(vmfeq_vv_f),ELEN,   LMUL,   _b,     MLEN)
+#define VCPOP           JOIN(RISCV_RVV(vcpop),     _m_b,   MLEN,   _,      _)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDMAX(va, vb, gvl) JOIN(RISCV_RVV(vfredmax_vs_f),ELEN,LMUL,   JOIN2(_f, ELEN), m1)(v_res, va, vb, gvl)
+#define VFREDUSUM(va, vb, gvl) JOIN(RISCV_RVV(vfredusum_vs_f),ELEN,LMUL,  JOIN2(_f, ELEN), m1)(v_res, va, vb, gvl)
+#define VFDIV_M         JOIN(RISCV_RVV(vfdiv),     _vv_f,  ELEN,   LMUL,   _m)
+#define VFMACC_M        JOIN(RISCV_RVV(vfmacc),    _vv_f,  ELEN,   LMUL,   _m)
+#else
+#define VFREDMAX        JOIN(RISCV_RVV(vfredmax_vs_f),ELEN,LMUL,   JOIN2(_f, ELEN), m1)
+#define VFREDUSUM       JOIN(RISCV_RVV(vfredusum_vs_f),ELEN,LMUL,  JOIN2(_f, ELEN), m1)
+#define VFDIV_M         JOIN(RISCV_RVV(vfdiv),     _vv_f,  ELEN,   LMUL,   _mu)
+#define VFMACC_M        JOIN(RISCV_RVV(vfmacc),    _vv_f,  ELEN,   LMUL,   _mu)
+#endif
+#define VFIRST          JOIN(RISCV_RVV(vfirst),    _m_b,   MLEN,   _,      _)
+#define VRGATHER        JOIN(RISCV_RVV(vrgather),  _vx_f,  ELEN,   LMUL,   _)
+#define VFDIV           JOIN(RISCV_RVV(vfdiv),     _vf_f,  ELEN,   LMUL,   _)
+#define VFMUL           JOIN(RISCV_RVV(vfmul),     _vv_f,  ELEN,   LMUL,   _)
+#define VFMACC          JOIN(RISCV_RVV(vfmacc),    _vv_f,  ELEN,   LMUL,   _)
+#define VMSOF           JOIN(RISCV_RVV(vmsof),     _m_b,   MLEN,   _,      _)
+#define VMANDN          JOIN(RISCV_RVV(vmandn),    _mm_b,  MLEN,   _,      _)
+#if defined(DOUBLE)
 #define ABS fabs
-#define MASK_T vbool16_t
-#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m4_m
-#define VMFGTVF_FLOAT vmfgt_vf_f64m4_b16
-#define VMFIRSTM vmfirst_m_b16
-#define VFDIVVF_FLOAT vfdiv_vf_f64m4
-#define VMFLTVF_FLOAT vmflt_vf_f64m4_b16
-#define VFREDMAXVS_FLOAT vfredmax_vs_f64m4_f64m1
+#else
+#define ABS fabsf
 #endif
 
+#define EXTRACT_FLOAT0_V(v) JOIN(RISCV_RVV(vfmv_f_s_f), ELEN, LMUL, _f, ELEN)(v)
+
+
 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
-	BLASLONG i=0, j=0;
+        BLASLONG i=0;
 
-	if ( n < 0 )  return(0.0);
-//        if(n == 1) return (ABS(x[0]));
+	if (n <= 0 || inc_x <= 0) return(0.0);
 
-        FLOAT_V_T vr, v0, v_zero;
+        FLOAT_V_T v_ssq, v_scale, v0, v1, v_zero;
         unsigned int gvl = 0;
         FLOAT_V_T_M1 v_res, v_z0;
-        gvl = VSETVL_MAX;
-        v_res = VFMVVF_FLOAT_M1(0, gvl);
-        v_z0 = VFMVVF_FLOAT_M1(0, gvl);
 
-        FLOAT scale = 0.0, ssq = 0.0;
-        MASK_T mask;
-        BLASLONG index = 0;
-        if(inc_x == 1){
-                BLASLONG n2 = n * 2;
-                gvl = VSETVL(n2);
-                vr = VFMVVF_FLOAT(0, gvl);
-                v_zero = VFMVVF_FLOAT(0, gvl);
-                for(i=0,j=0; i<n2/gvl; i++){
-                        v0 = VLEV_FLOAT(&x[j], gvl);
-                        //fabs(vector)
-                        mask = VMFLTVF_FLOAT(v0, 0, gvl);
-                        v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl);
-                        //if scale change
-                        mask = VMFGTVF_FLOAT(v0, scale, gvl);
-                        index = VMFIRSTM(mask, gvl);
-                        if(index == -1){//no elements greater than scale
-                                if(scale != 0.0){
-                                        v0 = VFDIVVF_FLOAT(v0, scale, gvl);
-                                        vr = VFMACCVV_FLOAT(vr, v0, v0, gvl);
-                                }
-                        }else{//found greater element
-                                //ssq in vector vr: vr[0]
-                                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                                //total ssq before current vector
-                                ssq += VFMVFS_FLOAT(v_res);
-                                //find max
-                                v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
-                                //update ssq before max_index
-                                ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res));
-                                //update scale
-                                scale = VFMVFS_FLOAT(v_res);
-                                //ssq in vector vr
-                                v0 = VFDIVVF_FLOAT(v0, scale, gvl);
-                                vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
-                        }
-                        j += gvl;
-                }
-                //ssq in vector vr: vr[0]
-                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                //total ssq now
-                ssq += VFMVFS_FLOAT(v_res);
+        v_res = VFMVVF_FLOAT_M1(0, 1);
+        v_z0 = VFMVVF_FLOAT_M1(0, 1);
 
-                //tail
-                if(j < n2){
-                        gvl = VSETVL(n2-j);
-                        v0 = VLEV_FLOAT(&x[j], gvl);
-                        //fabs(vector)
-                        mask = VMFLTVF_FLOAT(v0, 0, gvl);
-                        v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl);
-                        //if scale change
-                        mask = VMFGTVF_FLOAT(v0, scale, gvl);
-                        index = VMFIRSTM(mask, gvl);
-                        if(index == -1){//no elements greater than scale
-                                if(scale != 0.0)
-                                        v0 = VFDIVVF_FLOAT(v0, scale, gvl);
-                        }else{//found greater element
-                                //find max
-                                v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
-                                //update ssq before max_index
-                                ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res));
-                                //update scale
-                                scale = VFMVFS_FLOAT(v_res);
-                                v0 = VFDIVVF_FLOAT(v0, scale, gvl);
-                        }
-                        vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
-                        //ssq in vector vr: vr[0]
-                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                        //total ssq now
-                        ssq += VFMVFS_FLOAT(v_res);
-                }
-        }else{
-                gvl = VSETVL(n);
-                vr = VFMVVF_FLOAT(0, gvl);
-                v_zero = VFMVVF_FLOAT(0, gvl);
-                unsigned int stride_x = inc_x * sizeof(FLOAT) * 2;
-                int idx = 0, inc_v = inc_x * gvl * 2;
-                for(i=0,j=0; i<n/gvl; i++){
-                        v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl);
-                        //fabs(vector)
-                        mask = VMFLTVF_FLOAT(v0, 0, gvl);
-                        v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl);
-                        //if scale change
-                        mask = VMFGTVF_FLOAT(v0, scale, gvl);
-                        index = VMFIRSTM(mask, gvl);
-                        if(index == -1){//no elements greater than scale
-                                if(scale != 0.0){
-                                        v0 = VFDIVVF_FLOAT(v0, scale, gvl);
-                                        vr = VFMACCVV_FLOAT(vr, v0, v0, gvl);
-                                }
-                        }else{//found greater element
-                                //ssq in vector vr: vr[0]
-                                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                                //total ssq before current vector
-                                ssq += VFMVFS_FLOAT(v_res);
-                                //find max
-                                v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
-                                //update ssq before max_index
-                                ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res));
-                                //update scale
-                                scale = VFMVFS_FLOAT(v_res);
-                                //ssq in vector vr
-                                v0 = VFDIVVF_FLOAT(v0, scale, gvl);
-                                vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
-                        }
+        gvl = VSETVL(n);
+        v_ssq = VFMVVF_FLOAT(0, gvl);
+        v_scale = VFMVVF_FLOAT(0, gvl);
+        v_zero = VFMVVF_FLOAT(0, gvl);
 
-                        v0 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl);
-                        //fabs(vector)
-                        mask = VMFLTVF_FLOAT(v0, 0, gvl);
-                        v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl);
-                        //if scale change
-                        mask = VMFGTVF_FLOAT(v0, scale, gvl);
-                        index = VMFIRSTM(mask, gvl);
-                        if(index == -1){//no elements greater than scale
-                                if(scale != 0.0){
-                                        v0 = VFDIVVF_FLOAT(v0, scale, gvl);
-                                        vr = VFMACCVV_FLOAT(vr, v0, v0, gvl);
-                                }
-                        }else{//found greater element
-                                //ssq in vector vr: vr[0]
-                                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                                //total ssq before current vector
-                                ssq += VFMVFS_FLOAT(v_res);
-                                //find max
-                                v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
-                                //update ssq before max_index
-                                ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res));
-                                //update scale
-                                scale = VFMVFS_FLOAT(v_res);
-                                //ssq in vector vr
-                                v0 = VFDIVVF_FLOAT(v0, scale, gvl);
-                                vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
-                        }
-                        j += gvl;
-                        idx += inc_v;
+        unsigned int stride_x = inc_x * sizeof(FLOAT) * 2;
+        int idx = 0;
+
+        for(i=0; i<n/gvl; i++){
+                v0 = VLSEV_FLOAT( &x[idx], stride_x, gvl );
+                v1 = VLSEV_FLOAT( &x[idx+1], stride_x, gvl );
+                v0 = VFABS( v0, gvl );
+                v1 = VFABS( v1, gvl );
+
+                MASK_T scale_mask0 = VMFGT( v0, v_scale, gvl );
+                MASK_T scale_mask1 = VMFGT( v1, v_scale, gvl );
+                if( VCPOP( scale_mask0, gvl ) + VCPOP( scale_mask1, gvl ) > 0 ){ // scale change?
+                        // find largest element in v0 and v1
+                        v_res = VFREDMAX( v0, v_z0, gvl );
+                        v_res = VFREDMAX( v1, v_res, gvl );
+                        FLOAT const largest_elt = EXTRACT_FLOAT( v_res );
+
+                        v_scale = VFDIV( v_scale, largest_elt, gvl );   // scale/largest_elt
+                        v_scale = VFMUL( v_scale, v_scale, gvl );       // (scale/largest_elt)*(scale/largest_elt)
+                        v_ssq = VFMUL( v_scale, v_ssq, gvl );           // ssq*(scale/largest_elt)*(scale/largest_elt)
+
+                        v_scale = VFMVVF_FLOAT( largest_elt, gvl );     // splated largest_elt becomes new scale
                 }
-                //ssq in vector vr: vr[0]
-                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                //total ssq now
-                ssq += VFMVFS_FLOAT(v_res);
 
-                //tail
-                if(j < n){
-                        gvl = VSETVL(n-j);
-                        v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl);
-                        //fabs(vector)
-                        mask = VMFLTVF_FLOAT(v0, 0, gvl);
-                        v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl);
-                        //if scale change
-                        mask = VMFGTVF_FLOAT(v0, scale, gvl);
-                        index = VMFIRSTM(mask, gvl);
-                        if(index == -1){//no elements greater than scale
-                                if(scale != 0.0){
-                                        v0 = VFDIVVF_FLOAT(v0, scale, gvl);
-                                        vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
+                MASK_T nonzero_mask0 = VMFNE( v0, 0, gvl );
+                MASK_T nonzero_mask1 = VMFNE( v1, 0, gvl );
+                v0 = VFDIV_M( nonzero_mask0, v_zero, v0, v_scale, gvl );
+                v1 = VFDIV_M( nonzero_mask1, v_zero, v1, v_scale, gvl );
+                v_ssq = VFMACC_M( nonzero_mask0, v_ssq, v0, v0, gvl );
+                v_ssq = VFMACC_M( nonzero_mask1, v_ssq, v1, v1, gvl );
+
+                idx += inc_x * gvl * 2;
+        }
+
+        v_res = VFREDUSUM(v_ssq, v_z0, gvl);
+        FLOAT ssq = EXTRACT_FLOAT(v_res);
+        FLOAT scale = EXTRACT_FLOAT0_V(v_scale);
+
+        //finish any tail using scalar ops
+        i*=gvl;
+        if(i<n){
+                i *= inc_x*2;
+                n *= inc_x*2;
+                FLOAT temp;
+                do{
+                        if ( x[i] != 0.0 ){
+                                temp = ABS( x[i] );
+                                if ( scale < temp ){
+                                        ssq = 1 + ssq * ( scale / temp ) * ( scale / temp );
+                                        scale = temp ;
+                                }else{
+                                        ssq += ( temp / scale ) * ( temp / scale );
                                 }
-                        }else{//found greater element
-                                //find max
-                                v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
-                                //update ssq before max_index
-                                ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res));
-                                //update scale
-                                scale = VFMVFS_FLOAT(v_res);
-                                v0 = VFDIVVF_FLOAT(v0, scale, gvl);
-                                vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
                         }
 
-                        v0 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl);
-                        //fabs(vector)
-                        mask = VMFLTVF_FLOAT(v0, 0, gvl);
-                        v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl);
-                        //if scale change
-                        mask = VMFGTVF_FLOAT(v0, scale, gvl);
-                        index = VMFIRSTM(mask, gvl);
-                        if(index == -1){//no elements greater than scale
-                                if(scale != 0.0){
-                                        v0 = VFDIVVF_FLOAT(v0, scale, gvl);
-                                        vr = VFMACCVV_FLOAT(vr, v0, v0, gvl);
+                        if ( x[i+1] != 0.0 ){
+                                temp = ABS( x[i+1] );
+                                if ( scale < temp ){
+                                        ssq = 1 + ssq * ( scale / temp ) * ( scale / temp );
+                                        scale = temp ;
+                                }else{
+                                        ssq += ( temp / scale ) * ( temp / scale );
                                 }
-                        }else{//found greater element
-                                //ssq in vector vr: vr[0]
-                                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                                //total ssq before current vector
-                                ssq += VFMVFS_FLOAT(v_res);
-                                //find max
-                                v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
-                                //update ssq before max_index
-                                ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res));
-                                //update scale
-                                scale = VFMVFS_FLOAT(v_res);
-                                v0 = VFDIVVF_FLOAT(v0, scale, gvl);
-                                vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
                         }
-                        //ssq in vector vr: vr[0]
-                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                        //total ssq now
-                        ssq += VFMVFS_FLOAT(v_res);
-                }
-        }
-	return(scale * sqrt(ssq));
-}
 
+                        i += inc_x*2;
+                }while(i<n);
+        }
 
+        return(scale * sqrt(ssq));
+}
diff --git a/kernel/riscv64/zrot_rvv.c b/kernel/riscv64/zrot_rvv.c
new file mode 100644
index 000000000..1d5390684
--- /dev/null
+++ b/kernel/riscv64/zrot_rvv.c
@@ -0,0 +1,228 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m4(n)
+#define FLOAT_V_T               vfloat32m4_t
+#define FLOAT_VX2_T             vfloat32m4x2_t
+#define VGET_VX2                __riscv_vget_v_f32m4x2_f32m4
+#define VSET_VX2                __riscv_vset_v_f32m4_f32m4x2
+#define VLEV_FLOAT              __riscv_vle32_v_f32m4
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m4
+#define VSEV_FLOAT              __riscv_vse32_v_f32m4
+#define VSSEV_FLOAT             __riscv_vsse32_v_f32m4
+#define VLSEG_FLOAT             __riscv_vlseg2e32_v_f32m4x2
+#define VSSEG_FLOAT             __riscv_vsseg2e32_v_f32m4x2
+#define VLSSEG_FLOAT            __riscv_vlsseg2e32_v_f32m4x2
+#define VSSSEG_FLOAT            __riscv_vssseg2e32_v_f32m4x2
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f32m4
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f32m4
+#define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f32m4
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m4(n)
+#define FLOAT_V_T               vfloat64m4_t
+#define FLOAT_VX2_T             vfloat64m4x2_t
+#define VGET_VX2                __riscv_vget_v_f64m4x2_f64m4
+#define VSET_VX2                __riscv_vset_v_f64m4_f64m4x2
+#define VLEV_FLOAT              __riscv_vle64_v_f64m4
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m4
+#define VSEV_FLOAT              __riscv_vse64_v_f64m4
+#define VSSEV_FLOAT             __riscv_vsse64_v_f64m4
+#define VLSEG_FLOAT             __riscv_vlseg2e64_v_f64m4x2
+#define VSSEG_FLOAT             __riscv_vsseg2e64_v_f64m4x2
+#define VLSSEG_FLOAT            __riscv_vlsseg2e64_v_f64m4x2
+#define VSSSEG_FLOAT            __riscv_vssseg2e64_v_f64m4x2
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f64m4
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f64m4
+#define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f64m4
+#endif
+
+int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
+{
+
+    if (n <= 0) return(0);
+
+    FLOAT_V_T vt0, vt1, vx0, vx1, vy0, vy1;
+    FLOAT_VX2_T vxx2, vyx2, vtx2;
+
+    if (inc_x == 0 && inc_y == 0) {
+        BLASLONG i=0;
+        BLASLONG ix=0,iy=0;
+        FLOAT temp[2];
+        BLASLONG inc_x2;
+        BLASLONG inc_y2;
+
+        inc_x2 = 2 * inc_x ;
+        inc_y2 = 2 * inc_y ;
+
+        while(i < n)
+        {
+            temp[0]   = c*x[ix]   + s*y[iy] ;
+            temp[1]   = c*x[ix+1] + s*y[iy+1] ;
+            y[iy]     = c*y[iy]   - s*x[ix] ;
+            y[iy+1]   = c*y[iy+1] - s*x[ix+1] ;
+            x[ix]     = temp[0] ;
+            x[ix+1]   = temp[1] ;
+
+            ix += inc_x2 ;
+            iy += inc_y2 ;
+            i++ ;
+        }
+    }
+    else if(inc_x == 1 && inc_y == 1) {
+
+        for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*2) {
+            vl = VSETVL(n);
+
+            vxx2 = VLSEG_FLOAT(x, vl);
+            vyx2 = VLSEG_FLOAT(y, vl);
+
+            vx0 = VGET_VX2(vxx2, 0);
+            vx1 = VGET_VX2(vxx2, 1);
+            vy0 = VGET_VX2(vyx2, 0);
+            vy1 = VGET_VX2(vyx2, 1);
+
+            vt0 = VFMULVF_FLOAT(vx0, c, vl);
+            vt0 = VFMACCVF_FLOAT(vt0, s, vy0, vl);
+            vt1 = VFMULVF_FLOAT(vx1, c, vl);
+            vt1 = VFMACCVF_FLOAT(vt1, s, vy1, vl);
+            vy0 = VFMULVF_FLOAT(vy0, c, vl);
+            vy0 = VFNMSACVF_FLOAT(vy0, s, vx0, vl);
+            vy1 = VFMULVF_FLOAT(vy1, c, vl);
+            vy1 = VFNMSACVF_FLOAT(vy1, s, vx1, vl);
+
+            vtx2 = VSET_VX2(vtx2, 0, vt0);
+            vtx2 = VSET_VX2(vtx2, 1, vt1);
+            vyx2 = VSET_VX2(vyx2, 0, vy0);
+            vyx2 = VSET_VX2(vyx2, 1, vy1);
+
+            VSSEG_FLOAT(x, vtx2, vl);
+            VSSEG_FLOAT(y, vyx2, vl);
+        }
+
+    } else if (inc_x == 1){
+        BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*inc_y*2) {
+            vl = VSETVL(n);
+
+            vxx2 = VLSEG_FLOAT(x, vl);
+            vyx2 = VLSSEG_FLOAT(y, stride_y, vl);
+
+            vx0 = VGET_VX2(vxx2, 0);
+            vx1 = VGET_VX2(vxx2, 1);
+            vy0 = VGET_VX2(vyx2, 0);
+            vy1 = VGET_VX2(vyx2, 1);
+
+            vt0 = VFMULVF_FLOAT(vx0, c, vl);
+            vt0 = VFMACCVF_FLOAT(vt0, s, vy0, vl);
+            vt1 = VFMULVF_FLOAT(vx1, c, vl);
+            vt1 = VFMACCVF_FLOAT(vt1, s, vy1, vl);
+            vy0 = VFMULVF_FLOAT(vy0, c, vl);
+            vy0 = VFNMSACVF_FLOAT(vy0, s, vx0, vl);
+            vy1 = VFMULVF_FLOAT(vy1, c, vl);
+            vy1 = VFNMSACVF_FLOAT(vy1, s, vx1, vl);
+
+            vtx2 = VSET_VX2(vtx2, 0, vt0);
+            vtx2 = VSET_VX2(vtx2, 1, vt1);
+            vyx2 = VSET_VX2(vyx2, 0, vy0);
+            vyx2 = VSET_VX2(vyx2, 1, vy1);
+
+            VSSEG_FLOAT(x, vtx2, vl);
+            VSSSEG_FLOAT(y, stride_y, vyx2, vl);
+        }
+
+    } else if (inc_y == 1){
+        BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*2) {
+            vl = VSETVL(n);
+
+            vxx2 = VLSSEG_FLOAT(x, stride_x, vl);
+            vyx2 = VLSEG_FLOAT(y, vl);
+
+            vx0 = VGET_VX2(vxx2, 0);
+            vx1 = VGET_VX2(vxx2, 1);
+            vy0 = VGET_VX2(vyx2, 0);
+            vy1 = VGET_VX2(vyx2, 1);
+
+            vt0 = VFMULVF_FLOAT(vx0, c, vl);
+            vt0 = VFMACCVF_FLOAT(vt0, s, vy0, vl);
+            vt1 = VFMULVF_FLOAT(vx1, c, vl);
+            vt1 = VFMACCVF_FLOAT(vt1, s, vy1, vl);
+            vy0 = VFMULVF_FLOAT(vy0, c, vl);
+            vy0 = VFNMSACVF_FLOAT(vy0, s, vx0, vl);
+            vy1 = VFMULVF_FLOAT(vy1, c, vl);
+            vy1 = VFNMSACVF_FLOAT(vy1, s, vx1, vl);
+
+            vtx2 = VSET_VX2(vtx2, 0, vt0);
+            vtx2 = VSET_VX2(vtx2, 1, vt1);
+            vyx2 = VSET_VX2(vyx2, 0, vy0);
+            vyx2 = VSET_VX2(vyx2, 1, vy1);
+
+            VSSSEG_FLOAT(x, stride_x, vtx2, vl);
+            VSSEG_FLOAT(y, vyx2, vl);
+        }
+
+    } else {
+        BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
+        BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*inc_y*2) {
+            vl = VSETVL(n);
+
+            vxx2 = VLSSEG_FLOAT(x, stride_x, vl);
+            vyx2 = VLSSEG_FLOAT(y, stride_y, vl);
+
+            vx0 = VGET_VX2(vxx2, 0);
+            vx1 = VGET_VX2(vxx2, 1);
+            vy0 = VGET_VX2(vyx2, 0);
+            vy1 = VGET_VX2(vyx2, 1);
+
+            vt0 = VFMULVF_FLOAT(vx0, c, vl);
+            vt0 = VFMACCVF_FLOAT(vt0, s, vy0, vl);
+            vt1 = VFMULVF_FLOAT(vx1, c, vl);
+            vt1 = VFMACCVF_FLOAT(vt1, s, vy1, vl);
+            vy0 = VFMULVF_FLOAT(vy0, c, vl);
+            vy0 = VFNMSACVF_FLOAT(vy0, s, vx0, vl);
+            vy1 = VFMULVF_FLOAT(vy1, c, vl);
+            vy1 = VFNMSACVF_FLOAT(vy1, s, vx1, vl);
+
+            vtx2 = VSET_VX2(vtx2, 0, vt0);
+            vtx2 = VSET_VX2(vtx2, 1, vt1);
+            vyx2 = VSET_VX2(vyx2, 0, vy0);
+            vyx2 = VSET_VX2(vyx2, 1, vy1);
+
+            VSSSEG_FLOAT(x, stride_x, vtx2, vl);
+            VSSSEG_FLOAT(y, stride_y, vyx2, vl);
+        }
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/zrot_vector.c b/kernel/riscv64/zrot_vector.c
index 727d13a87..50751b343 100644
--- a/kernel/riscv64/zrot_vector.c
+++ b/kernel/riscv64/zrot_vector.c
@@ -27,27 +27,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m4(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
+#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n)
+#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)()
 #define FLOAT_V_T vfloat32m4_t
-#define VLEV_FLOAT vle32_v_f32m4
-#define VLSEV_FLOAT vlse32_v_f32m4
-#define VSEV_FLOAT vse32_v_f32m4
-#define VSSEV_FLOAT vsse32_v_f32m4
-#define VFMACCVF_FLOAT vfmacc_vf_f32m4
-#define VFMULVF_FLOAT vfmul_vf_f32m4
-#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4
+#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4)
+#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4)
+#define VSEV_FLOAT RISCV_RVV(vse32_v_f32m4)
+#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4)
+#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4)
+#define VFMULVF_FLOAT RISCV_RVV(vfmul_vf_f32m4)
+#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f32m4)
 #else
-#define VSETVL(n) vsetvl_e64m4(n)
-#define VSETVL_MAX vsetvlmax_e64m1()
+#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n)
+#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)()
 #define FLOAT_V_T vfloat64m4_t
-#define VLEV_FLOAT vle64_v_f64m4
-#define VLSEV_FLOAT vlse64_v_f64m4
-#define VSEV_FLOAT vse64_v_f64m4
-#define VSSEV_FLOAT vsse64_v_f64m4
-#define VFMACCVF_FLOAT vfmacc_vf_f64m4
-#define VFMULVF_FLOAT vfmul_vf_f64m4
-#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4
+#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4)
+#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4)
+#define VSEV_FLOAT RISCV_RVV(vse64_v_f64m4)
+#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4)
+#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4)
+#define VFMULVF_FLOAT RISCV_RVV(vfmul_vf_f64m4)
+#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m4)
 #endif
 
 int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
@@ -59,7 +59,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
         unsigned int gvl = 0;
 
         FLOAT_V_T vt0, vt1, vx0, vx1, vy0, vy1;
-        gvl = VSETVL(n);
+        gvl = VSETVL((inc_x != 0 && inc_y != 0) ? n : 1);
         BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
         BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT);
         BLASLONG inc_xv = inc_x * 2 * gvl;
@@ -112,7 +112,6 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
 		}
 		
 	}else{
-                if (inc_x == 0 && inc_y == 0) gvl = VSETVL(1);
 		for(i=0,j=0; i < n/gvl; i++){
 			vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
 			vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
diff --git a/kernel/riscv64/zscal_rvv.c b/kernel/riscv64/zscal_rvv.c
new file mode 100644
index 000000000..ae79d9f9d
--- /dev/null
+++ b/kernel/riscv64/zscal_rvv.c
@@ -0,0 +1,112 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m4(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m4()
+#define FLOAT_V_T               vfloat32m4_t
+#define FLOAT_VX2_T             vfloat32m4x2_t
+#define VGET_VX2                __riscv_vget_v_f32m4x2_f32m4
+#define VSET_VX2                __riscv_vset_v_f32m4_f32m4x2
+#define VLSEG_FLOAT             __riscv_vlseg2e32_v_f32m4x2
+#define VLSSEG_FLOAT            __riscv_vlsseg2e32_v_f32m4x2
+#define VSSEG_FLOAT             __riscv_vsseg2e32_v_f32m4x2
+#define VSSSEG_FLOAT            __riscv_vssseg2e32_v_f32m4x2
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f32m4
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f32m4
+#define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f32m4
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m4
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m4(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m4()
+#define FLOAT_V_T               vfloat64m4_t
+#define FLOAT_VX2_T             vfloat64m4x2_t
+#define VGET_VX2                __riscv_vget_v_f64m4x2_f64m4
+#define VSET_VX2                __riscv_vset_v_f64m4_f64m4x2
+#define VLSEG_FLOAT             __riscv_vlseg2e64_v_f64m4x2
+#define VLSSEG_FLOAT            __riscv_vlsseg2e64_v_f64m4x2
+#define VSSEG_FLOAT             __riscv_vsseg2e64_v_f64m4x2
+#define VSSSEG_FLOAT            __riscv_vssseg2e64_v_f64m4x2
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f64m4
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f64m4
+#define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f64m4
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m4
+#endif
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+
+    if((n <= 0) || (inc_x <= 0)) return(0);
+
+    FLOAT_V_T vt, vr, vi;
+    BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
+    size_t vlmax = VSETVL_MAX;
+    FLOAT_VX2_T vx2;
+
+    if(inc_x == 1) {
+
+        for (size_t vl; n > 0; n -= vl, x += vl*2) {
+            vl = VSETVL(n);
+
+            vx2 = VLSEG_FLOAT(x, vl);
+            vr = VGET_VX2(vx2, 0);
+            vi = VGET_VX2(vx2, 1);
+
+            vt = VFMULVF_FLOAT(vr, da_r, vl);
+            vt = VFNMSACVF_FLOAT(vt, da_i, vi, vl);
+            vi = VFMULVF_FLOAT(vi, da_r, vl);
+            vi = VFMACCVF_FLOAT(vi, da_i, vr, vl);
+
+            vx2 = VSET_VX2(vx2, 0, vt);
+            vx2 = VSET_VX2(vx2, 1, vi);
+            VSSEG_FLOAT(x, vx2, vl);
+        }
+
+    } else {
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) {
+            vl = VSETVL(n);
+
+            vx2 = VLSSEG_FLOAT(x, stride_x, vl);
+            vr = VGET_VX2(vx2, 0);
+            vi = VGET_VX2(vx2, 1);
+
+            vt = VFMULVF_FLOAT(vr, da_r, vl);
+            vt = VFNMSACVF_FLOAT(vt, da_i, vi, vl);
+            vi = VFMULVF_FLOAT(vi, da_r, vl);
+            vi = VFMACCVF_FLOAT(vi, da_i, vr, vl);
+
+            vx2 = VSET_VX2(vx2, 0, vt);
+            vx2 = VSET_VX2(vx2, 1, vi);
+            VSSSEG_FLOAT(x, stride_x, vx2, vl);
+        }
+    }
+
+    return(0);
+}
diff --git a/kernel/riscv64/zscal_vector.c b/kernel/riscv64/zscal_vector.c
index 77f4fc312..536bbdf73 100644
--- a/kernel/riscv64/zscal_vector.c
+++ b/kernel/riscv64/zscal_vector.c
@@ -27,25 +27,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m4(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
+#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n)
+#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)()
 #define FLOAT_V_T vfloat32m4_t
-#define VLSEV_FLOAT vlse32_v_f32m4
-#define VSSEV_FLOAT vsse32_v_f32m4
-#define VFMACCVF_FLOAT vfmacc_vf_f32m4
-#define VFMULVF_FLOAT vfmul_vf_f32m4
-#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4
-#define VFMVVF_FLOAT vfmv_v_f_f32m4
+#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4)
+#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4)
+#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4)
+#define VFMULVF_FLOAT RISCV_RVV(vfmul_vf_f32m4)
+#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f32m4)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m4)
 #else
-#define VSETVL(n) vsetvl_e64m4(n)
-#define VSETVL_MAX vsetvlmax_e64m1()
+#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n)
+#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)()
 #define FLOAT_V_T vfloat64m4_t
-#define VLSEV_FLOAT vlse64_v_f64m4
-#define VSSEV_FLOAT vsse64_v_f64m4
-#define VFMACCVF_FLOAT vfmacc_vf_f64m4
-#define VFMULVF_FLOAT vfmul_vf_f64m4
-#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4
-#define VFMVVF_FLOAT vfmv_v_f_f64m4
+#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4)
+#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4)
+#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4)
+#define VFMULVF_FLOAT RISCV_RVV(vfmul_vf_f64m4)
+#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m4)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m4)
 #endif
 
 int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
@@ -59,86 +59,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F
 
         unsigned int gvl = 0;
         FLOAT_V_T vt, v0, v1;
-        if(da_r == 0.0 && da_i == 0.0){
-                gvl = VSETVL(n);
-                BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
-                BLASLONG inc_xv = inc_x * 2 * gvl;
-                vt = VFMVVF_FLOAT(0.0, gvl);
-                for(i=0,j=0; i < n/(gvl*2); i++){
-                        VSSEV_FLOAT(&x[ix], stride_x, vt, gvl);
-                        VSSEV_FLOAT(&x[ix+1], stride_x, vt, gvl);
-                        VSSEV_FLOAT(&x[ix+inc_xv], stride_x, vt, gvl);
-                        VSSEV_FLOAT(&x[ix+inc_xv+1], stride_x, vt, gvl);
-
-                        j += gvl*2;
-                        ix += inc_xv*2;
-                }
-                for(; j < n; ){
-                        gvl = VSETVL(n-j);
-                        VSSEV_FLOAT(&x[ix], stride_x, vt, gvl);
-                        VSSEV_FLOAT(&x[ix+1], stride_x, vt, gvl);
-                        j += gvl;
-                        ix += inc_x * 2 * gvl;
-                }
-#if 0
-        }else if(da_r == 0.0){
-                gvl = VSETVL(n);
-                BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
-                BLASLONG inc_xv = inc_x * 2 * gvl;
-                for(i=0,j=0; i < n/gvl; i++){
-                        v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
-                        v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
-
-                        vt = VFMULVF_FLOAT(v1, -da_i, gvl);
-                        v1 = VFMULVF_FLOAT(v0, da_i, gvl);
-
-                        VSSEV_FLOAT(&x[ix], stride_x, vt, gvl);
-                        VSSEV_FLOAT(&x[ix+1], stride_x, v1, gvl);
-
-                        j += gvl;
-                        ix += inc_xv;
-                }
-#endif
-                if(j < n){
-                        gvl = VSETVL(n-j);
-                        v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
-                        v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
-
-                        vt = VFMULVF_FLOAT(v1, -da_i, gvl);
-                        v1 = VFMULVF_FLOAT(v0, da_i, gvl);
-
-                        VSSEV_FLOAT(&x[ix], stride_x, vt, gvl);
-                        VSSEV_FLOAT(&x[ix+1], stride_x, v1, gvl);
-                }
-        }else if(da_i == 0.0){
-                gvl = VSETVL(n);
-                BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
-                BLASLONG inc_xv = inc_x * 2 * gvl;
-                for(i=0,j=0; i < n/gvl; i++){
-                        v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
-                        v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
-
-                        vt = VFMULVF_FLOAT(v0, da_r, gvl);
-                        v1 = VFMULVF_FLOAT(v1, da_r, gvl);
-
-                        VSSEV_FLOAT(&x[ix], stride_x, vt, gvl);
-                        VSSEV_FLOAT(&x[ix+1], stride_x, v1, gvl);
-
-                        j += gvl;
-                        ix += inc_xv;
-                }
-                if(j < n){
-                        gvl = VSETVL(n-j);
-                        v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
-                        v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
-
-                        vt = VFMULVF_FLOAT(v0, da_r, gvl);
-                        v1 = VFMULVF_FLOAT(v1, da_r, gvl);
-
-                        VSSEV_FLOAT(&x[ix], stride_x, vt, gvl);
-                        VSSEV_FLOAT(&x[ix+1], stride_x, v1, gvl);
-                }
-        }else{
+        {
                 gvl = VSETVL(n);
                 BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
                 BLASLONG inc_xv = inc_x * 2 * gvl;
diff --git a/kernel/riscv64/zsum_rvv.c b/kernel/riscv64/zsum_rvv.c
new file mode 100644
index 000000000..489188bd5
--- /dev/null
+++ b/kernel/riscv64/zsum_rvv.c
@@ -0,0 +1,107 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m4(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m4()
+#define FLOAT_V_T               vfloat32m4_t
+#define FLOAT_V_T_M1            vfloat32m1_t
+#define FLOAT_VX2_T             vfloat32m4x2_t
+#define VGET_VX2                __riscv_vget_v_f32m4x2_f32m4
+#define VLSEG_FLOAT             __riscv_vlseg2e32_v_f32m4x2
+#define VLSSEG_FLOAT            __riscv_vlsseg2e32_v_f32m4x2
+#define VFREDSUMVS_FLOAT        __riscv_vfredusum_vs_f32m4_f32m1
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m4
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
+#define VFADDVV_FLOAT_TU        __riscv_vfadd_vv_f32m4_tu
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m4(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m4()
+#define FLOAT_V_T               vfloat64m4_t
+#define FLOAT_V_T_M1            vfloat64m1_t
+#define FLOAT_VX2_T             vfloat64m4x2_t
+#define VGET_VX2                __riscv_vget_v_f64m4x2_f64m4
+#define VLSEG_FLOAT             __riscv_vlseg2e64_v_f64m4x2
+#define VLSSEG_FLOAT            __riscv_vlsseg2e64_v_f64m4x2
+#define VFREDSUMVS_FLOAT        __riscv_vfredusum_vs_f64m4_f64m1
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m4
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
+#define VFADDVV_FLOAT_TU        __riscv_vfadd_vv_f64m4_tu
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+    FLOAT sumf = 0.0;
+    if (n <= 0 || inc_x <= 0) return(sumf);
+
+    FLOAT_V_T v0, v1;
+    FLOAT_VX2_T vx2;
+    size_t vlmax = VSETVL_MAX; 
+    FLOAT_V_T v_sum = VFMVVF_FLOAT(0, vlmax);
+
+    if(inc_x == 1) {
+
+        for (size_t vl; n > 0; n -= vl, x += vl*2) {
+            vl = VSETVL(n);
+
+            vx2 = VLSEG_FLOAT(x, vl);
+
+            v0 = VGET_VX2(vx2, 0);
+            v1 = VGET_VX2(vx2, 1);
+
+            v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v0, vl);
+            v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v1, vl);
+        }
+
+    } else {
+
+        BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2;
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) {
+            vl = VSETVL(n);
+
+            vx2 = VLSSEG_FLOAT(x, stride_x, vl);
+
+            v0 = VGET_VX2(vx2, 0);
+            v1 = VGET_VX2(vx2, 1);
+
+            v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v0, vl);
+            v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v1, vl);
+        }
+
+    }
+
+    FLOAT_V_T_M1 v_res = VFMVVF_FLOAT_M1(0, vlmax);
+    v_res = VFREDSUMVS_FLOAT(v_sum, v_res, vlmax);
+    sumf += VFMVFS_FLOAT_M1(v_res);
+
+    return(sumf);
+}
diff --git a/kernel/riscv64/zsum_vector.c b/kernel/riscv64/zsum_vector.c
new file mode 100644
index 000000000..ca0b02b5c
--- /dev/null
+++ b/kernel/riscv64/zsum_vector.c
@@ -0,0 +1,131 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#ifdef RISCV64_ZVL256B
+#       define LMUL m2
+#       if defined(DOUBLE)
+#               define ELEN 64
+#               define MLEN _b32
+#       else
+#               define ELEN 32
+#               define MLEN _b16
+#       endif
+#else
+#       define LMUL m8
+#       if defined(DOUBLE)
+#               define ELEN 64
+#               define MLEN _b8
+#       else
+#               define ELEN 32
+#               define MLEN _b4
+#       endif
+#endif
+
+#define _
+#define JOIN2_X(x, y) x ## y
+#define JOIN2(x, y) JOIN2_X(x, y)
+#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
+
+#define VSETVL          JOIN(RISCV_RVV(vsetvl),    _e,     ELEN,   LMUL,   _)
+#define FLOAT_V_T       JOIN(vfloat,            ELEN,   LMUL,   _t,     _)
+#define FLOAT_V_T_M1    JOIN(vfloat,            ELEN,   m1,     _t,     _)
+#define VLEV_FLOAT      JOIN(RISCV_RVV(vle),       ELEN,   _v_f,   ELEN,   LMUL)
+#define VLSEV_FLOAT     JOIN(RISCV_RVV(vlse),      ELEN,   _v_f,   ELEN,   LMUL)
+#define VFREDSUMVS_FLOAT JOIN(RISCV_RVV(vfredusum_vs_f),  ELEN,   LMUL,   _f, JOIN2( ELEN,   m1))
+#define VFMVVF_FLOAT    JOIN(RISCV_RVV(vfmv),      _v_f_f, ELEN,   LMUL,   _)
+#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv),      _v_f_f, ELEN,   m1,     _)
+#define VFADDVV_FLOAT   JOIN(RISCV_RVV(vfadd),     _vv_f,  ELEN,   LMUL,   _)
+#define VMFLTVF_FLOAT   JOIN(RISCV_RVV(vmflt),     _vf_f,  ELEN,   LMUL,   MLEN)
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0, j=0;
+	BLASLONG ix=0;
+	FLOAT asumf=0.0;
+	if (n <= 0 || inc_x <= 0) return(asumf);
+        unsigned int gvl = 0;
+        FLOAT_V_T v0, v1, v_zero,v_sum;
+        FLOAT_V_T_M1 v_res;
+        v_res = VFMVVF_FLOAT_M1(0, 1);
+
+        if(inc_x == 1){
+                BLASLONG n2 = n * 2;
+                gvl = VSETVL(n2);
+                v_zero = VFMVVF_FLOAT(0, gvl);
+                if(gvl <= n2/2){
+                        v_sum = VFMVVF_FLOAT(0, gvl);
+                        for(i=0,j=0; i<n2/(gvl*2); i++){
+                                v0 = VLEV_FLOAT(&x[j], gvl);
+                                v_sum = VFADDVV_FLOAT(v_sum, v0, gvl);
+
+                                v1 = VLEV_FLOAT(&x[j+gvl], gvl);
+                                v_sum = VFADDVV_FLOAT(v_sum, v1, gvl);
+                                j += gvl * 2;
+                        }
+                        v_res = VFREDSUMVS_FLOAT(v_sum, v_res, gvl);
+                }
+                for(;j<n2;){
+                        gvl = VSETVL(n2-j);
+                        v0 = VLEV_FLOAT(&x[j], gvl);
+                        v_res = VFREDSUMVS_FLOAT(v0, v_res, gvl);
+                        j += gvl;
+                }
+        }else{
+                gvl = VSETVL(n);
+                unsigned int stride_x = inc_x * sizeof(FLOAT) * 2;
+                v_zero = VFMVVF_FLOAT(0, gvl);
+
+                BLASLONG inc_xv = inc_x * 2 * gvl;
+                v_sum = VFMVVF_FLOAT(0, gvl);
+                for(i=0,j=0; i<n/gvl; i++){
+                        v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                        v_sum = VFADDVV_FLOAT(v_sum, v0, gvl);
+
+                        v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+                        v_sum = VFADDVV_FLOAT(v_sum, v1, gvl);
+
+                        j += gvl;
+                        ix += inc_xv;
+                }
+                v_res = VFREDSUMVS_FLOAT(v_sum, v_res, gvl);
+                if(j<n){
+                        gvl = VSETVL(n-j);
+                        v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+
+                        v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+                        v_sum = VFADDVV_FLOAT(v0, v1, gvl);
+                        v_res = VFREDSUMVS_FLOAT(v_sum, v_res, gvl);
+                }
+        }
+        asumf = EXTRACT_FLOAT(v_res);
+	return(asumf);
+}
+
+
diff --git a/kernel/riscv64/zswap.c b/kernel/riscv64/zswap.c
index ae4760ae0..df1402b94 100644
--- a/kernel/riscv64/zswap.c
+++ b/kernel/riscv64/zswap.c
@@ -45,7 +45,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dumm
 	BLASLONG inc_x2;
 	BLASLONG inc_y2;
 
-	if ( n < 0     )  return(0);
+	if ( n <= 0     )  return(0);
 
 	inc_x2 = 2 * inc_x;
 	inc_y2 = 2 * inc_y;
diff --git a/kernel/riscv64/zswap_rvv.c b/kernel/riscv64/zswap_rvv.c
new file mode 100644
index 000000000..c2adf5e05
--- /dev/null
+++ b/kernel/riscv64/zswap_rvv.c
@@ -0,0 +1,156 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m4(n)
+#define FLOAT_VX2_T             vfloat32m4x2_t
+#define VLSEG_FLOAT             __riscv_vlseg2e32_v_f32m4x2
+#define VLSSEG_FLOAT            __riscv_vlsseg2e32_v_f32m4x2
+#define VSSEG_FLOAT             __riscv_vsseg2e32_v_f32m4x2
+#define VSSSEG_FLOAT            __riscv_vssseg2e32_v_f32m4x2
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m4(n)
+#define FLOAT_VX2_T             vfloat64m4x2_t
+#define VLSEG_FLOAT             __riscv_vlseg2e64_v_f64m4x2
+#define VLSSEG_FLOAT            __riscv_vlsseg2e64_v_f64m4x2
+#define VSSEG_FLOAT             __riscv_vsseg2e64_v_f64m4x2
+#define VSSSEG_FLOAT            __riscv_vssseg2e64_v_f64m4x2
+#endif
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+
+    if (n <= 0) return(0);
+
+    FLOAT_VX2_T vxx2, vyx2;
+
+    if (inc_x == 0 && inc_y == 0) {
+        if (n & 1) {
+            FLOAT temp[2];
+            temp[0] = x[0];
+            temp[1] = x[1];
+            x[0] = y[0];
+            x[1] = y[1];
+            y[0] = temp[0];
+            y[1] = temp[1];
+        }
+        else {
+            return 0;
+        }
+    }
+    else if(inc_x == 0) {
+        FLOAT temp[2];
+        temp[0] = x[0];
+        temp[1] = x[1];
+        x[0] = y[(n - 1) * inc_y * 2];
+        x[0] = y[(n - 1) * inc_y * 2 + 1];
+        FLOAT* ptr = y + (n - 1) * inc_y * 2;   // start from the last one
+        BLASLONG stride_y = (0 - inc_y) * sizeof(FLOAT) * 2; // reverse
+        BLASLONG m = n - 1;
+        for (size_t vl; m > 0; m -= vl * 2, ptr -= vl*inc_y * 2) {
+            vl = VSETVL(m);
+            vyx2 = VLSSEG_FLOAT(ptr - 2, stride_y, vl);
+            VSSSEG_FLOAT(ptr, stride_y, vyx2, vl);
+        }
+        y[0] = temp[0];
+        y[1] = temp[1];
+    }
+    else if(inc_y == 0) {
+        FLOAT temp[2];
+        temp[0] = y[0];
+        temp[1] = y[1];
+        y[0] = x[(n - 1) * inc_x * 2];
+        y[0] = x[(n - 1) * inc_x * 2 + 1];
+        FLOAT* ptr = x + (n - 1) * inc_x * 2;   // start from the last one
+        BLASLONG stride_x = (0 - inc_x) * sizeof(FLOAT) * 2; // reverse
+        BLASLONG m = n - 1;
+        for (size_t vl; m > 0; m -= vl * 2, ptr -= vl*inc_x * 2) {
+            vl = VSETVL(m);
+            vxx2 = VLSSEG_FLOAT(ptr - 2, stride_x, vl);
+            VSSSEG_FLOAT(ptr, stride_x, vxx2, vl);
+        }
+        x[0] = temp[0];
+        x[1] = temp[1];
+    }
+    else if(inc_x == 1 && inc_y == 1) {
+
+        for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*2) {
+            vl = VSETVL(n);
+
+            vxx2 = VLSEG_FLOAT(x, vl);
+            vyx2 = VLSEG_FLOAT(y, vl);
+
+            VSSEG_FLOAT(y, vxx2, vl);
+            VSSEG_FLOAT(x, vyx2, vl);
+        }
+
+    } else if (inc_x == 1){
+        BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*inc_y*2) {
+            vl = VSETVL(n);
+
+            vxx2 = VLSEG_FLOAT(x, vl);
+            vyx2 = VLSSEG_FLOAT(y, stride_y, vl);
+
+            VSSSEG_FLOAT(y, stride_y, vxx2, vl);
+            VSSEG_FLOAT(x, vyx2, vl);
+        }
+
+    } else if (inc_y == 1){
+        BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*2) {
+            vl = VSETVL(n);
+
+            vxx2 = VLSSEG_FLOAT(x, stride_x, vl);
+            vyx2 = VLSEG_FLOAT(y, vl);
+
+            VSSEG_FLOAT(y, vxx2, vl);
+            VSSSEG_FLOAT(x, stride_x, vyx2, vl);
+        }
+
+    } else {
+        BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
+        BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*inc_y*2) {
+            vl = VSETVL(n);
+
+            vxx2 = VLSSEG_FLOAT(x, stride_x, vl);
+            vyx2 = VLSSEG_FLOAT(y, stride_y, vl);
+
+            VSSSEG_FLOAT(y, stride_y, vxx2, vl);
+            VSSSEG_FLOAT(x, stride_x, vyx2, vl);
+        }
+
+    }
+
+    return(0);
+}
diff --git a/kernel/riscv64/zswap_vector.c b/kernel/riscv64/zswap_vector.c
index 09cc8992a..f2734c4a9 100644
--- a/kernel/riscv64/zswap_vector.c
+++ b/kernel/riscv64/zswap_vector.c
@@ -27,35 +27,50 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 #include <stdio.h>
-#if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m8(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
-#define FLOAT_V_T vfloat32m8_t
-#define VLEV_FLOAT vle32_v_f32m8
-#define VLSEV_FLOAT vlse32_v_f32m8
-#define VSEV_FLOAT vse32_v_f32m8
-#define VSSEV_FLOAT vsse32_v_f32m8
+
+#ifdef RISCV64_ZVL256B
+#       define LMUL m2
+#       if defined(DOUBLE)
+#               define ELEN 64
+#               define MLEN 64
+#       else
+#               define ELEN 32
+#               define MLEN 32
+#       endif
 #else
-#define VSETVL(n) vsetvl_e64m8(n)
-#define VSETVL_MAX vsetvlmax_e64m1()
-#define FLOAT_V_T vfloat64m8_t
-#define VLEV_FLOAT vle64_v_f64m8
-#define VLSEV_FLOAT vlse64_v_f64m8
-#define VSEV_FLOAT vse64_v_f64m8
-#define VSSEV_FLOAT vsse64_v_f64m8
+#       define LMUL m8
+#       if defined(DOUBLE)
+#               define ELEN 64
+#               define MLEN 16
+#       else
+#               define ELEN 32
+#               define MLEN 8
+#       endif
 #endif
 
+#define _
+#define JOIN2_X(x, y) x ## y
+#define JOIN2(x, y) JOIN2_X(x, y)
+#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
+
+#define VSETVL          JOIN(RISCV_RVV(vsetvl),    _e,     ELEN,   LMUL,   _)
+#define FLOAT_V_T       JOIN(vfloat,            ELEN,   LMUL,   _t,     _)
+#define VLEV_FLOAT      JOIN(RISCV_RVV(vle),       ELEN,   _v_f,   ELEN,   LMUL)
+#define VLSEV_FLOAT     JOIN(RISCV_RVV(vlse),      ELEN,   _v_f,   ELEN,   LMUL)
+#define VSEV_FLOAT      JOIN(RISCV_RVV(vse),       ELEN,   _v_f,   ELEN,   LMUL)
+#define VSSEV_FLOAT     JOIN(RISCV_RVV(vsse),      ELEN,   _v_f,   ELEN,   LMUL)
+
 int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
 {
 	BLASLONG i = 0, j = 0;
 	BLASLONG ix = 0,iy = 0;
         BLASLONG stride_x, stride_y;
         FLOAT_V_T vx0, vx1, vy0, vy1;
-        unsigned int gvl = 0;
+        unsigned int gvl = VSETVL((inc_x != 0 && inc_y != 0) ? n : 1);
+        if( inc_x == 0 && inc_y == 0 ) { n = n & 1; }
 
-	if (n < 0)  return(0);
+	if (n <= 0)  return(0);
         if(inc_x == 1 && inc_y == 1){
-                gvl = VSETVL(n);
                 BLASLONG n2 = n * 2;
                 if(gvl <= n2/2){
                         for(i=0,j=0; i<n2/(2*gvl); i++){
@@ -80,8 +95,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dumm
                         j += gvl;
                 }
         }else{
-                gvl = VSETVL(n);
-                if (inc_x == 0 && inc_y == 0) gvl = VSETVL(1);
                 stride_x = inc_x * 2 * sizeof(FLOAT);
                 stride_y = inc_y * 2 * sizeof(FLOAT);
                 BLASLONG inc_xv = inc_x * gvl * 2;
diff --git a/kernel/riscv64/zsymm_lcopy_rvv_v1.c b/kernel/riscv64/zsymm_lcopy_rvv_v1.c
new file mode 100644
index 000000000..f4d806190
--- /dev/null
+++ b/kernel/riscv64/zsymm_lcopy_rvv_v1.c
@@ -0,0 +1,121 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m2()
+#define FLOAT_V_T               vfloat32m2_t
+#define FLOAT_VX2_T             vfloat32m2x2_t
+#define VGET_VX2                __riscv_vget_v_f32m2x2_f32m2
+#define VSET_VX2                __riscv_vset_v_f32m2_f32m2x2
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e32_v_f32m2x2
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e32_v_f32m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2x2
+#define INT_V_T                 vint32m2_t
+#define VID_V_INT               __riscv_vid_v_i32m2
+#define VADD_VX_INT             __riscv_vadd_vx_i32m2
+#define VMSGT_VX_INT            __riscv_vmsgt_vx_i32m2_b16
+#define VBOOL_T                 vbool16_t
+#define VMERGE_VVM_FLOAT        __riscv_vmerge_vvm_f32m2
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m2()
+#define FLOAT_V_T               vfloat64m2_t
+#define FLOAT_VX2_T             vfloat64m2x2_t
+#define VGET_VX2                __riscv_vget_v_f64m2x2_f64m2
+#define VSET_VX2                __riscv_vset_v_f64m2_f64m2x2
+#define VLEV_FLOAT              __riscv_vle64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e64_v_f64m2x2
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e64_v_f64m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2x2
+#define INT_V_T                 vint64m2_t
+#define VID_V_INT               __riscv_vid_v_i64m2
+#define VADD_VX_INT             __riscv_vadd_vx_i64m2
+#define VMSGT_VX_INT            __riscv_vmsgt_vx_i64m2_b32
+#define VBOOL_T                 vbool32_t
+#define VMERGE_VVM_FLOAT        __riscv_vmerge_vvm_f64m2
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b)
+{
+    BLASLONG i, js, offset;
+
+    FLOAT *ao1, *ao2;
+
+    BLASLONG stride_lda = sizeof(FLOAT)*lda*2;
+
+    FLOAT_V_T vb0, vb1, va10, va11, va20, va21;
+    FLOAT_VX2_T va1x2, va2x2, vbx2;
+    VBOOL_T vbool;
+    INT_V_T vindex_max, vindex;
+
+    size_t vl = VSETVL_MAX;
+    vindex_max   = VID_V_INT(vl);
+
+    for (js = n; js > 0; js -= vl, posX += vl) {
+        vl = VSETVL(js);
+        offset = posX - posY;
+
+        ao1 = a + posX * 2 + posY * lda * 2;
+        ao2 = a + posY * 2 + (posX) * lda * 2;
+
+        for (i = m; i > 0; i--, offset--) {
+
+            va2x2 = VLSSEG2_FLOAT(ao2, stride_lda, vl);
+            va1x2 = VLSEG2_FLOAT(ao1, vl);
+
+            va20 = VGET_VX2(va2x2, 0);
+            va21 = VGET_VX2(va2x2, 1);
+            va10 = VGET_VX2(va1x2, 0);
+            va11 = VGET_VX2(va1x2, 1);
+
+            vindex = VADD_VX_INT(vindex_max, offset, vl);
+            vbool  = VMSGT_VX_INT(vindex, 0, vl);
+
+            vb0 =  VMERGE_VVM_FLOAT(va20, va10, vbool, vl);
+            vb1 =  VMERGE_VVM_FLOAT(va21, va11, vbool, vl);
+
+            vbx2 = VSET_VX2(vbx2, 0, vb0);
+            vbx2 = VSET_VX2(vbx2, 1, vb1);
+            VSSEG2_FLOAT(b, vbx2, vl);
+
+            b   += vl * 2;
+            ao1 += lda * 2;
+            ao2 += 2;
+        }
+    }
+
+    return 0;
+}
+
diff --git a/kernel/riscv64/zsymm_ucopy_rvv_v1.c b/kernel/riscv64/zsymm_ucopy_rvv_v1.c
new file mode 100644
index 000000000..069551bb0
--- /dev/null
+++ b/kernel/riscv64/zsymm_ucopy_rvv_v1.c
@@ -0,0 +1,121 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m2()
+#define FLOAT_V_T               vfloat32m2_t
+#define FLOAT_VX2_T             vfloat32m2x2_t
+#define VGET_VX2                __riscv_vget_v_f32m2x2_f32m2
+#define VSET_VX2                __riscv_vset_v_f32m2_f32m2x2
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e32_v_f32m2x2
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e32_v_f32m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2x2
+#define INT_V_T                 vint32m2_t
+#define VID_V_INT               __riscv_vid_v_i32m2
+#define VADD_VX_INT             __riscv_vadd_vx_i32m2
+#define VMSGT_VX_INT            __riscv_vmsgt_vx_i32m2_b16
+#define VBOOL_T                 vbool16_t
+#define VMERGE_VVM_FLOAT        __riscv_vmerge_vvm_f32m2
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m2()
+#define FLOAT_V_T               vfloat64m2_t
+#define FLOAT_VX2_T             vfloat64m2x2_t
+#define VGET_VX2                __riscv_vget_v_f64m2x2_f64m2
+#define VSET_VX2                __riscv_vset_v_f64m2_f64m2x2
+#define VLEV_FLOAT              __riscv_vle64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e64_v_f64m2x2
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e64_v_f64m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2x2
+#define INT_V_T                 vint64m2_t
+#define VID_V_INT               __riscv_vid_v_i64m2
+#define VADD_VX_INT             __riscv_vadd_vx_i64m2
+#define VMSGT_VX_INT            __riscv_vmsgt_vx_i64m2_b32
+#define VBOOL_T                 vbool32_t
+#define VMERGE_VVM_FLOAT        __riscv_vmerge_vvm_f64m2
+#endif
+
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b)
+{
+    BLASLONG i, js, offset;
+
+    FLOAT *ao1, *ao2;
+
+    BLASLONG stride_lda = sizeof(FLOAT)*lda * 2;
+    
+    FLOAT_V_T vb0, vb1, va10, va11, va20, va21;
+    FLOAT_VX2_T va1x2, va2x2, vbx2;
+    VBOOL_T vbool;
+    INT_V_T vindex_max, vindex;
+
+
+    size_t vl = VSETVL_MAX;
+    vindex_max   = VID_V_INT(vl);
+
+    for (js = n; js > 0; js -= vl, posX += vl) {
+        vl = VSETVL(js);
+        offset = posX - posY;
+
+        ao1 = a + posY * 2 + (posX + 0) * lda * 2;
+        ao2 = a + posX * 2 + 0 + posY * lda * 2;
+
+        for (i = m; i > 0; i--, offset--) {
+            va1x2 = VLSSEG2_FLOAT(ao1, stride_lda, vl);
+            va2x2 = VLSEG2_FLOAT(ao2, vl);
+
+            va20 = VGET_VX2(va2x2, 0);
+            va21 = VGET_VX2(va2x2, 1);
+            va10 = VGET_VX2(va1x2, 0);
+            va11 = VGET_VX2(va1x2, 1);
+
+            vindex = VADD_VX_INT(vindex_max, offset, vl);
+            vbool  = VMSGT_VX_INT(vindex, 0, vl);
+
+            vb0 =  VMERGE_VVM_FLOAT(va20, va10, vbool, vl);
+            vb1 =  VMERGE_VVM_FLOAT(va21, va11, vbool, vl);
+
+            vbx2 = VSET_VX2(vbx2, 0, vb0);
+            vbx2 = VSET_VX2(vbx2, 1, vb1);
+            VSSEG2_FLOAT(b, vbx2, vl);
+
+            b   += vl * 2;
+            ao1 += 2;
+            ao2 += lda * 2;
+        }
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/zsymv_L_rvv.c b/kernel/riscv64/zsymv_L_rvv.c
new file mode 100644
index 000000000..cefdea7f6
--- /dev/null
+++ b/kernel/riscv64/zsymv_L_rvv.c
@@ -0,0 +1,179 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#if !defined(DOUBLE)
+#define VSETVL(n) __riscv_vsetvl_e32m4(n)
+#define VSETVL_MAX __riscv_vsetvlmax_e32m1()
+#define FLOAT_V_T vfloat32m4_t
+#define FLOAT_V_T_M1 vfloat32m1_t
+#define VLEV_FLOAT __riscv_vle32_v_f32m4
+#define VLSEV_FLOAT __riscv_vlse32_v_f32m4
+#define VSEV_FLOAT __riscv_vse32_v_f32m4
+#define VSSEV_FLOAT __riscv_vsse32_v_f32m4
+#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1
+#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4
+#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4
+#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m4_tu
+#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f32m4_tu
+#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4
+#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4
+#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4
+#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
+#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4
+#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32
+#define VFNEGV_FLOAT __riscv_vfneg_v_f32mf4
+#else
+#define VSETVL(n) __riscv_vsetvl_e64m4(n)
+#define VSETVL_MAX __riscv_vsetvlmax_e64m1()
+#define FLOAT_V_T vfloat64m4_t
+#define FLOAT_V_T_M1 vfloat64m1_t
+#define VLEV_FLOAT __riscv_vle64_v_f64m4
+#define VLSEV_FLOAT __riscv_vlse64_v_f64m4
+#define VSEV_FLOAT __riscv_vse64_v_f64m4
+#define VSSEV_FLOAT __riscv_vsse64_v_f64m4
+#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1
+#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4
+#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4
+#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m4_tu
+#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f64m4_tu
+#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4
+#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4
+#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4
+#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
+#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4
+#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64
+#define VFNEGV_FLOAT __riscv_vfneg_v_f64mf4
+#endif
+
+int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i,
+	  FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+        BLASLONG i, j, k;
+        BLASLONG ix,iy;
+        BLASLONG jx,jy;
+        FLOAT temp1[2];
+        FLOAT temp2[2];
+        FLOAT *a_ptr = a;
+        BLASLONG gvl = VSETVL_MAX;
+        FLOAT_V_T_M1 v_res, v_z0;
+        v_res = VFMVVF_FLOAT_M1(0, gvl);
+        v_z0 = VFMVVF_FLOAT_M1(0, gvl);
+
+        FLOAT_V_T va_r, va_i, vx_r, vx_i, vy_r, vy_i, vr_r, vr_i;
+        BLASLONG stride_x, stride_y, inc_xv, inc_yv, len;
+
+        stride_x = 2 * inc_x * sizeof(FLOAT);
+        stride_y = 2 * inc_y * sizeof(FLOAT);
+        jx = 0;
+        jy = 0;
+        for (j=0; j<offset; j++)
+        {
+                temp1[0] = alpha_r * x[2 * jx] - alpha_i * x[2 * jx + 1];
+                temp1[1] = alpha_r * x[2 * jx + 1] + alpha_i * x[2 * jx];
+                temp2[0] = 0;
+                temp2[1] = 0;
+
+		y[2 * jy] += temp1[0] * a_ptr[j * 2] - temp1[1] * a_ptr[j * 2 + 1];
+		y[2 * jy + 1] += temp1[1] * a_ptr[j * 2] + temp1[0] * a_ptr[j * 2 + 1];
+
+                ix = jx + inc_x;
+                iy = jy + inc_y;
+                i = j + 1;
+                len = m - i;
+                if(len > 0){
+                        gvl = VSETVL(len);
+                        inc_xv = inc_x * gvl;
+                        inc_yv = inc_y * gvl;
+                        vr_r = VFMVVF_FLOAT(0, gvl);
+                        vr_i = VFMVVF_FLOAT(0, gvl);
+                        for(k = 0; k < len / gvl; k++){
+                                va_r = VLSEV_FLOAT(&a_ptr[2 * i], 2 * sizeof(FLOAT), gvl);
+                                va_i = VLSEV_FLOAT(&a_ptr[2 * i + 1], 2 * sizeof(FLOAT), gvl);
+
+                                vy_r = VLSEV_FLOAT(&y[2 * iy], stride_y, gvl);
+                                vy_i = VLSEV_FLOAT(&y[2 * iy + 1], stride_y, gvl);
+                                
+                                vy_r = VFMACCVF_FLOAT(vy_r, temp1[0], va_r, gvl);
+                                vy_r = VFNMSACVF_FLOAT(vy_r, temp1[1], va_i, gvl);
+                                vy_i = VFMACCVF_FLOAT(vy_i, temp1[0], va_i, gvl);
+                                vy_i = VFMACCVF_FLOAT(vy_i, temp1[1], va_r, gvl);
+                                
+                                VSSEV_FLOAT(&y[2 * iy], stride_y, vy_r, gvl);
+                                VSSEV_FLOAT(&y[2 * iy + 1], stride_y, vy_i, gvl);
+
+                                vx_r = VLSEV_FLOAT(&x[2 * ix], stride_x, gvl);
+                                vx_i = VLSEV_FLOAT(&x[2 * ix + 1], stride_x, gvl);
+                                vr_r = VFMACCVV_FLOAT(vr_r, vx_r, va_r, gvl);
+                                vr_r = VFNMSACVV_FLOAT(vr_r, vx_i, va_i, gvl);
+                                vr_i = VFMACCVV_FLOAT(vr_i, vx_r, va_i, gvl);
+                                vr_i = VFMACCVV_FLOAT(vr_i, vx_i, va_r, gvl);
+
+                                i += gvl;
+                                ix += inc_xv;
+                                iy += inc_yv;
+                        }
+
+                        if(i < m){
+                                unsigned int gvl_rem = VSETVL(m-i);
+                                vy_r = VLSEV_FLOAT(&y[2 * iy], stride_y, gvl_rem);
+                                vy_i = VLSEV_FLOAT(&y[2 * iy + 1], stride_y, gvl_rem);
+                                va_r = VLSEV_FLOAT(&a_ptr[2 * i], 2 * sizeof(FLOAT), gvl_rem);
+                                va_i = VLSEV_FLOAT(&a_ptr[2 * i + 1], 2 * sizeof(FLOAT), gvl_rem);
+
+                                vy_r = VFMACCVF_FLOAT(vy_r, temp1[0], va_r, gvl_rem);
+                                vy_r = VFNMSACVF_FLOAT(vy_r, temp1[1], va_i, gvl_rem);
+                                vy_i = VFMACCVF_FLOAT(vy_i, temp1[0], va_i, gvl_rem);
+                                vy_i = VFMACCVF_FLOAT(vy_i, temp1[1], va_r, gvl_rem);
+                                
+                                VSSEV_FLOAT(&y[2 * iy], stride_y, vy_r, gvl_rem);
+                                VSSEV_FLOAT(&y[2 * iy + 1], stride_y, vy_i, gvl_rem);
+
+                                vx_r = VLSEV_FLOAT(&x[2 * ix], stride_x, gvl_rem);
+                                vx_i = VLSEV_FLOAT(&x[2 * ix + 1], stride_x, gvl_rem);
+                                vr_r = VFMACCVV_FLOAT_TU(vr_r, vx_r, va_r, gvl_rem);
+                                vr_r = VFNMSACVV_FLOAT_TU(vr_r, vx_i, va_i, gvl_rem);
+                                vr_i = VFMACCVV_FLOAT_TU(vr_i, vx_r, va_i, gvl_rem);
+                                vr_i = VFMACCVV_FLOAT_TU(vr_i, vx_i, va_r, gvl_rem);
+                                
+                        }
+                        v_res = VFREDSUM_FLOAT(vr_r, v_z0, gvl);
+                        temp2[0] = VFMVFS_FLOAT_M1(v_res);
+                        v_res = VFREDSUM_FLOAT(vr_i, v_z0, gvl);
+                        temp2[1] = VFMVFS_FLOAT_M1(v_res);
+                }
+                y[2 * jy] += alpha_r * temp2[0] - alpha_i * temp2[1];
+                y[2 * jy + 1] += alpha_r * temp2[1] + alpha_i * temp2[0];
+
+                jx    += inc_x;
+                jy    += inc_y;
+                a_ptr += 2 * lda;
+        }
+                
+        return(0);
+}
+
diff --git a/kernel/riscv64/zsymv_U_rvv.c b/kernel/riscv64/zsymv_U_rvv.c
new file mode 100644
index 000000000..67b5a649c
--- /dev/null
+++ b/kernel/riscv64/zsymv_U_rvv.c
@@ -0,0 +1,177 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#if !defined(DOUBLE)
+#define VSETVL(n) __riscv_vsetvl_e32m4(n)
+#define VSETVL_MAX __riscv_vsetvlmax_e32m1()
+#define FLOAT_V_T vfloat32m4_t
+#define FLOAT_V_T_M1 vfloat32m1_t
+#define VLEV_FLOAT __riscv_vle32_v_f32m4
+#define VLSEV_FLOAT __riscv_vlse32_v_f32m4
+#define VSEV_FLOAT __riscv_vse32_v_f32m4
+#define VSSEV_FLOAT __riscv_vsse32_v_f32m4
+#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1
+#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4
+#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4
+#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m4_tu
+#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f32m4_tu
+#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4
+#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4
+#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4
+#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
+#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4
+#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32
+#else
+#define VSETVL(n) __riscv_vsetvl_e64m4(n)
+#define VSETVL_MAX __riscv_vsetvlmax_e64m1()
+#define FLOAT_V_T vfloat64m4_t
+#define FLOAT_V_T_M1 vfloat64m1_t
+#define VLEV_FLOAT __riscv_vle64_v_f64m4
+#define VLSEV_FLOAT __riscv_vlse64_v_f64m4
+#define VSEV_FLOAT __riscv_vse64_v_f64m4
+#define VSSEV_FLOAT __riscv_vsse64_v_f64m4
+#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1
+#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4
+#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4
+#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m4_tu
+#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f64m4_tu
+#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4
+#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4
+#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4
+#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
+#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4
+#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64
+#endif
+
+int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i,
+	  FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+        BLASLONG i, j, k;
+        BLASLONG ix,iy;
+        BLASLONG jx,jy;
+        FLOAT temp1[2];
+        FLOAT temp2[2];
+        FLOAT *a_ptr = a;
+        BLASLONG gvl = VSETVL_MAX;
+        FLOAT_V_T_M1 v_res, v_z0;
+        v_res = VFMVVF_FLOAT_M1(0, gvl);
+        v_z0 = VFMVVF_FLOAT_M1(0, gvl);
+
+
+        FLOAT_V_T va_r, va_i, vx_r, vx_i, vy_r, vy_i, vr_r, vr_i;
+        BLASLONG stride_x, stride_y, inc_xv, inc_yv;
+
+        BLASLONG m1 = m - offset;
+        jx = m1 * inc_x;
+        jy = m1 * inc_y;
+        a_ptr += m1 * lda;
+        stride_x = 2 * inc_x * sizeof(FLOAT);
+        stride_y = 2 * inc_y * sizeof(FLOAT);
+        for (j=m1; j<m; j++)
+        {
+                temp1[0] = alpha_r * x[2 * jx] - alpha_i * x[2 * jx + 1];
+                temp1[1] = alpha_r * x[2 * jx + 1] + alpha_i * x[2 * jx];
+                temp2[0] = 0;
+                temp2[1] = 0;
+                if(j > 0){
+                         ix = 0;
+                         iy = 0;
+                         i = 0;
+                         gvl = VSETVL(j);
+                         inc_xv = inc_x * gvl;
+                         inc_yv = inc_y * gvl;
+                         vr_r = VFMVVF_FLOAT(0, gvl);
+                         vr_i = VFMVVF_FLOAT(0, gvl);
+                         for(k = 0; k < j / gvl; k++){
+                                 va_r = VLSEV_FLOAT(&a_ptr[2 * i], 2 * sizeof(FLOAT), gvl);
+                                 va_i = VLSEV_FLOAT(&a_ptr[2 * i + 1], 2 * sizeof(FLOAT), gvl);
+
+                                 vy_r = VLSEV_FLOAT(&y[2 * iy], stride_y, gvl);
+                                 vy_i = VLSEV_FLOAT(&y[2 * iy + 1], stride_y, gvl);
+                                 
+                                 vy_r = VFMACCVF_FLOAT(vy_r, temp1[0], va_r, gvl);
+                                 vy_r = VFNMSACVF_FLOAT(vy_r, temp1[1], va_i, gvl);
+                                 vy_i = VFMACCVF_FLOAT(vy_i, temp1[0], va_i, gvl);
+                                 vy_i = VFMACCVF_FLOAT(vy_i, temp1[1], va_r, gvl);
+                                
+                                 VSSEV_FLOAT(&y[2 * iy], stride_y, vy_r, gvl);
+                                 VSSEV_FLOAT(&y[2 * iy + 1], stride_y, vy_i, gvl);
+
+                                 vx_r = VLSEV_FLOAT(&x[2 * ix], stride_x, gvl);
+                                 vx_i = VLSEV_FLOAT(&x[2 * ix + 1], stride_x, gvl);
+                                 vr_r = VFMACCVV_FLOAT(vr_r, vx_r, va_r, gvl);
+                                 vr_r = VFNMSACVV_FLOAT(vr_r, vx_i, va_i, gvl);
+                                 vr_i = VFMACCVV_FLOAT(vr_i, vx_r, va_i, gvl);
+                                 vr_i = VFMACCVV_FLOAT(vr_i, vx_i, va_r, gvl);
+
+                                 i += gvl;
+                                 ix += inc_xv;
+                                 iy += inc_yv;
+                         }
+
+                         if(i < j){
+                                 unsigned int gvl_rem = VSETVL(j-i);
+                                 vy_r = VLSEV_FLOAT(&y[2 * iy], stride_y, gvl_rem);
+                                 vy_i = VLSEV_FLOAT(&y[2 * iy + 1], stride_y, gvl_rem);
+                                 
+                                 va_r = VLSEV_FLOAT(&a_ptr[2 * i], 2 * sizeof(FLOAT), gvl_rem);
+                                 va_i = VLSEV_FLOAT(&a_ptr[2 * i + 1], 2 * sizeof(FLOAT), gvl_rem);
+
+                                 vy_r = VFMACCVF_FLOAT(vy_r, temp1[0], va_r, gvl_rem);
+                                 vy_r = VFNMSACVF_FLOAT(vy_r, temp1[1], va_i, gvl_rem);
+                                 vy_i = VFMACCVF_FLOAT(vy_i, temp1[0], va_i, gvl_rem);
+                                 vy_i = VFMACCVF_FLOAT(vy_i, temp1[1], va_r, gvl_rem);
+                                
+                                 VSSEV_FLOAT(&y[2 * iy], stride_y, vy_r, gvl_rem);
+                                 VSSEV_FLOAT(&y[2 * iy + 1], stride_y, vy_i, gvl_rem);
+
+                                 vx_r = VLSEV_FLOAT(&x[2 * ix], stride_x, gvl_rem);
+                                 vx_i = VLSEV_FLOAT(&x[2 * ix + 1], stride_x, gvl_rem);
+                                 vr_r = VFMACCVV_FLOAT_TU(vr_r, vx_r, va_r, gvl_rem);
+                                 vr_r = VFNMSACVV_FLOAT_TU(vr_r, vx_i, va_i, gvl_rem);
+                                 vr_i = VFMACCVV_FLOAT_TU(vr_i, vx_r, va_i, gvl_rem);
+                                 vr_i = VFMACCVV_FLOAT_TU(vr_i, vx_i, va_r, gvl_rem);
+                                
+                         }
+                         v_res = VFREDSUM_FLOAT(vr_r, v_z0, gvl);
+                         temp2[0] = VFMVFS_FLOAT_M1(v_res);
+                         v_res = VFREDSUM_FLOAT(vr_i, v_z0, gvl);
+                         temp2[1] = VFMVFS_FLOAT_M1(v_res);
+                }
+
+		 y[2 * jy] += temp1[0] * a_ptr[j * 2] - temp1[1] * a_ptr[j * 2 + 1] + alpha_r * temp2[0] - alpha_i * temp2[1];
+		 y[2 * jy + 1] += temp1[1] * a_ptr[j * 2] + temp1[0] * a_ptr[j * 2 + 1] + alpha_r * temp2[1] + alpha_i * temp2[0];
+
+                a_ptr += 2 * lda;
+                jx    += inc_x;
+                jy    += inc_y;
+        }
+        
+        return(0);
+}
+
diff --git a/kernel/riscv64/ztrmm_kernel_4x4_zvl128b.c b/kernel/riscv64/ztrmm_kernel_4x4_zvl128b.c
new file mode 100644
index 000000000..d7d5e5fea
--- /dev/null
+++ b/kernel/riscv64/ztrmm_kernel_4x4_zvl128b.c
@@ -0,0 +1,805 @@
+/*
+
+AUTOGENERATED KERNEL
+Script: ./kernel/riscv64/generate_kernel.py
+Settings:
+ LMUL=2
+ M=4
+ M_tail_scalar_from=2
+ N=4
+ __riscv_='__riscv_'
+ complex=True
+ conjugate=False
+ cpu='zvl128b'
+ force_acc_double=False
+ index_type='BLASLONG'
+ op='trmm'
+ param_precision='double'
+ reg_width_bits=128
+ tail_policy=''
+ trace=False
+
+Derived:
+ ELEN_ACC=64
+ ELEN_PARAM=64
+ LMUL_ACC=2
+ VFMACC='__riscv_vfmacc_vf_f64m2'
+ VFMUL='__riscv_vfmul_vf_f64m2'
+ VLEV='__riscv_vle64_v_f64m2'
+ VLSEV='__riscv_vlse64_v_f64m2'
+ VMACC_TO_ACC='__riscv_vfmacc_vf_f64m2'
+ VMUL_TO_ACC='__riscv_vfmul_vf_f64m2'
+ VSETVL='__riscv_vsetvl_e64m2'
+ VSEV='__riscv_vse64_v_f64m2'
+ VSSEV='__riscv_vsse64_v_f64m2'
+ acc_vector_t='vfloat64m2_t'
+ output='ztrmm_kernel_4x4_zvl128b.c'
+ param_scalar_t='double'
+ param_vector_t='vfloat64m2_t'
+
+*/
+
+#include "common.h"
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+#define S0 1
+#define S1 -1
+#define S2 1
+#define S3 1
+#define VFMACC_RR __riscv_vfmsac
+#define VFMACC_RI __riscv_vfmacc
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+#define S0 1
+#define S1 1
+#define S2 1
+#define S3 -1
+#define VFMACC_RR __riscv_vfmacc
+#define VFMACC_RI __riscv_vfmsac
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+#define S0 1
+#define S1 1
+#define S2 -1
+#define S3 1
+#define VFMACC_RR __riscv_vfmacc
+#define VFMACC_RI __riscv_vfnmsac
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+#define S0 1
+#define S1 -1
+#define S2 -1
+#define S3 -1
+#define VFMACC_RR __riscv_vfmsac
+#define VFMACC_RI __riscv_vfnmacc
+#endif
+
+#if defined(LEFT) != defined(TRANSA)
+#define BACKWARDS
+#endif
+
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc, BLASLONG offset)
+
+{
+    BLASLONG gvl = 0;
+    BLASLONG m_top = 0;
+    BLASLONG n_top = 0;
+
+    // -- MAIN PASS
+
+    for (BLASLONG j = 0; j < N / 4; j += 1) {
+        m_top = 0;
+        BLASLONG gvl = __riscv_vsetvl_e64m2(4);
+
+        for (BLASLONG i = 0; i < M / 4; i += 1) {
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 4 * 2;
+            bi += off * 4 * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 4;
+#else
+            pass_K = off + 4;
+#endif
+#endif
+            double B0r = B[bi + 0 * 2 + 0];
+            double B0i = B[bi + 0 * 2 + 1];
+            double B1r = B[bi + 1 * 2 + 0];
+            double B1i = B[bi + 1 * 2 + 1];
+            double B2r = B[bi + 2 * 2 + 0];
+            double B2i = B[bi + 2 * 2 + 1];
+            double B3r = B[bi + 3 * 2 + 0];
+            double B3i = B[bi + 3 * 2 + 1];
+            bi += 4 * 2;
+
+            vfloat64m2_t A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+            vfloat64m2_t A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ai += 4 * 2;
+
+            // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k
+            // leaving 6 vector registers for temporaries
+            // performing 2 operations between reuses of temporaries
+            vfloat64m2_t tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl);
+            vfloat64m2_t tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl);
+            vfloat64m2_t tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl);
+            vfloat64m2_t tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl);
+            tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
+            vfloat64m2_t ACC0r = tmp0r;
+            vfloat64m2_t ACC0i = tmp0i;
+            vfloat64m2_t ACC1r = tmp1r;
+            vfloat64m2_t ACC1i = tmp1i;
+            tmp0r = __riscv_vfmul_vf_f64m2(A0i, B2i, gvl);
+            tmp0i = __riscv_vfmul_vf_f64m2(A0r, B2i, gvl);
+            tmp1r = __riscv_vfmul_vf_f64m2(A0i, B3i, gvl);
+            tmp1i = __riscv_vfmul_vf_f64m2(A0r, B3i, gvl);
+            tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl);
+            tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl);
+            tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl);
+            tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl);
+            vfloat64m2_t ACC2r = tmp0r;
+            vfloat64m2_t ACC2i = tmp0i;
+            vfloat64m2_t ACC3r = tmp1r;
+            vfloat64m2_t ACC3i = tmp1i;
+
+            for (BLASLONG k = 1; k < pass_K; k++) {
+                B0r = B[bi + 0 * 2 + 0];
+                B0i = B[bi + 0 * 2 + 1];
+                B1r = B[bi + 1 * 2 + 0];
+                B1i = B[bi + 1 * 2 + 1];
+                B2r = B[bi + 2 * 2 + 0];
+                B2i = B[bi + 2 * 2 + 1];
+                B3r = B[bi + 3 * 2 + 0];
+                B3i = B[bi + 3 * 2 + 1];
+                bi += 4 * 2;
+
+                A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+                A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+                ai += 4 * 2;
+
+                tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl);
+                tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
+                ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl);
+                tmp0r = __riscv_vfmul_vf_f64m2(A0i, B2i, gvl);
+                tmp0i = __riscv_vfmul_vf_f64m2(A0r, B2i, gvl);
+                tmp1r = __riscv_vfmul_vf_f64m2(A0i, B3i, gvl);
+                tmp1i = __riscv_vfmul_vf_f64m2(A0r, B3i, gvl);
+                tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl);
+                tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl);
+                tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl);
+                tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl);
+                ACC2r = __riscv_vfadd(ACC2r, tmp0r, gvl);
+                ACC2i = __riscv_vfadd(ACC2i, tmp0i, gvl);
+                ACC3r = __riscv_vfadd(ACC3r, tmp1r, gvl);
+                ACC3i = __riscv_vfadd(ACC3i, tmp1i, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat64m2_t C0r = __riscv_vfmul(ACC0r, alphar, gvl);
+            vfloat64m2_t C0i = __riscv_vfmul(ACC0i, alphar, gvl);
+            vfloat64m2_t C1r = __riscv_vfmul(ACC1r, alphar, gvl);
+            vfloat64m2_t C1i = __riscv_vfmul(ACC1i, alphar, gvl);
+            vfloat64m2_t C2r = __riscv_vfmul(ACC2r, alphar, gvl);
+            vfloat64m2_t C2i = __riscv_vfmul(ACC2i, alphar, gvl);
+            vfloat64m2_t C3r = __riscv_vfmul(ACC3r, alphar, gvl);
+            vfloat64m2_t C3i = __riscv_vfmul(ACC3i, alphar, gvl);
+            C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl);
+            C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl);
+            C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl);
+            C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl);
+            C2r = __riscv_vfnmsac(C2r, alphai, ACC2i, gvl);
+            C2i = __riscv_vfmacc(C2i, alphai, ACC2r, gvl);
+            C3r = __riscv_vfnmsac(C3r, alphai, ACC3i, gvl);
+            C3i = __riscv_vfmacc(C3i, alphai, ACC3r, gvl);
+            __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl);
+            __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl);
+            __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C2r, gvl);
+            __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C2i, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C3r, gvl);
+            __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C3i, gvl);
+
+            m_top += 4;
+        }
+
+        // -- tails for main pass
+
+        if (M & 2) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            double result4 = 0;
+            double result5 = 0;
+            double result6 = 0;
+            double result7 = 0;
+            double result8 = 0;
+            double result9 = 0;
+            double result10 = 0;
+            double result11 = 0;
+            double result12 = 0;
+            double result13 = 0;
+            double result14 = 0;
+            double result15 = 0;
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 2 * 2;
+            bi += off * 4 * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 2;
+#else
+            pass_K = off + 4;
+#endif
+#endif
+
+            for (BLASLONG k = 0; k < pass_K; k++) {
+                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
+                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
+                result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1];
+                result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1];
+                result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1];
+                result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1];
+                result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1];
+                result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1];
+                result8 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1];
+                result9 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1];
+                result10 += S0 * A[ai + 2 + 0] * B[bi + 4 + 0] + S1 * A[ai + 2 + 1] * B[bi + 4 + 1];
+                result11 += S2 * A[ai + 2 + 1] * B[bi + 4 + 0] + S3 * A[ai + 2 + 0] * B[bi + 4 + 1];
+                result12 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1];
+                result13 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1];
+                result14 += S0 * A[ai + 2 + 0] * B[bi + 6 + 0] + S1 * A[ai + 2 + 1] * B[bi + 6 + 1];
+                result15 += S2 * A[ai + 2 + 1] * B[bi + 6 + 0] + S3 * A[ai + 2 + 0] * B[bi + 6 + 1];
+                ai += 2 * 2;
+                bi += 4 * 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            double Cr, Ci;
+            Cr = result0 * alphar;
+            Ci = result1 * alphar;
+            Cr -= result1 * alphai;
+            Ci += result0 * alphai;
+            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
+            Cr = result2 * alphar;
+            Ci = result3 * alphar;
+            Cr -= result3 * alphai;
+            Ci += result2 * alphai;
+            C[(ci + 0 * ldc + 1) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 1) * 2 + 1] = Ci;
+            Cr = result4 * alphar;
+            Ci = result5 * alphar;
+            Cr -= result5 * alphai;
+            Ci += result4 * alphai;
+            C[(ci + 1 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 1 * ldc + 0) * 2 + 1] = Ci;
+            Cr = result6 * alphar;
+            Ci = result7 * alphar;
+            Cr -= result7 * alphai;
+            Ci += result6 * alphai;
+            C[(ci + 1 * ldc + 1) * 2 + 0] = Cr;
+            C[(ci + 1 * ldc + 1) * 2 + 1] = Ci;
+            Cr = result8 * alphar;
+            Ci = result9 * alphar;
+            Cr -= result9 * alphai;
+            Ci += result8 * alphai;
+            C[(ci + 2 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 2 * ldc + 0) * 2 + 1] = Ci;
+            Cr = result10 * alphar;
+            Ci = result11 * alphar;
+            Cr -= result11 * alphai;
+            Ci += result10 * alphai;
+            C[(ci + 2 * ldc + 1) * 2 + 0] = Cr;
+            C[(ci + 2 * ldc + 1) * 2 + 1] = Ci;
+            Cr = result12 * alphar;
+            Ci = result13 * alphar;
+            Cr -= result13 * alphai;
+            Ci += result12 * alphai;
+            C[(ci + 3 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 3 * ldc + 0) * 2 + 1] = Ci;
+            Cr = result14 * alphar;
+            Ci = result15 * alphar;
+            Cr -= result15 * alphai;
+            Ci += result14 * alphai;
+            C[(ci + 3 * ldc + 1) * 2 + 0] = Cr;
+            C[(ci + 3 * ldc + 1) * 2 + 1] = Ci;
+            m_top += 2;
+        }
+
+        if (M & 1) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            double result4 = 0;
+            double result5 = 0;
+            double result6 = 0;
+            double result7 = 0;
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 1 * 2;
+            bi += off * 4 * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 1;
+#else
+            pass_K = off + 4;
+#endif
+#endif
+
+            for (BLASLONG k = 0; k < pass_K; k++) {
+                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
+                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
+                result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1];
+                result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1];
+                result4 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1];
+                result5 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1];
+                result6 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1];
+                result7 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1];
+                ai += 1 * 2;
+                bi += 4 * 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            double Cr, Ci;
+            Cr = result0 * alphar;
+            Ci = result1 * alphar;
+            Cr -= result1 * alphai;
+            Ci += result0 * alphai;
+            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
+            Cr = result2 * alphar;
+            Ci = result3 * alphar;
+            Cr -= result3 * alphai;
+            Ci += result2 * alphai;
+            C[(ci + 1 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 1 * ldc + 0) * 2 + 1] = Ci;
+            Cr = result4 * alphar;
+            Ci = result5 * alphar;
+            Cr -= result5 * alphai;
+            Ci += result4 * alphai;
+            C[(ci + 2 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 2 * ldc + 0) * 2 + 1] = Ci;
+            Cr = result6 * alphar;
+            Ci = result7 * alphar;
+            Cr -= result7 * alphai;
+            Ci += result6 * alphai;
+            C[(ci + 3 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 3 * ldc + 0) * 2 + 1] = Ci;
+            m_top += 1;
+        }
+
+        n_top += 4;
+    }
+
+    // -- tails for N=2
+
+    if (N & 2) {
+        gvl = __riscv_vsetvl_e64m2(4);
+        m_top = 0;
+
+        for (BLASLONG i = 0; i < M / 4; i += 1) {
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 4 * 2;
+            bi += off * 2 * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 4;
+#else
+            pass_K = off + 2;
+#endif
+#endif
+            double B0r = B[bi + 0 * 2 + 0];
+            double B0i = B[bi + 0 * 2 + 1];
+            double B1r = B[bi + 1 * 2 + 0];
+            double B1i = B[bi + 1 * 2 + 1];
+            bi += 2 * 2;
+
+            vfloat64m2_t A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+            vfloat64m2_t A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ai += 4 * 2;
+
+            // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k
+            // leaving 10 vector registers for temporaries
+            vfloat64m2_t tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl);
+            vfloat64m2_t tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl);
+            vfloat64m2_t tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl);
+            vfloat64m2_t tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl);
+            tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
+            vfloat64m2_t ACC0r = tmp0r;
+            vfloat64m2_t ACC0i = tmp0i;
+            vfloat64m2_t ACC1r = tmp1r;
+            vfloat64m2_t ACC1i = tmp1i;
+
+            for (BLASLONG k = 1; k < pass_K; k++) {
+                B0r = B[bi + 0 * 2 + 0];
+                B0i = B[bi + 0 * 2 + 1];
+                B1r = B[bi + 1 * 2 + 0];
+                B1i = B[bi + 1 * 2 + 1];
+                bi += 2 * 2;
+
+                A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+                A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+                ai += 4 * 2;
+
+                tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl);
+                tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
+                ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat64m2_t C0r = __riscv_vfmul(ACC0r, alphar, gvl);
+            vfloat64m2_t C0i = __riscv_vfmul(ACC0i, alphar, gvl);
+            vfloat64m2_t C1r = __riscv_vfmul(ACC1r, alphar, gvl);
+            vfloat64m2_t C1i = __riscv_vfmul(ACC1i, alphar, gvl);
+            C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl);
+            C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl);
+            C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl);
+            C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl);
+            __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl);
+            __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl);
+            __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl);
+
+            m_top += 4;
+        }
+
+        if (M & 2) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            double result4 = 0;
+            double result5 = 0;
+            double result6 = 0;
+            double result7 = 0;
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 2 * 2;
+            bi += off * 2 * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 2;
+#else
+            pass_K = off + 2;
+#endif
+#endif
+
+            for (BLASLONG k = 0; k < pass_K; k++) {
+                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
+                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
+                result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1];
+                result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1];
+                result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1];
+                result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1];
+                result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1];
+                result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1];
+                ai += 2 * 2;
+                bi += 2 * 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            double Cr, Ci;
+            Cr = result0 * alphar;
+            Ci = result1 * alphar;
+            Cr -= result1 * alphai;
+            Ci += result0 * alphai;
+            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
+            Cr = result2 * alphar;
+            Ci = result3 * alphar;
+            Cr -= result3 * alphai;
+            Ci += result2 * alphai;
+            C[(ci + 0 * ldc + 1) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 1) * 2 + 1] = Ci;
+            Cr = result4 * alphar;
+            Ci = result5 * alphar;
+            Cr -= result5 * alphai;
+            Ci += result4 * alphai;
+            C[(ci + 1 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 1 * ldc + 0) * 2 + 1] = Ci;
+            Cr = result6 * alphar;
+            Ci = result7 * alphar;
+            Cr -= result7 * alphai;
+            Ci += result6 * alphai;
+            C[(ci + 1 * ldc + 1) * 2 + 0] = Cr;
+            C[(ci + 1 * ldc + 1) * 2 + 1] = Ci;
+            m_top += 2;
+        }
+
+        if (M & 1) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 1 * 2;
+            bi += off * 2 * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 1;
+#else
+            pass_K = off + 2;
+#endif
+#endif
+
+            for (BLASLONG k = 0; k < pass_K; k++) {
+                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
+                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
+                result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1];
+                result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1];
+                ai += 1 * 2;
+                bi += 2 * 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            double Cr, Ci;
+            Cr = result0 * alphar;
+            Ci = result1 * alphar;
+            Cr -= result1 * alphai;
+            Ci += result0 * alphai;
+            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
+            Cr = result2 * alphar;
+            Ci = result3 * alphar;
+            Cr -= result3 * alphai;
+            Ci += result2 * alphai;
+            C[(ci + 1 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 1 * ldc + 0) * 2 + 1] = Ci;
+            m_top += 1;
+        }
+
+        n_top += 2;
+    }
+
+    // -- tails for N=1
+
+    if (N & 1) {
+        gvl = __riscv_vsetvl_e64m2(4);
+        m_top = 0;
+
+        for (BLASLONG i = 0; i < M / 4; i += 1) {
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 4 * 2;
+            bi += off * 1 * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 4;
+#else
+            pass_K = off + 1;
+#endif
+#endif
+            double B0r = B[bi + 0 * 2 + 0];
+            double B0i = B[bi + 0 * 2 + 1];
+            bi += 1 * 2;
+
+            vfloat64m2_t A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+            vfloat64m2_t A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ai += 4 * 2;
+
+            // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k
+            // leaving 12 vector registers for temporaries
+            vfloat64m2_t tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl);
+            vfloat64m2_t tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl);
+            tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+            vfloat64m2_t ACC0r = tmp0r;
+            vfloat64m2_t ACC0i = tmp0i;
+
+            for (BLASLONG k = 1; k < pass_K; k++) {
+                B0r = B[bi + 0 * 2 + 0];
+                B0i = B[bi + 0 * 2 + 1];
+                bi += 1 * 2;
+
+                A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+                A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+                ai += 4 * 2;
+
+                tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl);
+                tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+                ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat64m2_t C0r = __riscv_vfmul(ACC0r, alphar, gvl);
+            vfloat64m2_t C0i = __riscv_vfmul(ACC0i, alphar, gvl);
+            C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl);
+            C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl);
+            __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl);
+            __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl);
+
+            m_top += 4;
+        }
+
+        if (M & 2) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 2 * 2;
+            bi += off * 1 * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 2;
+#else
+            pass_K = off + 1;
+#endif
+#endif
+
+            for (BLASLONG k = 0; k < pass_K; k++) {
+                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
+                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
+                result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1];
+                result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1];
+                ai += 2 * 2;
+                bi += 1 * 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            double Cr, Ci;
+            Cr = result0 * alphar;
+            Ci = result1 * alphar;
+            Cr -= result1 * alphai;
+            Ci += result0 * alphai;
+            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
+            Cr = result2 * alphar;
+            Ci = result3 * alphar;
+            Cr -= result3 * alphai;
+            Ci += result2 * alphai;
+            C[(ci + 0 * ldc + 1) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 1) * 2 + 1] = Ci;
+            m_top += 2;
+        }
+
+        if (M & 1) {
+            double result0 = 0;
+            double result1 = 0;
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 1 * 2;
+            bi += off * 1 * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 1;
+#else
+            pass_K = off + 1;
+#endif
+#endif
+
+            for (BLASLONG k = 0; k < pass_K; k++) {
+                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
+                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
+                ai += 1 * 2;
+                bi += 1 * 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            double Cr, Ci;
+            Cr = result0 * alphar;
+            Ci = result1 * alphar;
+            Cr -= result1 * alphai;
+            Ci += result0 * alphai;
+            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
+            m_top += 1;
+        }
+
+        n_top += 1;
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/ztrmm_kernel_8x4_zvl256b.c b/kernel/riscv64/ztrmm_kernel_8x4_zvl256b.c
new file mode 100644
index 000000000..de7622b89
--- /dev/null
+++ b/kernel/riscv64/ztrmm_kernel_8x4_zvl256b.c
@@ -0,0 +1,1337 @@
+/*
+
+AUTOGENERATED KERNEL
+Settings:
+ LMUL=1
+ M=8
+ M_tail_scalar_from=1
+ N=4
+ __riscv_='__riscv_'
+ complex=True
+ conjugate=False
+ cpu='zvl256b'
+ force_acc_double=False
+ index_type='BLASLONG'
+ op='trmm'
+ param_precision='double'
+ reg_width_bits=256
+ tail_policy=''
+ trace=False
+
+Derived:
+ ELEN_ACC=64
+ ELEN_PARAM=64
+ LMUL_ACC=1
+ VFMACC='__riscv_vfmacc_vf_f64m1'
+ VFMUL='__riscv_vfmul_vf_f64m1'
+ VLEV='__riscv_vle64_v_f64m1'
+ VLSEV='__riscv_vlse64_v_f64m1'
+ VMACC_TO_ACC='__riscv_vfmacc_vf_f64m1'
+ VMUL_TO_ACC='__riscv_vfmul_vf_f64m1'
+ VSETVL='__riscv_vsetvl_e64m1'
+ VSEV='__riscv_vse64_v_f64m1'
+ VSSEV='__riscv_vsse64_v_f64m1'
+ acc_vector_t='vfloat64m1_t'
+ output='ztrmm_kernel_8x4_zvl256b.c'
+ param_scalar_t='double'
+ param_vector_t='vfloat64m1_t'
+
+*/
+
+#include "common.h"
+
+
+
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+    #define S0  1
+    #define S1 -1
+    #define S2  1
+    #define S3  1
+    #define VFMACC_RR __riscv_vfmsac
+    #define VFMACC_RI __riscv_vfmacc
+#endif
+#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
+    #define S0  1
+    #define S1  1
+    #define S2  1
+    #define S3 -1
+    #define VFMACC_RR __riscv_vfmacc
+    #define VFMACC_RI __riscv_vfmsac
+#endif
+#if   defined(RN) || defined(RT) || defined(CN) || defined(CT)
+    #define S0  1
+    #define S1  1
+    #define S2 -1
+    #define S3  1
+    #define VFMACC_RR __riscv_vfmacc
+    #define VFMACC_RI __riscv_vfnmsac
+#endif
+#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
+    #define S0  1
+    #define S1 -1
+    #define S2 -1
+    #define S3 -1
+    #define VFMACC_RR __riscv_vfmsac
+    #define VFMACC_RI __riscv_vfnmacc
+#endif
+
+
+#if defined(LEFT) != defined(TRANSA)
+    #define BACKWARDS
+#endif
+
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc, BLASLONG offset)
+
+{
+    BLASLONG gvl = 0;
+    BLASLONG m_top = 0;
+    BLASLONG n_top = 0;
+
+
+    // -- MAIN PASS
+
+    for (BLASLONG j=0; j<N/4; j+=1) {
+        m_top = 0;
+        BLASLONG gvl = __riscv_vsetvl_e64m1(4);
+
+
+        for (BLASLONG i=0; i<M/8; i+=1) {
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*8*2;
+                bi += off*4*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 8;
+                #else
+                    pass_K = off + 4;
+                #endif
+            #endif
+            double B0r = B[bi+0*2+0];
+            double B0i = B[bi+0*2+1];
+            double B1r = B[bi+1*2+0];
+            double B1i = B[bi+1*2+1];
+            double B2r = B[bi+2*2+0];
+            double B2i = B[bi+2*2+1];
+            double B3r = B[bi+3*2+0];
+            double B3i = B[bi+3*2+1];
+            bi += 4*2;
+
+            vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A1r = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A1i = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 8*2;
+
+            // 4 vector regs to hold A array contents, 16 regs to hold values accumulated over k
+            // leaving 12 vector registers for temporaries
+            // performing 4 operations between reuses of temporaries
+            vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+            vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+            vfloat64m1_t tmp1r = __riscv_vfmul_vf_f64m1( A1i, B0i, gvl);
+            vfloat64m1_t tmp1i = __riscv_vfmul_vf_f64m1( A1r, B0i, gvl);
+            vfloat64m1_t tmp2r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
+            vfloat64m1_t tmp2i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
+            vfloat64m1_t tmp3r = __riscv_vfmul_vf_f64m1( A1i, B1i, gvl);
+            vfloat64m1_t tmp3i = __riscv_vfmul_vf_f64m1( A1r, B1i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B0r, A1r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B0r, A1i, gvl);
+            tmp2r = VFMACC_RR( tmp2r, B1r, A0r, gvl);
+            tmp2i = VFMACC_RI( tmp2i, B1r, A0i, gvl);
+            tmp3r = VFMACC_RR( tmp3r, B1r, A1r, gvl);
+            tmp3i = VFMACC_RI( tmp3i, B1r, A1i, gvl);
+            vfloat64m1_t ACC0r = tmp0r;
+            vfloat64m1_t ACC0i = tmp0i;
+            vfloat64m1_t ACC1r = tmp1r;
+            vfloat64m1_t ACC1i = tmp1i;
+            vfloat64m1_t ACC2r = tmp2r;
+            vfloat64m1_t ACC2i = tmp2i;
+            vfloat64m1_t ACC3r = tmp3r;
+            vfloat64m1_t ACC3i = tmp3i;
+            tmp0r = __riscv_vfmul_vf_f64m1( A0i, B2i, gvl);
+            tmp0i = __riscv_vfmul_vf_f64m1( A0r, B2i, gvl);
+            tmp1r = __riscv_vfmul_vf_f64m1( A1i, B2i, gvl);
+            tmp1i = __riscv_vfmul_vf_f64m1( A1r, B2i, gvl);
+            tmp2r = __riscv_vfmul_vf_f64m1( A0i, B3i, gvl);
+            tmp2i = __riscv_vfmul_vf_f64m1( A0r, B3i, gvl);
+            tmp3r = __riscv_vfmul_vf_f64m1( A1i, B3i, gvl);
+            tmp3i = __riscv_vfmul_vf_f64m1( A1r, B3i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B2r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B2r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B2r, A1r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B2r, A1i, gvl);
+            tmp2r = VFMACC_RR( tmp2r, B3r, A0r, gvl);
+            tmp2i = VFMACC_RI( tmp2i, B3r, A0i, gvl);
+            tmp3r = VFMACC_RR( tmp3r, B3r, A1r, gvl);
+            tmp3i = VFMACC_RI( tmp3i, B3r, A1i, gvl);
+            vfloat64m1_t ACC4r = tmp0r;
+            vfloat64m1_t ACC4i = tmp0i;
+            vfloat64m1_t ACC5r = tmp1r;
+            vfloat64m1_t ACC5i = tmp1i;
+            vfloat64m1_t ACC6r = tmp2r;
+            vfloat64m1_t ACC6i = tmp2i;
+            vfloat64m1_t ACC7r = tmp3r;
+            vfloat64m1_t ACC7i = tmp3i;
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                B2r = B[bi+2*2+0];
+                B2i = B[bi+2*2+1];
+                B3r = B[bi+3*2+0];
+                B3i = B[bi+3*2+1];
+                bi += 4*2;
+
+                A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                A1r = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2], sizeof(FLOAT)*2, gvl );
+                A1i = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 8*2;
+
+                tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f64m1( A1i, B0i, gvl);
+                tmp1i = __riscv_vfmul_vf_f64m1( A1r, B0i, gvl);
+                tmp2r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
+                tmp2i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
+                tmp3r = __riscv_vfmul_vf_f64m1( A1i, B1i, gvl);
+                tmp3i = __riscv_vfmul_vf_f64m1( A1r, B1i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B0r, A1r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B0r, A1i, gvl);
+                tmp2r = VFMACC_RR( tmp2r, B1r, A0r, gvl);
+                tmp2i = VFMACC_RI( tmp2i, B1r, A0i, gvl);
+                tmp3r = VFMACC_RR( tmp3r, B1r, A1r, gvl);
+                tmp3i = VFMACC_RI( tmp3i, B1r, A1i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+                ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
+                ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
+                ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
+                ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
+                tmp0r = __riscv_vfmul_vf_f64m1( A0i, B2i, gvl);
+                tmp0i = __riscv_vfmul_vf_f64m1( A0r, B2i, gvl);
+                tmp1r = __riscv_vfmul_vf_f64m1( A1i, B2i, gvl);
+                tmp1i = __riscv_vfmul_vf_f64m1( A1r, B2i, gvl);
+                tmp2r = __riscv_vfmul_vf_f64m1( A0i, B3i, gvl);
+                tmp2i = __riscv_vfmul_vf_f64m1( A0r, B3i, gvl);
+                tmp3r = __riscv_vfmul_vf_f64m1( A1i, B3i, gvl);
+                tmp3i = __riscv_vfmul_vf_f64m1( A1r, B3i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B2r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B2r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B2r, A1r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B2r, A1i, gvl);
+                tmp2r = VFMACC_RR( tmp2r, B3r, A0r, gvl);
+                tmp2i = VFMACC_RI( tmp2i, B3r, A0i, gvl);
+                tmp3r = VFMACC_RR( tmp3r, B3r, A1r, gvl);
+                tmp3i = VFMACC_RI( tmp3i, B3r, A1i, gvl);
+                ACC4r = __riscv_vfadd( ACC4r, tmp0r, gvl);
+                ACC4i = __riscv_vfadd( ACC4i, tmp0i, gvl);
+                ACC5r = __riscv_vfadd( ACC5r, tmp1r, gvl);
+                ACC5i = __riscv_vfadd( ACC5i, tmp1i, gvl);
+                ACC6r = __riscv_vfadd( ACC6r, tmp2r, gvl);
+                ACC6i = __riscv_vfadd( ACC6i, tmp2i, gvl);
+                ACC7r = __riscv_vfadd( ACC7r, tmp3r, gvl);
+                ACC7i = __riscv_vfadd( ACC7i, tmp3i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
+            vfloat64m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
+            vfloat64m1_t C1r = __riscv_vfmul( ACC1r, alphar, gvl );
+            vfloat64m1_t C1i = __riscv_vfmul( ACC1i, alphar, gvl );
+            vfloat64m1_t C2r = __riscv_vfmul( ACC2r, alphar, gvl );
+            vfloat64m1_t C2i = __riscv_vfmul( ACC2i, alphar, gvl );
+            vfloat64m1_t C3r = __riscv_vfmul( ACC3r, alphar, gvl );
+            vfloat64m1_t C3i = __riscv_vfmul( ACC3i, alphar, gvl );
+            vfloat64m1_t C4r = __riscv_vfmul( ACC4r, alphar, gvl );
+            vfloat64m1_t C4i = __riscv_vfmul( ACC4i, alphar, gvl );
+            vfloat64m1_t C5r = __riscv_vfmul( ACC5r, alphar, gvl );
+            vfloat64m1_t C5i = __riscv_vfmul( ACC5i, alphar, gvl );
+            vfloat64m1_t C6r = __riscv_vfmul( ACC6r, alphar, gvl );
+            vfloat64m1_t C6i = __riscv_vfmul( ACC6i, alphar, gvl );
+            vfloat64m1_t C7r = __riscv_vfmul( ACC7r, alphar, gvl );
+            vfloat64m1_t C7i = __riscv_vfmul( ACC7i, alphar, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+            C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
+            C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
+            C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
+            C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
+            C4r = __riscv_vfnmsac( C4r, alphai, ACC4i, gvl );
+            C4i = __riscv_vfmacc ( C4i, alphai, ACC4r, gvl );
+            C5r = __riscv_vfnmsac( C5r, alphai, ACC5i, gvl );
+            C5i = __riscv_vfmacc ( C5i, alphai, ACC5r, gvl );
+            C6r = __riscv_vfnmsac( C6r, alphai, ACC6i, gvl );
+            C6i = __riscv_vfmacc ( C6i, alphai, ACC6r, gvl );
+            C7r = __riscv_vfnmsac( C7r, alphai, ACC7i, gvl );
+            C7i = __riscv_vfmacc ( C7i, alphai, ACC7r, gvl );
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+             ci += gvl;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            ci += ldc-gvl*1;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
+             ci += gvl;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
+            ci += ldc-gvl*1;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C4r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C4i, gvl);
+             ci += gvl;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C5r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C5i, gvl);
+            ci += ldc-gvl*1;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C6r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C6i, gvl);
+             ci += gvl;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C7r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C7i, gvl);
+            
+            m_top += 8;
+        }
+
+
+
+        // -- tails for main pass
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e64m1(4);
+
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*4*2;
+                bi += off*4*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 4;
+                #else
+                    pass_K = off + 4;
+                #endif
+            #endif
+            double B0r = B[bi+0*2+0];
+            double B0i = B[bi+0*2+1];
+            double B1r = B[bi+1*2+0];
+            double B1i = B[bi+1*2+1];
+            double B2r = B[bi+2*2+0];
+            double B2i = B[bi+2*2+1];
+            double B3r = B[bi+3*2+0];
+            double B3i = B[bi+3*2+1];
+            bi += 4*2;
+
+            vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 4*2;
+
+            // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k
+            // leaving 22 vector registers for temporaries
+            vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+            vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+            vfloat64m1_t tmp1r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
+            vfloat64m1_t tmp1i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
+            vfloat64m1_t tmp2r = __riscv_vfmul_vf_f64m1( A0i, B2i, gvl);
+            vfloat64m1_t tmp2i = __riscv_vfmul_vf_f64m1( A0r, B2i, gvl);
+            vfloat64m1_t tmp3r = __riscv_vfmul_vf_f64m1( A0i, B3i, gvl);
+            vfloat64m1_t tmp3i = __riscv_vfmul_vf_f64m1( A0r, B3i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+            tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+            tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+            tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+            tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+            vfloat64m1_t ACC0r = tmp0r;
+            vfloat64m1_t ACC0i = tmp0i;
+            vfloat64m1_t ACC1r = tmp1r;
+            vfloat64m1_t ACC1i = tmp1i;
+            vfloat64m1_t ACC2r = tmp2r;
+            vfloat64m1_t ACC2i = tmp2i;
+            vfloat64m1_t ACC3r = tmp3r;
+            vfloat64m1_t ACC3i = tmp3i;
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                B2r = B[bi+2*2+0];
+                B2i = B[bi+2*2+1];
+                B3r = B[bi+3*2+0];
+                B3i = B[bi+3*2+1];
+                bi += 4*2;
+
+                A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 4*2;
+
+                tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
+                tmp2r = __riscv_vfmul_vf_f64m1( A0i, B2i, gvl);
+                tmp2i = __riscv_vfmul_vf_f64m1( A0r, B2i, gvl);
+                tmp3r = __riscv_vfmul_vf_f64m1( A0i, B3i, gvl);
+                tmp3i = __riscv_vfmul_vf_f64m1( A0r, B3i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+                tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+                tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+                tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+                tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+                ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
+                ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
+                ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
+                ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
+            vfloat64m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
+            vfloat64m1_t C1r = __riscv_vfmul( ACC1r, alphar, gvl );
+            vfloat64m1_t C1i = __riscv_vfmul( ACC1i, alphar, gvl );
+            vfloat64m1_t C2r = __riscv_vfmul( ACC2r, alphar, gvl );
+            vfloat64m1_t C2i = __riscv_vfmul( ACC2i, alphar, gvl );
+            vfloat64m1_t C3r = __riscv_vfmul( ACC3r, alphar, gvl );
+            vfloat64m1_t C3i = __riscv_vfmul( ACC3i, alphar, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+            C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
+            C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
+            C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
+            C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
+            
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            gvl = __riscv_vsetvl_e64m1(2);
+
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*2*2;
+                bi += off*4*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 2;
+                #else
+                    pass_K = off + 4;
+                #endif
+            #endif
+            double B0r = B[bi+0*2+0];
+            double B0i = B[bi+0*2+1];
+            double B1r = B[bi+1*2+0];
+            double B1i = B[bi+1*2+1];
+            double B2r = B[bi+2*2+0];
+            double B2i = B[bi+2*2+1];
+            double B3r = B[bi+3*2+0];
+            double B3i = B[bi+3*2+1];
+            bi += 4*2;
+
+            vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 2*2;
+
+            // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k
+            // leaving 22 vector registers for temporaries
+            vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+            vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+            vfloat64m1_t tmp1r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
+            vfloat64m1_t tmp1i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
+            vfloat64m1_t tmp2r = __riscv_vfmul_vf_f64m1( A0i, B2i, gvl);
+            vfloat64m1_t tmp2i = __riscv_vfmul_vf_f64m1( A0r, B2i, gvl);
+            vfloat64m1_t tmp3r = __riscv_vfmul_vf_f64m1( A0i, B3i, gvl);
+            vfloat64m1_t tmp3i = __riscv_vfmul_vf_f64m1( A0r, B3i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+            tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+            tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+            tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+            tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+            vfloat64m1_t ACC0r = tmp0r;
+            vfloat64m1_t ACC0i = tmp0i;
+            vfloat64m1_t ACC1r = tmp1r;
+            vfloat64m1_t ACC1i = tmp1i;
+            vfloat64m1_t ACC2r = tmp2r;
+            vfloat64m1_t ACC2i = tmp2i;
+            vfloat64m1_t ACC3r = tmp3r;
+            vfloat64m1_t ACC3i = tmp3i;
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                B2r = B[bi+2*2+0];
+                B2i = B[bi+2*2+1];
+                B3r = B[bi+3*2+0];
+                B3i = B[bi+3*2+1];
+                bi += 4*2;
+
+                A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 2*2;
+
+                tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
+                tmp2r = __riscv_vfmul_vf_f64m1( A0i, B2i, gvl);
+                tmp2i = __riscv_vfmul_vf_f64m1( A0r, B2i, gvl);
+                tmp3r = __riscv_vfmul_vf_f64m1( A0i, B3i, gvl);
+                tmp3i = __riscv_vfmul_vf_f64m1( A0r, B3i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+                tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+                tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+                tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+                tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+                ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
+                ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
+                ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
+                ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
+            vfloat64m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
+            vfloat64m1_t C1r = __riscv_vfmul( ACC1r, alphar, gvl );
+            vfloat64m1_t C1i = __riscv_vfmul( ACC1i, alphar, gvl );
+            vfloat64m1_t C2r = __riscv_vfmul( ACC2r, alphar, gvl );
+            vfloat64m1_t C2i = __riscv_vfmul( ACC2i, alphar, gvl );
+            vfloat64m1_t C3r = __riscv_vfmul( ACC3r, alphar, gvl );
+            vfloat64m1_t C3i = __riscv_vfmul( ACC3i, alphar, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+            C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
+            C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
+            C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
+            C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
+            
+            m_top += 2;
+        }
+
+
+        if( M & 1 ) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            double result4 = 0;
+            double result5 = 0;
+            double result6 = 0;
+            double result7 = 0;
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*1*2;
+                bi += off*4*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 1;
+                #else
+                    pass_K = off + 4;
+                #endif
+            #endif
+
+            for(BLASLONG k=0; k<pass_K; k++) {
+                result0+=S0*A[ai+0+0]*B[bi+0+0] + S1*A[ai+0+1]*B[bi+0+1];
+                result1+=S2*A[ai+0+1]*B[bi+0+0] + S3*A[ai+0+0]*B[bi+0+1];
+                result2+=S0*A[ai+0+0]*B[bi+2+0] + S1*A[ai+0+1]*B[bi+2+1];
+                result3+=S2*A[ai+0+1]*B[bi+2+0] + S3*A[ai+0+0]*B[bi+2+1];
+                result4+=S0*A[ai+0+0]*B[bi+4+0] + S1*A[ai+0+1]*B[bi+4+1];
+                result5+=S2*A[ai+0+1]*B[bi+4+0] + S3*A[ai+0+0]*B[bi+4+1];
+                result6+=S0*A[ai+0+0]*B[bi+6+0] + S1*A[ai+0+1]*B[bi+6+1];
+                result7+=S2*A[ai+0+1]*B[bi+6+0] + S3*A[ai+0+0]*B[bi+6+1];
+                ai+=1*2;
+                bi+=4*2;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            double Cr, Ci;
+            Cr = result0*alphar;
+            Ci = result1*alphar;
+            Cr -= result1*alphai;
+            Ci += result0*alphai;
+            C[(ci+0*ldc+0)*2+0] = Cr;
+            C[(ci+0*ldc+0)*2+1] = Ci;
+            Cr = result2*alphar;
+            Ci = result3*alphar;
+            Cr -= result3*alphai;
+            Ci += result2*alphai;
+            C[(ci+1*ldc+0)*2+0] = Cr;
+            C[(ci+1*ldc+0)*2+1] = Ci;
+            Cr = result4*alphar;
+            Ci = result5*alphar;
+            Cr -= result5*alphai;
+            Ci += result4*alphai;
+            C[(ci+2*ldc+0)*2+0] = Cr;
+            C[(ci+2*ldc+0)*2+1] = Ci;
+            Cr = result6*alphar;
+            Ci = result7*alphar;
+            Cr -= result7*alphai;
+            Ci += result6*alphai;
+            C[(ci+3*ldc+0)*2+0] = Cr;
+            C[(ci+3*ldc+0)*2+1] = Ci;
+            m_top+=1;
+        }
+
+        n_top += 4;
+    }
+
+
+
+    // -- tails for N=2
+
+    if( N & 2 ) {
+        gvl = __riscv_vsetvl_e64m1(4);
+        m_top = 0;
+
+        for (BLASLONG i=0; i<M/8; i+=1) {
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*8*2;
+                bi += off*2*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 8;
+                #else
+                    pass_K = off + 2;
+                #endif
+            #endif
+            double B0r = B[bi+0*2+0];
+            double B0i = B[bi+0*2+1];
+            double B1r = B[bi+1*2+0];
+            double B1i = B[bi+1*2+1];
+            bi += 2*2;
+
+            vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A1r = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A1i = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 8*2;
+
+            // 4 vector regs to hold A array contents, 8 regs to hold values accumulated over k
+            // leaving 20 vector registers for temporaries
+            vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+            vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+            vfloat64m1_t tmp1r = __riscv_vfmul_vf_f64m1( A1i, B0i, gvl);
+            vfloat64m1_t tmp1i = __riscv_vfmul_vf_f64m1( A1r, B0i, gvl);
+            vfloat64m1_t tmp2r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
+            vfloat64m1_t tmp2i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
+            vfloat64m1_t tmp3r = __riscv_vfmul_vf_f64m1( A1i, B1i, gvl);
+            vfloat64m1_t tmp3i = __riscv_vfmul_vf_f64m1( A1r, B1i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B0r, A1r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B0r, A1i, gvl);
+            tmp2r = VFMACC_RR( tmp2r, B1r, A0r, gvl);
+            tmp2i = VFMACC_RI( tmp2i, B1r, A0i, gvl);
+            tmp3r = VFMACC_RR( tmp3r, B1r, A1r, gvl);
+            tmp3i = VFMACC_RI( tmp3i, B1r, A1i, gvl);
+            vfloat64m1_t ACC0r = tmp0r;
+            vfloat64m1_t ACC0i = tmp0i;
+            vfloat64m1_t ACC1r = tmp1r;
+            vfloat64m1_t ACC1i = tmp1i;
+            vfloat64m1_t ACC2r = tmp2r;
+            vfloat64m1_t ACC2i = tmp2i;
+            vfloat64m1_t ACC3r = tmp3r;
+            vfloat64m1_t ACC3i = tmp3i;
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                bi += 2*2;
+
+                A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                A1r = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2], sizeof(FLOAT)*2, gvl );
+                A1i = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 8*2;
+
+                tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f64m1( A1i, B0i, gvl);
+                tmp1i = __riscv_vfmul_vf_f64m1( A1r, B0i, gvl);
+                tmp2r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
+                tmp2i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
+                tmp3r = __riscv_vfmul_vf_f64m1( A1i, B1i, gvl);
+                tmp3i = __riscv_vfmul_vf_f64m1( A1r, B1i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B0r, A1r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B0r, A1i, gvl);
+                tmp2r = VFMACC_RR( tmp2r, B1r, A0r, gvl);
+                tmp2i = VFMACC_RI( tmp2i, B1r, A0i, gvl);
+                tmp3r = VFMACC_RR( tmp3r, B1r, A1r, gvl);
+                tmp3i = VFMACC_RI( tmp3i, B1r, A1i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+                ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
+                ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
+                ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
+                ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
+            vfloat64m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
+            vfloat64m1_t C1r = __riscv_vfmul( ACC1r, alphar, gvl );
+            vfloat64m1_t C1i = __riscv_vfmul( ACC1i, alphar, gvl );
+            vfloat64m1_t C2r = __riscv_vfmul( ACC2r, alphar, gvl );
+            vfloat64m1_t C2i = __riscv_vfmul( ACC2i, alphar, gvl );
+            vfloat64m1_t C3r = __riscv_vfmul( ACC3r, alphar, gvl );
+            vfloat64m1_t C3i = __riscv_vfmul( ACC3i, alphar, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+            C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
+            C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
+            C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
+            C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+             ci += gvl;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            ci += ldc-gvl*1;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
+             ci += gvl;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
+            
+            m_top += 8;
+        }
+
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e64m1(4);
+
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*4*2;
+                bi += off*2*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 4;
+                #else
+                    pass_K = off + 2;
+                #endif
+            #endif
+            double B0r = B[bi+0*2+0];
+            double B0i = B[bi+0*2+1];
+            double B1r = B[bi+1*2+0];
+            double B1i = B[bi+1*2+1];
+            bi += 2*2;
+
+            vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 4*2;
+
+            // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k
+            // leaving 26 vector registers for temporaries
+            vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+            vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+            vfloat64m1_t tmp1r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
+            vfloat64m1_t tmp1i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+            vfloat64m1_t ACC0r = tmp0r;
+            vfloat64m1_t ACC0i = tmp0i;
+            vfloat64m1_t ACC1r = tmp1r;
+            vfloat64m1_t ACC1i = tmp1i;
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                bi += 2*2;
+
+                A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 4*2;
+
+                tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
+            vfloat64m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
+            vfloat64m1_t C1r = __riscv_vfmul( ACC1r, alphar, gvl );
+            vfloat64m1_t C1i = __riscv_vfmul( ACC1i, alphar, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            gvl = __riscv_vsetvl_e64m1(2);
+
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*2*2;
+                bi += off*2*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 2;
+                #else
+                    pass_K = off + 2;
+                #endif
+            #endif
+            double B0r = B[bi+0*2+0];
+            double B0i = B[bi+0*2+1];
+            double B1r = B[bi+1*2+0];
+            double B1i = B[bi+1*2+1];
+            bi += 2*2;
+
+            vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 2*2;
+
+            // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k
+            // leaving 26 vector registers for temporaries
+            vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+            vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+            vfloat64m1_t tmp1r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
+            vfloat64m1_t tmp1i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+            vfloat64m1_t ACC0r = tmp0r;
+            vfloat64m1_t ACC0i = tmp0i;
+            vfloat64m1_t ACC1r = tmp1r;
+            vfloat64m1_t ACC1i = tmp1i;
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                bi += 2*2;
+
+                A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 2*2;
+
+                tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
+            vfloat64m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
+            vfloat64m1_t C1r = __riscv_vfmul( ACC1r, alphar, gvl );
+            vfloat64m1_t C1i = __riscv_vfmul( ACC1i, alphar, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            
+            m_top += 2;
+        }
+
+
+        if( M & 1 ) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*1*2;
+                bi += off*2*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 1;
+                #else
+                    pass_K = off + 2;
+                #endif
+            #endif
+
+            for(BLASLONG k=0; k<pass_K; k++) {
+                result0+=S0*A[ai+0+0]*B[bi+0+0] + S1*A[ai+0+1]*B[bi+0+1];
+                result1+=S2*A[ai+0+1]*B[bi+0+0] + S3*A[ai+0+0]*B[bi+0+1];
+                result2+=S0*A[ai+0+0]*B[bi+2+0] + S1*A[ai+0+1]*B[bi+2+1];
+                result3+=S2*A[ai+0+1]*B[bi+2+0] + S3*A[ai+0+0]*B[bi+2+1];
+                ai+=1*2;
+                bi+=2*2;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            double Cr, Ci;
+            Cr = result0*alphar;
+            Ci = result1*alphar;
+            Cr -= result1*alphai;
+            Ci += result0*alphai;
+            C[(ci+0*ldc+0)*2+0] = Cr;
+            C[(ci+0*ldc+0)*2+1] = Ci;
+            Cr = result2*alphar;
+            Ci = result3*alphar;
+            Cr -= result3*alphai;
+            Ci += result2*alphai;
+            C[(ci+1*ldc+0)*2+0] = Cr;
+            C[(ci+1*ldc+0)*2+1] = Ci;
+            m_top+=1;
+        }
+
+        n_top += 2;
+    }
+
+
+
+    // -- tails for N=1
+
+    if( N & 1 ) {
+        gvl = __riscv_vsetvl_e64m1(4);
+        m_top = 0;
+
+        for (BLASLONG i=0; i<M/8; i+=1) {
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*8*2;
+                bi += off*1*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 8;
+                #else
+                    pass_K = off + 1;
+                #endif
+            #endif
+            double B0r = B[bi+0*2+0];
+            double B0i = B[bi+0*2+1];
+            bi += 1*2;
+
+            vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A1r = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A1i = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 8*2;
+
+            // 4 vector regs to hold A array contents, 4 regs to hold values accumulated over k
+            // leaving 24 vector registers for temporaries
+            vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+            vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+            vfloat64m1_t tmp1r = __riscv_vfmul_vf_f64m1( A1i, B0i, gvl);
+            vfloat64m1_t tmp1i = __riscv_vfmul_vf_f64m1( A1r, B0i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B0r, A1r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B0r, A1i, gvl);
+            vfloat64m1_t ACC0r = tmp0r;
+            vfloat64m1_t ACC0i = tmp0i;
+            vfloat64m1_t ACC1r = tmp1r;
+            vfloat64m1_t ACC1i = tmp1i;
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                bi += 1*2;
+
+                A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                A1r = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2], sizeof(FLOAT)*2, gvl );
+                A1i = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 8*2;
+
+                tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f64m1( A1i, B0i, gvl);
+                tmp1i = __riscv_vfmul_vf_f64m1( A1r, B0i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B0r, A1r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B0r, A1i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
+            vfloat64m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
+            vfloat64m1_t C1r = __riscv_vfmul( ACC1r, alphar, gvl );
+            vfloat64m1_t C1i = __riscv_vfmul( ACC1i, alphar, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+             ci += gvl;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            
+            m_top += 8;
+        }
+
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e64m1(4);
+
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*4*2;
+                bi += off*1*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 4;
+                #else
+                    pass_K = off + 1;
+                #endif
+            #endif
+            double B0r = B[bi+0*2+0];
+            double B0i = B[bi+0*2+1];
+            bi += 1*2;
+
+            vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 4*2;
+
+            // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k
+            // leaving 28 vector registers for temporaries
+            vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+            vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            vfloat64m1_t ACC0r = tmp0r;
+            vfloat64m1_t ACC0i = tmp0i;
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                bi += 1*2;
+
+                A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 4*2;
+
+                tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
+            vfloat64m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            gvl = __riscv_vsetvl_e64m1(2);
+
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*2*2;
+                bi += off*1*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 2;
+                #else
+                    pass_K = off + 1;
+                #endif
+            #endif
+            double B0r = B[bi+0*2+0];
+            double B0i = B[bi+0*2+1];
+            bi += 1*2;
+
+            vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 2*2;
+
+            // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k
+            // leaving 28 vector registers for temporaries
+            vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+            vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            vfloat64m1_t ACC0r = tmp0r;
+            vfloat64m1_t ACC0i = tmp0i;
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                bi += 1*2;
+
+                A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 2*2;
+
+                tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
+            vfloat64m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            
+            m_top += 2;
+        }
+
+
+        if( M & 1 ) {
+            double result0 = 0;
+            double result1 = 0;
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*1*2;
+                bi += off*1*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 1;
+                #else
+                    pass_K = off + 1;
+                #endif
+            #endif
+
+            for(BLASLONG k=0; k<pass_K; k++) {
+                result0+=S0*A[ai+0+0]*B[bi+0+0] + S1*A[ai+0+1]*B[bi+0+1];
+                result1+=S2*A[ai+0+1]*B[bi+0+0] + S3*A[ai+0+0]*B[bi+0+1];
+                ai+=1*2;
+                bi+=1*2;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            double Cr, Ci;
+            Cr = result0*alphar;
+            Ci = result1*alphar;
+            Cr -= result1*alphai;
+            Ci += result0*alphai;
+            C[(ci+0*ldc+0)*2+0] = Cr;
+            C[(ci+0*ldc+0)*2+1] = Ci;
+            m_top+=1;
+        }
+
+        n_top += 1;
+    }
+
+    return 0;
+}
+
diff --git a/kernel/riscv64/ztrmm_lncopy_rvv_v1.c b/kernel/riscv64/ztrmm_lncopy_rvv_v1.c
new file mode 100644
index 000000000..ae664561b
--- /dev/null
+++ b/kernel/riscv64/ztrmm_lncopy_rvv_v1.c
@@ -0,0 +1,155 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include <stdio.h>
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define FLOAT_V_T               vfloat32m2_t
+#define FLOAT_VX2_T             vfloat32m2x2_t
+#define VGET_VX2                __riscv_vget_v_f32m2x2_f32m2
+#define VSET_VX2                __riscv_vset_v_f32m2_f32m2x2
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m2
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e32_v_f32m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2x2
+#define VBOOL_T                 vbool16_t
+#define UINT_V_T                vint32m2_t
+#define VID_V_UINT              __riscv_vid_v_i32m2
+#define VMSGTU_VX_UINT          __riscv_vmsgt_vx_i32m2_b16
+#define VMSEQ_VX_UINT           __riscv_vmseq_vx_i32m2_b16
+#define VFMERGE_VFM_FLOAT       __riscv_vfmerge_vfm_f32m2
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define FLOAT_V_T               vfloat64m2_t
+#define FLOAT_VX2_T             vfloat64m2x2_t
+#define VGET_VX2                __riscv_vget_v_f64m2x2_f64m2
+#define VSET_VX2                __riscv_vset_v_f64m2_f64m2x2
+#define VLEV_FLOAT              __riscv_vle64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m2
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e64_v_f64m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2x2
+#define VBOOL_T                 vbool32_t
+#define UINT_V_T                vuint64m2_t
+#define VID_V_UINT              __riscv_vid_v_u64m2
+#define VMSGTU_VX_UINT          __riscv_vmsgtu_vx_u64m2_b32
+#define VMSEQ_VX_UINT           __riscv_vmseq_vx_u64m2_b32
+#define VFMERGE_VFM_FLOAT       __riscv_vfmerge_vfm_f64m2
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
+
+    BLASLONG i, js, X;
+
+    FLOAT *ao;
+
+    BLASLONG stride_lda = sizeof(FLOAT)*lda*2;
+    
+    FLOAT_VX2_T vax2;
+    FLOAT_V_T va0, va1;
+
+    size_t vl;
+#ifdef UNIT
+    VBOOL_T vbool_eq;
+#endif
+
+    VBOOL_T vbool_cmp;
+    UINT_V_T vindex;
+
+    for (js = n; js > 0; js -= vl)
+    {
+        vl = VSETVL(js);
+        X = posX;
+
+        if (posX <= posY) 
+        {
+            ao = a + posY * 2 + posX * lda * 2;
+        } 
+        else 
+        {
+            ao = a + posX * 2 + posY * lda * 2;
+        }
+
+        i = 0;
+        do 
+        {
+            if (X > posY) 
+            {
+                vax2 = VLSSEG2_FLOAT(ao, stride_lda, vl);
+                VSSEG2_FLOAT(b, vax2, vl);
+
+                ao  += 2;
+                b   += vl * 2;
+
+                X ++;
+                i ++;
+            } 
+            else if (X < posY) 
+            {
+                ao  += lda * 2;
+                b   += vl * 2;
+                X ++;
+                i ++;
+            } 
+            else 
+            {
+                vindex  = VID_V_UINT(vl);
+                for (unsigned int j = 0; j < vl; j++) 
+                {
+                    vax2 = VLSSEG2_FLOAT(ao, stride_lda, vl);
+                    va0 = VGET_VX2(vax2, 0);
+                    va1 = VGET_VX2(vax2, 1);
+
+                    vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl);
+                    va0 = VFMERGE_VFM_FLOAT(va0, ZERO, vbool_cmp, vl);
+                    va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl);
+#ifdef UNIT
+                    vbool_eq = VMSEQ_VX_UINT(vindex, j, vl);
+                    va0 =  VFMERGE_VFM_FLOAT(va0, ONE, vbool_eq, vl);
+                    va1 =  VFMERGE_VFM_FLOAT(va1, ZERO, vbool_eq, vl);
+#endif
+                    vax2 = VSET_VX2(vax2, 0, va0);
+                    vax2 = VSET_VX2(vax2, 1, va1);
+                    VSSEG2_FLOAT(b, vax2, vl);
+                    ao  += 2;
+                    b   += vl * 2;
+                }
+
+                X += vl;
+                i += vl;
+            }
+        } while (i < m);
+
+        posY += vl;
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/ztrmm_ltcopy_rvv_v1.c b/kernel/riscv64/ztrmm_ltcopy_rvv_v1.c
new file mode 100644
index 000000000..ab8d34337
--- /dev/null
+++ b/kernel/riscv64/ztrmm_ltcopy_rvv_v1.c
@@ -0,0 +1,152 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include <stdio.h>
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define FLOAT_V_T               vfloat32m2_t
+#define FLOAT_VX2_T             vfloat32m2x2_t
+#define VGET_VX2                __riscv_vget_v_f32m2x2_f32m2
+#define VSET_VX2                __riscv_vset_v_f32m2_f32m2x2
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e32_v_f32m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2x2
+#define VBOOL_T                 vbool16_t
+#define UINT_V_T                vuint32m2_t
+#define VID_V_UINT              __riscv_vid_v_u32m2
+#define VMSLTU_VX_UINT          __riscv_vmsltu_vx_u32m2_b16
+#define VMSEQ_VX_UINT           __riscv_vmseq_vx_u32m2_b16
+#define VFMERGE_VFM_FLOAT       __riscv_vfmerge_vfm_f32m2
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define FLOAT_V_T               vfloat64m2_t
+#define FLOAT_VX2_T             vfloat64m2x2_t
+#define VGET_VX2                __riscv_vget_v_f64m2x2_f64m2
+#define VSET_VX2                __riscv_vset_v_f64m2_f64m2x2
+#define VLEV_FLOAT              __riscv_vle64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e64_v_f64m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2x2
+#define VBOOL_T                 vbool32_t
+#define UINT_V_T                vuint64m2_t
+#define VID_V_UINT              __riscv_vid_v_u64m2
+#define VMSLTU_VX_UINT          __riscv_vmsltu_vx_u64m2_b32
+#define VMSEQ_VX_UINT           __riscv_vmseq_vx_u64m2_b32
+#define VFMERGE_VFM_FLOAT       __riscv_vfmerge_vfm_f64m2
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
+
+    BLASLONG i, js, X;
+
+    FLOAT *ao;
+    
+    FLOAT_VX2_T vax2;
+    FLOAT_V_T va0, va1;
+    size_t vl;
+#ifdef UNIT
+    VBOOL_T vbool_eq;
+#endif
+
+    VBOOL_T vbool_cmp;
+    UINT_V_T vindex;
+
+    for (js = n; js > 0; js -= vl)
+    {
+        vl = VSETVL(js);
+        X = posX;
+
+        if (posX <= posY) 
+        {
+            ao = a + posY * 2 + posX * lda * 2;
+        } 
+        else 
+        {
+            ao = a + posX * 2 + posY * lda * 2;
+        }
+
+        i = 0;
+        do 
+        {
+            if (X > posY) 
+            {
+                ao  += 2;
+                b   += vl * 2;
+                X++;
+                i++;
+            } 
+            else if (X < posY) 
+            {
+                //va1 = VLEV_FLOAT(ao, vl);
+                vax2 = VLSEG2_FLOAT(ao, vl);
+                VSSEG2_FLOAT(b, vax2, vl);
+
+                ao  += lda * 2;
+                b   += vl * 2;
+                X ++;
+                i ++;
+            }
+            else
+            {
+                vindex  = VID_V_UINT(vl);
+                for (unsigned int j = 0; j < vl; j++) 
+                {
+                    //va1 = VLEV_FLOAT(ao, vl);
+                    vax2 = VLSEG2_FLOAT(ao, vl);
+                    va0 = VGET_VX2(vax2, 0);
+                    va1 = VGET_VX2(vax2, 1);
+
+                    vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl);
+                    va0 = VFMERGE_VFM_FLOAT(va0, ZERO, vbool_cmp, vl);
+                    va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl);
+#ifdef UNIT
+                    vbool_eq = VMSEQ_VX_UINT(vindex, j, vl);
+                    va0 =  VFMERGE_VFM_FLOAT(va0, ONE, vbool_eq, vl);
+                    va1 =  VFMERGE_VFM_FLOAT(va1, ZERO, vbool_eq, vl);
+#endif
+                    vax2 = VSET_VX2(vax2, 0, va0);
+                    vax2 = VSET_VX2(vax2, 1, va1);
+                    VSSEG2_FLOAT(b, vax2, vl);
+                    ao  += lda * 2;
+                    b   += vl * 2;
+                }
+                X += vl;
+                i += vl;
+
+            }
+        } while (i < m);
+
+        posY += vl;
+    }
+
+    return 0;
+}
+
diff --git a/kernel/riscv64/ztrmm_uncopy_rvv_v1.c b/kernel/riscv64/ztrmm_uncopy_rvv_v1.c
new file mode 100644
index 000000000..ba6e63b96
--- /dev/null
+++ b/kernel/riscv64/ztrmm_uncopy_rvv_v1.c
@@ -0,0 +1,154 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include <stdio.h>
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define FLOAT_V_T               vfloat32m2_t
+#define FLOAT_VX2_T             vfloat32m2x2_t
+#define VGET_VX2                __riscv_vget_v_f32m2x2_f32m2
+#define VSET_VX2                __riscv_vset_v_f32m2_f32m2x2
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e32_v_f32m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2x2
+#define VBOOL_T                 vbool16_t
+#define UINT_V_T                vuint32m2_t
+#define VID_V_UINT              __riscv_vid_v_u32m2
+#define VMSLTU_VX_UINT          __riscv_vmsltu_vx_u32m2_b16
+#define VMSEQ_VX_UINT           __riscv_vmseq_vx_u32m2_b16
+#define VFMERGE_VFM_FLOAT       __riscv_vfmerge_vfm_f32m2
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define FLOAT_V_T               vfloat64m2_t
+#define FLOAT_VX2_T             vfloat64m2x2_t
+#define VGET_VX2                __riscv_vget_v_f64m2x2_f64m2
+#define VSET_VX2                __riscv_vset_v_f64m2_f64m2x2
+#define VLEV_FLOAT              __riscv_vle64_v_f64m2
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e64_v_f64m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2x2
+#define VBOOL_T                 vbool32_t
+#define UINT_V_T                vuint64m2_t
+#define VID_V_UINT              __riscv_vid_v_u64m2
+#define VMSLTU_VX_UINT          __riscv_vmsltu_vx_u64m2_b32
+#define VMSEQ_VX_UINT           __riscv_vmseq_vx_u64m2_b32
+#define VFMERGE_VFM_FLOAT       __riscv_vfmerge_vfm_f64m2
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
+
+    BLASLONG i, js, X;
+    BLASLONG stride_lda = sizeof(FLOAT) * lda * 2;
+    FLOAT *ao;
+
+    FLOAT_VX2_T vax2;
+    FLOAT_V_T va0, va1;
+    size_t vl;
+
+#ifdef UNIT
+    VBOOL_T vbool_eq;
+#endif
+
+    VBOOL_T vbool_cmp;
+    UINT_V_T vindex;
+
+    for (js = n; js > 0; js -= vl)
+    {
+        vl = VSETVL(js);
+        X = posX;
+
+        if (posX <= posY) 
+        {
+            ao = a + posX * 2 + posY * lda * 2;
+        } 
+        else 
+        {
+            ao = a + posY * 2 + posX * lda * 2;
+        }
+
+        i = 0;
+        do
+        {
+            if (X < posY) 
+            {
+                vax2 = VLSSEG2_FLOAT(ao, stride_lda, vl);
+                VSSEG2_FLOAT(b, vax2, vl);
+
+                ao  += 2;
+                b   += vl * 2;
+
+                X++;
+                i++;
+            } 
+            else if (X > posY) 
+            {
+                ao  += lda * 2;
+                b   += vl * 2;
+
+                X++;
+                i++;
+            } 
+            else 
+            {
+                vindex  = VID_V_UINT(vl);
+                for (unsigned int j = 0; j < vl; j++) 
+                {
+                    vax2 = VLSSEG2_FLOAT(ao, stride_lda, vl);
+                    va0 = VGET_VX2(vax2, 0);
+                    va1 = VGET_VX2(vax2, 1);
+
+                    vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl);
+                    va0 = VFMERGE_VFM_FLOAT(va0, ZERO, vbool_cmp, vl);
+                    va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl);
+#ifdef UNIT
+                    vbool_eq = VMSEQ_VX_UINT(vindex, j, vl);
+                    va0 =  VFMERGE_VFM_FLOAT(va0, ONE, vbool_eq, vl);
+                    va1 =  VFMERGE_VFM_FLOAT(va1, ZERO, vbool_eq, vl);
+#endif
+                    vax2 = VSET_VX2(vax2, 0, va0);
+                    vax2 = VSET_VX2(vax2, 1, va1);
+                    VSSEG2_FLOAT(b, vax2, vl);
+                    ao  += 2;
+                    b   += vl * 2;
+                }
+
+                X += vl;
+                i += vl;
+            }
+        }while (i < m);
+
+        posY += vl;
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/ztrmm_utcopy_rvv_v1.c b/kernel/riscv64/ztrmm_utcopy_rvv_v1.c
new file mode 100644
index 000000000..a624fff54
--- /dev/null
+++ b/kernel/riscv64/ztrmm_utcopy_rvv_v1.c
@@ -0,0 +1,151 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include <stdio.h>
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define FLOAT_V_T               vfloat32m2_t
+#define FLOAT_VX2_T             vfloat32m2x2_t
+#define VGET_VX2                __riscv_vget_v_f32m2x2_f32m2
+#define VSET_VX2                __riscv_vset_v_f32m2_f32m2x2
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e32_v_f32m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2x2
+#define VBOOL_T                 vbool16_t
+#define UINT_V_T                vuint32m2_t
+#define VID_V_UINT              __riscv_vid_v_u32m2
+#define VMSGTU_VX_UINT          __riscv_vmsgtu_vx_u32m2_b16
+#define VMSEQ_VX_UINT           __riscv_vmseq_vx_u32m2_b16
+#define VFMERGE_VFM_FLOAT       __riscv_vfmerge_vfm_f32m2
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define FLOAT_V_T               vfloat64m2_t
+#define FLOAT_VX2_T             vfloat64m2x2_t
+#define VGET_VX2                __riscv_vget_v_f64m2x2_f64m2
+#define VSET_VX2                __riscv_vset_v_f64m2_f64m2x2
+#define VLEV_FLOAT              __riscv_vle64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e64_v_f64m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2x2
+#define VBOOL_T                 vbool32_t
+#define UINT_V_T                vuint64m2_t
+#define VID_V_UINT              __riscv_vid_v_u64m2
+#define VMSGTU_VX_UINT          __riscv_vmsgtu_vx_u64m2_b32
+#define VMSEQ_VX_UINT           __riscv_vmseq_vx_u64m2_b32
+#define VFMERGE_VFM_FLOAT       __riscv_vfmerge_vfm_f64m2
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
+
+    BLASLONG i, j, js, X;
+
+    FLOAT *ao;
+
+    FLOAT_VX2_T vax2;
+    FLOAT_V_T va0, va1;
+#ifdef UNIT
+    VBOOL_T vbool_eq;
+#endif
+
+    VBOOL_T vbool_cmp;
+    UINT_V_T vindex;
+
+    size_t vl;
+
+    for (js = n; js > 0; js -= vl)
+    {
+        vl = VSETVL(js);
+
+        X = posX;
+
+        if (posX <= posY) 
+        {
+            ao = a + posX * 2 + posY * lda * 2;
+        } 
+        else 
+        {
+            ao = a + posY * 2 + posX * lda * 2;
+        }
+
+        i = 0;
+        do
+        {
+            if (X < posY) 
+            {
+                ao  += 2;
+                b   += vl * 2;
+                X++;
+                i++;
+            }
+            else if (X > posY)
+            {
+                vax2 = VLSEG2_FLOAT(ao, vl);
+                VSSEG2_FLOAT(b, vax2, vl);
+                ao  += lda * 2;
+                b   += vl * 2;
+                X++;
+                i++;
+            }
+            else
+            {
+                vindex  = VID_V_UINT(vl);
+                for (j = 0; j < vl; j++) 
+                {
+                    vax2 = VLSEG2_FLOAT(ao, vl);
+                    va0 = VGET_VX2(vax2, 0);
+                    va1 = VGET_VX2(vax2, 1);
+
+                    vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl);
+                    va0 = VFMERGE_VFM_FLOAT(va0, ZERO, vbool_cmp, vl);
+                    va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl);
+#ifdef UNIT
+                    vbool_eq = VMSEQ_VX_UINT(vindex, j, vl);
+                    va0 =  VFMERGE_VFM_FLOAT(va0, ONE, vbool_eq, vl);
+                    va1 =  VFMERGE_VFM_FLOAT(va1, ZERO, vbool_eq, vl);
+#endif
+                    vax2 = VSET_VX2(vax2, 0, va0);
+                    vax2 = VSET_VX2(vax2, 1, va1);
+                    VSSEG2_FLOAT(b, vax2, vl);
+                    ao += lda * 2;
+                    b += vl * 2;
+                }
+                X += vl;
+                i += vl;
+            }
+        }while (i < m);
+        posY += vl;
+    }
+
+    return 0;
+}
+
diff --git a/kernel/riscv64/ztrmmkernel_2x2_rvv.c b/kernel/riscv64/ztrmmkernel_2x2_rvv.c
new file mode 100644
index 000000000..399124d2e
--- /dev/null
+++ b/kernel/riscv64/ztrmmkernel_2x2_rvv.c
@@ -0,0 +1,596 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m2()
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e32m1()
+#define FLOAT_V_T               vfloat32m2_t
+#define FLOAT_V_T_M1            vfloat32m1_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VLSEG4_FLOAT            __riscv_vlseg4e32_v_f32m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e32_v_f32m2
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m2
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f32m2
+#define VFMACCVV_FLOAT          __riscv_vfmacc_vv_f32m2
+#define VFNMSACVV_FLOAT         __riscv_vfnmsac_vv_f32m2
+#define VFREDSUMVS_FLOAT        __riscv_vfredusum_vs_f32m2_f32m1
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m2()
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e64m1()
+#define FLOAT_V_T               vfloat64m2_t
+#define FLOAT_V_T_M1            vfloat64m1_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m2
+#define VLSEG4_FLOAT            __riscv_vlseg4e64_v_f64m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e64_v_f64m2
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m2
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f64m2
+#define VFMACCVV_FLOAT          __riscv_vfmacc_vv_f64m2
+#define VFNMSACVV_FLOAT         __riscv_vfnmsac_vv_f64m2
+#define VFREDSUMVS_FLOAT        __riscv_vfredusum_vs_f64m2_f64m1
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
+#endif
+
+// Optimizes the implementation in ../generic/ztrmmkernel_2x2.c
+
+
+/********************************
+  ADD1 a*c
+  ADD2 b*c
+  ADD3 a*d
+  ADD4 b*d
+ *********************************/
+int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* ba,FLOAT* bb,
+        FLOAT* C,BLASLONG ldc, BLASLONG offset)
+{
+    BLASLONG i,j,k;
+    FLOAT *C0,*C1,*ptrba,*ptrbb;
+    FLOAT res0,res1;
+    BLASLONG off, temp;
+    
+    FLOAT_V_T va0, va1, va2, va3, vb0, vb1, vb2, vb3;
+    FLOAT_V_T vres0, vres1, vres2, vres3, vres4, vres5, vres6, vres7;
+    FLOAT_V_T_M1 v_m1_res0, v_m1_res1;
+    FLOAT_V_T_M1 v_z0 = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1);
+
+    size_t vl;
+    size_t vlmax = VSETVL_MAX;
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    off = -offset;
+#else
+    off = 0;
+#endif
+
+    for (j = bn/2; j > 0; j--)
+    {
+#if defined(TRMMKERNEL) &&  defined(LEFT)
+        off = offset;
+#endif
+        C0 = C;
+        C1 = C0+2*ldc;
+        ptrba = ba;
+
+        for (i = bm/2; i > 0; i--)
+        {
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*2*2;
+            ptrbb = bb+off*2*2;
+#endif
+
+            vres0 = VFMVVF_FLOAT(0.0, vlmax);
+            vres1 = VFMVVF_FLOAT(0.0, vlmax);
+            vres2 = VFMVVF_FLOAT(0.0, vlmax);
+            vres3 = VFMVVF_FLOAT(0.0, vlmax);
+            vres4 = VFMVVF_FLOAT(0.0, vlmax);
+            vres5 = VFMVVF_FLOAT(0.0, vlmax);
+            vres6 = VFMVVF_FLOAT(0.0, vlmax);
+            vres7 = VFMVVF_FLOAT(0.0, vlmax);
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk - off;
+#elif defined(LEFT)
+            temp = off + 2;
+#else
+            temp = off + 2;
+#endif
+
+            for (k = temp; k > 0; k -= vl)
+            {
+                vl = VSETVL(k);
+                VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl);
+                VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl);
+
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+                vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
+                vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl);
+                vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl);
+                vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl);
+
+                vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl);
+                vres3 = VFMACCVV_FLOAT(vres3, va3, vb0, vl);
+                vres2 = VFNMSACVV_FLOAT(vres2, va3, vb1, vl);
+                vres3 = VFMACCVV_FLOAT(vres3, va2, vb1, vl);
+
+                vres4 = VFMACCVV_FLOAT(vres4, va0, vb2, vl);
+                vres5 = VFMACCVV_FLOAT(vres5, va1, vb2, vl);
+                vres4 = VFNMSACVV_FLOAT(vres4, va1, vb3, vl);
+                vres5 = VFMACCVV_FLOAT(vres5, va0, vb3, vl);
+
+                vres6 = VFMACCVV_FLOAT(vres6, va2, vb2, vl);
+                vres7 = VFMACCVV_FLOAT(vres7, va3, vb2, vl);
+                vres6 = VFNMSACVV_FLOAT(vres6, va3, vb3, vl);
+                vres7 = VFMACCVV_FLOAT(vres7, va2, vb3, vl);
+#endif
+#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
+                vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
+                vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl);
+                vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl);
+                vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl);
+
+                vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl);
+                vres3 = VFMACCVV_FLOAT(vres3, va3, vb0, vl);
+                vres2 = VFMACCVV_FLOAT(vres2, va3, vb1, vl);
+                vres3 = VFNMSACVV_FLOAT(vres3, va2, vb1, vl);
+
+                vres4 = VFMACCVV_FLOAT(vres4, va0, vb2, vl);
+                vres5 = VFMACCVV_FLOAT(vres5, va1, vb2, vl);
+                vres4 = VFMACCVV_FLOAT(vres4, va1, vb3, vl);
+                vres5 = VFNMSACVV_FLOAT(vres5, va0, vb3, vl);
+
+                vres6 = VFMACCVV_FLOAT(vres6, va2, vb2, vl);
+                vres7 = VFMACCVV_FLOAT(vres7, va3, vb2, vl);
+                vres6 = VFMACCVV_FLOAT(vres6, va3, vb3, vl);
+                vres7 = VFNMSACVV_FLOAT(vres7, va2, vb3, vl);
+#endif
+#if   defined(RN) || defined(RT) || defined(CN) || defined(CT)
+                vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
+                vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl);
+                vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl);
+                vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl);
+
+                vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl);
+                vres3 = VFNMSACVV_FLOAT(vres3, va3, vb0, vl);
+                vres2 = VFMACCVV_FLOAT(vres2, va3, vb1, vl);
+                vres3 = VFMACCVV_FLOAT(vres3, va2, vb1, vl);
+
+                vres4 = VFMACCVV_FLOAT(vres4, va0, vb2, vl);
+                vres5 = VFNMSACVV_FLOAT(vres5, va1, vb2, vl);
+                vres4 = VFMACCVV_FLOAT(vres4, va1, vb3, vl);
+                vres5 = VFMACCVV_FLOAT(vres5, va0, vb3, vl);
+
+                vres6 = VFMACCVV_FLOAT(vres6, va2, vb2, vl);
+                vres7 = VFNMSACVV_FLOAT(vres7, va3, vb2, vl);
+                vres6 = VFMACCVV_FLOAT(vres6, va3, vb3, vl);
+                vres7 = VFMACCVV_FLOAT(vres7, va2, vb3, vl);
+#endif
+#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
+                vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
+                vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl);
+                vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl);
+                vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl);
+
+                vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl);
+                vres3 = VFNMSACVV_FLOAT(vres3, va3, vb0, vl);
+                vres2 = VFMACCVV_FLOAT(vres2, va3, vb1, vl);
+                vres3 = VFNMSACVV_FLOAT(vres3, va2, vb1, vl);
+
+                vres4 = VFMACCVV_FLOAT(vres4, va0, vb2, vl);
+                vres5 = VFNMSACVV_FLOAT(vres5, va1, vb2, vl);
+                vres4 = VFMACCVV_FLOAT(vres4, va1, vb3, vl);
+                vres5 = VFNMSACVV_FLOAT(vres5, va0, vb3, vl);
+
+                vres6 = VFMACCVV_FLOAT(vres6, va2, vb2, vl);
+                vres7 = VFNMSACVV_FLOAT(vres7, va3, vb2, vl);
+                vres6 = VFMACCVV_FLOAT(vres6, va3, vb3, vl);
+                vres7 = VFNMSACVV_FLOAT(vres7, va2, vb3, vl);
+
+#endif
+                ptrba += vl * 4;
+                ptrbb += vl * 4;
+            }
+            
+            v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres0, v_z0, vlmax);
+            v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres1, v_z0, vlmax);
+            res0 = VFMVFS_FLOAT_M1(v_m1_res0);
+            res1 = VFMVFS_FLOAT_M1(v_m1_res1);
+            C0[0] = res0 * alphar - res1 * alphai;
+            C0[1] = res1 * alphar + res0 * alphai;
+
+            v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres2, v_z0, vlmax);
+            v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres3, v_z0, vlmax);
+            res0 = VFMVFS_FLOAT_M1(v_m1_res0);
+            res1 = VFMVFS_FLOAT_M1(v_m1_res1);
+            C0[2] = res0 * alphar - res1 * alphai;
+            C0[3] = res1 * alphar + res0 * alphai;
+
+            v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres4, v_z0, vlmax);
+            v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres5, v_z0, vlmax);
+            res0 = VFMVFS_FLOAT_M1(v_m1_res0);
+            res1 = VFMVFS_FLOAT_M1(v_m1_res1);
+            C1[0] = res0 * alphar - res1 * alphai;
+            C1[1] = res1 * alphar + res0 * alphai;
+
+            v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres6, v_z0, vlmax);
+            v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres7, v_z0, vlmax);
+            res0 = VFMVFS_FLOAT_M1(v_m1_res0);
+            res1 = VFMVFS_FLOAT_M1(v_m1_res1);
+            C1[2] = res0 * alphar - res1 * alphai;
+            C1[3] = res1 * alphar + res0 * alphai;
+#if ( defined(LEFT) &&  defined(TRANSA)) || \
+            (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 2;
+#else
+            temp -= 2;
+#endif
+
+            ptrba += temp*2*2;
+            ptrbb += temp*2*2;
+
+#endif
+
+#ifdef LEFT
+            off += 2;
+#endif
+
+            C0 = C0+4;
+            C1 = C1+4;
+        }
+
+        if (bm & 1)
+        {
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*2;
+            ptrbb = bb + off*2*2;
+#endif
+            vres0 = VFMVVF_FLOAT(0.0, vlmax);
+            vres1 = VFMVVF_FLOAT(0.0, vlmax);
+            vres2 = VFMVVF_FLOAT(0.0, vlmax);
+            vres3 = VFMVVF_FLOAT(0.0, vlmax);
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk - off;
+#elif defined(LEFT)
+            temp = off+1;
+#else
+            temp = off+2;
+#endif
+            for (k = temp; k > 0; k -= vl)
+            {
+                vl = VSETVL(k);
+                VLSEG2_FLOAT(&va0, &va1, ptrba, vl);
+                VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl);
+
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+                vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
+                vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl);
+                vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl);
+                vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl);
+
+                vres2 = VFMACCVV_FLOAT(vres2, va0, vb2, vl);
+                vres3 = VFMACCVV_FLOAT(vres3, va1, vb2, vl);
+                vres2 = VFNMSACVV_FLOAT(vres2, va1, vb3, vl);
+                vres3 = VFMACCVV_FLOAT(vres3, va0, vb3, vl);
+#endif
+#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
+
+                vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
+                vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl);
+                vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl);
+                vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl);
+
+                vres2 = VFMACCVV_FLOAT(vres2, va0, vb2, vl);
+                vres3 = VFMACCVV_FLOAT(vres3, va1, vb2, vl);
+                vres2 = VFMACCVV_FLOAT(vres2, va1, vb3, vl);
+                vres3 = VFNMSACVV_FLOAT(vres3, va0, vb3, vl);
+
+#endif
+#if   defined(RN) || defined(RT) || defined(CN) || defined(CT)
+
+                vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
+                vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl);
+                vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl);
+                vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl);
+
+                vres2 = VFMACCVV_FLOAT(vres2, va0, vb2, vl);
+                vres3 = VFNMSACVV_FLOAT(vres3, va1, vb2, vl);
+                vres2 = VFMACCVV_FLOAT(vres2, va1, vb3, vl);
+                vres3 = VFMACCVV_FLOAT(vres3, va0, vb3, vl);
+
+#endif
+#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
+
+                vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
+                vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl);
+                vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl);
+                vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl);
+
+                vres2 = VFMACCVV_FLOAT(vres2, va0, vb2, vl);
+                vres3 = VFNMSACVV_FLOAT(vres3, va1, vb2, vl);
+                vres2 = VFNMSACVV_FLOAT(vres2, va1, vb3, vl);
+                vres3 = VFNMSACVV_FLOAT(vres3, va0, vb3, vl);
+
+#endif
+                ptrba += vl * 2;
+                ptrbb += vl * 4;
+            }
+            v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres0, v_z0, vlmax);
+            v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres1, v_z0, vlmax);
+            res0 = VFMVFS_FLOAT_M1(v_m1_res0);
+            res1 = VFMVFS_FLOAT_M1(v_m1_res1);
+            C0[0] = res0 * alphar - res1 * alphai;
+            C0[1] = res1 * alphar + res0 * alphai;
+
+            v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres2, v_z0, vlmax);
+            v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres3, v_z0, vlmax);
+            res0 = VFMVFS_FLOAT_M1(v_m1_res0);
+            res1 = VFMVFS_FLOAT_M1(v_m1_res1);
+            C1[0] = res0 * alphar - res1 * alphai;
+            C1[1] = res1 * alphar + res0 * alphai;
+
+#if ( defined(LEFT) &&  defined(TRANSA)) || \
+            (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 1;
+#else
+            temp -= 2;
+#endif
+            ptrba += temp*2;
+            ptrbb += temp*2*2;
+#endif
+#ifdef LEFT
+            off += 1;
+#endif
+            C0 = C0+2;
+            C1 = C1+2;
+        }
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        off += 2;
+#endif
+        k = (bk<<2);
+        bb = bb+k;
+        i = (ldc<<2);
+        C = C+i;
+    }
+
+    if (bn & 1)
+    {
+        C0 = C;
+#if defined(TRMMKERNEL) &&  defined(LEFT)
+        off = offset;
+#endif
+        ptrba = ba;
+
+        for (i = bm/2; i > 0; i--)
+        {
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*2*2;
+            ptrbb = bb+off*2;
+#endif
+            vres0 = VFMVVF_FLOAT(0.0, vlmax);
+            vres1 = VFMVVF_FLOAT(0.0, vlmax);
+            vres2 = VFMVVF_FLOAT(0.0, vlmax);
+            vres3 = VFMVVF_FLOAT(0.0, vlmax);
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk - off;
+#elif defined(LEFT)
+            temp = off + 2;
+#else
+            temp = off + 1;
+#endif
+
+            for (k = temp; k > 0; k -= vl)
+            {
+                vl = VSETVL(k);
+                VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl);
+                VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl);
+
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+                vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
+                vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl);
+                vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl);
+                vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl);
+
+                vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl);
+                vres3 = VFMACCVV_FLOAT(vres3, va3, vb0, vl);
+                vres2 = VFNMSACVV_FLOAT(vres2, va3, vb1, vl);
+                vres3 = VFMACCVV_FLOAT(vres3, va2, vb1, vl);
+#endif
+#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
+                vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
+                vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl);
+                vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl);
+                vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl);
+
+                vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl);
+                vres3 = VFMACCVV_FLOAT(vres3, va3, vb0, vl);
+                vres2 = VFMACCVV_FLOAT(vres2, va3, vb1, vl);
+                vres3 = VFNMSACVV_FLOAT(vres3, va2, vb1, vl);
+#endif
+#if   defined(RN) || defined(RT) || defined(CN) || defined(CT)
+                vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
+                vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl);
+                vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl);
+                vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl);
+
+                vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl);
+                vres3 = VFNMSACVV_FLOAT(vres3, va3, vb0, vl);
+                vres2 = VFMACCVV_FLOAT(vres2, va3, vb1, vl);
+                vres3 = VFMACCVV_FLOAT(vres3, va2, vb1, vl);
+
+#endif
+#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
+                vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
+                vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl);
+                vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl);
+                vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl);
+
+                vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl);
+                vres3 = VFNMSACVV_FLOAT(vres3, va3, vb0, vl);
+                vres2 = VFNMSACVV_FLOAT(vres2, va3, vb1, vl);
+                vres3 = VFNMSACVV_FLOAT(vres3, va2, vb1, vl);
+
+#endif
+                ptrba += vl * 4;
+                ptrbb += vl * 2;
+            }
+            v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres0, v_z0, vlmax);
+            v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres1, v_z0, vlmax);
+            res0 = VFMVFS_FLOAT_M1(v_m1_res0);
+            res1 = VFMVFS_FLOAT_M1(v_m1_res1);
+            C0[0] = res0 * alphar - res1 * alphai;
+            C0[1] = res1 * alphar + res0 * alphai;
+
+            v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres2, v_z0, vlmax);
+            v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres3, v_z0, vlmax);
+            res0 = VFMVFS_FLOAT_M1(v_m1_res0);
+            res1 = VFMVFS_FLOAT_M1(v_m1_res1);
+            C0[2] = res0 * alphar - res1 * alphai;
+            C0[3] = res1 * alphar + res0 * alphai;
+
+#if ( defined(LEFT) &&  defined(TRANSA)) || \
+            (!defined(LEFT) && !defined(TRANSA))
+            temp = bk-off;
+#ifdef LEFT
+            temp -= 2;
+#else
+            temp -= 1;
+#endif
+            ptrba += temp*2*2;
+            ptrbb += temp*2;
+#endif
+#ifdef LEFT
+            off += 2;
+#endif
+            C0 = C0+4;
+        }
+
+        if (bm & 1)
+        {
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*2;
+            ptrbb = bb + off*2;
+#endif
+            vres0 = VFMVVF_FLOAT(0.0, vlmax);
+            vres1 = VFMVVF_FLOAT(0.0, vlmax);
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off + 1;
+#else
+            temp = off + 1;
+#endif
+
+            for (k = temp; k > 0; k -= vl)
+            {
+                vl = VSETVL(k);
+                VLSEG2_FLOAT(&va0, &va1, ptrba, vl);
+                VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl);
+
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+                vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
+                vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl);
+                vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl);
+                vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl);
+
+#endif
+#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
+                vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
+                vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl);
+                vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl);
+                vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl);
+
+#endif
+#if   defined(RN) || defined(RT) || defined(CN) || defined(CT)
+                vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
+                vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl);
+                vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl);
+                vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl);
+
+#endif
+#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
+                vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
+                vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl);
+                vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl);
+                vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl);
+
+#endif
+                ptrba += vl * 2;
+                ptrbb += vl * 2;
+                
+            }
+            
+            v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres0, v_z0, vlmax);
+            v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres1, v_z0, vlmax);
+            res0 = VFMVFS_FLOAT_M1(v_m1_res0);
+            res1 = VFMVFS_FLOAT_M1(v_m1_res1);
+            
+            C0[0] = res0 * alphar - res1 * alphai;
+            C0[1] = res1 * alphar + res0 * alphai;
+
+#if ( defined(LEFT) &&  defined(TRANSA)) || \
+            (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 1;
+#else
+            temp -= 1;
+#endif
+            ptrba += temp*2;
+            ptrbb += temp*2;
+            
+#endif
+#ifdef LEFT
+            off += 1;
+#endif
+            C0 = C0+2;
+        }
+        k = (bk<<1);
+        bb = bb+k;
+        i = (ldc<<1);
+        C = C+i;
+    }
+    return 0;
+}
diff --git a/kernel/riscv64/ztrmmkernel_rvv_v1x4.c b/kernel/riscv64/ztrmmkernel_rvv_v1x4.c
new file mode 100644
index 000000000..db5f06af8
--- /dev/null
+++ b/kernel/riscv64/ztrmmkernel_rvv_v1x4.c
@@ -0,0 +1,632 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define FLOAT_V_T               vfloat32m2_t
+#define FLOAT_VX2_T             vfloat32m2x2_t
+#define VGET_VX2                __riscv_vget_v_f32m2x2_f32m2
+#define VSET_VX2                __riscv_vset_v_f32m2_f32m2x2
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e32_v_f32m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2x2
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m2
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f32m2
+#define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f32m2
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f32m2
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define FLOAT_V_T               vfloat64m2_t
+#define FLOAT_VX2_T             vfloat64m2x2_t
+#define VGET_VX2                __riscv_vget_v_f64m2x2_f64m2
+#define VSET_VX2                __riscv_vset_v_f64m2_f64m2x2
+#define VLEV_FLOAT              __riscv_vle64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e64_v_f64m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2x2
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m2
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f64m2
+#define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f64m2
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f64m2
+#endif
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+#define OP_rr       VFMACCVF_FLOAT
+#define OP_ir       VFMACCVF_FLOAT
+#define OP_ii       VFNMSACVF_FLOAT
+#define OP_ri       VFMACCVF_FLOAT
+#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
+#define OP_rr       VFMACCVF_FLOAT
+#define OP_ir       VFMACCVF_FLOAT
+#define OP_ii       VFMACCVF_FLOAT
+#define OP_ri       VFNMSACVF_FLOAT
+#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
+#define OP_rr       VFMACCVF_FLOAT
+#define OP_ir       VFNMSACVF_FLOAT
+#define OP_ii       VFMACCVF_FLOAT
+#define OP_ri       VFMACCVF_FLOAT
+#elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
+#define OP_rr       VFMACCVF_FLOAT
+#define OP_ir       VFNMSACVF_FLOAT
+#define OP_ii       VFNMSACVF_FLOAT
+#define OP_ri       VFNMSACVF_FLOAT
+#endif
+
+int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* ba,FLOAT* bb,FLOAT* C, BLASLONG ldc, BLASLONG offset)
+{
+    BLASLONG i,j,k;
+    FLOAT *C0, *C1, *C2, *C3, *ptrba,*ptrbb;
+	BLASLONG off, temp;
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+	off = -offset;
+#else
+	off = 0;
+#endif
+
+    FLOAT_VX2_T vax2;
+    FLOAT_V_T va0, va1, va2, va3, va4, va5, va6, va7;
+    FLOAT_V_T vres0, vres1, vres2, vres3, vres4, vres5, vres6, vres7;
+
+    //fprintf(stderr, "%s, bn=%ld bm=%ld bk=%ld alphar=%f alphai=%f ldc=%ld, offset=%ld\n", __FUNCTION__, bn, bm, bk, alphar, alphai, ldc, offset); // Debug
+
+    size_t vl;
+    for (j = bn/4; j > 0; j--)
+    {
+        C0 = C;
+        C1 = C0 + 2 * ldc;
+        C2 = C1 + 2 * ldc;
+        C3 = C2 + 2 * ldc;
+#if defined(TRMMKERNEL) && defined(LEFT)
+        off = offset;
+#endif
+        ptrba = ba;
+        for (i = bm; i > 0; i -= vl)
+        {
+            vl = VSETVL(i);
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*vl*2;
+            ptrbb = bb + off*4*2;
+#endif
+
+            vres0 = VFMVVF_FLOAT(0.0, vl);
+            vres1 = VFMVVF_FLOAT(0.0, vl);
+            vres2 = VFMVVF_FLOAT(0.0, vl);
+            vres3 = VFMVVF_FLOAT(0.0, vl);
+            vres4 = VFMVVF_FLOAT(0.0, vl);
+            vres5 = VFMVVF_FLOAT(0.0, vl);
+            vres6 = VFMVVF_FLOAT(0.0, vl);
+            vres7 = VFMVVF_FLOAT(0.0, vl);
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+vl;  // number of values in A
+#else
+            temp = off+4;   // number of values in B
+#endif
+
+            for (k = temp/4; k > 0; k--)
+            {
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va0 = VGET_VX2(vax2, 0);
+                va1 = VGET_VX2(vax2, 1);
+                ptrba += vl*2;
+
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va2 = VGET_VX2(vax2, 0);
+                va3 = VGET_VX2(vax2, 1);
+                ptrba += vl*2;
+
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va0, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va1, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va1, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va0, vl);
+
+                vres2 =  OP_rr(vres2, *(ptrbb + 2), va0, vl);
+                vres3 =  OP_ir(vres3, *(ptrbb + 2), va1, vl);
+                vres2 =  OP_ii(vres2, *(ptrbb + 3), va1, vl);
+                vres3 =  OP_ri(vres3, *(ptrbb + 3), va0, vl);
+
+                vres4 =  OP_rr(vres4, *(ptrbb + 4), va0, vl);
+                vres5 =  OP_ir(vres5, *(ptrbb + 4), va1, vl);
+                vres4 =  OP_ii(vres4, *(ptrbb + 5), va1, vl);
+                vres5 =  OP_ri(vres5, *(ptrbb + 5), va0, vl);
+
+                vres6 =  OP_rr(vres6, *(ptrbb + 6), va0, vl);
+                vres7 =  OP_ir(vres7, *(ptrbb + 6), va1, vl);
+                vres6 =  OP_ii(vres6, *(ptrbb + 7), va1, vl);
+                vres7 =  OP_ri(vres7, *(ptrbb + 7), va0, vl);
+
+                ptrbb += 8;
+
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va4 = VGET_VX2(vax2, 0);
+                va5 = VGET_VX2(vax2, 1);
+                ptrba += vl*2;
+                
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va2, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va3, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va3, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va2, vl);
+                
+                vres2 =  OP_rr(vres2, *(ptrbb + 2), va2, vl);
+                vres3 =  OP_ir(vres3, *(ptrbb + 2), va3, vl);
+                vres2 =  OP_ii(vres2, *(ptrbb + 3), va3, vl);
+                vres3 =  OP_ri(vres3, *(ptrbb + 3), va2, vl);
+                
+                vres4 =  OP_rr(vres4, *(ptrbb + 4), va2, vl);
+                vres5 =  OP_ir(vres5, *(ptrbb + 4), va3, vl);
+                vres4 =  OP_ii(vres4, *(ptrbb + 5), va3, vl);
+                vres5 =  OP_ri(vres5, *(ptrbb + 5), va2, vl);
+                
+                vres6 =  OP_rr(vres6, *(ptrbb + 6), va2, vl);
+                vres7 =  OP_ir(vres7, *(ptrbb + 6), va3, vl);
+                vres6 =  OP_ii(vres6, *(ptrbb + 7), va3, vl);
+                vres7 =  OP_ri(vres7, *(ptrbb + 7), va2, vl);
+                
+                ptrbb += 8;
+
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va6 = VGET_VX2(vax2, 0);
+                va7 = VGET_VX2(vax2, 1);
+                ptrba += vl*2;
+                
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va4, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va5, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va5, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va4, vl);
+                
+                vres2 =  OP_rr(vres2, *(ptrbb + 2), va4, vl);
+                vres3 =  OP_ir(vres3, *(ptrbb + 2), va5, vl);
+                vres2 =  OP_ii(vres2, *(ptrbb + 3), va5, vl);
+                vres3 =  OP_ri(vres3, *(ptrbb + 3), va4, vl);
+                
+                vres4 =  OP_rr(vres4, *(ptrbb + 4), va4, vl);
+                vres5 =  OP_ir(vres5, *(ptrbb + 4), va5, vl);
+                vres4 =  OP_ii(vres4, *(ptrbb + 5), va5, vl);
+                vres5 =  OP_ri(vres5, *(ptrbb + 5), va4, vl);
+                
+                vres6 =  OP_rr(vres6, *(ptrbb + 6), va4, vl);
+                vres7 =  OP_ir(vres7, *(ptrbb + 6), va5, vl);
+                vres6 =  OP_ii(vres6, *(ptrbb + 7), va5, vl);
+                vres7 =  OP_ri(vres7, *(ptrbb + 7), va4, vl);
+                
+                ptrbb += 8;
+                
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va6, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va7, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va7, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va6, vl);
+                
+                vres2 =  OP_rr(vres2, *(ptrbb + 2), va6, vl);
+                vres3 =  OP_ir(vres3, *(ptrbb + 2), va7, vl);
+                vres2 =  OP_ii(vres2, *(ptrbb + 3), va7, vl);
+                vres3 =  OP_ri(vres3, *(ptrbb + 3), va6, vl);
+                
+                vres4 =  OP_rr(vres4, *(ptrbb + 4), va6, vl);
+                vres5 =  OP_ir(vres5, *(ptrbb + 4), va7, vl);
+                vres4 =  OP_ii(vres4, *(ptrbb + 5), va7, vl);
+                vres5 =  OP_ri(vres5, *(ptrbb + 5), va6, vl);
+                
+                vres6 =  OP_rr(vres6, *(ptrbb + 6), va6, vl);
+                vres7 =  OP_ir(vres7, *(ptrbb + 6), va7, vl);
+                vres6 =  OP_ii(vres6, *(ptrbb + 7), va7, vl);
+                vres7 =  OP_ri(vres7, *(ptrbb + 7), va6, vl);
+                
+                ptrbb += 8;
+            }
+
+            for (k = temp & 3; k > 0; k--)
+            {
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va0 = VGET_VX2(vax2, 0);
+                va1 = VGET_VX2(vax2, 1);
+                ptrba += vl*2;
+
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va0, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va1, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va1, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va0, vl);
+
+                vres2 =  OP_rr(vres2, *(ptrbb + 2), va0, vl);
+                vres3 =  OP_ir(vres3, *(ptrbb + 2), va1, vl);
+                vres2 =  OP_ii(vres2, *(ptrbb + 3), va1, vl);
+                vres3 =  OP_ri(vres3, *(ptrbb + 3), va0, vl);
+
+                vres4 =  OP_rr(vres4, *(ptrbb + 4), va0, vl);
+                vres5 =  OP_ir(vres5, *(ptrbb + 4), va1, vl);
+                vres4 =  OP_ii(vres4, *(ptrbb + 5), va1, vl);
+                vres5 =  OP_ri(vres5, *(ptrbb + 5), va0, vl);
+
+                vres6 =  OP_rr(vres6, *(ptrbb + 6), va0, vl);
+                vres7 =  OP_ir(vres7, *(ptrbb + 6), va1, vl);
+                vres6 =  OP_ii(vres6, *(ptrbb + 7), va1, vl);
+                vres7 =  OP_ri(vres7, *(ptrbb + 7), va0, vl);
+
+                ptrbb += 8;
+            }
+            va0 =  VFMULVF_FLOAT(vres0, alphar, vl);
+            va1 =  VFMULVF_FLOAT(vres1, alphar, vl);
+            va0 = VFNMSACVF_FLOAT(va0, alphai, vres1, vl);
+            va1 =  VFMACCVF_FLOAT(va1, alphai, vres0, vl);
+            
+            vax2 = VSET_VX2(vax2, 0, va0);
+            vax2 = VSET_VX2(vax2, 1, va1);
+            VSSEG2_FLOAT(C0, vax2, vl);
+
+            va2 =  VFMULVF_FLOAT(vres2, alphar, vl);
+            va3 =  VFMULVF_FLOAT(vres3, alphar, vl);
+            va2 = VFNMSACVF_FLOAT(va2, alphai, vres3, vl);
+            va3 =  VFMACCVF_FLOAT(va3, alphai, vres2, vl);
+
+            vax2 = VSET_VX2(vax2, 0, va2);
+            vax2 = VSET_VX2(vax2, 1, va3);
+            VSSEG2_FLOAT(C1, vax2, vl);
+
+            va0 =  VFMULVF_FLOAT(vres4, alphar, vl);
+            va1 =  VFMULVF_FLOAT(vres5, alphar, vl);
+            va0 = VFNMSACVF_FLOAT(va0, alphai, vres5, vl);
+            va1 =  VFMACCVF_FLOAT(va1, alphai, vres4, vl);
+
+            vax2 = VSET_VX2(vax2, 0, va0);
+            vax2 = VSET_VX2(vax2, 1, va1);
+            VSSEG2_FLOAT(C2, vax2, vl);
+
+            va2 =  VFMULVF_FLOAT(vres6, alphar, vl);
+            va3 =  VFMULVF_FLOAT(vres7, alphar, vl);
+            va2 = VFNMSACVF_FLOAT(va2, alphai, vres7, vl);
+            va3 =  VFMACCVF_FLOAT(va3, alphai, vres6, vl);
+
+            vax2 = VSET_VX2(vax2, 0, va2);
+            vax2 = VSET_VX2(vax2, 1, va3);
+            VSSEG2_FLOAT(C3, vax2, vl);
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= vl; // number of values in A
+#else
+            temp -= 4; // number of values in B
+#endif
+            ptrba += temp*vl*2;
+            ptrbb += temp*4*2;
+#endif
+
+#ifdef LEFT
+            off += vl; // number of values in A
+#endif
+
+            C0 += vl * 2;
+            C1 += vl * 2;
+            C2 += vl * 2;
+            C3 += vl * 2;
+        }
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        off += 4;
+#endif
+
+        bb += (bk << 3);
+        C  += (ldc << 3);
+    }
+
+    if (bn & 2)
+    {
+        C0 = C;
+        C1 = C0 + 2 * ldc;
+#if defined(TRMMKERNEL) && defined(LEFT)
+        off = offset;
+#endif
+        ptrba = ba;
+        for (i = bm; i > 0; i -= vl)
+        {
+            vl = VSETVL(i);
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*vl*2;
+            ptrbb = bb + off*2*2;
+#endif
+
+            vres0 = VFMVVF_FLOAT(0.0, vl);
+            vres1 = VFMVVF_FLOAT(0.0, vl);
+            vres2 = VFMVVF_FLOAT(0.0, vl);
+            vres3 = VFMVVF_FLOAT(0.0, vl);
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+vl;  // number of values in A
+#else
+            temp = off+2;   // number of values in B
+#endif
+            for (k = temp/4; k > 0; k--)
+            {
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va0 = VGET_VX2(vax2, 0);
+                va1 = VGET_VX2(vax2, 1);
+                ptrba += vl*2;
+
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va2 = VGET_VX2(vax2, 0);
+                va3 = VGET_VX2(vax2, 1);
+                ptrba += vl*2;
+
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va0, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va1, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va1, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va0, vl);
+
+                vres2 =  OP_rr(vres2, *(ptrbb + 2), va0, vl);
+                vres3 =  OP_ir(vres3, *(ptrbb + 2), va1, vl);
+                vres2 =  OP_ii(vres2, *(ptrbb + 3), va1, vl);
+                vres3 =  OP_ri(vres3, *(ptrbb + 3), va0, vl);
+
+                ptrbb += 4;
+
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va4 = VGET_VX2(vax2, 0);
+                va5 = VGET_VX2(vax2, 1);
+                ptrba += vl*2;
+                
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va2, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va3, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va3, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va2, vl);
+                
+                vres2 =  OP_rr(vres2, *(ptrbb + 2), va2, vl);
+                vres3 =  OP_ir(vres3, *(ptrbb + 2), va3, vl);
+                vres2 =  OP_ii(vres2, *(ptrbb + 3), va3, vl);
+                vres3 =  OP_ri(vres3, *(ptrbb + 3), va2, vl);
+
+                ptrbb += 4;
+
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va6 = VGET_VX2(vax2, 0);
+                va7 = VGET_VX2(vax2, 1);
+                ptrba += vl*2;
+                
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va4, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va5, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va5, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va4, vl);
+                
+                vres2 =  OP_rr(vres2, *(ptrbb + 2), va4, vl);
+                vres3 =  OP_ir(vres3, *(ptrbb + 2), va5, vl);
+                vres2 =  OP_ii(vres2, *(ptrbb + 3), va5, vl);
+                vres3 =  OP_ri(vres3, *(ptrbb + 3), va4, vl);
+                
+                ptrbb += 4;
+                
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va6, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va7, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va7, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va6, vl);
+                
+                vres2 =  OP_rr(vres2, *(ptrbb + 2), va6, vl);
+                vres3 =  OP_ir(vres3, *(ptrbb + 2), va7, vl);
+                vres2 =  OP_ii(vres2, *(ptrbb + 3), va7, vl);
+                vres3 =  OP_ri(vres3, *(ptrbb + 3), va6, vl);
+                
+                ptrbb += 4;
+            }
+
+            for (k = temp & 3; k > 0; k--)
+            {
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va0 = VGET_VX2(vax2, 0);
+                va1 = VGET_VX2(vax2, 1);
+                ptrba += vl*2;
+
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va0, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va1, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va1, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va0, vl);
+
+                vres2 =  OP_rr(vres2, *(ptrbb + 2), va0, vl);
+                vres3 =  OP_ir(vres3, *(ptrbb + 2), va1, vl);
+                vres2 =  OP_ii(vres2, *(ptrbb + 3), va1, vl);
+                vres3 =  OP_ri(vres3, *(ptrbb + 3), va0, vl);
+
+                ptrbb += 4;
+            }
+
+            va0 =  VFMULVF_FLOAT(vres0, alphar, vl);
+            va1 =  VFMULVF_FLOAT(vres1, alphar, vl);
+            va0 = VFNMSACVF_FLOAT(va0, alphai, vres1, vl);
+            va1 =  VFMACCVF_FLOAT(va1, alphai, vres0, vl);
+            
+            vax2 = VSET_VX2(vax2, 0, va0);
+            vax2 = VSET_VX2(vax2, 1, va1);
+            VSSEG2_FLOAT(C0, vax2, vl);
+
+            va2 =  VFMULVF_FLOAT(vres2, alphar, vl);
+            va3 =  VFMULVF_FLOAT(vres3, alphar, vl);
+            va2 = VFNMSACVF_FLOAT(va2, alphai, vres3, vl);
+            va3 =  VFMACCVF_FLOAT(va3, alphai, vres2, vl);
+
+            vax2 = VSET_VX2(vax2, 0, va2);
+            vax2 = VSET_VX2(vax2, 1, va3);
+            VSSEG2_FLOAT(C1, vax2, vl);
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= vl; // number of values in A
+#else
+            temp -= 2; // number of values in B
+#endif
+            ptrba += temp*vl*2;
+            ptrbb += temp*2*2;
+#endif
+
+#ifdef LEFT
+            off += vl; // number of values in A
+#endif
+            C0 += vl * 2;
+            C1 += vl * 2;
+        }
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        off += 2;
+#endif
+        bb += (bk << 2);
+        C  += (ldc << 2);
+    }
+
+    if (bn & 1)
+    {
+        C0 = C;
+#if defined(TRMMKERNEL) &&  defined(LEFT)
+        off = offset;
+#endif
+        ptrba = ba;
+        for (i = bm; i > 0; i -= vl)
+        {
+            vl = VSETVL(i);
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*vl*2;
+            ptrbb = bb + off*2;
+#endif
+
+            vres0 = VFMVVF_FLOAT(0.0, vl);
+            vres1 = VFMVVF_FLOAT(0.0, vl);
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+vl;  // number of values in A
+#else
+            temp = off+1;   // number of values in B
+#endif
+            for (k = temp/4; k > 0; k--)
+            {
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va0 = VGET_VX2(vax2, 0);
+                va1 = VGET_VX2(vax2, 1);
+                ptrba += vl*2;
+
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va2 = VGET_VX2(vax2, 0);
+                va3 = VGET_VX2(vax2, 1);
+                ptrba += vl*2;
+
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va0, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va1, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va1, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va0, vl);
+
+                ptrbb += 2;
+
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va4 = VGET_VX2(vax2, 0);
+                va5 = VGET_VX2(vax2, 1);
+                ptrba += vl*2;
+                
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va2, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va3, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va3, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va2, vl);
+
+                ptrbb += 2;
+
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va6 = VGET_VX2(vax2, 0);
+                va7 = VGET_VX2(vax2, 1);
+                ptrba += vl*2;
+                
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va4, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va5, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va5, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va4, vl);
+                
+                ptrbb += 2;
+                
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va6, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va7, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va7, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va6, vl);
+                
+                ptrbb += 2;
+            }
+
+            for (k = temp & 3; k > 0; k--)
+            {
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va0 = VGET_VX2(vax2, 0);
+                va1 = VGET_VX2(vax2, 1);
+                ptrba += vl*2;
+
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va0, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va1, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va1, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va0, vl);
+
+                ptrbb += 2;
+            }
+
+            va0 =  VFMULVF_FLOAT(vres0, alphar, vl);
+            va1 =  VFMULVF_FLOAT(vres1, alphar, vl);
+            va0 = VFNMSACVF_FLOAT(va0, alphai, vres1, vl);
+            va1 =  VFMACCVF_FLOAT(va1, alphai, vres0, vl);
+            
+            vax2 = VSET_VX2(vax2, 0, va0);
+            vax2 = VSET_VX2(vax2, 1, va1);
+            VSSEG2_FLOAT(C0, vax2, vl);
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= vl; // number of values in A
+#else
+            temp -= 1; // number of values in B
+#endif
+            ptrba += temp*vl*2;
+            ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+            off += vl; // number of values in A
+#endif
+            C0 += vl * 2;
+        }
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        off += 1;
+#endif
+        bb += bk << 1;
+        C  += ldc << 1;
+   }
+   return 0;
+}
diff --git a/kernel/riscv64/ztrsm_lncopy_rvv_v1.c b/kernel/riscv64/ztrsm_lncopy_rvv_v1.c
new file mode 100644
index 000000000..36cec711d
--- /dev/null
+++ b/kernel/riscv64/ztrsm_lncopy_rvv_v1.c
@@ -0,0 +1,115 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define FLOAT_VX2_T             vfloat32m2x2_t
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e32_v_f32m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2x2
+#define VSSEG2_FLOAT_M          __riscv_vsseg2e32_v_f32m2x2_m
+#define VBOOL_T                 vbool16_t
+#define UINT_V_T                vuint32m2_t
+#define VID_V_UINT              __riscv_vid_v_u32m2
+#define VMSLTU_VX_UINT          __riscv_vmsltu_vx_u32m2_b16
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define FLOAT_VX2_T             vfloat64m2x2_t
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e64_v_f64m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2x2
+#define VSSEG2_FLOAT_M          __riscv_vsseg2e64_v_f64m2x2_m
+#define VBOOL_T                 vbool32_t
+#define UINT_V_T                vuint64m2_t
+#define VID_V_UINT              __riscv_vid_v_u64m2
+#define VMSLTU_VX_UINT          __riscv_vmsltu_vx_u64m2_b32
+
+#endif
+
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
+
+    //fprintf(stderr, "%s , %s, m = %4ld  n = %4ld  lda = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, lda, offset); // Debug
+
+    BLASLONG i, ii, jj, js;
+
+    FLOAT *ao;
+
+    jj = offset;
+
+    BLASLONG stride_lda = sizeof(FLOAT)*lda*2;
+
+    FLOAT_VX2_T vax2;
+    VBOOL_T vbool_cmp;
+    UINT_V_T vindex;
+    size_t vl;
+
+    for (js = n; js > 0; js -= vl)
+    {
+        vl = VSETVL(js);
+        ao = a;
+
+        ii = 0;
+        for (i = 0; i < m;)
+        {
+            if (ii == jj) 
+            {
+                vindex  = VID_V_UINT(vl);
+                for (unsigned int j = 0; j < vl; j++) 
+                {
+                    vax2 = VLSSEG2_FLOAT(ao, stride_lda, vl);
+                    vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl);
+                    VSSEG2_FLOAT_M(vbool_cmp, b, vax2, vl);
+
+                    compinv((b + j * 2), *(ao + j * lda * 2), *(ao + j * lda * 2 + 1));
+                    ao  += 2;
+                    b   += vl * 2;
+                }
+                i += vl;
+                ii += vl;
+            }
+            else
+            {
+                if (ii > jj)
+                {
+                    vax2 = VLSSEG2_FLOAT(ao, stride_lda, vl);
+                    VSSEG2_FLOAT(b, vax2, vl);
+                }
+                ao  += 2;
+                b   += vl * 2;
+                i++;
+                ii++;
+            }
+        }
+
+        a += vl * lda * 2;
+        jj += vl;
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/ztrsm_ltcopy_rvv_v1.c b/kernel/riscv64/ztrsm_ltcopy_rvv_v1.c
new file mode 100644
index 000000000..3a7bdb522
--- /dev/null
+++ b/kernel/riscv64/ztrsm_ltcopy_rvv_v1.c
@@ -0,0 +1,114 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define FLOAT_VX2_T             vfloat32m2x2_t
+#define VLSEG2_FLOAT            __riscv_vlseg2e32_v_f32m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2x2
+#define VSSEG2_FLOAT_M          __riscv_vsseg2e32_v_f32m2x2_m
+#define VBOOL_T                 vbool16_t
+#define UINT_V_T                vuint32m2_t
+#define VID_V_UINT              __riscv_vid_v_u32m2
+#define VMSGTU_VX_UINT          __riscv_vmsgtu_vx_u32m2_b16
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define FLOAT_VX2_T             vfloat64m2x2_t
+#define VLSEG2_FLOAT            __riscv_vlseg2e64_v_f64m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2x2
+#define VSSEG2_FLOAT_M          __riscv_vsseg2e64_v_f64m2x2_m
+#define VBOOL_T                 vbool32_t
+#define UINT_V_T                vuint64m2_t
+#define VID_V_UINT              __riscv_vid_v_u64m2
+#define VMSGTU_VX_UINT          __riscv_vmsgtu_vx_u64m2_b32
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
+
+    //fprintf(stderr, "%s , %s, m = %4ld  n = %4ld  lda = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, lda, offset); // Debug
+
+    BLASLONG i, ii, jj, js;
+
+    FLOAT *ao;
+
+    jj = offset;
+
+    FLOAT_VX2_T vax2;
+    VBOOL_T vbool_cmp;
+    UINT_V_T vindex;
+
+    size_t vl;
+
+    for (js = n; js > 0; js -= vl)
+    {
+        vl = VSETVL(js);
+        ao = a;
+
+        ii = 0;
+        for (i = 0; i < m;)
+        {
+
+            if (ii == jj) 
+            {
+                vindex  = VID_V_UINT(vl);
+                for (unsigned int j = 0; j < vl; j++) 
+                {
+                    compinv((b + j * 2), *(ao + j * 2), *(ao + j * 2 + 1));
+
+                    vax2 = VLSEG2_FLOAT(ao, vl);
+                    vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl);
+                    VSSEG2_FLOAT_M(vbool_cmp, b, vax2, vl);
+
+                    b   += vl * 2;
+                    ao  += lda * 2;
+                }
+                i += vl;
+                ii += vl;
+            }
+            else 
+            {
+                if (ii < jj) 
+                {
+                    vax2 = VLSEG2_FLOAT(ao, vl);
+                    VSSEG2_FLOAT(b, vax2, vl);
+                }
+                ao  += lda * 2;
+                b   += vl * 2;
+                i ++;
+                ii ++;
+            }
+        }
+
+        a += vl * 2;
+        jj += vl;
+    }
+    return 0;
+}
+
diff --git a/kernel/riscv64/ztrsm_uncopy_rvv_v1.c b/kernel/riscv64/ztrsm_uncopy_rvv_v1.c
new file mode 100644
index 000000000..2a158d4de
--- /dev/null
+++ b/kernel/riscv64/ztrsm_uncopy_rvv_v1.c
@@ -0,0 +1,113 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include <stdio.h>
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define FLOAT_VX2_T             vfloat32m2x2_t
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e32_v_f32m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2x2
+#define VSSEG2_FLOAT_M          __riscv_vsseg2e32_v_f32m2x2_m
+#define VBOOL_T                 vbool16_t
+#define UINT_V_T                vuint32m2_t
+#define VID_V_UINT              __riscv_vid_v_u32m2
+#define VMSGTU_VX_UINT          __riscv_vmsgtu_vx_u32m2_b16
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define FLOAT_VX2_T             vfloat64m2x2_t
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e64_v_f64m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2x2
+#define VSSEG2_FLOAT_M          __riscv_vsseg2e64_v_f64m2x2_m
+#define VBOOL_T                 vbool32_t
+#define UINT_V_T                vuint64m2_t
+#define VID_V_UINT              __riscv_vid_v_u64m2
+#define VMSGTU_VX_UINT          __riscv_vmsgtu_vx_u64m2_b32
+#endif
+
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
+
+    //fprintf(stderr, "%s , %s, m = %4ld  n = %4ld  lda = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, lda, offset); // Debug
+
+    BLASLONG i, ii, jj, js;
+    BLASLONG stride_lda = sizeof(FLOAT)*lda*2;
+
+    FLOAT *ao;
+    jj = offset;
+
+    FLOAT_VX2_T vax2;
+    VBOOL_T vbool_cmp;
+    UINT_V_T vindex;
+
+    size_t vl;
+
+    for (js = n; js > 0; js -= vl)
+    {
+        vl = VSETVL(js);
+        ao = a;
+
+        i = 0;
+        ii = 0;
+        for (i = 0; i < m;)
+        {
+            if (ii == jj) 
+            {
+                vindex  = VID_V_UINT(vl);
+                for (unsigned int j = 0; j < vl; j++) 
+                {
+                    compinv((b + j * 2), *(ao + j * lda * 2), *(ao + j * lda * 2 + 1));
+                    vax2 = VLSSEG2_FLOAT(ao, stride_lda, vl);
+                    vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl);
+                    VSSEG2_FLOAT_M(vbool_cmp, b, vax2, vl);
+                    ao  += 2;
+                    b   += vl * 2;
+                }
+                i += vl;
+                ii += vl;
+            } 
+            else
+            {
+                if (ii < jj) 
+                {
+                    vax2 = VLSSEG2_FLOAT(ao, stride_lda, vl);
+                    VSSEG2_FLOAT(b, vax2, vl);
+                }
+                ao  += 2;
+                b   += vl * 2;
+                i++;
+                ii++;
+            }
+        } 
+
+        a += vl * lda * 2;
+        jj += vl;
+    }
+    return 0;
+}
diff --git a/kernel/riscv64/ztrsm_utcopy_rvv_v1.c b/kernel/riscv64/ztrsm_utcopy_rvv_v1.c
new file mode 100644
index 000000000..4b3319588
--- /dev/null
+++ b/kernel/riscv64/ztrsm_utcopy_rvv_v1.c
@@ -0,0 +1,115 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define FLOAT_VX2_T             vfloat32m2x2_t
+#define VLSEG2_FLOAT            __riscv_vlseg2e32_v_f32m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2x2
+#define VSSEG2_FLOAT_M          __riscv_vsseg2e32_v_f32m2x2_m
+#define VBOOL_T                 vbool16_t
+#define UINT_V_T                vuint32m2_t
+#define VID_V_UINT              __riscv_vid_v_u32m2
+#define VMSLTU_VX_UINT          __riscv_vmsltu_vx_u32m2_b16
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define FLOAT_VX2_T             vfloat64m2x2_t
+#define VLSEG2_FLOAT            __riscv_vlseg2e64_v_f64m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2x2
+#define VSSEG2_FLOAT_M          __riscv_vsseg2e64_v_f64m2x2_m
+#define VBOOL_T                 vbool32_t
+#define UINT_V_T                vuint64m2_t
+#define VID_V_UINT              __riscv_vid_v_u64m2
+#define VMSLTU_VX_UINT          __riscv_vmsltu_vx_u64m2_b32
+#endif
+
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
+
+    //fprintf(stderr, "%s , %s, m = %4ld  n = %4ld  lda = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, lda, offset); // Debug
+
+    BLASLONG i, ii, jj, js;
+
+    FLOAT *ao;
+
+    jj = offset;
+    FLOAT_VX2_T vax2;
+
+    VBOOL_T vbool_cmp;
+    UINT_V_T vindex;
+
+    size_t vl;
+  
+    for (js = n; js > 0; js -= vl)
+    {
+        vl = VSETVL(js);
+        ao = a;
+
+        ii = 0;
+        for (i = 0; i < m;)
+        {
+
+            if (ii == jj) 
+            {
+                vindex  = VID_V_UINT(vl);
+                for (unsigned int j = 0; j < vl; j++) 
+                {
+                    vax2 = VLSEG2_FLOAT(ao, vl);
+                    vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl);
+                    VSSEG2_FLOAT_M(vbool_cmp, b, vax2, vl);
+
+                    compinv((b + j * 2), *(ao + j * 2), *(ao + j * 2 + 1));
+
+                    ao  += lda * 2;
+                    b   += vl * 2;
+                }
+                i += vl;
+                ii += vl;
+            } 
+            else 
+            {
+                if (ii > jj) 
+                {
+                    vax2 = VLSEG2_FLOAT(ao, vl);
+                    VSSEG2_FLOAT(b, vax2, vl);
+                }
+                ao  += lda * 2;
+                b   += vl * 2;
+                i ++;
+                ii ++;
+            }
+        }
+
+        a += vl * 2;
+        jj += vl;
+    }
+
+    return 0;
+}
diff --git a/kernel/x86_64/zscal.c b/kernel/x86_64/zscal.c
index 66c8a0d2b..bc79c0caf 100644
--- a/kernel/x86_64/zscal.c
+++ b/kernel/x86_64/zscal.c
@@ -69,16 +69,16 @@ static void zscal_kernel_8( BLASLONG n, FLOAT *alpha , FLOAT *x )
 
 	for( i=0; i<n; i+=4 )
 	{
-		t0 = da_r *x[0] - da_i *x[1];	
-		t1 = da_r *x[2] - da_i *x[3];	
-		t2 = da_r *x[4] - da_i *x[5];	
-		t3 = da_r *x[6] - da_i *x[7];	
+		t0 = da_r *x[0] - da_i *x[1];
+		t1 = da_r *x[2] - da_i *x[3];
+		t2 = da_r *x[4] - da_i *x[5];
+		t3 = da_r *x[6] - da_i *x[7];
 
 		x[1] = da_r * x[1] + da_i * x[0];
 		x[3] = da_r * x[3] + da_i * x[2];
 		x[5] = da_r * x[5] + da_i * x[4];
 		x[7] = da_r * x[7] + da_i * x[6];
-		
+
 		x[0] = t0;
 		x[2] = t1;
 		x[4] = t2;
@@ -99,16 +99,16 @@ static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha , FLOAT *x )
 
 	for( i=0; i<n; i+=4 )
 	{
-		t0 =  - da_i *x[1];	
-		t1 =  - da_i *x[3];	
-		t2 =  - da_i *x[5];	
-		t3 =  - da_i *x[7];	
+		t0 =  - da_i *x[1];
+		t1 =  - da_i *x[3];
+		t2 =  - da_i *x[5];
+		t3 =  - da_i *x[7];
 
 		x[1] =  da_i * x[0];
 		x[3] =  da_i * x[2];
 		x[5] =  da_i * x[4];
 		x[7] =  da_i * x[6];
-		
+
 		x[0] = t0;
 		x[2] = t1;
 		x[4] = t2;
@@ -129,16 +129,16 @@ static void zscal_kernel_8_zero_i( BLASLONG n, FLOAT *alpha , FLOAT *x )
 
 	for( i=0; i<n; i+=4 )
 	{
-		t0 = da_r *x[0];	
-		t1 = da_r *x[2];	
-		t2 = da_r *x[4];	
-		t3 = da_r *x[6];	
+		t0 = da_r *x[0];
+		t1 = da_r *x[2];
+		t2 = da_r *x[4];
+		t3 = da_r *x[6];
 
 		x[1] = da_r * x[1];
 		x[3] = da_r * x[3];
 		x[5] = da_r * x[5];
 		x[7] = da_r * x[7];
-		
+
 		x[0] = t0;
 		x[2] = t1;
 		x[4] = t2;
@@ -157,14 +157,14 @@ static void zscal_kernel_8_zero( BLASLONG n, FLOAT *alpha , FLOAT *x )
 	BLASLONG i;
 	for( i=0; i<n; i+=4 )
 	{
-		x[0] = 0.0;	
-		x[1] = 0.0;	
-		x[2] = 0.0;	
-		x[3] = 0.0;	
-		x[4] = 0.0;	
-		x[5] = 0.0;	
-		x[6] = 0.0;	
-		x[7] = 0.0;	
+		x[0] = 0.0;
+		x[1] = 0.0;
+		x[2] = 0.0;
+		x[3] = 0.0;
+		x[4] = 0.0;
+		x[5] = 0.0;
+		x[6] = 0.0;
+		x[7] = 0.0;
 		x+=8;
 	}
 
@@ -186,10 +186,10 @@ static void zscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_
 
 	for ( i=0; i<n; i+=4 )
 	{
-		t0 = da_r * x[0]         - da_i *x[1];	
-		t1 = da_r * x[inc_x]     - da_i *x[inc_x  + 1];	
-		t2 = da_r * x[inc_x2]    - da_i *x[inc_x2 + 1];	
-		t3 = da_r * x[inc_x3]    - da_i *x[inc_x3 + 1];	
+		t0 = da_r * x[0]         - da_i *x[1];
+		t1 = da_r * x[inc_x]     - da_i *x[inc_x  + 1];
+		t2 = da_r * x[inc_x2]    - da_i *x[inc_x2 + 1];
+		t3 = da_r * x[inc_x3]    - da_i *x[inc_x3 + 1];
 
 		x[1]               = da_i * x[0]       + da_r * x[1];
 		x[inc_x  +1]       = da_i * x[inc_x]   + da_r * x[inc_x  +1];
@@ -228,7 +228,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
 			{
 				while(j < n1)
 				{
-			
+
 					x[i]=0.0;
 					x[i+1]=0.0;
 					x[i+inc_x]=0.0;
@@ -240,7 +240,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
 
 				while(j < n)
 				{
-			
+
 					x[i]=0.0;
 					x[i+1]=0.0;
 					i += inc_x ;
@@ -253,11 +253,17 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
 			{
 				while(j < n1)
 				{
-			
-					temp0        = -da_i * x[i+1];
+
+					if (isnan(x[i]) || isinf(x[i]))
+						temp0	= NAN;
+					else
+						temp0   = -da_i * x[i+1];
 					x[i+1]       =  da_i * x[i];
 					x[i]         =  temp0;
-					temp1        = -da_i * x[i+1+inc_x];
+					if (isnan(x[i+inc_x]) || isinf(x[i+inc_x]))
+						temp1	= NAN;
+					else
+						temp1   = -da_i * x[i+1+inc_x];
 					x[i+1+inc_x] =  da_i * x[i+inc_x];
 					x[i+inc_x]   =  temp1;
 					i += 2*inc_x ;
@@ -267,8 +273,11 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
 
 				while(j < n)
 				{
-			
-					temp0        = -da_i * x[i+1];
+
+					if (isnan(x[i]) || isinf(x[i]))
+						temp0	= NAN;
+					else
+						temp0   = -da_i * x[i+1];
 					x[i+1]       =  da_i * x[i];
 					x[i]         =  temp0;
 					i += inc_x ;
@@ -291,7 +300,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
 
 				while(j < n1)
 				{
-			
+
 					temp0        =  da_r * x[i];
 					x[i+1]       =  da_r * x[i+1];
 					x[i]         =  temp0;
@@ -305,7 +314,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
 
 				while(j < n)
 				{
-			
+
 					temp0        =  da_r * x[i];
 					x[i+1]       =  da_r * x[i+1];
 					x[i]         =  temp0;
@@ -368,7 +377,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
 		}
 		i = n1 << 1;
 		j = n1;
-	
+
 	if ( da_r == 0.0 || da_r != da_r )
 	{
 		if ( da_i == 0.0 )
@@ -385,7 +394,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
 			}
 
 		}
-		else if (da_r < -FLT_MAX || da_r > FLT_MAX) { 
+		else if (da_r < -FLT_MAX || da_r > FLT_MAX) {
 			while(j < n)
 			{
 					x[i]= NAN;
@@ -404,7 +413,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
 				if (x[i] < -FLT_MAX || x[i] > FLT_MAX)
 					temp0 = NAN;
 				x[i+1]       =  da_i * x[i];
-				if ( x[i] == x[i]) //preserve NaN 
+				if ( x[i] == x[i]) //preserve NaN
 				  x[i]         =  temp0;
 				i += 2 ;
 				j++;
@@ -420,7 +429,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
 		{
 				while(j < n)
 				{
-			
+
 					temp0        =  da_r * x[i];
 					x[i+1]       =  da_r * x[i+1];
 					x[i]         =  temp0;
@@ -442,7 +451,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
 
 			}
 
-		}		
+		}
 
 	}
 
diff --git a/lapack-netlib/SRC/clarfgp.f b/lapack-netlib/SRC/clarfgp.f
index 47b5e47b0..980e93612 100644
--- a/lapack-netlib/SRC/clarfgp.f
+++ b/lapack-netlib/SRC/clarfgp.f
@@ -148,33 +148,23 @@
       ALPHR = REAL( ALPHA )
       ALPHI = AIMAG( ALPHA )
 *
-      IF( XNORM.LE.EPS*ABS(ALPHA) ) THEN
+      IF( XNORM.LE.EPS*ABS(ALPHA) .AND. ALPHI.EQ.ZERO ) THEN
 *
 *        H  =  [1-alpha/abs(alpha) 0; 0 I], sign chosen so ALPHA >= 0.
 *
-         IF( ALPHI.EQ.ZERO ) THEN
-            IF( ALPHR.GE.ZERO ) THEN
-*              When TAU.eq.ZERO, the vector is special-cased to be
-*              all zeros in the application routines.  We do not need
-*              to clear it.
-               TAU = ZERO
-            ELSE
-*              However, the application routines rely on explicit
-*              zero checks when TAU.ne.ZERO, and we must clear X.
-               TAU = TWO
-               DO J = 1, N-1
-                  X( 1 + (J-1)*INCX ) = ZERO
-               END DO
-               ALPHA = -ALPHA
-            END IF
+         IF( ALPHR.GE.ZERO ) THEN
+*           When TAU.eq.ZERO, the vector is special-cased to be
+*           all zeros in the application routines.  We do not need
+*           to clear it.
+            TAU = ZERO
          ELSE
-*           Only "reflecting" the diagonal entry to be real and non-negative.
-            XNORM = SLAPY2( ALPHR, ALPHI )
-            TAU = CMPLX( ONE - ALPHR / XNORM, -ALPHI / XNORM )
+*           However, the application routines rely on explicit
+*           zero checks when TAU.ne.ZERO, and we must clear X.
+            TAU = TWO
             DO J = 1, N-1
                X( 1 + (J-1)*INCX ) = ZERO
             END DO
-            ALPHA = XNORM
+            ALPHA = -ALPHA
          END IF
       ELSE
 *
diff --git a/lapack-netlib/SRC/zlarfgp.f b/lapack-netlib/SRC/zlarfgp.f
index 6c9efb04c..d54f2ea5d 100644
--- a/lapack-netlib/SRC/zlarfgp.f
+++ b/lapack-netlib/SRC/zlarfgp.f
@@ -148,33 +148,23 @@
       ALPHR = DBLE( ALPHA )
       ALPHI = DIMAG( ALPHA )
 *
-      IF( XNORM.LE.EPS*ABS(ALPHA) ) THEN
+      IF( XNORM.LE.EPS*ABS(ALPHA) .AND. ALPHI.EQ.ZERO ) THEN
 *
 *        H  =  [1-alpha/abs(alpha) 0; 0 I], sign chosen so ALPHA >= 0.
 *
-         IF( ALPHI.EQ.ZERO ) THEN
-            IF( ALPHR.GE.ZERO ) THEN
-*              When TAU.eq.ZERO, the vector is special-cased to be
-*              all zeros in the application routines.  We do not need
-*              to clear it.
-               TAU = ZERO
-            ELSE
-*              However, the application routines rely on explicit
-*              zero checks when TAU.ne.ZERO, and we must clear X.
-               TAU = TWO
-               DO J = 1, N-1
-                  X( 1 + (J-1)*INCX ) = ZERO
-               END DO
-               ALPHA = -ALPHA
-            END IF
+         IF( ALPHR.GE.ZERO ) THEN
+*           When TAU.eq.ZERO, the vector is special-cased to be
+*           all zeros in the application routines.  We do not need
+*           to clear it.
+            TAU = ZERO
          ELSE
-*           Only "reflecting" the diagonal entry to be real and non-negative.
-            XNORM = DLAPY2( ALPHR, ALPHI )
-            TAU = DCMPLX( ONE - ALPHR / XNORM, -ALPHI / XNORM )
+*           However, the application routines rely on explicit
+*           zero checks when TAU.ne.ZERO, and we must clear X.
+            TAU = TWO
             DO J = 1, N-1
                X( 1 + (J-1)*INCX ) = ZERO
             END DO
-            ALPHA = XNORM
+            ALPHA = -ALPHA
          END IF
       ELSE
 *
diff --git a/param.h b/param.h
index e048dabe7..5d2e960a2 100644
--- a/param.h
+++ b/param.h
@@ -2743,19 +2743,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define GEMM_DEFAULT_OFFSET_B 0
 #define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL
 
-#ifdef HAVE_MSA
-#define SGEMM_DEFAULT_UNROLL_M  8
-#define SGEMM_DEFAULT_UNROLL_N  8
-
-#define DGEMM_DEFAULT_UNROLL_M  8
-#define DGEMM_DEFAULT_UNROLL_N  4
-
-#define CGEMM_DEFAULT_UNROLL_M  8
-#define CGEMM_DEFAULT_UNROLL_N  4
-
-#define ZGEMM_DEFAULT_UNROLL_M  4
-#define ZGEMM_DEFAULT_UNROLL_N  4
-#else
+#if defined(NO_MSA)
 #define SGEMM_DEFAULT_UNROLL_M  8
 #define SGEMM_DEFAULT_UNROLL_N  4
 
@@ -2767,6 +2755,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define ZGEMM_DEFAULT_UNROLL_M  2
 #define ZGEMM_DEFAULT_UNROLL_N  2
+#else
+#define SGEMM_DEFAULT_UNROLL_M  8
+#define SGEMM_DEFAULT_UNROLL_N  8
+
+#define DGEMM_DEFAULT_UNROLL_M  8
+#define DGEMM_DEFAULT_UNROLL_N  4
+
+#define CGEMM_DEFAULT_UNROLL_M  8
+#define CGEMM_DEFAULT_UNROLL_N  4
+
+#define ZGEMM_DEFAULT_UNROLL_M  4
+#define ZGEMM_DEFAULT_UNROLL_N  4
 #endif
 
 #define SGEMM_DEFAULT_P	64
@@ -2854,12 +2854,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define QGEMM_DEFAULT_UNROLL_N 2
 #define CGEMM_DEFAULT_UNROLL_N 2
-#define ZGEMM_DEFAULT_UNROLL_N 2
+#define ZGEMM_DEFAULT_UNROLL_N 4
 #define XGEMM_DEFAULT_UNROLL_N 1
 
 #define QGEMM_DEFAULT_UNROLL_M 2
 #define CGEMM_DEFAULT_UNROLL_M 2
-#define ZGEMM_DEFAULT_UNROLL_M 2
+#define ZGEMM_DEFAULT_UNROLL_M 8
 #define XGEMM_DEFAULT_UNROLL_M 1
 
 #define SGEMM_DEFAULT_P 256
@@ -2891,10 +2891,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define DGEMM_DEFAULT_UNROLL_M  8
 #define DGEMM_DEFAULT_UNROLL_N  4
 
-#define CGEMM_DEFAULT_UNROLL_M  2
-#define CGEMM_DEFAULT_UNROLL_N  2
+#define CGEMM_DEFAULT_UNROLL_M  8
+#define CGEMM_DEFAULT_UNROLL_N  4
 
-#define ZGEMM_DEFAULT_UNROLL_M  1
+#define ZGEMM_DEFAULT_UNROLL_M  4
 #define ZGEMM_DEFAULT_UNROLL_N  4
 
 #define SGEMM_DEFAULT_P	128
@@ -2958,19 +2958,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define GEMM_DEFAULT_OFFSET_B  0
 #define GEMM_DEFAULT_ALIGN (BLASLONG) 0x03fffUL
 
-#if defined(HAVE_MSA)
-#define SGEMM_DEFAULT_UNROLL_M  8
-#define SGEMM_DEFAULT_UNROLL_N  8
-
-#define DGEMM_DEFAULT_UNROLL_M  8
-#define DGEMM_DEFAULT_UNROLL_N  4
-
-#define CGEMM_DEFAULT_UNROLL_M  8
-#define CGEMM_DEFAULT_UNROLL_N  4
-
-#define ZGEMM_DEFAULT_UNROLL_M  4
-#define ZGEMM_DEFAULT_UNROLL_N  4
-#else
+#if defined(NO_MSA)
 #define SGEMM_DEFAULT_UNROLL_M  2
 #define SGEMM_DEFAULT_UNROLL_N  2
 
@@ -2982,6 +2970,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define ZGEMM_DEFAULT_UNROLL_M  2
 #define ZGEMM_DEFAULT_UNROLL_N  2
+#else
+#define SGEMM_DEFAULT_UNROLL_M  8
+#define SGEMM_DEFAULT_UNROLL_N  8
+
+#define DGEMM_DEFAULT_UNROLL_M  8
+#define DGEMM_DEFAULT_UNROLL_N  4
+
+#define CGEMM_DEFAULT_UNROLL_M  8
+#define CGEMM_DEFAULT_UNROLL_N  4
+
+#define ZGEMM_DEFAULT_UNROLL_M  4
+#define ZGEMM_DEFAULT_UNROLL_N  4
 #endif
 
 #define SGEMM_DEFAULT_P  128
@@ -3041,6 +3041,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
+#if defined(x280)
+#define GEMM_DEFAULT_OFFSET_A 0
+#define GEMM_DEFAULT_OFFSET_B 0
+#define GEMM_DEFAULT_ALIGN 0x03fffUL
+
+#define SGEMM_DEFAULT_UNROLL_M  16 // 4 // 16 // 2
+#define SGEMM_DEFAULT_UNROLL_N  8// 4 // 4 // 2
+
+/* SGEMM_UNROLL_MN is calculated as max(SGEMM_UNROLL_M, SGEMM_UNROLL_N)
+ * Since we don't define SGEMM_UNROLL_M correctly we have to manually set this macro.
+ * If VLMAX size is ever more than 1024, this should be increased also. */
+#define SGEMM_DEFAULT_UNROLL_MN  32
+
+#define DGEMM_DEFAULT_UNROLL_M  16 //2 // 8
+#define DGEMM_DEFAULT_UNROLL_N  8 //2 // 4
+#define DGEMM_DEFAULT_UNROLL_MN  32
+
+#define CGEMM_DEFAULT_UNROLL_M  8
+#define CGEMM_DEFAULT_UNROLL_N  4
+#define CGEMM_DEFAULT_UNROLL_MN 32
+
+#define ZGEMM_DEFAULT_UNROLL_M  8
+#define ZGEMM_DEFAULT_UNROLL_N  4
+#define ZGEMM_DEFAULT_UNROLL_MN 16
+
+#define SGEMM_DEFAULT_P	160
+#define DGEMM_DEFAULT_P	160
+#define CGEMM_DEFAULT_P 96
+#define ZGEMM_DEFAULT_P 64
+
+#define SGEMM_DEFAULT_Q 240
+#define DGEMM_DEFAULT_Q 128
+#define CGEMM_DEFAULT_Q 120
+#define ZGEMM_DEFAULT_Q 120
+
+#define SGEMM_DEFAULT_R 12288
+#define DGEMM_DEFAULT_R 8192
+#define CGEMM_DEFAULT_R 4096
+#define ZGEMM_DEFAULT_R 4096
+
+#define SYMV_P	16
+
+#define GEMM_DEFAULT_OFFSET_A 0
+#define GEMM_DEFAULT_OFFSET_B 0
+
+#endif
 #ifdef C910V
 #define GEMM_DEFAULT_OFFSET_A 0
 #define GEMM_DEFAULT_OFFSET_B 0
@@ -3080,6 +3126,84 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
+#ifdef RISCV64_ZVL128B
+#define GEMM_DEFAULT_OFFSET_A 0
+#define GEMM_DEFAULT_OFFSET_B 0
+#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL
+
+#define SGEMM_DEFAULT_UNROLL_M 8
+#define SGEMM_DEFAULT_UNROLL_N 8
+
+#define DGEMM_DEFAULT_UNROLL_M 8
+#define DGEMM_DEFAULT_UNROLL_N 4
+
+#define CGEMM_DEFAULT_UNROLL_M 8
+#define CGEMM_DEFAULT_UNROLL_N 4
+
+#define ZGEMM_DEFAULT_UNROLL_M 4
+#define ZGEMM_DEFAULT_UNROLL_N 4
+
+#define SGEMM_DEFAULT_P 128
+#define DGEMM_DEFAULT_P 128
+#define CGEMM_DEFAULT_P 96
+#define ZGEMM_DEFAULT_P 64
+
+#define SGEMM_DEFAULT_Q 240
+#define DGEMM_DEFAULT_Q 120
+#define CGEMM_DEFAULT_Q 120
+#define ZGEMM_DEFAULT_Q 120
+
+#define SGEMM_DEFAULT_R 12288
+#define DGEMM_DEFAULT_R 8192
+#define CGEMM_DEFAULT_R 4096
+#define ZGEMM_DEFAULT_R 4096
+
+#define SYMV_P 16
+
+#define GEMM_DEFAULT_OFFSET_A 0
+#define GEMM_DEFAULT_OFFSET_B 0
+
+#endif
+
+#ifdef RISCV64_ZVL256B
+#define GEMM_DEFAULT_OFFSET_A 0
+#define GEMM_DEFAULT_OFFSET_B 0
+#define GEMM_DEFAULT_ALIGN 0x03fffUL
+
+#define SGEMM_DEFAULT_UNROLL_M  16
+#define SGEMM_DEFAULT_UNROLL_N  8
+
+#define DGEMM_DEFAULT_UNROLL_M  8
+#define DGEMM_DEFAULT_UNROLL_N  8
+
+#define CGEMM_DEFAULT_UNROLL_M  8
+#define CGEMM_DEFAULT_UNROLL_N  8
+
+#define ZGEMM_DEFAULT_UNROLL_M  8
+#define ZGEMM_DEFAULT_UNROLL_N  4
+
+#define SGEMM_DEFAULT_P 128
+#define DGEMM_DEFAULT_P 64
+#define CGEMM_DEFAULT_P 64
+#define ZGEMM_DEFAULT_P 64
+
+#define SGEMM_DEFAULT_Q 128
+#define DGEMM_DEFAULT_Q 128
+#define CGEMM_DEFAULT_Q 128
+#define ZGEMM_DEFAULT_Q 64
+
+#define SGEMM_DEFAULT_R 16384
+#define DGEMM_DEFAULT_R 8192
+#define CGEMM_DEFAULT_R 8192
+#define ZGEMM_DEFAULT_R 4096
+
+#define SYMV_P 16
+
+#define GEMM_DEFAULT_OFFSET_A 0
+#define GEMM_DEFAULT_OFFSET_B 0
+
+#endif
+
 #ifdef ARMV7
 #define SNUMOPT		2
 #define DNUMOPT		2
diff --git a/test/Makefile b/test/Makefile
index 56acf1c5b..5a4694ce6 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -276,6 +276,9 @@ ifeq ($(F_COMPILER), IBM)
 ifeq ($(C_COMPILER), GCC)
 CEXTRALIB += -lgomp
 endif
+ifeq ($(C_COMPILER), CLANG)
+CEXTRALIB += -lomp
+endif
 endif
 endif
 
diff --git a/test/compare_sgemm_sbgemm.c b/test/compare_sgemm_sbgemm.c
index cf808b56d..bc74233ab 100644
--- a/test/compare_sgemm_sbgemm.c
+++ b/test/compare_sgemm_sbgemm.c
@@ -100,6 +100,8 @@ main (int argc, char *argv[])
       float C[m * n];
       bfloat16_bits AA[m * k], BB[k * n];
       float DD[m * n], CC[m * n];
+      bfloat16 atmp,btmp;
+      blasint one=1;
 
       for (j = 0; j < m; j++)
 	{
@@ -108,16 +110,18 @@ main (int argc, char *argv[])
 	      A[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5;
 	      B[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5;
 	      C[j * k + i] = 0;
-	      AA[j * k + i].v = *(uint32_t *) & A[j * k + i] >> 16;
-	      BB[j * k + i].v = *(uint32_t *) & B[j * k + i] >> 16;
+	      sbstobf16_(&one, &A[j*k+i], &one, &atmp, &one);
+	      sbstobf16_(&one, &B[j*k+i], &one, &btmp, &one);
+	      AA[j * k + i].v = atmp;
+	      BB[j * k + i].v = btmp;
 	      CC[j * k + i] = 0;
 	      DD[j * k + i] = 0;
 	    }
 	}
       SGEMM (&transA, &transB, &m, &n, &k, &alpha, A,
 	     &m, B, &k, &beta, C, &m);
-      SBGEMM (&transA, &transB, &m, &n, &k, &alpha, AA,
-	      &m, BB, &k, &beta, CC, &m);
+      SBGEMM (&transA, &transB, &m, &n, &k, &alpha, (bfloat16*) AA,
+	      &m, (bfloat16*)BB, &k, &beta, CC, &m);
       for (i = 0; i < n; i++)
 	for (j = 0; j < m; j++)
 	    if (fabs (CC[i * m + j] - C[i * m + j]) > 1.0)
diff --git a/utest/CMakeLists.txt b/utest/CMakeLists.txt
index 41829bd22..edfcfb7cf 100644
--- a/utest/CMakeLists.txt
+++ b/utest/CMakeLists.txt
@@ -17,6 +17,7 @@ else ()
     test_swap.c
     test_zscal.c
     test_amin.c
+    test_axpby.c
   )
 endif ()
 
diff --git a/utest/Makefile b/utest/Makefile
index 8acaa3ea9..c42496fb3 100644
--- a/utest/Makefile
+++ b/utest/Makefile
@@ -12,7 +12,7 @@ UTESTBIN=openblas_utest
 include $(TOPDIR)/Makefile.system
 
 OBJS=utest_main.o test_min.o test_amax.o test_ismin.o test_rotmg.o test_axpy.o test_dotu.o test_dsdot.o test_swap.o test_rot.o test_dnrm2.o test_zscal.o \
-     test_amin.o
+     test_amin.o test_axpby.o
 #test_rot.o test_swap.o test_axpy.o test_dotu.o test_dsdot.o test_fork.o
 
 ifneq ($(NO_LAPACK), 1)
@@ -45,8 +45,18 @@ endif
 
 all : run_test
 
+ifeq ($(OSNAME), AIX)
+ifeq ($(USE_OPENMP), 1)
+$(UTESTBIN): $(OBJS)
+	$(CC) $(CFLAGS) $(LDFLAGS) -o $@ $^ ../$(LIBNAME) $(EXTRALIB)
+else
+$(UTESTBIN): $(OBJS)
+	$(CC) $(CFLAGS) $(LDFLAGS) -o $@ $^ ../$(LIBNAME) $(EXTRALIB) $(FEXTRALIB)
+endif
+else
 $(UTESTBIN): $(OBJS)
 	$(CC) $(CFLAGS) $(LDFLAGS) -o $@ $^ ../$(LIBNAME) $(EXTRALIB) $(FEXTRALIB)
+endif
 
 run_test: $(UTESTBIN)
 ifneq ($(CROSS), 1)
diff --git a/utest/test_axpby.c b/utest/test_axpby.c
new file mode 100644
index 000000000..37ba8ad14
--- /dev/null
+++ b/utest/test_axpby.c
@@ -0,0 +1,320 @@
+/*****************************************************************************
+Copyright (c) 2011-2024, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "openblas_utest.h"
+
+#ifdef BUILD_SINGLE
+CTEST(axpby, saxpby_inc_0)
+{
+    blasint i;
+    blasint N = 9, incX = 0, incY = 0;
+    float alpha = 1.0, beta = 2.0;
+    float x1[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 };
+    float y1[] = { 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 };
+
+    BLASFUNC(saxpby)(&N, &alpha, x1, &incX, &beta, y1, &incY);
+
+    float x2[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 };
+    float y2[] = { 1535.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 };
+
+    for(i = 0; i < N; i++){
+        ASSERT_DBL_NEAR_TOL(x2[i], x1[i], SINGLE_EPS);
+        ASSERT_DBL_NEAR_TOL(y2[i], y1[i], SINGLE_EPS);
+    }
+}
+
+CTEST(axpby, saxpby_inc_1)
+{
+    blasint i;
+    blasint N = 9, incX = 1, incY = 1;
+    float alpha = 0.25, beta = 0.75;
+    float x1[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 };
+    float y1[] = { 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 };
+
+    BLASFUNC(saxpby)(&N, &alpha, x1, &incX, &beta, y1, &incY);
+
+    float x2[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 };
+    float y2[] = { 1.75, 3.75, 5.75, 7.75, 1.75, 3.75, 5.75, 7.75, 9.75 };
+
+    for(i = 0; i < N; i++){
+        ASSERT_DBL_NEAR_TOL(x2[i], x1[i], SINGLE_EPS);
+        ASSERT_DBL_NEAR_TOL(y2[i], y1[i], SINGLE_EPS);
+    }
+}
+
+CTEST(axpby, saxpby_inc_2)
+{
+    blasint i;
+    blasint N = 9, incX = 2, incY = 2;
+    float alpha = 0.25, beta = 0.75;
+    float x1[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0,
+                   1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 };
+    float y1[] = { 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0,
+                   2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 };
+
+    BLASFUNC(saxpby)(&N, &alpha, x1, &incX, &beta, y1, &incY);
+
+    float x2[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0,
+                   1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 };
+    float y2[] = { 1.75, 4.00, 5.75, 8.00, 1.75, 4.00, 5.75, 8.00,
+                   9.75, 2.00, 3.75, 6.00, 7.75, 2.00, 3.75, 6.00,
+                   7.75, 10.00 };
+
+    for(i = 0; i < 2 * N; i++){
+        ASSERT_DBL_NEAR_TOL(x2[i], x1[i], SINGLE_EPS);
+        ASSERT_DBL_NEAR_TOL(y2[i], y1[i], SINGLE_EPS);
+    }
+}
+#endif
+
+#ifdef BUILD_DOUBLE
+CTEST(axpby, daxpby_inc_0)
+{
+    blasint i;
+    blasint N = 9, incX = 0, incY = 0;
+    double alpha = 1.0, beta  = 2.0;
+    double x1[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 };
+    double y1[] = { 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 };
+
+    BLASFUNC(daxpby)(&N, &alpha, x1, &incX, &beta, y1, &incY);
+
+    double x2[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 };
+    double y2[] = { 1535.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 };
+
+    for(i = 0; i < N; i++){
+        ASSERT_DBL_NEAR_TOL(x2[i], x1[i], DOUBLE_EPS);
+        ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS);
+    }
+}
+
+CTEST(axpby, daxpby_inc_1)
+{
+    blasint i;
+    blasint N = 9, incX = 1, incY = 1;
+    double alpha = 0.25, beta = 0.75;
+    double x1[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 };
+    double y1[] = { 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 };
+
+    BLASFUNC(daxpby)(&N, &alpha, x1, &incX, &beta, y1, &incY);
+
+    double x2[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 };
+    double y2[] = { 1.75, 3.75, 5.75, 7.75, 1.75, 3.75, 5.75, 7.75, 9.75 };
+
+    for(i = 0; i < N; i++){
+        ASSERT_DBL_NEAR_TOL(x2[i], x1[i], DOUBLE_EPS);
+        ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS);
+    }
+}
+
+CTEST(axpby, daxpby_inc_2)
+{
+    blasint i;
+    blasint N = 9, incX = 2, incY = 2;
+    double alpha = 0.25, beta = 0.75;
+    double x1[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0,
+                    1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 };
+    double y1[] = { 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0,
+                    2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 };
+
+    BLASFUNC(daxpby)(&N, &alpha, x1, &incX, &beta, y1, &incY);
+
+    double x2[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0,
+                    1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 };
+    double y2[] = { 1.75, 4.00, 5.75, 8.00, 1.75, 4.00, 5.75, 8.00,
+                    9.75, 2.00, 3.75, 6.00, 7.75, 2.00, 3.75, 6.00,
+                    7.75, 10.00 };
+
+    for(i = 0; i < 2 * N; i++){
+        ASSERT_DBL_NEAR_TOL(x2[i], x1[i], DOUBLE_EPS);
+        ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS);
+    }
+}
+#endif
+
+#ifdef BUILD_COMPLEX
+CTEST(axpby, caxpby_inc_0)
+{
+    blasint i;
+    blasint N = 9, incX = 0, incY = 0;
+    float alpha[] = { 1.0, 2.0 }, beta[] = { 2.0, 1.0 };
+    float x1[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0,
+                   1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 };
+    float y1[] = { 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0,
+                   2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 };
+
+    BLASFUNC(caxpby)(&N, alpha, x1, &incX, beta, y1, &incY);
+
+    float x2[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0,
+                   1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 };
+    float y2[] = { 9355.0, -8865.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0,
+                   10.0, 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 };
+
+    for(i = 0; i < 2 * N; i++){
+        ASSERT_DBL_NEAR_TOL(x2[i], x1[i], SINGLE_EPS);
+        ASSERT_DBL_NEAR_TOL(y2[i], y1[i], SINGLE_EPS);
+    }
+}
+
+CTEST(axpby, caxpby_inc_1)
+{
+    blasint i;
+    blasint N = 9, incX = 1, incY = 1;
+    float alpha[] = { 0.25, 0.25 }, beta[] = { 0.75, 0.75 };
+    float x1[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0,
+                   1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 };
+    float y1[] = { 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0,
+                   2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 };
+
+    BLASFUNC(caxpby)(&N, alpha, x1, &incX, beta, y1, &incY);
+
+    float x2[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0,
+                   1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 };
+    float y2[] = { -2.0, 5.5, -2.0, 13.5, -2.0, 5.5, -2.0, 13.5,
+                   8.0, 11.5, -2.0, 9.5, 6.0, 9.5, -2.0, 9.5, -2.0, 17.5 };
+
+    for(i = 0; i < 2 * N; i++){
+        ASSERT_DBL_NEAR_TOL(x2[i], x1[i], SINGLE_EPS);
+        ASSERT_DBL_NEAR_TOL(y2[i], y1[i], SINGLE_EPS);
+    }
+}
+
+CTEST(axpby, caxpby_inc_2)
+{
+    blasint i;
+    blasint N = 9, incX = 2, incY = 2;
+    float alpha[] = { 0.25, 0.25 }, beta[] = { 0.75, 0.75 };
+    float x1[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0,
+                   1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0,
+                   1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0,
+                   1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 };
+    float y1[] = { 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0,
+                   2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0,
+                   2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0,
+                   2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 };
+
+    BLASFUNC(caxpby)(&N, &alpha, x1, &incX, &beta, y1, &incY);
+
+    float x2[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0,
+                   1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0,
+                   1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0,
+                   1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 };
+    float y2[] = { -2.0, 5.5, 6.0, 8.0, -2.0, 5.5, 6.0, 8.0, 8.0,
+                   11.5, 4.0, 6.0, 6.0, 9.5, 4.0, 6.0, -2.0, 17.5,
+                   2.0, 4.0, -2.0, 13.5, 2.0, 4.0, -2.0, 13.5, 10.0,
+                   2.0, -2.0, 9.5, 8.0, 2.0, -2.0, 9.5, 8.0, 10.0 };
+
+    for(i = 0; i < 4 * N; i++){
+        ASSERT_DBL_NEAR_TOL(x2[i], x1[i], SINGLE_EPS);
+        ASSERT_DBL_NEAR_TOL(y2[i], y1[i], SINGLE_EPS);
+    }
+}
+#endif
+
+#ifdef BUILD_COMPLEX16
+CTEST(axpby, zaxpby_inc_0)
+{
+    blasint i;
+    blasint N = 9, incX = 0, incY = 0;
+    double alpha[] = { 1.0, 2.0 }, beta[] = { 2.0, 1.0 };
+    double x1[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0,
+                   1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 };
+    double y1[] = { 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0,
+                   2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 };
+
+    BLASFUNC(zaxpby)(&N, alpha, x1, &incX, beta, y1, &incY);
+
+    double x2[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0,
+                   1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 };
+    double y2[] = { 9355.0, -8865.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0,
+                   10.0, 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 };
+
+    for(i = 0; i < 2 * N; i++){
+        ASSERT_DBL_NEAR_TOL(x2[i], x1[i], DOUBLE_EPS);
+        ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS);
+    }
+}
+
+CTEST(axpby, zaxpby_inc_1)
+{
+    blasint i;
+    blasint N = 9, incX = 1, incY = 1;
+    double alpha[] = { 0.25, 0.25 }, beta[] = { 0.75, 0.75 };
+    double x1[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0,
+                   1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 };
+    double y1[] = { 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0,
+                   2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 };
+
+    BLASFUNC(zaxpby)(&N, alpha, x1, &incX, beta, y1, &incY);
+
+    double x2[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0,
+                   1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 };
+    double y2[] = { -2.0, 5.5, -2.0, 13.5, -2.0, 5.5, -2.0, 13.5,
+                   8.0, 11.5, -2.0, 9.5, 6.0, 9.5, -2.0, 9.5, -2.0, 17.5 };
+
+    for(i = 0; i < 2 * N; i++){
+        ASSERT_DBL_NEAR_TOL(x2[i], x1[i], DOUBLE_EPS);
+        ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS);
+    }
+}
+
+CTEST(axpby, zaxpby_inc_2)
+{
+    blasint i;
+    blasint N = 9, incX = 2, incY = 2;
+    double alpha[] = { 0.25, 0.25 }, beta[] = { 0.75, 0.75 };
+    double x1[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0,
+                   1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0,
+                   1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0,
+                   1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 };
+    double y1[] = { 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0,
+                   2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0,
+                   2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0,
+                   2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 };
+
+    BLASFUNC(zaxpby)(&N, &alpha, x1, &incX, &beta, y1, &incY);
+
+    double x2[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0,
+                   1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0,
+                   1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0,
+                   1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 };
+    double y2[] = { -2.0, 5.5, 6.0, 8.0, -2.0, 5.5, 6.0, 8.0, 8.0,
+                   11.5, 4.0, 6.0, 6.0, 9.5, 4.0, 6.0, -2.0, 17.5,
+                   2.0, 4.0, -2.0, 13.5, 2.0, 4.0, -2.0, 13.5, 10.0,
+                   2.0, -2.0, 9.5, 8.0, 2.0, -2.0, 9.5, 8.0, 10.0 };
+
+    for(i = 0; i < 4 * N; i++){
+        ASSERT_DBL_NEAR_TOL(x2[i], x1[i], DOUBLE_EPS);
+        ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS);
+    }
+}
+#endif
diff --git a/utest/test_zscal.c b/utest/test_zscal.c
index 8992eee90..ffc851e8b 100644
--- a/utest/test_zscal.c
+++ b/utest/test_zscal.c
@@ -20,6 +20,18 @@ CTEST(zscal, i_nan)
     ASSERT_TRUE(isnan(nan[17]));
 }
 
+CTEST(zscal, i_nan_inc_2)
+{
+    double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 };
+    double nan[] = {NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0,
+                    NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0};
+    cblas_zscal(9, i, &nan, 2);
+    ASSERT_TRUE(isnan(nan[0]));
+    ASSERT_TRUE(isnan(nan[1]));
+    ASSERT_TRUE(isnan(nan[16]));
+    ASSERT_TRUE(isnan(nan[17]));
+}
+
 CTEST(zscal, nan_i)
 {
     double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 };
@@ -30,7 +42,19 @@ CTEST(zscal, nan_i)
     ASSERT_TRUE(isnan(i[16]));
     ASSERT_TRUE(isnan(i[17]));
 }
-	    
+
+CTEST(zscal, nan_i_inc_2)
+{
+    double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1,
+                  0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 };
+    double nan[] = {NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0};
+    cblas_zscal(9, &nan, &i, 2);
+    ASSERT_TRUE(isnan(i[0]));
+    ASSERT_TRUE(isnan(i[1]));
+    ASSERT_TRUE(isnan(i[16]));
+    ASSERT_TRUE(isnan(i[17]));
+}
+
 CTEST(zscal, i_inf)
 {
     double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 };
@@ -40,7 +64,19 @@ CTEST(zscal, i_inf)
     ASSERT_TRUE(isinf(inf[1]));
     ASSERT_TRUE(isnan(inf[16]));
     ASSERT_TRUE(isinf(inf[17]));
-}    
+}
+
+CTEST(zscal, i_inf_inc_2)
+{
+    double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 };
+    double inf[] = {INFINITY, 0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0,
+                    INFINITY, 0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0};
+    cblas_zscal(9, i, &inf, 2);
+    ASSERT_TRUE(isnan(inf[0]));
+    ASSERT_TRUE(isinf(inf[1]));
+    ASSERT_TRUE(isnan(inf[16]));
+    ASSERT_TRUE(isinf(inf[17]));
+}
 
 CTEST(zscal, inf_i)
 {
@@ -53,4 +89,16 @@ CTEST(zscal, inf_i)
     ASSERT_TRUE(isinf(i[17]));
 }
 
+CTEST(zscal, inf_i_inc_2)
+{
+    double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1,
+                  0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 };
+    double inf[] = {INFINITY, 0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0};
+    cblas_zscal(9, &inf, &i, 2);
+    ASSERT_TRUE(isnan(i[0]));
+    ASSERT_TRUE(isinf(i[1]));
+    ASSERT_TRUE(isnan(i[16]));
+    ASSERT_TRUE(isinf(i[17]));
+}
+
 #endif