Merge branch 'OpenMathLib:develop' into gemmt_tests

11 months ago · 556ffac02d
--- a/.github/workflows/c910v.yml
+++ b/.github/workflows/c910v.yml
@@ -83,9 +83,39 @@ jobs:

      - name: test
        run: |
          export PATH=$GITHUB_WORKSPACE/qemu-install/bin/:$PATH
          qemu-riscv64 ./utest/openblas_utest
          qemu-riscv64 ./utest/openblas_utest_ext
          run_with_retry() {
              local cmd="$1"
              local time_out=10
              local retries=10
              local attempt=0

              for ((i=1; i<=retries; i++)); do
                  attempt=$((i))
                  if timeout -s 12  --preserve-status $time_out $cmd; then
                      echo "Command succeeded on attempt $i."
                      return 0
                  else
                      local exit_code=$?
                      if [ $exit_code -eq 140 ]; then
                          echo "Attempt $i timed out (retrying...)"
                          time_out=$((time_out + 5))
                      else
                          echo "Attempt $i failed with exit code $exit_code. Aborting workflow."
                          exit $exit_code
                      fi
                  fi
              done
              echo "All $retries attempts failed, giving up."
              echo "Final failure was due to timeout."
              echo "Aborting workflow."
              exit $exit_code
          }
          export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH
          which qemu-riscv64
          export QEMU_BIN=$(which qemu-riscv64)
          run_with_retry "$QEMU_BIN ./utest/openblas_utest"
          run_with_retry "$QEMU_BIN ./utest/openblas_utest_ext"

          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xscblat1
          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xdcblat1
          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xccblat1
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -62,6 +62,7 @@ option(CPP_THREAD_SAFETY_TEST "Run a massively parallel DGEMM test to confirm th

 option(CPP_THREAD_SAFETY_GEMV "Run a massively parallel DGEMV test to confirm thread safety of the library (requires OpenMP)" OFF)
 option(BUILD_STATIC_LIBS "Build static library" OFF)
 option(BUILD_SHARED_LIBS "Build shared library" OFF)
 if(NOT BUILD_STATIC_LIBS AND NOT BUILD_SHARED_LIBS)
  set(BUILD_STATIC_LIBS ON CACHE BOOL "Build static library" FORCE)
 endif()
@@ -123,7 +124,12 @@ message(WARNING "CMake support is experimental. It does not yet support all buil
 include("${PROJECT_SOURCE_DIR}/cmake/utils.cmake")
 include("${PROJECT_SOURCE_DIR}/cmake/system.cmake")

 set(OpenBLAS_LIBNAME ${LIBNAMEPREFIX}openblas${LIBNAMESUFFIX}${SUFFIX64_UNDERSCORE})
 string(FIND "${LIBNAMESUFFIX}" "${SUFFIX64_UNDERSCORE}" HAVE64)
 if (${HAVE64} GREATER -1)
 	set(OpenBLAS_LIBNAME ${LIBNAMEPREFIX}openblas${LIBNAMESUFFIX})
 else ()
 	set(OpenBLAS_LIBNAME ${LIBNAMEPREFIX}openblas${LIBNAMESUFFIX}${SUFFIX64_UNDERSCORE})
 endif ()

 set(BLASDIRS interface driver/level2 driver/level3 driver/others)

@@ -716,4 +722,5 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PN}ConfigVersion.cmake
        DESTINATION ${CMAKECONFIG_INSTALL_DIR})
 install(EXPORT "${PN}${SUFFIX64}Targets"
        NAMESPACE "${PN}${SUFFIX64}::"
        DESTINATION ${CMAKECONFIG_INSTALL_DIR})
        DESTINATION ${CMAKECONFIG_INSTALL_DIR})

--- a/Makefile.system
+++ b/Makefile.system
@@ -276,6 +276,7 @@ SMALL_MATRIX_OPT = 1
 endif
 ifeq ($(ARCH), arm64)
 GEMM_GEMV_FORWARD = 1
 GEMM_GEMV_FORWARD_BF16 = 1
 endif
 ifeq ($(ARCH), riscv)
 GEMM_GEMV_FORWARD = 1
--- a/cmake/cc.cmake
+++ b/cmake/cc.cmake
@@ -229,9 +229,9 @@ if (${CORE} STREQUAL NEOVERSEN1)
    if (${CMAKE_C_COMPILER_ID} STREQUAL "NVC" AND NOT NO_SVE)
 	set (CCOMMON_OPT  "${CCOMMON_OPT} -tp=neoverse-n1")
    elseif (${GCC_VERSION} VERSION_GREATER 9.4 OR ${GCC_VERSION} VERSION_EQUAL 9.4)
      set (CCOMMON_OPT  "${CCOMMON_OPT} -march=armv8.2-a+sve -mtune=neoverse-n1")
      set (CCOMMON_OPT  "${CCOMMON_OPT} -march=armv8.2-a -mtune=neoverse-n1")
    else ()
      set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve")
      set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a")
    endif()
  endif ()
 endif ()
@@ -260,13 +260,13 @@ endif ()

 if (${CORE} STREQUAL CORTEXA510)
  if (NOT DYNAMIC_ARCH)
    set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve")
    set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.4-a+sve")
  endif ()
 endif ()

 if (${CORE} STREQUAL CORTEXA710)
  if (NOT DYNAMIC_ARCH)
    set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve")
    set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.4-a+sve")
  endif ()
 endif ()

@@ -278,7 +278,7 @@ endif ()

 if (${CORE} STREQUAL CORTEXX2)
  if (NOT DYNAMIC_ARCH)
    set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve")
    set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.4-a+sve")
  endif ()
 endif ()

--- a/ctest/CMakeLists.txt
+++ b/ctest/CMakeLists.txt
@@ -6,7 +6,7 @@ enable_language(Fortran)
 endif()

 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DADD${BU} -DCBLAS")
 if (BINARY32 AND CMAKE_C_PLATFORM_ID MATCHES "MinGW" AND CMAKE_Fortran_COMPILER_VERSION VERSION_EQUAL 14.2)
 if (BINARY32 AND CMAKE_C_PLATFORM_ID MATCHES "MinGW" AND CMAKE_Fortran_COMPILER_VERSION VERSION_GREATER 14.1)
       list(REMOVE_ITEM ${CMAKE_Fortran_FLAGS} -O3 -O2 -O1 -Os)
       set (CMAKE_Fortran_FLAGS_RELEASE "" CACHE STRING "" FORCE)
 endif()
--- a/driver/level3/level3_thread.c
+++ b/driver/level3/level3_thread.c
@@ -851,9 +851,20 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, IFLOAT *sa, IF
    /* Objective function come from sum of partitions in m and n.             */
    /* (n / nthreads_n) + (m / nthreads_m)                                    */
    /* = (n * nthreads_m + m * nthreads_n) / (nthreads_n * nthreads_m)        */
    while (nthreads_m % 2 == 0 && n * nthreads_m + m * nthreads_n > n * (nthreads_m / 2) + m * (nthreads_n * 2)) {
      nthreads_m /= 2;
      nthreads_n *= 2;
    BLASLONG cost = 0, div = 0;
    BLASLONG i;
    for (i = 1; i <= sqrt(nthreads_m); i++) {
      if (nthreads_m % i) continue;
      BLASLONG j = nthreads_m / i;
      BLASLONG cost_i = n * j + m * nthreads_n * i;
      BLASLONG cost_j = n * i + m * nthreads_n * j;
      if (cost == 0 ||
          cost_i < cost) {cost = cost_i; div = i;}
      if (cost_j < cost) {cost = cost_j; div = j;}
    }
    if (div > 1) {
      nthreads_m /= div;
      nthreads_n *= div;
    }
  }

--- a/interface/gemm.c
+++ b/interface/gemm.c
@@ -417,21 +417,24 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS

  PRINT_DEBUG_CNAME;

 #if !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) && defined(USE_SGEMM_KERNEL_DIRECT)
 #if defined(DYNAMIC_ARCH) && defined(ARCH_x86)
 if (support_avx512() )
 #if !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) 
 #if defined(ARCH_x86) && (defined(USE_SGEMM_KERNEL_DIRECT)||defined(DYNAMIC_ARCH))
 #if defined(DYNAMIC_ARCH)
  if (support_avx512() )
 #endif
  if (beta == 0 && alpha == 1.0 && order == CblasRowMajor && TransA == CblasNoTrans && TransB == CblasNoTrans && SGEMM_DIRECT_PERFORMANT(m,n,k)) {
 	SGEMM_DIRECT(m, n, k, a, lda, b, ldb, c, ldc);
 	return;
  }
 #endif
 #if defined(DYNAMIC_ARCH) && defined(ARCH_ARM64)
 if (support_sme1()){
 #if defined(ARCH_ARM64) && (defined(USE_SGEMM_KERNEL_DIRECT)||defined(DYNAMIC_ARCH))
 #if defined(DYNAMIC_ARCH)
 if (support_sme1())
 #endif
  if (beta == 0 && alpha == 1.0 && order == CblasRowMajor && TransA == CblasNoTrans && TransB == CblasNoTrans) {
 	SGEMM_DIRECT(m, n, k, a, lda, b, ldb, c, ldc);
 	return;
  }
 }
 #endif
 #endif

--- a/interface/zscal.c
+++ b/interface/zscal.c
@@ -98,7 +98,7 @@ void CNAME(blasint n, FLOAT alpha_r, void *vx, blasint incx){
  if (nthreads == 1) {
 #endif

  SCAL_K(n, 0, 0, alpha[0], alpha[1], x, incx, NULL, 0, NULL, 0);
  SCAL_K(n, 0, 0, alpha[0], alpha[1], x, incx, NULL, 0, NULL, 1);

 #ifdef SMP
  } else {
@@ -108,7 +108,7 @@ void CNAME(blasint n, FLOAT alpha_r, void *vx, blasint incx){
    mode  =  BLAS_SINGLE | BLAS_COMPLEX;
 #endif

    blas_level1_thread(mode, n, 0, 0,  alpha, x, incx, NULL, 0, NULL, 0, (int (*)(void))SCAL_K, nthreads);
    blas_level1_thread(mode, n, 0, 0,  alpha, x, incx, NULL, 0, NULL, 1, (int (*)(void))SCAL_K, nthreads);

  }
 #endif
--- a/interface/zsyr.c
+++ b/interface/zsyr.c
@@ -116,12 +116,12 @@ void NAME(char *UPLO, blasint *N, FLOAT  *ALPHA,

 #else

 void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, int n, FLOAT alpha, FLOAT *x, int incx, FLOAT *a, int lda) {
 void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, int n, void* valpha, FLOAT *x, int incx, FLOAT *a, int lda) {

  FLOAT *buffer;
  int uplo;
  blasint info;
  FLOAT * ALPHA = &alpha;
  FLOAT * ALPHA = (FLOAT*)valpha;
  FLOAT alpha_r	= ALPHA[0];
  FLOAT alpha_i	= ALPHA[1];
 #ifdef SMP
--- a/kernel/arm/zscal.c
+++ b/kernel/arm/zscal.c
@@ -27,65 +27,56 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 /**************************************************************************************
 * 2013/09/14 Saar
 *	 BLASTEST float		: OK
 * 	 BLASTEST double	: OK
 * 	 CTEST			: OK
 * 	 TEST			: OK
 *    BLASTEST float     : OK
 *    BLASTEST double    : OK
 *    CTEST          : OK
 *    TEST           : OK
 *
 **************************************************************************************/

 #include "common.h"

 // The c/zscal_k function is called not only by cblas_c/zscal but also by other upper-level interfaces.
 // In certain cases, the expected return values for cblas_s/zscal differ from those of other upper-level interfaces.
 // To handle this, we use the dummy2 parameter to differentiate between them.
 int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
 {
 	BLASLONG i=0;
 	BLASLONG inc_x2;
 	BLASLONG ip = 0;
 	FLOAT temp;
    BLASLONG i = 0;
    BLASLONG inc_x2;
    BLASLONG ip = 0;
    FLOAT temp;

        if ( (n <= 0) || (inc_x <= 0))
                return(0);
    if ((n <= 0) || (inc_x <= 0))
        return(0);

    inc_x2 = 2 * inc_x;
    if (dummy2 == 0) {
        for (i = 0; i < n; i++)
        {
            if (da_r == 0.0 && da_i == 0.0)
            {
                x[ip] = 0.0;
                x[ip+1] = 0.0;
            }
            else
            {
                temp    = da_r * x[ip]   - da_i * x[ip+1];
                x[ip+1] = da_r * x[ip+1] + da_i * x[ip]  ;
                x[ip] = temp;
            }

 	inc_x2 = 2 * inc_x;
 	for ( i=0; i<n; i++ )
 	{
 		if ( da_r == 0.0 )
 		{
 			if ( da_i == 0.0 )
 			{
 				temp = 0.0;
 				x[ip+1] = 0.0 ;
 			}
 			else
 			{
 				temp = - da_i * x[ip+1] ;
 				if (isnan(x[ip]) || isinf(x[ip])) temp = NAN;
 				if (!isinf(x[ip+1])) 
 					x[ip+1] = da_i * x[ip]  ;
 				else 	x[ip+1] = NAN;
 			}
 		}
 		else
 		{
 			if ( da_i == 0.0 )
 			{
 				temp    = da_r * x[ip]  ;
 				x[ip+1] = da_r * x[ip+1];
 			}
 			else
 			{
 				temp    = da_r * x[ip]   - da_i * x[ip+1] ;
 				x[ip+1] = da_r * x[ip+1] + da_i * x[ip]   ;
 			}
 		}
 		x[ip]   = temp;
            ip += inc_x2;
        }
        return(0);
    }
    for (i = 0; i < n; i++)
    {
        temp    = da_r * x[ip]   - da_i * x[ip+1];
        x[ip+1] = da_r * x[ip+1] + da_i * x[ip]  ;

 		ip += inc_x2;
 	}

 	return(0);
        x[ip]   = temp;
        ip += inc_x2;
    }

    return(0);
 }


--- a/kernel/arm64/dot_kernel_asimd.c
+++ b/kernel/arm64/dot_kernel_asimd.c
@@ -134,7 +134,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	"	fadd	v4.4s, v4.4s, v6.4s		\n"	\
 	"	fadd	v0.4s, v0.4s, v4.4s		\n"	\
 	"	faddp	v0.4s, v0.4s, v0.4s		\n"	\
 	"	faddp	v0.4s, v0.4s, v0.4s		\n"
 	"	faddp	"OUT", v0.2s			\n"

 #else /* !defined(DSDOT) */
 #define KERNEL_F1						\
--- a/kernel/arm64/zscal.S
+++ b/kernel/arm64/zscal.S
@@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define	INC_X	x4	/* X stride */
 #define I	x5	/* loop variable */
 #define X_COPY	x6	/* Copy of X */

 #define FLAG	x7
 /*******************************************************************************
 * Macro definitions
 *******************************************************************************/
@@ -216,6 +216,9 @@ zscal_begin:

 	cmp	N, xzr
 	ble	.Lzscal_kernel_L999
 ldr FLAG, [sp]
 cmp FLAG, #1
 beq .Lzscal_kernel_RI_non_zero

 	fcmp	DA_R, #0.0
 	bne	.Lzscal_kernel_R_non_zero
@@ -228,7 +231,7 @@ zscal_begin:
 .Lzscal_kernel_R_non_zero:

 	fcmp	DA_I, #0.0
 	beq	.Lzscal_kernel_I_zero
 //QUAK	beq	.Lzscal_kernel_I_zero

 /*******************************************************************************
 * A_R != 0 && A_I != 0
--- a/kernel/generic/zgemm_ncopy_16.c
+++ b/kernel/generic/zgemm_ncopy_16.c
@@ -0,0 +1,332 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
 /* without modification, are permitted provided that the following   */
 /* conditions are met:                                               */
 /*                                                                   */
 /*   1. Redistributions of source code must retain the above         */
 /*      copyright notice, this list of conditions and the following  */
 /*      disclaimer.                                                  */
 /*                                                                   */
 /*   2. Redistributions in binary form must reproduce the above      */
 /*      copyright notice, this list of conditions and the following  */
 /*      disclaimer in the documentation and/or other materials       */
 /*      provided with the distribution.                              */
 /*                                                                   */
 /*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
 /*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
 /*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
 /*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
 /*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
 /*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
 /*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
 /*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
 /*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
 /*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
 /*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
 /*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
 /*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
 /*    POSSIBILITY OF SUCH DAMAGE.                                    */
 /*                                                                   */
 /* The views and conclusions contained in the software and           */
 /* documentation are those of the authors and should not be          */
 /* interpreted as representing official policies, either expressed   */
 /* or implied, of The University of Texas at Austin.                 */
 /*********************************************************************/

 #include <stdio.h>
 #include "common.h"

 int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
  BLASLONG i, j;

  IFLOAT *aoffset;
  IFLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4;
  IFLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8;
  IFLOAT *aoffset9, *aoffset10, *aoffset11, *aoffset12;
  IFLOAT *aoffset13, *aoffset14, *aoffset15, *aoffset16;

  IFLOAT *boffset;
  IFLOAT ctemp01, ctemp02, ctemp03, ctemp04;
  IFLOAT ctemp05, ctemp06, ctemp07, ctemp08;
  IFLOAT ctemp09, ctemp10, ctemp11, ctemp12;
  IFLOAT ctemp13, ctemp14, ctemp15, ctemp16;
  IFLOAT ctemp17, ctemp18, ctemp19, ctemp20;
  IFLOAT ctemp21, ctemp22, ctemp23, ctemp24;
  IFLOAT ctemp25, ctemp26, ctemp27, ctemp28;
  IFLOAT ctemp29, ctemp30, ctemp31, ctemp32;

  aoffset = a;
  boffset = b;
  lda *= 2;

  j = (n >> 4);
  if (j > 0){
    do{
        aoffset1  = aoffset;
        aoffset2  = aoffset1  + lda;
        aoffset3  = aoffset2  + lda;
        aoffset4  = aoffset3  + lda;
        aoffset5  = aoffset4  + lda;
        aoffset6  = aoffset5  + lda;
        aoffset7  = aoffset6  + lda;
        aoffset8  = aoffset7  + lda;
        aoffset9  = aoffset8  + lda;
        aoffset10 = aoffset9  + lda;
        aoffset11 = aoffset10 + lda;
        aoffset12 = aoffset11 + lda;
        aoffset13 = aoffset12 + lda;
        aoffset14 = aoffset13 + lda;
        aoffset15 = aoffset14 + lda;
        aoffset16 = aoffset15 + lda;
        aoffset += 16 * lda;

      i = m;
      if (i > 0){
 	do{
        ctemp01 = *(aoffset1 +  0);
        ctemp02 = *(aoffset1 +  1);
        ctemp03 = *(aoffset2 +  0);
        ctemp04 = *(aoffset2 +  1);
        ctemp05 = *(aoffset3 +  0);
        ctemp06 = *(aoffset3 +  1);
        ctemp07 = *(aoffset4 +  0);
        ctemp08 = *(aoffset4 +  1);
        ctemp09 = *(aoffset5 +  0);
        ctemp10 = *(aoffset5 +  1);
        ctemp11 = *(aoffset6 +  0);
        ctemp12 = *(aoffset6 +  1);
        ctemp13 = *(aoffset7 +  0);
        ctemp14 = *(aoffset7 +  1);
        ctemp15 = *(aoffset8 +  0);
        ctemp16 = *(aoffset8 +  1);

        ctemp17 = *(aoffset9 +  0);
        ctemp18 = *(aoffset9 +  1);
        ctemp19 = *(aoffset10 +  0);
        ctemp20 = *(aoffset10 +  1);
        ctemp21 = *(aoffset11 +  0);
        ctemp22 = *(aoffset11 +  1);
        ctemp23 = *(aoffset12 +  0);
        ctemp24 = *(aoffset12 +  1);
        ctemp25 = *(aoffset13 +  0);
        ctemp26 = *(aoffset13 +  1);
        ctemp27 = *(aoffset14 +  0);
        ctemp28 = *(aoffset14 +  1);
        ctemp29 = *(aoffset15 +  0);
        ctemp30 = *(aoffset15 +  1);
        ctemp31 = *(aoffset16 +  0);
        ctemp32 = *(aoffset16 +  1);

      *(boffset +  0) = ctemp01;
 	  *(boffset +  1) = ctemp02;
 	  *(boffset +  2) = ctemp03;
 	  *(boffset +  3) = ctemp04;
 	  *(boffset +  4) = ctemp05;
 	  *(boffset +  5) = ctemp06;
 	  *(boffset +  6) = ctemp07;
 	  *(boffset +  7) = ctemp08;
 	  *(boffset +  8) = ctemp09;
 	  *(boffset +  9) = ctemp10;
 	  *(boffset + 10) = ctemp11;
 	  *(boffset + 11) = ctemp12;
 	  *(boffset + 12) = ctemp13;
 	  *(boffset + 13) = ctemp14;
 	  *(boffset + 14) = ctemp15;
 	  *(boffset + 15) = ctemp16;

      *(boffset + 16) = ctemp17;
 	  *(boffset + 17) = ctemp18;
 	  *(boffset + 18) = ctemp19;
 	  *(boffset + 19) = ctemp20;
 	  *(boffset + 20) = ctemp21;
 	  *(boffset + 21) = ctemp22;
 	  *(boffset + 22) = ctemp23;
 	  *(boffset + 23) = ctemp24;
 	  *(boffset + 24) = ctemp25;
 	  *(boffset + 25) = ctemp26;
 	  *(boffset + 26) = ctemp27;
 	  *(boffset + 27) = ctemp28;
 	  *(boffset + 28) = ctemp29;
 	  *(boffset + 29) = ctemp30;
 	  *(boffset + 30) = ctemp31;
 	  *(boffset + 31) = ctemp32;

        aoffset1 +=  2;
        aoffset2 +=  2;
        aoffset3 +=  2;
        aoffset4 +=  2;
        aoffset5 +=  2;
        aoffset6 +=  2;
        aoffset7 +=  2;
        aoffset8 +=  2;
        aoffset9  +=  2;
        aoffset10 +=  2;
        aoffset11 +=  2;
        aoffset12 +=  2;
        aoffset13 +=  2;
        aoffset14 +=  2;
        aoffset15 +=  2;
        aoffset16 +=  2;

        boffset   += 32;
 	  i --;
 	}while(i > 0);
      }
      j--;
    }while(j > 0);
  } /* end of if(j > 0) */

  if (n & 8){
    aoffset1  = aoffset;
      aoffset2  = aoffset1 + lda;
      aoffset3  = aoffset2 + lda;
      aoffset4  = aoffset3 + lda;
      aoffset5  = aoffset4 + lda;
      aoffset6  = aoffset5 + lda;
      aoffset7  = aoffset6 + lda;
      aoffset8  = aoffset7 + lda;
      aoffset += 8 * lda;

      i = m;
      if (i > 0){
 	do{
 	  ctemp01 = *(aoffset1 +  0);
 	  ctemp02 = *(aoffset1 +  1);
 	  ctemp03 = *(aoffset2 +  0);
 	  ctemp04 = *(aoffset2 +  1);
 	  ctemp05 = *(aoffset3 +  0);
 	  ctemp06 = *(aoffset3 +  1);
 	  ctemp07 = *(aoffset4 +  0);
 	  ctemp08 = *(aoffset4 +  1);
 	  ctemp09 = *(aoffset5 +  0);
 	  ctemp10 = *(aoffset5 +  1);
 	  ctemp11 = *(aoffset6 +  0);
 	  ctemp12 = *(aoffset6 +  1);
 	  ctemp13 = *(aoffset7 +  0);
 	  ctemp14 = *(aoffset7 +  1);
 	  ctemp15 = *(aoffset8 +  0);
 	  ctemp16 = *(aoffset8 +  1);

 	  *(boffset +  0) = ctemp01;
 	  *(boffset +  1) = ctemp02;
 	  *(boffset +  2) = ctemp03;
 	  *(boffset +  3) = ctemp04;
 	  *(boffset +  4) = ctemp05;
 	  *(boffset +  5) = ctemp06;
 	  *(boffset +  6) = ctemp07;
 	  *(boffset +  7) = ctemp08;
 	  *(boffset +  8) = ctemp09;
 	  *(boffset +  9) = ctemp10;
 	  *(boffset + 10) = ctemp11;
 	  *(boffset + 11) = ctemp12;
 	  *(boffset + 12) = ctemp13;
 	  *(boffset + 13) = ctemp14;
 	  *(boffset + 14) = ctemp15;
 	  *(boffset + 15) = ctemp16;

 	  aoffset1 += 2;
 	  aoffset2 += 2;
 	  aoffset3 += 2;
 	  aoffset4 += 2;
 	  aoffset5 += 2;
 	  aoffset6 += 2;
 	  aoffset7 += 2;
 	  aoffset8 += 2;

 	  boffset += 16;
 	  i --;
 	}while(i > 0);
      }
  }

  if (n & 4){
    aoffset1  = aoffset;
    aoffset2  = aoffset1 + lda;
    aoffset3  = aoffset2 + lda;
    aoffset4  = aoffset3 + lda;
    aoffset += 4 * lda;

    i = m;
    if (i > 0){
      do{
 	ctemp01 = *(aoffset1 +  0);
 	ctemp02 = *(aoffset1 +  1);
 	ctemp03 = *(aoffset2 +  0);
 	ctemp04 = *(aoffset2 +  1);
 	ctemp05 = *(aoffset3 +  0);
 	ctemp06 = *(aoffset3 +  1);
 	ctemp07 = *(aoffset4 +  0);
 	ctemp08 = *(aoffset4 +  1);

 	*(boffset +  0) = ctemp01;
 	*(boffset +  1) = ctemp02;
 	*(boffset +  2) = ctemp03;
 	*(boffset +  3) = ctemp04;
 	*(boffset +  4) = ctemp05;
 	*(boffset +  5) = ctemp06;
 	*(boffset +  6) = ctemp07;
 	*(boffset +  7) = ctemp08;

 	aoffset1 += 2;
 	aoffset2 += 2;
 	aoffset3 += 2;
 	aoffset4 += 2;

 	boffset += 8;
 	i --;
      }while(i > 0);
    }
  } /* end of if(j > 0) */

  if (n & 2){
    aoffset1  = aoffset;
    aoffset2  = aoffset1 + lda;
    aoffset += 2 * lda;

    i = m;
    if (i > 0){
      do{
 	ctemp01 = *(aoffset1 +  0);
 	ctemp02 = *(aoffset1 +  1);
 	ctemp03 = *(aoffset2 +  0);
 	ctemp04 = *(aoffset2 +  1);

 	*(boffset +  0) = ctemp01;
 	*(boffset +  1) = ctemp02;
 	*(boffset +  2) = ctemp03;
 	*(boffset +  3) = ctemp04;

 	aoffset1 +=  2;
 	aoffset2 +=  2;
 	boffset  +=  4;
 	i --;
      }while(i > 0);
    }

  } /* end of if(j > 0) */

  if (n & 1){
    aoffset1  = aoffset;

    i = m;
    if (i > 0){
      do{
 	ctemp01 = *(aoffset1 +  0);
 	ctemp02 = *(aoffset1 +  1);

 	*(boffset +  0) = ctemp01;
 	*(boffset +  1) = ctemp02;

 	aoffset1 += 2;
 	boffset  += 2;
 	i --;
      }while(i > 0);
    }

  } /* end of if(j > 0) */

  return 0;
 }
--- a/kernel/loongarch64/cgemm_ncopy_16_lasx.S
+++ b/kernel/loongarch64/cgemm_ncopy_16_lasx.S
@@ -45,18 +45,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define S6     $r17
 #define S7     $r18
 #define S8     $r19
 #define S9     $r20
 #define S10    $r23
 #define S11    $r24
 #define S12    $r25
 #define S13    $r26
 #define S14    $r27
 #define S15    $r28
 #define S16    $r29
 #define TD     $r30
 #define TS     $r31
 #define S9     $r23
 #define S10    $r24
 #define S11    $r25
 #define S12    $r26
 #define S13    $r27
 #define S14    $r28
 #define S15    $r29
 #define S16    $r30
 #define TD     $r20
 #define TS     $r11
 #define TL     $r7
 #define T0     $r6
 #define ZERO   $r0

 #define F0     $f0
@@ -67,6 +66,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define F5     $f5
 #define F6     $f6
 #define F7     $f7
 #define F8     $f8
 #define F9     $f9
 #define F10    $f10
 #define F11    $f11
 #define F12    $f12
 #define F13    $f13
 #define F14    $f14
 #define F15    $f15
 /* LASX vectors */
 #define U0     $xr0
 #define U1     $xr1
@@ -103,589 +110,232 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

    PROLOGUE

    addi.d     $sp,  $sp,  -0x90
    SDARG      $r23, $sp,  0x00
    SDARG      $r24, $sp,  0x08
    SDARG      $r25, $sp,  0x10
    SDARG      $r26, $sp,  0x18
    SDARG      $r27, $sp,  0x20
    SDARG      $r28, $sp,  0x28
    SDARG      $r29, $sp,  0x30
    SDARG      $r30, $sp,  0x38
    SDARG      $r31, $sp,  0x40
    ST         $f23, $sp,  0x48
    ST         $f24, $sp,  0x50
    ST         $f25, $sp,  0x58
    ST         $f26, $sp,  0x60
    ST         $f27, $sp,  0x68
    ST         $f28, $sp,  0x70
    ST         $f29, $sp,  0x78
    ST         $f30, $sp,  0x80
    ST         $f31, $sp,  0x88

    move       TD,   DST
    move       TS,   SRC
    slli.d     TL,   LDA,  0x03
    slli.d     T0,   TL,   0x01
    srai.d     J,    N,    0x04
    addi.d     $sp,  $sp,  -64
    SDARG      $r23, $sp,  0
    SDARG      $r24, $sp,  8
    SDARG      $r25, $sp,  16
    SDARG      $r26, $sp,  24
    SDARG      $r27, $sp,  32
    SDARG      $r28, $sp,  40
    SDARG      $r29, $sp,  48
    SDARG      $r30, $sp,  56

    move       TD,   DST  //boffset
    move       TS,   SRC  //aoffset
    slli.d     TL,   LDA,  0x03  //lda
    srai.d     J,    N,    0x04  //j
    beq        J,    ZERO, .L_N8

 .L_J1: /* J-- */
 .L_J1:  /* if(j>0) j--*/
    move       S1,   TS
    add.d      S2,   TS,   TL
    srai.d     I,    M,    0x03
    move       I,    M
    add.d      S3,   S2,   TL
    addi.d     J,    J,    -1
    add.d      S4,   S3,   TL
    add.d      S5,   S3,   T0
    add.d      S6,   S4,   T0
    add.d      S7,   S5,   T0
    add.d      S8,   S6,   T0
    add.d      S9,   S7,   T0
    add.d      S10,  S8,   T0
    add.d      S11,  S9,   T0
    add.d      S12,  S10,  T0
    add.d      S13,  S11,  T0
    add.d      S14,  S12,  T0
    add.d      S15,  S13,  T0
    add.d      S16,  S14,  T0
    add.d      TS,   S15,  T0
    beq        I,    ZERO, .L_I7

 .L_I1: /* I-- */
    xvld       U0,   S1,   0x00
    xvld       U1,   S2,   0x00
    xvld       U2,   S3,   0x00
    xvld       U3,   S4,   0x00
    xvld       U4,   S5,   0x00
    xvld       U5,   S6,   0x00
    xvld       U6,   S7,   0x00
    xvld       U7,   S8,   0x00
    xvld       U8,   S9,   0x00
    xvld       U9,   S10,  0x00
    xvld       U10,  S11,  0x00
    xvld       U11,  S12,  0x00
    xvld       U12,  S13,  0x00
    xvld       U13,  S14,  0x00
    xvld       U14,  S15,  0x00
    xvld       U15,  S16,  0x00

    xvpackev.d D0,   U1,   U0
    xvpackod.d D1,   U1,   U0
    xvpackev.d D2,   U3,   U2
    xvpackod.d D3,   U3,   U2
    xvpackev.d D4,   U5,   U4
    xvpackod.d D5,   U5,   U4
    xvpackev.d D6,   U7,   U6
    xvpackod.d D7,   U7,   U6

    xvpackev.d D8,   U9,   U8
    xvpackod.d D9,   U9,   U8
    xvpackev.d D10,  U11,  U10
    xvpackod.d D11,  U11,  U10
    xvpackev.d D12,  U13,  U12
    xvpackod.d D13,  U13,  U12
    xvpackev.d D14,  U15,  U14
    xvpackod.d D15,  U15,  U14

    xvand.v    U0,   D0,   D0
    xvpermi.q  D0,   D2,   0x02  // 0
    xvand.v    U4,   D4,   D4
    xvpermi.q  D4,   D6,   0x02  // 1
    xvand.v    U1,   D1,   D1
    xvpermi.q  D1,   D3,   0x02  // 4
    xvand.v    U5,   D5,   D5
    xvpermi.q  D5,   D7,   0x02  // 5
    xvpermi.q  D2,   U0,   0x31  // 8
    xvpermi.q  D6,   U4,   0x31  // 9
    xvpermi.q  D3,   U1,   0x31  // 12
    xvpermi.q  D7,   U5,   0x31  // 13

    xvand.v    U8,   D8,   D8
    xvpermi.q  D8,   D10,  0x02  // 2
    xvand.v    U12,  D12,  D12
    xvpermi.q  D12,  D14,  0x02  // 3
    xvand.v    U9,   D9,   D9
    xvpermi.q  D9,   D11,  0x02  // 6
    xvand.v    U13,  D13,  D13
    xvpermi.q  D13,  D15,  0x02  // 7
    xvpermi.q  D10,  U8,   0x31  // 10
    xvpermi.q  D14,  U12,  0x31  // 11
    xvpermi.q  D11,  U9,   0x31  // 14
    xvpermi.q  D15,  U13,  0x31  // 15

    xvst       D0,   TD,   0x00  // 0
    xvst       D4,   TD,   0x20  // 1
    xvst       D8,   TD,   0x40  // 2
    xvst       D12,  TD,   0x60  // 3
    xvst       D1,   TD,   0x80  // 4
    xvst       D5,   TD,   0xA0  // 5
    xvst       D9,   TD,   0xC0  // 6
    xvst       D13,  TD,   0xE0  // 7
    addi.d     TD,   TD,   0x100
    xvst       D2,   TD,   0x00  // 8
    xvst       D6,   TD,   0x20  // 9
    xvst       D10,  TD,   0x40  // 10
    xvst       D14,  TD,   0x60  // 11
    xvst       D3,   TD,   0x80  // 12
    xvst       D7,   TD,   0xA0  // 13
    xvst       D11,  TD,   0xC0  // 14
    xvst       D15,  TD,   0xE0  // 15
    addi.d     TD,   TD,   0x100

    xvld       U0,   S1,   0x20
    xvld       U1,   S2,   0x20
    xvld       U2,   S3,   0x20
    xvld       U3,   S4,   0x20
    xvld       U4,   S5,   0x20
    xvld       U5,   S6,   0x20
    xvld       U6,   S7,   0x20
    xvld       U7,   S8,   0x20
    xvld       U8,   S9,   0x20
    xvld       U9,   S10,  0x20
    xvld       U10,  S11,  0x20
    xvld       U11,  S12,  0x20
    xvld       U12,  S13,  0x20
    xvld       U13,  S14,  0x20
    xvld       U14,  S15,  0x20
    xvld       U15,  S16,  0x20

    xvpackev.d D0,   U1,   U0
    xvpackod.d D1,   U1,   U0
    xvpackev.d D2,   U3,   U2
    xvpackod.d D3,   U3,   U2
    xvpackev.d D4,   U5,   U4
    xvpackod.d D5,   U5,   U4
    xvpackev.d D6,   U7,   U6
    xvpackod.d D7,   U7,   U6

    xvpackev.d D8,   U9,   U8
    xvpackod.d D9,   U9,   U8
    xvpackev.d D10,  U11,  U10
    xvpackod.d D11,  U11,  U10
    xvpackev.d D12,  U13,  U12
    xvpackod.d D13,  U13,  U12
    xvpackev.d D14,  U15,  U14
    xvpackod.d D15,  U15,  U14

    xvand.v    U0,   D0,   D0
    xvpermi.q  D0,   D2,   0x02  // 0
    xvand.v    U4,   D4,   D4
    xvpermi.q  D4,   D6,   0x02  // 1
    xvand.v    U1,   D1,   D1
    xvpermi.q  D1,   D3,   0x02  // 4
    xvand.v    U5,   D5,   D5
    xvpermi.q  D5,   D7,   0x02  // 5
    xvpermi.q  D2,   U0,   0x31  // 8
    xvpermi.q  D6,   U4,   0x31  // 9
    xvpermi.q  D3,   U1,   0x31  // 12
    xvpermi.q  D7,   U5,   0x31  // 13

    xvand.v    U8,   D8,   D8
    xvpermi.q  D8,   D10,  0x02  // 2
    xvand.v    U12,  D12,  D12
    xvpermi.q  D12,  D14,  0x02  // 3
    xvand.v    U9,   D9,   D9
    xvpermi.q  D9,   D11,  0x02  // 6
    xvand.v    U13,  D13,  D13
    xvpermi.q  D13,  D15,  0x02  // 7
    xvpermi.q  D10,  U8,   0x31  // 10
    xvpermi.q  D14,  U12,  0x31  // 11
    xvpermi.q  D11,  U9,   0x31  // 14
    xvpermi.q  D15,  U13,  0x31  // 15

    xvst       D0,   TD,   0x00  // 0
    xvst       D4,   TD,   0x20  // 1
    xvst       D8,   TD,   0x40  // 2
    xvst       D12,  TD,   0x60  // 3
    xvst       D1,   TD,   0x80  // 4
    xvst       D5,   TD,   0xA0  // 5
    xvst       D9,   TD,   0xC0  // 6
    xvst       D13,  TD,   0xE0  // 7
    addi.d     TD,   TD,   0x100
    xvst       D2,   TD,   0x00  // 8
    xvst       D6,   TD,   0x20  // 9
    xvst       D10,  TD,   0x40  // 10
    xvst       D14,  TD,   0x60  // 11
    xvst       D3,   TD,   0x80  // 12
    xvst       D7,   TD,   0xA0  // 13
    xvst       D11,  TD,   0xC0  // 14
    xvst       D15,  TD,   0xE0  // 15
    addi.d     TD,   TD,   0x100


    addi.d     S1,   S1,   0x40
    addi.d     S2,   S2,   0x40
    addi.d     S3,   S3,   0x40
    addi.d     S4,   S4,   0x40
    addi.d     S5,   S5,   0x40
    addi.d     S6,   S6,   0x40
    addi.d     S7,   S7,   0x40
    addi.d     S8,   S8,   0x40
    addi.d     S9,   S9,   0x40
    addi.d     S10,  S10,  0x40
    addi.d     S11,  S11,  0x40
    addi.d     S12,  S12,  0x40
    addi.d     S13,  S13,  0x40
    addi.d     S14,  S14,  0x40
    addi.d     S15,  S15,  0x40
    addi.d     S16,  S16,  0x40

    add.d      S5,   S4,   TL
    add.d      S6,   S5,   TL
    add.d      S7,   S6,   TL
    add.d      S8,   S7,   TL
    add.d      S9,   S8,   TL
    add.d      S10,  S9,   TL
    add.d      S11,  S10,  TL
    add.d      S12,  S11,  TL
    add.d      S13,  S12,  TL
    add.d      S14,  S13,  TL
    add.d      S15,  S14,  TL
    add.d      S16,  S15,  TL
    add.d      TS,   S16,  TL
    beq        I,    ZERO, .L_J11

 .L_I1:  /* if(i>0) i--*/
    fld.d      F0,   S1,   0x00
    fld.d      F1,   S2,   0x00
    fld.d      F2,   S3,   0x00
    fld.d      F3,   S4,   0x00
    fld.d      F4,   S5,   0x00
    fld.d      F5,   S6,   0x00
    fld.d      F6,   S7,   0x00
    fld.d      F7,   S8,   0x00

    fst.d      F0,   TD,   0x00
    fst.d      F1,   TD,   0x08
    fst.d      F2,   TD,   0x10
    fst.d      F3,   TD,   0x18
    fst.d      F4,   TD,   0x20
    fst.d      F5,   TD,   0x28
    fst.d      F6,   TD,   0x30
    fst.d      F7,   TD,   0x38

    fld.d      F0,   S9,   0x00
    fld.d      F1,   S10,  0x00
    fld.d      F2,   S11,  0x00
    fld.d      F3,   S12,  0x00
    fld.d      F4,   S13,  0x00
    fld.d      F5,   S14,  0x00
    fld.d      F6,   S15,  0x00
    fld.d      F7,   S16,  0x00

    fst.d      F0,   TD,   0x40
    fst.d      F1,   TD,   0x48
    fst.d      F2,   TD,   0x50
    fst.d      F3,   TD,   0x58
    fst.d      F4,   TD,   0x60
    fst.d      F5,   TD,   0x68
    fst.d      F6,   TD,   0x70
    fst.d      F7,   TD,   0x78

    addi.d     S1,   S1,   0x08
    addi.d     S2,   S2,   0x08
    addi.d     S3,   S3,   0x08
    addi.d     S4,   S4,   0x08
    addi.d     S5,   S5,   0x08
    addi.d     S6,   S6,   0x08
    addi.d     S7,   S7,   0x08
    addi.d     S8,   S8,   0x08
    addi.d     S9,   S9,   0x08
    addi.d     S10,  S10,  0x08
    addi.d     S11,  S11,  0x08
    addi.d     S12,  S12,  0x08
    addi.d     S13,  S13,  0x08
    addi.d     S14,  S14,  0x08
    addi.d     S15,  S15,  0x08
    addi.d     S16,  S16,  0x08
    addi.d     TD,   TD,   0x80
    addi.d     I,    I,    -1
    blt        ZERO, I,    .L_I1

 .L_I7:
    andi      I,     M,    0x07
    beq       I,     ZERO, .L_I0

 .L_II1: /* I-- */
    fld.d     F0,    S1,  0x00
    fld.d     F1,    S2,  0x00
    fld.d     F2,    S3,  0x00
    fld.d     F3,    S4,  0x00
    fld.d     F4,    S5,  0x00
    fld.d     F5,    S6,  0x00
    fld.d     F6,    S7,  0x00
    fld.d     F7,    S8,  0x00

    fst.d     F0,    TD,  0x00
    addi.d    S1,    S1,  0x08
    fst.d     F1,    TD,  0x08
    addi.d    S2,    S2,  0x08
    fst.d     F2,    TD,  0x10
    addi.d    S3,    S3,  0x08
    fst.d     F3,    TD,  0x18
    addi.d    S4,    S4,  0x08
    fst.d     F4,    TD,  0x20
    addi.d    S5,    S5,  0x08
    fst.d     F5,    TD,  0x28
    addi.d    S6,    S6,  0x08
    fst.d     F6,    TD,  0x30
    addi.d    S7,    S7,  0x08
    fst.d     F7,    TD,  0x38
    addi.d    S8,    S8,  0x08
    addi.d    TD,    TD,  0x40

    fld.d     F0,    S9,  0x00
    fld.d     F1,    S10, 0x00
    fld.d     F2,    S11, 0x00
    fld.d     F3,    S12, 0x00
    fld.d     F4,    S13, 0x00
    fld.d     F5,    S14, 0x00
    fld.d     F6,    S15, 0x00
    fld.d     F7,    S16, 0x00

    fst.d     F0,    TD,  0x00
    addi.d    S9,    S9,  0x08
    fst.d     F1,    TD,  0x08
    addi.d    S10,   S10, 0x08
    fst.d     F2,    TD,  0x10
    addi.d    S11,   S11, 0x08
    fst.d     F3,    TD,  0x18
    addi.d    S12,   S12, 0x08
    fst.d     F4,    TD,  0x20
    addi.d    S13,   S13, 0x08
    fst.d     F5,    TD,  0x28
    addi.d    S14,   S14, 0x08
    fst.d     F6,    TD,  0x30
    addi.d    S15,   S15, 0x08
    fst.d     F7,    TD,  0x38
    addi.d    S16,   S16, 0x08
    addi.d    TD,    TD,  0x40

    addi.d    I,     I,   -1
    blt       ZERO,  I,   .L_II1

 .L_I0:
    blt       ZERO,  J,   .L_J1

 .L_N8:
    andi      J,     N,   0x08
    beq       ZERO,  J,   .L_N4
 .L_J11: /* j--*/
    addi.d     J,    J,    -1
    blt        ZERO, J,    .L_J1

 .L_N8:  /* if(n&8)*/
    andi       I,     N,    0x08
    beq        I,     ZERO, .L_N4

    move       S1,   TS
    add.d      S2,   TS,   TL
    srai.d     I,    M,    0x03
    move       I,    M
    add.d      S3,   S2,   TL
    add.d      S4,   S2,   T0
    add.d      S5,   S3,   T0
    add.d      S6,   S4,   T0
    add.d      S7,   S5,   T0
    add.d      S8,   S6,   T0
    add.d      TS,   S7,   T0
    beq        I,    ZERO, .L_8I3

 .L_8I1:  /* I-- */
    xvld       U0,   S1,   0x00
    xvld       U1,   S2,   0x00
    xvld       U2,   S3,   0x00
    xvld       U3,   S4,   0x00
    xvld       U4,   S5,   0x00
    xvld       U5,   S6,   0x00
    xvld       U6,   S7,   0x00
    xvld       U7,   S8,   0x00

    xvpackev.d D0,   U1,   U0
    xvpackod.d D1,   U1,   U0
    xvpackev.d D2,   U3,   U2
    xvpackod.d D3,   U3,   U2
    xvpackev.d D4,   U5,   U4
    xvpackod.d D5,   U5,   U4
    xvpackev.d D6,   U7,   U6
    xvpackod.d D7,   U7,   U6

    xvand.v    U0,   D0,   D0
    xvpermi.q  D0,   D2,   0x02  // 0
    xvand.v    U4,   D4,   D4
    xvpermi.q  D4,   D6,   0x02  // 1
    xvand.v    U1,   D1,   D1
    xvpermi.q  D1,   D3,   0x02  // 2
    xvand.v    U5,   D5,   D5
    xvpermi.q  D5,   D7,   0x02  // 3
    xvpermi.q  D2,   U0,   0x31  // 4
    xvpermi.q  D6,   U4,   0x31  // 5
    xvpermi.q  D3,   U1,   0x31  // 6
    xvpermi.q  D7,   U5,   0x31  // 7

    xvst       D0,   TD,   0x00
    xvst       D4,   TD,   0x20
    xvst       D1,   TD,   0x40
    xvst       D5,   TD,   0x60
    xvst       D2,   TD,   0x80
    xvst       D6,   TD,   0xA0
    xvst       D3,   TD,   0xC0
    xvst       D7,   TD,   0xE0
    addi.d     TD,   TD,   0x100

    xvld       U0,   S1,   0x20
    xvld       U1,   S2,   0x20
    xvld       U2,   S3,   0x20
    xvld       U3,   S4,   0x20
    xvld       U4,   S5,   0x20
    xvld       U5,   S6,   0x20
    xvld       U6,   S7,   0x20
    xvld       U7,   S8,   0x20

    xvpackev.d D0,   U1,   U0
    xvpackod.d D1,   U1,   U0
    xvpackev.d D2,   U3,   U2
    xvpackod.d D3,   U3,   U2
    xvpackev.d D4,   U5,   U4
    xvpackod.d D5,   U5,   U4
    xvpackev.d D6,   U7,   U6
    xvpackod.d D7,   U7,   U6

    xvand.v    U0,   D0,   D0
    xvpermi.q  D0,   D2,   0x02  // 0
    xvand.v    U4,   D4,   D4
    xvpermi.q  D4,   D6,   0x02  // 1
    xvand.v    U1,   D1,   D1
    xvpermi.q  D1,   D3,   0x02  // 2
    xvand.v    U5,   D5,   D5
    xvpermi.q  D5,   D7,   0x02  // 3
    xvpermi.q  D2,   U0,   0x31  // 4
    xvpermi.q  D6,   U4,   0x31  // 5
    xvpermi.q  D3,   U1,   0x31  // 6
    xvpermi.q  D7,   U5,   0x31  // 7

    xvst       D0,   TD,   0x00
    xvst       D4,   TD,   0x20
    xvst       D1,   TD,   0x40
    xvst       D5,   TD,   0x60
    xvst       D2,   TD,   0x80
    xvst       D6,   TD,   0xA0
    xvst       D3,   TD,   0xC0
    xvst       D7,   TD,   0xE0
    addi.d     TD,   TD,   0x100

    addi.d     S1,   S1,   0x40
    addi.d     S2,   S2,   0x40
    addi.d     S3,   S3,   0x40
    addi.d     S4,   S4,   0x40
    addi.d     S5,   S5,   0x40
    addi.d     S6,   S6,   0x40
    addi.d     S7,   S7,   0x40
    addi.d     S8,   S8,   0x40

    add.d      S4,   S3,   TL
    add.d      S5,   S4,   TL
    add.d      S6,   S5,   TL
    add.d      S7,   S6,   TL
    add.d      S8,   S7,   TL
    add.d      TS,   S8,   TL
    beq        I,     ZERO, .L_N4

 .L_N81:  /* if(i>0) i--*/
    fld.d      F0,   S1,   0x00
    fld.d      F1,   S2,   0x00
    fld.d      F2,   S3,   0x00
    fld.d      F3,   S4,   0x00
    fld.d      F4,   S5,   0x00
    fld.d      F5,   S6,   0x00
    fld.d      F6,   S7,   0x00
    fld.d      F7,   S8,   0x00

    fst.d      F0,   TD,   0x00
    fst.d      F1,   TD,   0x08
    fst.d      F2,   TD,   0x10
    fst.d      F3,   TD,   0x18
    fst.d      F4,   TD,   0x20
    fst.d      F5,   TD,   0x28
    fst.d      F6,   TD,   0x30
    fst.d      F7,   TD,   0x38

    addi.d     S1,   S1,   0x08
    addi.d     S2,   S2,   0x08
    addi.d     S3,   S3,   0x08
    addi.d     S4,   S4,   0x08
    addi.d     S5,   S5,   0x08
    addi.d     S6,   S6,   0x08
    addi.d     S7,   S7,   0x08
    addi.d     S8,   S8,   0x08
    addi.d     TD,   TD,   0x40
    addi.d     I,    I,    -1
    blt        ZERO, I,    .L_8I1

 .L_8I3:
    andi      I,     M,    0x07
    beq       I,     ZERO, .L_N4

 .L_8I11:
    fld.d     F0,    S1,  0x00
    fld.d     F1,    S2,  0x00
    fld.d     F2,    S3,  0x00
    fld.d     F3,    S4,  0x00
    fld.d     F4,    S5,  0x00
    fld.d     F5,    S6,  0x00
    fld.d     F6,    S7,  0x00
    fld.d     F7,    S8,  0x00

    fst.d     F0,    TD,  0x00
    addi.d    S1,    S1,  0x08
    fst.d     F1,    TD,  0x08
    addi.d    S2,    S2,  0x08
    fst.d     F2,    TD,  0x10
    addi.d    S3,    S3,  0x08
    fst.d     F3,    TD,  0x18
    addi.d    S4,    S4,  0x08
    fst.d     F4,    TD,  0x20
    addi.d    S5,    S5,  0x08
    fst.d     F5,    TD,  0x28
    addi.d    S6,    S6,  0x08
    fst.d     F6,    TD,  0x30
    addi.d    S7,    S7,  0x08
    fst.d     F7,    TD,  0x38
    addi.d    S8,    S8,  0x08

    addi.d    TD,    TD,  0x40
    addi.d    I,     I,   -1
    blt       ZERO,  I,   .L_8I11

 .L_N4:
    andi      J,     N,   0x04
    beq       ZERO,  J,   .L_N2
    blt        ZERO, I,    .L_N81

 .L_N4:  /* if(n&4)*/
    andi       I,     N,    0x04
    beq        I,     ZERO, .L_N2

    move       S1,   TS
    add.d      S2,   TS,   TL
    srai.d     I,    M,    0x02
    move       I,    M
    add.d      S3,   S2,   TL
    add.d      S4,   S2,   T0
    add.d      TS,   S3,   T0
    beq        I,    ZERO, .L_I3

 .L_4I1: /* I-- */
    xvld       U0,   S1,   0x00
    xvld       U1,   S2,   0x00
    xvld       U2,   S3,   0x00
    xvld       U3,   S4,   0x00

    xvpackev.d D0,   U1,   U0
    xvpackod.d D1,   U1,   U0
    xvpackev.d D2,   U3,   U2
    xvpackod.d D3,   U3,   U2

    xvand.v    U0,   D0,   D0
    xvpermi.q  D0,   D2,   0x02  // 0
    xvand.v    U1,   D1,   D1
    xvpermi.q  D1,   D3,   0x02  // 1
    xvpermi.q  D2,   U0,   0x31  // 2
    xvpermi.q  D3,   U1,   0x31  // 3

    xvst       D0,   TD,   0x00
    xvst       D1,   TD,   0x20
    xvst       D2,   TD,   0x40
    xvst       D3,   TD,   0x60

    addi.d     S1,   S1,   0x20
    addi.d     S2,   S2,   0x20
    addi.d     S3,   S3,   0x20
    addi.d     S4,   S4,   0x20
    addi.d     TD,   TD,   0x80

    add.d      S4,   S3,   TL
    add.d      TS,   S4,   TL
    beq        I,     ZERO, .L_N2

 .L_N41:  /* if(i>0)*/
    fld.d      F0,   S1,   0x00
    fld.d      F1,   S2,   0x00
    fld.d      F2,   S3,   0x00
    fld.d      F3,   S4,   0x00

    fst.d      F0,   TD,   0x00
    fst.d      F1,   TD,   0x08
    fst.d      F2,   TD,   0x10
    fst.d      F3,   TD,   0x18

    addi.d     S1,   S1,   0x08
    addi.d     S2,   S2,   0x08
    addi.d     S3,   S3,   0x08
    addi.d     S4,   S4,   0x08
    addi.d     TD,   TD,   0x20
    addi.d     I,    I,    -1
    blt        ZERO, I,    .L_4I1

 .L_I3:
    andi      I,     M,    0x03
    beq       I,     ZERO, .L_N2

 .L_4II1:
    fld.d     F0,    S1,  0x00
    fld.d     F1,    S2,  0x00
    fld.d     F2,    S3,  0x00
    fld.d     F3,    S4,  0x00

    fst.d     F0,    TD,  0x00
    addi.d    S1,    S1,  0x08
    fst.d     F1,    TD,  0x08
    addi.d    S2,    S2,  0x08
    fst.d     F2,    TD,  0x10
    addi.d    S3,    S3,  0x08
    fst.d     F3,    TD,  0x18
    addi.d    S4,    S4,  0x08

    addi.d    TD,    TD,  0x20
    addi.d    I,     I,   -1
    blt       ZERO,  I,   .L_4II1

 .L_N2:
    andi      J,     N,   0x02
    beq       ZERO,  J,   .L_N1
    blt        ZERO, I,    .L_N41

 .L_N2:  /* if(n&2)*/
    andi       I,     N,    0x02
    beq        I,     ZERO, .L_N1

    move       S1,   TS
    add.d      S2,   TS,   TL
    srai.d     I,    M,    0x01
    move       I,    M
    add.d      TS,   S2,   TL
    beq        I,    ZERO, .L_NI1

 .L_2I1: /* I-- */
    xvld       U0,   S1,   0x00
    xvld       U1,   S2,   0x00

    xvpackev.d D0,   U1,   U0
    xvpackod.d D1,   U1,   U0

    xvpermi.q  D0,   D1,   0x02  // 0
    beq        I,    ZERO, .L_N1

    xvst       D0,   TD,   0x00
 .L_N21:  /* if(i>0)*/
    fld.d      F0,   S1,   0x00
    fld.d      F1,   S2,   0x00

    addi.d     S1,   S1,   0x10
    addi.d     S2,   S2,   0x10
    addi.d     TD,   TD,   0x20
    fst.d      F0,   TD,   0x00
    fst.d      F1,   TD,   0x08

    addi.d     S1,   S1,   0x08
    addi.d     S2,   S2,   0x08
    addi.d     TD,   TD,   0x10
    addi.d     I,    I,    -1
    blt        ZERO, I,    .L_2I1

 .L_NI1:
    andi      I,     M,    0x01
    beq       I,     ZERO, .L_N1

    blt        ZERO, I,    .L_N21

    fld.d     F0,    S1,  0x00
    fld.d     F1,    S2,  0x00
 .L_N1:  /* if(n&2)*/
    andi       I,    N,    0x01
    beq        I,    ZERO, .L_N0

    fst.d     F0,    TD,  0x00
    addi.d    S1,    S1,  0x08
    fst.d     F1,    TD,  0x08
    addi.d    S2,    S2,  0x08
    addi.d    TD,    TD,  0x10
    move       S1,   TS
    move       I,    M
    beq        I,    ZERO, .L_N0

 .L_N1:
    move      S1,    TS
    beq       ZERO,  M,   .L_N0
 .L_N11:  /* if(i>0)*/
    fld.d      F0,   S1,   0x00
    fst.d      F0,   TD,   0x00

 .L_M1:
    fld.d     F0,    S1,  0x00
    addi.d    S1,    S1,  0x08
    fst.d     F0,    TD,  0x00
    addi.d    TD,    TD,  0x08
    addi.d    M,     M,   -1
    blt       ZERO,  M,   .L_M1
    addi.d     S1,   S1,   0x08
    addi.d     TD,   TD,   0x08
    addi.d     I,    I,    -1
    blt        ZERO, I,    .L_N11

 .L_N0:
    LDARG      $r23, $sp,  0x00
    LDARG      $r24, $sp,  0x08
    LDARG      $r25, $sp,  0x10
    LDARG      $r26, $sp,  0x18
    LDARG      $r27, $sp,  0x20
    LDARG      $r28, $sp,  0x28
    LDARG      $r29, $sp,  0x30
    LDARG      $r30, $sp,  0x38
    LDARG      $r31, $sp,  0x40
    LD         $f23, $sp,  0x48
    LD         $f24, $sp,  0x50
    LD         $f25, $sp,  0x58
    LD         $f26, $sp,  0x60
    LD         $f27, $sp,  0x68
    LD         $f28, $sp,  0x70
    LD         $f29, $sp,  0x78
    LD         $f30, $sp,  0x80
    LD         $f31, $sp,  0x88
    addi.d     $sp,  $sp,  0x90
    jirl       $r0,  $r1,  0x00
    LDARG     $r23,  $sp, 0
    LDARG     $r24,  $sp, 8
    LDARG     $r25,  $sp, 16
    LDARG     $r26,  $sp, 24
    LDARG     $r27,  $sp, 32
    LDARG     $r28,  $sp, 40
    LDARG     $r29,  $sp, 48
    LDARG     $r30,  $sp, 56
    addi.d    $sp,   $sp, 64
    jirl      $r0,   $r1, 0x00

    EPILOGUE
--- a/kernel/loongarch64/cscal_lasx.S
+++ b/kernel/loongarch64/cscal_lasx.S
@@ -94,7 +94,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    CMPEQ  $fcc1, ALPHAI, a1
    bge $r0, I, .L19
 ///////   INCX == 1 && N >= 4   ////////
    bnez  DUMMY2, .L17 // if DUMMPY2 == 1, called from c/zscal.
    bnez  DUMMY2, .L17 // if DUMMY2 == 1, called from c/zscal.

    bceqz $fcc0, .L17

@@ -146,6 +146,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    addi.d  I, I, -1
    blt $r0, I, .L17
    b .L19

    .align 3

 ///////  INCX == 1 && N < 8   ///////
@@ -156,7 +157,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    andi   I,   N,   7
 #endif
    beqz   I,   .L999
    bnez  DUMMY2, .L998 // if DUMMPY2 == 1, called from c/zscal.
    bnez  DUMMY2, .L998 // if DUMMY2 == 1, called from c/zscal.

    bceqz $fcc0, .L998

@@ -171,7 +172,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    CMPEQ  $fcc1, ALPHAI, a1
    move XX, X
    bge $r0, I, .L29
    bnez  DUMMY2, .L25 // if DUMMPY2 == 1, called from c/zscal.
    bnez  DUMMY2, .L25 // if DUMMY2 == 1, called from c/zscal.
    bceqz $fcc0, .L25

    bceqz $fcc1, .L25
@@ -341,7 +342,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    andi   I,   N,   7
 #endif
    beqz   I,   .L999
    bnez  DUMMY2, .L998 // if DUMMPY2 == 1, called from c/zscal.
    bnez  DUMMY2, .L998 // if DUMMY2 == 1, called from c/zscal.

    bceqz $fcc0, .L998

--- a/kernel/loongarch64/cscal_lsx.S
+++ b/kernel/loongarch64/cscal_lsx.S
@@ -33,6 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define ALPHAI $f1
 #define X      $r7
 #define INCX   $r8
 #define DUMMY2 $r9

 #define I      $r12
 #define TEMP   $r13
@@ -65,6 +66,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

    bge $r0, N, .L999
    bge $r0, INCX, .L999
    ld.d DUMMY2, $sp, 0
    li.d TEMP, 1
    movgr2fr.d a1, $r0
    FFINT    a1, a1
@@ -84,24 +86,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    srai.d I, N, 2
    bne INCX, TEMP, .L22

 ///////    INCX == 1   ////////
 .L11:
    bge $r0, I, .L997
    CMPEQ  $fcc0, ALPHAR, a1
    CMPEQ  $fcc1, ALPHAI, a1
    bceqz $fcc0, .L13
    b .L14
    .align 3
    bge $r0, I, .L19

 .L13:
    bceqz $fcc1, .L114 //alpha_r != 0.0 && alpha_i != 0.0
    b .L113 //alpha_r != 0.0 && alpha_i == 0.0
 ///////   INCX == 1 && N >= 4   ////////
    bnez  DUMMY2, .L17 // if DUMMPY2 == 1, called from c/zscal.

 .L14:
    bceqz $fcc1, .L114  //alpha_r == 0.0 && alpha_i != 0.0
    b .L111 //alpha_r == 0.0 && alpha_i == 0.0
    .align 3
    bceqz $fcc0, .L17

 .L111:  //alpha_r == 0.0 && alpha_i == 0.0
    bceqz $fcc1, .L17

 .L15:  //alpha_r == 0.0 && alpha_i == 0.0
    vst VXZ, X, 0 * SIZE
 #ifdef DOUBLE
    vst VXZ, X, 2 * SIZE
@@ -112,50 +110,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
    addi.d X, X, 8 * SIZE
    addi.d  I, I, -1
    blt $r0, I, .L111
    b .L997
    .align 3

 .L113: //alpha_r != 0.0 && alpha_i == 0.0
    vld VX0, X, 0 * SIZE
 #ifdef DOUBLE
    vld VX1, X, 2 * SIZE
    vpickev.d x1, VX1, VX0
    vpickod.d x2, VX1, VX0
    vfmul.d x3, VXAR, x1
    vfmul.d x4, VXAR, x2
    vilvl.d VX2, x4 ,x3
    vilvh.d VX3, x4, x3
    vst VX2, X, 0 * SIZE
    vst VX3, X, 2 * SIZE
    vld VX0, X, 4 * SIZE
    vld VX1, X, 6 * SIZE
    vpickev.d x1, VX1, VX0
    vpickod.d x2, VX1, VX0
    vfmul.d x3, VXAR, x1
    vfmul.d x4, VXAR, x2
    vilvl.d VX2, x4 ,x3
    vilvh.d VX3, x4, x3
    vst VX2, X, 4 * SIZE
    vst VX3, X, 6 * SIZE
 #else
    vld VX1, X, 4 * SIZE
    vpickev.w x1, VX1, VX0
    vpickod.w x2, VX1, VX0
    vfmul.s x3, VXAR, x1
    vfmul.s x4, VXAR, x2
    vilvl.w VX2, x4 ,x3
    vilvh.w VX3, x4, x3
    vst VX2, X, 0 * SIZE
    vst VX3, X, 4 * SIZE
 #endif
    addi.d X, X, 8 * SIZE
    addi.d  I, I, -1
    blt $r0, I, .L113
    b .L997
    blt $r0, I, .L15
    b .L19
    .align 3

 .L114:  //alpha_r != 0.0 && alpha_i != 0.0
 .L17:
    vld VX0, X, 0 * SIZE
 #ifdef DOUBLE
    vld VX1, X, 2 * SIZE
@@ -196,29 +155,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
    addi.d X, X, 8 * SIZE
    addi.d  I, I, -1
    blt $r0, I, .L114
    b .L997
    blt $r0, I, .L17
    b .L19
    .align 3

 ///////  INCX == 1 && N < 8   ///////
 .L19:
    andi   I,   N,   3
    beqz   I,   .L999
    bnez  DUMMY2, .L998 // if DUMMPY2 == 1, called from c/zscal.

    bceqz $fcc0, .L998

    bceqz $fcc1, .L998

    b .L995            // alpha_r == 0.0 && alpha_i == 0.0

 ///////    INCX != 1   ////////
 .L22:
    bge $r0, I, .L997
    move XX, X
    CMPEQ  $fcc0, ALPHAR, a1
    CMPEQ  $fcc1, ALPHAI, a1
    bceqz $fcc0, .L23
    b .L24
    .align 3
    move XX, X
    bge $r0, I, .L29
    bnez  DUMMY2, .L25 // if DUMMPY2 == 1, called from c/zscal.

 .L23:
    bceqz $fcc1, .L224 //alpha_r != 0.0 && alpha_i != 0.0
    b .L223 //alpha_r != 0.0 && alpha_i == 0.0
    bceqz $fcc0, .L25

 .L24:
    bceqz $fcc1, .L224  //alpha_r == 0.0 && alpha_i != 0.0
    b .L221 //alpha_r == 0.0 && alpha_i == 0.0
    .align 3
    bceqz $fcc1, .L25

 .L221:  //alpha_r == 0.0 && alpha_i == 0.0
 .L27:  //alpha_r == 0.0 && alpha_i == 0.0
 #ifdef DOUBLE
    vstelm.d VXZ, X, 0, 0
    vstelm.d VXZ, X, 1 * SIZE, 0
@@ -246,92 +211,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
    add.d X, X, INCX
    addi.d  I, I, -1
    blt $r0, I, .L221
    b .L997
    blt $r0, I, .L27
    b .L29
    .align 3

 .L223: //alpha_r != 0.0 && alpha_i == 0.0
 #ifdef DOUBLE
    ld.d t1, X, 0 * SIZE
    ld.d t2, X, 1 * SIZE
    add.d X, X, INCX
    ld.d t3, X, 0 * SIZE
    ld.d t4, X, 1 * SIZE
    add.d X, X, INCX
    vinsgr2vr.d x1, t1, 0
    vinsgr2vr.d x2, t2, 0
    vinsgr2vr.d x1, t3, 1
    vinsgr2vr.d x2, t4, 1
    vfmul.d x3, VXAR, x1
    vfmul.d x4, VXAR, x2
    vstelm.d x3, XX, 0 * SIZE, 0
    vstelm.d x4, XX, 1 * SIZE, 0
    add.d XX, XX, INCX
    vstelm.d x3, XX, 0 * SIZE, 1
    vstelm.d x4, XX, 1 * SIZE, 1
    add.d XX, XX, INCX

    ld.d t1, X, 0 * SIZE
    ld.d t2, X, 1 * SIZE
    add.d X, X, INCX
    ld.d t3, X, 0 * SIZE
    ld.d t4, X, 1 * SIZE
    vinsgr2vr.d x1, t1, 0
    vinsgr2vr.d x2, t2, 0
    vinsgr2vr.d x1, t3, 1
    vinsgr2vr.d x2, t4, 1
    add.d X, X, INCX
    vfmul.d x3, VXAR, x1
    vfmul.d x4, VXAR, x2
    addi.d  I, I, -1
    vstelm.d x3, XX, 0 * SIZE, 0
    vstelm.d x4, XX, 1 * SIZE, 0
    add.d XX, XX, INCX
    vstelm.d x3, XX, 0 * SIZE, 1
    vstelm.d x4, XX, 1 * SIZE, 1
 #else
    ld.w t1, X, 0 * SIZE
    ld.w t2, X, 1 * SIZE
    add.d X, X, INCX
    ld.w t3, X, 0 * SIZE
    ld.w t4, X, 1 * SIZE
    add.d X, X, INCX
    vinsgr2vr.w x1, t1, 0
    vinsgr2vr.w x2, t2, 0
    vinsgr2vr.w x1, t3, 1
    vinsgr2vr.w x2, t4, 1
    ld.w t1, X, 0 * SIZE
    ld.w t2, X, 1 * SIZE
    add.d X, X, INCX
    ld.w t3, X, 0 * SIZE
    ld.w t4, X, 1 * SIZE
    vinsgr2vr.w x1, t1, 2
    vinsgr2vr.w x2, t2, 2
    vinsgr2vr.w x1, t3, 3
    vinsgr2vr.w x2, t4, 3
    add.d X, X, INCX

    vfmul.s x3, VXAR, x1
    vfmul.s x4, VXAR, x2
    addi.d  I, I, -1
    vstelm.w x3, XX, 0 * SIZE, 0
    vstelm.w x4, XX, 1 * SIZE, 0
    add.d XX, XX, INCX
    vstelm.w x3, XX, 0 * SIZE, 1
    vstelm.w x4, XX, 1 * SIZE, 1
    add.d XX, XX, INCX
    vstelm.w x3, XX, 0 * SIZE, 2
    vstelm.w x4, XX, 1 * SIZE, 2
    add.d XX, XX, INCX
    vstelm.w x3, XX, 0 * SIZE, 3
    vstelm.w x4, XX, 1 * SIZE, 3
 #endif
    add.d XX, XX, INCX
    blt $r0, I, .L223
    b .L997
    .align 3

 .L224:  //alpha_r != 0.0 && alpha_i != 0.0
 .L25:
 #ifdef DOUBLE
    ld.d t1, X, 0 * SIZE
    ld.d t2, X, 1 * SIZE
@@ -414,15 +298,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    vstelm.w x4, XX, 1 * SIZE, 3
 #endif
    add.d XX, XX, INCX
    blt $r0, I, .L224
    b .L997
    blt $r0, I, .L25
    b .L29
    .align 3

 .L997:
    andi I, N, 3
    bge $r0, I, .L999
    .align 3
 ///////  INCX != 1 && N < 8   ///////
 .L29:
    andi   I,   N,   3
    beqz   I,   .L999
    bnez  DUMMY2, .L998 // if DUMMPY2 == 1, called from c/zscal.

    bceqz $fcc0, .L998

    bceqz $fcc1, .L998

    b .L995            // alpha_r == 0.0 && alpha_i == 0.0

 .L995: // alpha_r == 0.0 && alpha_i == 0.0
    ST   a1, X, 0 * SIZE
    ST   a1, X, 1 * SIZE
    addi.d I, I, -1
    add.d  X, X, INCX
    blt $r0, I, .L995
    b .L999
 .L998:
    LD   a1, X, 0 * SIZE
    LD   a2, X, 1 * SIZE
@@ -435,7 +333,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    ST   s2, X, 1 * SIZE
    add.d X, X, INCX
    blt $r0, I, .L998
    .align 3
    b .L999

 .L999:
    move $r4, $r12
--- a/kernel/loongarch64/zscal.S
+++ b/kernel/loongarch64/zscal.S
@@ -53,6 +53,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
   PROLOGUE

   li.d  TEMP, 2 * SIZE
   ld.d XX, $sp, 0 // Load dummy2
   slli.d XX, XX, ZBASE_SHIFT
   MTC  a1, $r0
   slli.d INCX, INCX, ZBASE_SHIFT
   bge $r0,    N, .L999
@@ -60,6 +62,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
   CMPEQ   $fcc1, ALPHA_I, a1
   bceqz   $fcc0, .L50
   bceqz   $fcc1, .L50
   beq     XX,    TEMP,  .L50  // if dummp2 == 1, do not directly copy 0
   srai.d I, N, 2
   bne INCX, TEMP, .L20
   bge $r0,    I, .L15
--- a/kernel/mips/zscal.c
+++ b/kernel/mips/zscal.c
@@ -1,5 +1,5 @@
 /***************************************************************************
 Copyright (c) 2016, The OpenBLAS Project
 Copyright (c) 2013, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
@@ -25,61 +25,58 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/

 /**************************************************************************************
 * 2013/09/14 Saar
 *    BLASTEST float     : OK
 *    BLASTEST double    : OK
 *    CTEST          : OK
 *    TEST           : OK
 *
 **************************************************************************************/

 #include "common.h"

 // The c/zscal_k function is called not only by cblas_c/zscal but also by other upper-level interfaces.
 // In certain cases, the expected return values for cblas_s/zscal differ from those of other upper-level interfaces.
 // To handle this, we use the dummy2 parameter to differentiate between them.
 int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
 {
 	BLASLONG i=0;
 	BLASLONG inc_x2;
 	BLASLONG ip = 0;
 	FLOAT temp;
    BLASLONG i = 0;
    BLASLONG inc_x2;
    BLASLONG ip = 0;
    FLOAT temp;

 	inc_x2 = 2 * inc_x;
 	for ( i=0; i<n; i++ )
 	{
 		if ( da_r == 0.0 )
 		{
 			if ( da_i == 0.0 )
 			{
 				temp = 0.0;
 				x[ip+1] = 0.0 ;
 			}
 			else
 			{
 				temp = - da_i * x[ip+1] ;
 				if (isnan(x[ip]) || isinf(x[ip])) temp = NAN;
 				if (!isinf(x[ip+1]))
 					x[ip+1] = da_i * x[ip]  ;
 				else    x[ip+1] = NAN;
 			}
 		}
 		else
 		{
 			if ( da_i == 0.0 )
 			{
 				temp    = da_r * x[ip]  ;
 				if (!isinf(x[ip+1]))
 					x[ip+1] = da_r * x[ip+1];
 				else    x[ip+1] = NAN;
 			}
 			else
 			{
 				temp    = da_r * x[ip]   - da_i * x[ip+1] ;
 				if (!isinf(x[ip+1]))
 					x[ip+1] = da_r * x[ip+1] + da_i * x[ip]   ;
 				else    x[ip+1] = NAN;
 			}
 		}
 		if ( da_r != da_r ) 
 			x[ip] = da_r;
 		else
 			x[ip]   = temp;
 		
 		ip += inc_x2;
 	}
    if ((n <= 0) || (inc_x <= 0))
        return(0);

 	return(0);
    inc_x2 = 2 * inc_x;
    if (dummy2 == 0) {
        for (i = 0; i < n; i++)
        {
            if (da_r == 0.0 && da_i == 0.0)
            {
                x[ip] = 0.0;
                x[ip+1] = 0.0;
            }
            else
            {
                temp    = da_r * x[ip]   - da_i * x[ip+1];
                x[ip+1] = da_r * x[ip+1] + da_i * x[ip]  ;
                x[ip] = temp;
            }

 }
            ip += inc_x2;
        }
        return(0);
    }
    for (i = 0; i < n; i++)
    {
        temp    = da_r * x[ip]   - da_i * x[ip+1];
        x[ip+1] = da_r * x[ip+1] + da_i * x[ip]  ;

        x[ip]   = temp;
        ip += inc_x2;
    }

    return(0);
 }
--- a/kernel/mips64/KERNEL
+++ b/kernel/mips64/KERNEL
@@ -6,6 +6,9 @@ CROTKERNEL  = ../mips/zrot.c
 ZROTKERNEL  = ../mips/zrot.c
 CSWAPKERNEL = ../mips/zswap.c
 ZSWAPKERNEL = ../mips/zswap.c

 CSCALKERNEL = ../mips/zscal.c
 ZSCALKERNEL = ../mips/zscal.c
                                                                                        
                                                                                                                                          
 ifndef SNRM2KERNEL
--- a/kernel/power/zscal.S
+++ b/kernel/power/zscal.S
@@ -51,6 +51,7 @@
 #define X r8
 #define INCX r9
 #endif
 #define FLAG r11
 #endif

 #if defined(_AIX) || defined(__APPLE__)
@@ -61,6 +62,7 @@
 #define X r8
 #define INCX r9
 #endif
 #define FLAG r11
 #endif

 #define FZERO	f0
@@ -94,6 +96,10 @@
 	fcmpu	cr0, FZERO, ALPHA_I
 	bne-	cr0, LL(A1I1)

 	LDLONG	FLAG, 104(SP)
 	cmpwi	cr0, FLAG, 1
 	beq-	cr0, LL(A1I1)

 	cmpwi	cr0, INCX, 2 * SIZE
 	bne-	cr0, LL(A0IN)

--- a/kernel/power/zscal.c
+++ b/kernel/power/zscal.c
@@ -136,7 +136,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F
 	if ( inc_x <= 0 )
 		return(0);

 	if (da_r == ZERO && da_i == ZERO) {
 	if (da_r == ZERO && da_i == ZERO && dummy2 == 0) {
 	  //clear the vector and return
 	  if (inc_x == 1) {
 	    memset(x, 0, n*COMPSIZE*SIZE);
--- a/kernel/power/zscal_ppc440.S
+++ b/kernel/power/zscal_ppc440.S
@@ -64,6 +64,7 @@
 #endif

 #define INC1	r11
 #define FLAG	r12

 #define FZERO	f0
 #define ALPHA_R	f1
@@ -97,6 +98,10 @@
 	fcmpu	cr0, FZERO, ALPHA_I
 	bne-	cr0, LL(A1I1)

 	lwz	FLAG, FRAMESLOT(0)(SP)
 	cmpwi	cr0, FLAG, 1
 	beq-	cr0, LL(A1I1)

 LL(A0IN):
 	srawi.	r0, N, 3
 	mtspr	CTR,  r0
--- a/kernel/riscv64/KERNEL.RISCV64_ZVL256B
+++ b/kernel/riscv64/KERNEL.RISCV64_ZVL256B
@@ -169,6 +169,7 @@ SSYMV_U_KERNEL =  symv_U_vector.c
 SSYMV_L_KERNEL =  symv_L_vector.c
 DSYMV_U_KERNEL =  symv_U_vector.c
 DSYMV_L_KERNEL =  symv_L_vector.c

 CSYMV_U_KERNEL =  ../generic/zsymv_k.c
 CSYMV_L_KERNEL =  ../generic/zsymv_k.c
 ZSYMV_U_KERNEL =  ../generic/zsymv_k.c
@@ -201,3 +202,12 @@ endif
 ifndef ZGEMM_BETA
 ZGEMM_BETA = ../generic/zgemm_beta.c
 endif

 ZOMATCOPY_CN = zomatcopy_cn_vector.c
 COMATCOPY_CN = zomatcopy_cn_vector.c

 DOMATCOPY_CN = omatcopy_cn_vector.c
 SOMATCOPY_CN = omatcopy_cn_vector.c

 SAXPBYKERNEL  = axpby_vector_v2.c
 DAXPBYKERNEL  = axpby_vector_v2.c
--- a/kernel/riscv64/axpby_vector_v2.c
+++ b/kernel/riscv64/axpby_vector_v2.c
@@ -0,0 +1,149 @@
 /***************************************************************************
 Copyright (c) 2022, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/

 #include "common.h"

 #if !defined(DOUBLE)
 #define VSETVL(n) RISCV_RVV(vsetvl_e32m8)(n)
 #define FLOAT_V_T vfloat32m8_t
 #define VLEV_FLOAT RISCV_RVV(vle32_v_f32m8)
 #define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m8)
 #define VSEV_FLOAT RISCV_RVV(vse32_v_f32m8)
 #define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m8)
 #define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m8)
 #define VFMULVF_FLOAT RISCV_RVV(vfmul_vf_f32m8)
 #define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m8)
 #else
 #define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n)
 #define FLOAT_V_T vfloat64m4_t
 #define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4)
 #define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4)
 #define VSEV_FLOAT RISCV_RVV(vse64_v_f64m4)
 #define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4)
 #define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4)
 #define VFMULVF_FLOAT RISCV_RVV(vfmul_vf_f64m4)
 #define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m4)
 #endif

 int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *y, BLASLONG inc_y)
 {
    FLOAT_V_T vx, vy;
    unsigned int gvl;
    if (n <= 0)
        return (0);
    if (inc_x == 1 && inc_y == 1)
    {
        while (n > 0)
        {
            gvl = VSETVL(n);

            vx = VLEV_FLOAT(x, gvl);
            vy = VLEV_FLOAT(y, gvl);

            vy = VFMULVF_FLOAT(vy, beta, gvl);
            vy = VFMACCVF_FLOAT(vy, alpha, vx, gvl);

            VSEV_FLOAT(y, vy, gvl);

            x += gvl;
            y += gvl;
            n -= gvl;
        }
    }
    else if (1 == inc_x)
    {
        BLASLONG stride_y = inc_y * sizeof(FLOAT);
        while (n > 0)
        {
            gvl = VSETVL(n);
            vy = VLSEV_FLOAT(y, stride_y, gvl);
            vx = VLEV_FLOAT(x, gvl);

            vy = VFMULVF_FLOAT(vy, beta, gvl);
            vy = VFMACCVF_FLOAT(vy, alpha, vx, gvl);

            VSSEV_FLOAT(y, stride_y, vy, gvl);

            x += gvl;
            y += gvl * inc_y;
            n -= gvl;
        }
    }
    else if (1 == inc_y)
    {
        BLASLONG stride_x = inc_x * sizeof(FLOAT);

        while (n > 0)
        {
            gvl = VSETVL(n);

            vx = VLSEV_FLOAT(x, stride_x, gvl);
            vy = VLEV_FLOAT(y, gvl);

            vy = VFMULVF_FLOAT(vy, beta, gvl);
            vy = VFMACCVF_FLOAT(vy, alpha, vx, gvl);

            VSEV_FLOAT(y, vy, gvl);

            x += gvl * inc_x;
            y += gvl;
            n -= gvl;
        }
    }
    else if (inc_y == 0)
    {
        FLOAT vf = y[0];
        for (; n > 0; n--)
        {
            vf = (vf * beta) + (x[0] * alpha);
            x += inc_x;
        }
        y[0] = vf;
    }
    else
    {
        BLASLONG stride_x = inc_x * sizeof(FLOAT);
        BLASLONG stride_y = inc_y * sizeof(FLOAT);
        while (n > 0)
        {
            gvl = VSETVL(n);
            vy = VLSEV_FLOAT(y, stride_y, gvl);
            vx = VLSEV_FLOAT(x, stride_x, gvl);

            vy = VFMULVF_FLOAT(vy, beta, gvl);
            vy = VFMACCVF_FLOAT(vy, alpha, vx, gvl);

            VSSEV_FLOAT(y, stride_y, vy, gvl);

            x += gvl * inc_x;
            y += gvl * inc_y;
            n -= gvl;
        }
    }

    return (0);
 }
--- a/kernel/riscv64/omatcopy_cn_vector.c
+++ b/kernel/riscv64/omatcopy_cn_vector.c
@@ -0,0 +1,123 @@
 /***************************************************************************
 Copyright (c) 2013, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/

 #include "common.h"
 #if !defined(DOUBLE)
 #define VSETVL_MAX				RISCV_RVV(vsetvlmax_e32m4)()
 #define VSETVL(n)               RISCV_RVV(vsetvl_e32m4)(n)
 #define FLOAT_V_T               vfloat32m4_t
 #define VLEV_FLOAT              RISCV_RVV(vle32_v_f32m4)
 #define VSEV_FLOAT              RISCV_RVV(vse32_v_f32m4)
 #define VFMULVF_FLOAT           RISCV_RVV(vfmul_vf_f32m4)
 #define VFMVVF_FLOAT            RISCV_RVV(vfmv_v_f_f32m4)
 #else
 #define VSETVL_MAX				RISCV_RVV(vsetvlmax_e64m4)()
 #define VSETVL(n)               RISCV_RVV(vsetvl_e64m4)(n)
 #define FLOAT_V_T               vfloat64m4_t
 #define VLEV_FLOAT              RISCV_RVV(vle64_v_f64m4)
 #define VSEV_FLOAT              RISCV_RVV(vse64_v_f64m4)
 #define VFMULVF_FLOAT           RISCV_RVV(vfmul_vf_f64m4)
 #define VFMVVF_FLOAT            RISCV_RVV(vfmv_v_f_f64m4)
 #endif


 int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
 {
 	BLASLONG i,j;
 	FLOAT *aptr,*bptr;
 	size_t vl;

 	FLOAT_V_T va, vb,va1,vb1;
 	if ( rows <= 0 )  return(0);
 	if ( cols <= 0 )  return(0);

 	aptr = a;
 	bptr = b;

 	if ( alpha == 0.0 )
 	{
 		vl = VSETVL_MAX;
 		va = VFMVVF_FLOAT(0, vl);
 		for ( i=0; i<cols ; i++ )
 		{
 			for(j=0; j<rows; j+=vl)
 			{
 				vl = VSETVL(rows - j);
 				VSEV_FLOAT(bptr + j, va, vl);
 			}
 			bptr += ldb;
 		}
 		return(0);
 	}

 	if ( alpha == 1.0 )
 	{
 		for ( i=0; i<cols ; i++ )
 		{
 			for(j=0; j<rows; j+=vl)
 			{
 				vl = VSETVL(rows - j);
 				va = VLEV_FLOAT(aptr + j, vl);
 				VSEV_FLOAT(bptr + j, va, vl);
 			}
 			aptr += lda;
 			bptr += ldb;
 		}
 		return(0);
 	}
 	i = 0;
 	if( cols % 2  ){
 		
 		for(j=0; j<rows; j+=vl)
 		{
 			vl = VSETVL(rows - j);
 			va = VLEV_FLOAT(aptr + j, vl);
 			va = VFMULVF_FLOAT(va, alpha, vl);
 			VSEV_FLOAT(bptr + j, va, vl);
 		}
 		aptr +=  lda;
 		bptr +=  ldb;
 		i = 1;
 	}
 	for ( ; i<cols ; i+=2 )
 	{
 		for(j=0; j<rows; j+=vl)
 		{
 			vl = VSETVL(rows - j);
 			va = VLEV_FLOAT(aptr + j, vl);
 			va1= VLEV_FLOAT(aptr + lda + j, vl);
 			va = VFMULVF_FLOAT(va, alpha, vl);
 			va1= VFMULVF_FLOAT(va1, alpha, vl);
 			VSEV_FLOAT(bptr + j, va, vl);
 			VSEV_FLOAT(bptr + ldb + j, va1, vl);
 		}
 		aptr += 2 * lda;
 		bptr += 2 * ldb;
 	}

 	return(0);
 }
--- a/kernel/riscv64/zaxpy_vector.c
+++ b/kernel/riscv64/zaxpy_vector.c
@@ -43,8 +43,55 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m4)
 #endif

 #if !defined(DOUBLE)
 inline int  small_caxpy_kernel(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
 #else
 inline int  small_zaxpy_kernel(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
 #endif
 {
 	BLASLONG i=0;
 	BLASLONG ix,iy;
 	BLASLONG inc_x2;
 	BLASLONG inc_y2;

 	if ( n <= 0     )  return(0);
 	if ( da_r == 0.0 && da_i == 0.0 ) return(0);

 	ix = 0;
 	iy = 0;

 	inc_x2 = 2 * inc_x;
 	inc_y2 = 2 * inc_y;

 	while(i < n)
 	{
 #if !defined(CONJ)
 		y[iy]   += ( da_r * x[ix]   - da_i * x[ix+1] ) ;
 		y[iy+1] += ( da_r * x[ix+1] + da_i * x[ix]   ) ;
 #else
 		y[iy]   += ( da_r * x[ix]   + da_i * x[ix+1] ) ;
 		y[iy+1] -= ( da_r * x[ix+1] - da_i * x[ix]   ) ;
 #endif
 		ix += inc_x2 ;
 		iy += inc_y2 ;
 		i++ ;

 	}
 	return(0);

 }

 int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
 {
 #if !defined(DOUBLE)
        if(n < 16) {
                return small_caxpy_kernel(n, dummy0, dummy1, da_r, da_i, x, inc_x, y, inc_y, dummy, dummy2);
        }
 #else
        if(n < 8) {
                return small_zaxpy_kernel(n, dummy0, dummy1, da_r, da_i, x, inc_x, y, inc_y, dummy, dummy2);
        }
 #endif
        BLASLONG i = 0, j = 0;
        BLASLONG ix = 0,iy = 0;
        if(n <= 0) return(0);
--- a/kernel/riscv64/zdot_vector.c
+++ b/kernel/riscv64/zdot_vector.c
@@ -68,8 +68,60 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VFNMSACVV_FLOAT RISCV_RVV(vfnmsac_vv_f64m4)
 #endif

 #if !defined(DOUBLE)
 inline OPENBLAS_COMPLEX_FLOAT small_cdot_kernel(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
 #else
 inline OPENBLAS_COMPLEX_FLOAT small_zdot_kernel(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
 #endif
 {
 	BLASLONG i=0;
 	BLASLONG ix=0,iy=0;
 	FLOAT dot[2];
 	OPENBLAS_COMPLEX_FLOAT result;
 	BLASLONG inc_x2;
 	BLASLONG inc_y2;

 	dot[0]=0.0;
 	dot[1]=0.0;

 	CREAL(result) = 0.0 ;
 	CIMAG(result) = 0.0 ;

 	if ( n < 1 )  return(result);

 	inc_x2 = 2 * inc_x ;
 	inc_y2 = 2 * inc_y ;

 	while(i < n)
 	{
 #if !defined(CONJ)
 		dot[0] += ( x[ix]   * y[iy] - x[ix+1] * y[iy+1] ) ;
 		dot[1] += ( x[ix+1] * y[iy] + x[ix]   * y[iy+1] ) ;
 #else
 		dot[0] += ( x[ix]   * y[iy] + x[ix+1] * y[iy+1] ) ;
 		dot[1] -= ( x[ix+1] * y[iy] - x[ix]   * y[iy+1] ) ;
 #endif
 		ix  += inc_x2 ;
 		iy  += inc_y2 ;
 		i++ ;

 	}
 	CREAL(result) = dot[0];
 	CIMAG(result) = dot[1];
 	return(result);

 }
 OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
 {
 #if !defined(DOUBLE)
 if(n < 16) {
        return small_cdot_kernel(n, x, inc_x, y, inc_y);
 }
 #else
 if(n < 8) {
        return small_zdot_kernel(n, x, inc_x, y, inc_y);
 }
 #endif
        BLASLONG i=0, j=0;
        BLASLONG ix=0,iy=0;
        FLOAT dot[2];
@@ -148,4 +200,4 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
        CREAL(result) = dot[0];
        CIMAG(result) = dot[1];
        return(result);
 }
 }
--- a/kernel/riscv64/zgemv_n_vector.c
+++ b/kernel/riscv64/zgemv_n_vector.c
@@ -66,7 +66,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
        BLASLONG lda2 = lda * 2;
        vy0_new = VLSEV_FLOAT(&y[iy], stride_y, gvl);
        vy1_new = VLSEV_FLOAT(&y[iy + 1], stride_y, gvl);
        for (k = 0, j = 0; k < m / gvl; k++)
        for (k = 0, j = 0; k < m / gvl; k ++)
        {
                a_ptr = a;
                ix = 0;
@@ -121,30 +121,73 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
 #endif
                        a_ptr += lda2;
                        ix += inc_x2;
                        
                }

                for (; i < n; i += 4)
                for (i = n % 4 ; i < n; i += 4)
                {
 #if !defined(XCONJ)

                        x_v0 = VLSEV_FLOAT(&x[ix], inc_x2 * sizeof(FLOAT), 4);
                        x_v1 = VLSEV_FLOAT(&x[ix + 1], inc_x2 * sizeof(FLOAT), 4);
                        temp_rv = VFMUL_VF_FLOAT(x_v0, alpha_r, 4);
                        temp_iv = VFMUL_VF_FLOAT(x_v0, alpha_i, 4);
                        temp_rv = VFNMSACVF_FLOAT(temp_rv, alpha_i, x_v1, 4);
                        temp_iv = VFMACCVF_FLOAT(temp_iv, alpha_r, x_v1, 4);
                        VSEV_FLOAT(&temp_rr[0], temp_rv, 4);
                        VSEV_FLOAT(&temp_ii[0], temp_iv, 4);
                        // temp_rr[0] = alpha_r * x[ix] - alpha_i * x[ix + 1];
                        // temp_rr[1] = alpha_r * x[ix + inc_x2] - alpha_i * x[ix + inc_x2 + 1];
                        x_v0 = VLSEV_FLOAT(&x[ix], inc_x2 * sizeof(FLOAT), 2);
                        x_v1 = VLSEV_FLOAT(&x[ix + 1], inc_x2 * sizeof(FLOAT), 2);
                        temp_rv = VFMUL_VF_FLOAT(x_v0, alpha_r, 2);
                        temp_rv = VFNMSACVF_FLOAT(temp_rv, alpha_i, x_v1, 2);         

                        // temp_ii[0] = alpha_r * x[ix + 1] + alpha_i * x[ix]; 
                        // temp_ii[1] = alpha_r * x[ix + inc_x2 + 1] + alpha_i * x[ix + inc_x2];
                        temp_iv = VFMUL_VF_FLOAT(x_v0, alpha_i, 2);
                        temp_iv = VFMACCVF_FLOAT(temp_iv, alpha_r, x_v1, 2);
                        VSEV_FLOAT(&temp_rr[0], temp_rv, 2);
                        VSEV_FLOAT(&temp_ii[0], temp_iv, 2);
                                  
                        // temp_rr[2] = alpha_r * x[ix + inc_x2 * 2] - alpha_i * x[ix + inc_x2 * 2 + 1];
                        // temp_rr[3] = alpha_r * x[ix + inc_x2 * 3] - alpha_i * x[ix + inc_x2 * 3 + 1];
                        x_v0 = VLSEV_FLOAT(&x[ix + inc_x2 * 2], inc_x2 * sizeof(FLOAT), 2);
                        x_v1 = VLSEV_FLOAT(&x[ix + inc_x2 * 2 + 1], inc_x2 * sizeof(FLOAT), 2);
                        temp_rv = VFMUL_VF_FLOAT(x_v0, alpha_r, 2);
                        temp_rv = VFNMSACVF_FLOAT(temp_rv, alpha_i, x_v1, 2);

                        // temp_ii[2] = alpha_r * x[ix + inc_x2 * 2 + 1] + alpha_i * x[ix + inc_x2 * 2];
                        // temp_ii[3] = alpha_r * x[ix + inc_x2 * 3 + 1] + alpha_i * x[ix + inc_x2 * 3];
                        temp_iv = VFMUL_VF_FLOAT(x_v0, alpha_i, 2);
                        temp_iv = VFMACCVF_FLOAT(temp_iv, alpha_r, x_v1, 2);
                        VSEV_FLOAT(&temp_rr[2], temp_rv, 2);
                        VSEV_FLOAT(&temp_ii[2], temp_iv, 2);

 #else
                        x_v0 = VLSEV_FLOAT(&x[ix], inc_x2 * sizeof(FLOAT), 4);
                        x_v1 = VLSEV_FLOAT(&x[ix + 1], inc_x2 * sizeof(FLOAT), 4);
                        temp_rv = VFMUL_VF_FLOAT(x_v0, alpha_r, 4);
                        temp_iv = VFMUL_VF_FLOAT(x_v0, alpha_i, 4);
                        temp_rv = VFMACCVF_FLOAT(temp_rv, alpha_i, x_v1, 4);
                        temp_iv = VFNMSACVF_FLOAT(temp_iv, alpha_r, x_v1, 4);
                        VSEV_FLOAT(&temp_rr[0], temp_rv, 4);
                        VSEV_FLOAT(&temp_ii[0], temp_iv, 4);
                        //  temp_rr[0] = alpha_r * x[ix] + alpha_i * x[ix + 1];
                        //  temp_rr[1] = alpha_r * x[ix + inc_x2] + alpha_i * x[ix + inc_x2 + 1];
                        x_v0 = VLSEV_FLOAT(&x[ix], inc_x2 * sizeof(FLOAT), 2);
                        x_v1 = VLSEV_FLOAT(&x[ix + 1], inc_x2 * sizeof(FLOAT), 2);
                        temp_rv = VFMUL_VF_FLOAT(x_v0, alpha_r, 2);
                        temp_rv = VFMACCVF_FLOAT(temp_rv, alpha_i, x_v1, 2);


                        // temp_ii[0] = alpha_r * x[ix + 1] - alpha_i * x[ix]; 
                        // temp_ii[1] = alpha_r * x[ix + inc_x2 + 1] - alpha_i * x[ix + inc_x2];
                        temp_iv = VFMUL_VF_FLOAT(x_v1, alpha_r, 2);
                        temp_iv = VFNMSACVF_FLOAT(temp_iv, alpha_i, x_v0, 2);
                        VSEV_FLOAT(&temp_rr[0], temp_rv, 2);
                        VSEV_FLOAT(&temp_ii[0], temp_iv, 2);

                        
                        // temp_rr[2] = alpha_r * x[ix + inc_x2 * 2] + alpha_i * x[ix + inc_x2 * 2 + 1];
                        // temp_rr[3] = alpha_r * x[ix + inc_x2 * 3] + alpha_i * x[ix + inc_x2 * 3 + 1];
                        x_v0 = VLSEV_FLOAT(&x[ix + inc_x2 * 2], inc_x2 * sizeof(FLOAT), 2);
                        x_v1 = VLSEV_FLOAT(&x[ix + inc_x2 * 2 + 1], inc_x2 * sizeof(FLOAT), 2);
                        temp_rv = VFMUL_VF_FLOAT(x_v0, alpha_r, 2);
                        temp_rv = VFMACCVF_FLOAT(temp_rv, alpha_i, x_v1, 2);


                        temp_ii[2] = alpha_r * x[ix + inc_x2 * 2 + 1] - alpha_i * x[ix + inc_x2 * 2];
                        temp_ii[3] = alpha_r * x[ix + inc_x2 * 3 + 1] - alpha_i * x[ix + inc_x2 * 3];
                        temp_iv = VFMUL_VF_FLOAT(x_v1, alpha_r, 2);
                        temp_iv = VFNMSACVF_FLOAT(temp_iv, alpha_i, x_v0, 2);
                        VSEV_FLOAT(&temp_rr[2], temp_rv, 2);
                        VSEV_FLOAT(&temp_ii[2], temp_iv, 2);



 #endif

@@ -257,7 +300,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
                VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl);
                VSSEV_FLOAT(&y[iy + 1], stride_y, vy1, gvl);
                j += gvl * 2;
                iy += inc_yv;
                iy += inc_yv  ;
        }
        // tail
        if (j / 2 < m)
--- a/kernel/riscv64/zomatcopy_cn_vector.c
+++ b/kernel/riscv64/zomatcopy_cn_vector.c
@@ -0,0 +1,106 @@
 /***************************************************************************
 Copyright (c) 2013, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/

 #include "common.h"


 #if !defined(DOUBLE)
 #define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n)
 #define FLOAT_V_T vfloat32m4_t
 #define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4)
 #define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4)
 #define VSEV_FLOAT RISCV_RVV(vse32_v_f32m4)
 #define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4)
 #define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4)
 #define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f32m4)
 #define VFMUL_VF_FLOAT RISCV_RVV(vfmul_vf_f32m4)
 #define VSEV_FLOAT RISCV_RVV(vse32_v_f32m4)
 #define VLSEG2_FLOAT RISCV_RVV(vlseg2e32_v_f32m4x2)
 #define VSSEG2_FLOAT RISCV_RVV(vsseg2e32_v_f32m4x2)
 #define FLOAT_VX2_T vfloat32m4x2_t
 #define VGET_VX2 RISCV_RVV(vget_v_f32m4x2_f32m4)
 #define VSET_VX2 RISCV_RVV(vset_v_f32m4_f32m4x2)
 #else
 #define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n)
 #define FLOAT_V_T vfloat64m4_t
 #define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4)
 #define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4)
 #define VSEV_FLOAT RISCV_RVV(vse64_v_f64m4)
 #define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4)
 #define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4)
 #define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m4)
 #define VFMUL_VF_FLOAT RISCV_RVV(vfmul_vf_f64m4)
 #define VSEV_FLOAT RISCV_RVV(vse64_v_f64m4)
 #define VLSEG2_FLOAT RISCV_RVV(vlseg2e64_v_f64m4x2)
 #define VSSEG2_FLOAT RISCV_RVV(vsseg2e64_v_f64m4x2)
 #define FLOAT_VX2_T vfloat64m4x2_t
 #define VGET_VX2 RISCV_RVV(vget_v_f64m4x2_f64m4)
 #define VSET_VX2 RISCV_RVV(vset_v_f64m4_f64m4x2)
 #endif

 int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
 {
 	BLASLONG i,j,ia;
 	FLOAT *aptr,*bptr;
 	FLOAT_V_T bptr_v0 , bptr_v1 , aptr_v0 ,aptr_v1;
 	FLOAT_VX2_T va, vb;
 	unsigned int gvl = 0;

 	if ( rows <= 0     )  return(0);
 	if ( cols <= 0     )  return(0);

 	aptr = a;
 	bptr = b;

 	lda *= 2;
 	ldb *= 2;
 	for ( i=0; i<cols ; i++ )
 	{
 		ia = 0;
 		for(j=0; j<rows ; j+=gvl)
 		{
 			gvl = VSETVL(rows - j);
 			va = VLSEG2_FLOAT(aptr + ia, gvl);
 			aptr_v0 = VGET_VX2(va, 0);
 			aptr_v1 = VGET_VX2(va, 1);
 			bptr_v1 = VFMUL_VF_FLOAT( aptr_v1, alpha_r,gvl);
 			bptr_v1 = VFMACCVF_FLOAT(bptr_v1, alpha_i, aptr_v0, gvl);
 			bptr_v0 = VFMUL_VF_FLOAT(  aptr_v0,alpha_r, gvl);
 			bptr_v0 = VFNMSACVF_FLOAT(bptr_v0, alpha_i, aptr_v1, gvl);
 			vb = VSET_VX2(vb, 0, bptr_v0);
 			vb = VSET_VX2(vb, 1, bptr_v1);
 			VSSEG2_FLOAT(&bptr[ia], vb, gvl);
 			ia += gvl * 2 ;

 		}
 		aptr += lda;
 		bptr += ldb;
 	}

 	return(0);

 }
--- a/kernel/riscv64/zscal.c
+++ b/kernel/riscv64/zscal.c
@@ -27,65 +27,56 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 /**************************************************************************************
 * 2013/09/14 Saar
 *	 BLASTEST float		: OK
 * 	 BLASTEST double	: OK
 * 	 CTEST			: OK
 * 	 TEST			: OK
 *    BLASTEST float     : OK
 *    BLASTEST double    : OK
 *    CTEST          : OK
 *    TEST           : OK
 *
 **************************************************************************************/

 #include "common.h"

 // The c/zscal_k function is called not only by cblas_c/zscal but also by other upper-level interfaces.
 // In certain cases, the expected return values for cblas_s/zscal differ from those of other upper-level interfaces.
 // To handle this, we use the dummy2 parameter to differentiate between them.
 int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
 {
 	BLASLONG i=0;
 	BLASLONG inc_x2;
 	BLASLONG ip = 0;
 	FLOAT temp;
    BLASLONG i = 0;
    BLASLONG inc_x2;
    BLASLONG ip = 0;
    FLOAT temp;

        if ( (n <= 0) || (inc_x <= 0))
                return(0);
    if ((n <= 0) || (inc_x <= 0))
        return(0);

    inc_x2 = 2 * inc_x;
    if (dummy2 == 0) {
        for (i = 0; i < n; i++)
        {
            if (da_r == 0.0 && da_i == 0.0)
            {
                x[ip] = 0.0;
                x[ip+1] = 0.0;
            }
            else
            {
                temp    = da_r * x[ip]   - da_i * x[ip+1];
                x[ip+1] = da_r * x[ip+1] + da_i * x[ip]  ;
                x[ip] = temp;
            }

 	inc_x2 = 2 * inc_x;
 	for ( i=0; i<n; i++ )
 	{
 		if ( da_r == 0.0 )
 		{
 			if ( da_i == 0.0 )
 			{
 				temp = 0.0;
 				x[ip+1] = 0.0 ;
 			}
 			else
 			{
 				temp = - da_i * x[ip+1] ;
 				if (isnan(x[ip]) || isinf(x[ip])) temp = NAN;
 				if (!isinf(x[ip+1]))
 					x[ip+1] = da_i * x[ip]  ;
 				else	x[ip+1] = NAN;
 			}
 		}
 		else
 		{
 			if ( da_i == 0.0 )
 			{
 				temp    = da_r * x[ip]  ;
 				x[ip+1] = da_r * x[ip+1];
 			}
 			else
 			{
 				temp    = da_r * x[ip]   - da_i * x[ip+1] ;
 				x[ip+1] = da_r * x[ip+1] + da_i * x[ip]   ;
 			}
 		}
 		x[ip]   = temp;
            ip += inc_x2;
        }
        return(0);
    }
    for (i = 0; i < n; i++)
    {
        temp    = da_r * x[ip]   - da_i * x[ip+1];
        x[ip+1] = da_r * x[ip+1] + da_i * x[ip]  ;

 		ip += inc_x2;
 	}

 	return(0);
        x[ip]   = temp;
        ip += inc_x2;
    }

    return(0);
 }


--- a/kernel/riscv64/zscal_rvv.c
+++ b/kernel/riscv64/zscal_rvv.c
@@ -70,6 +70,11 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F
    FLOAT_VX2_T vx2;

    if(inc_x == 1) {
 	if (dummy2 == 0 && da_r==0. && da_i == 0.) {
 		BLASLONG i;
 		for (i=0; i < n*2; i++) x[i]=0.;
 		return(0);
 	} else {

        for (size_t vl; n > 0; n -= vl, x += vl*2) {
            vl = VSETVL(n);
@@ -80,6 +85,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F

            vt = VFMULVF_FLOAT(vr, da_r, vl);
            vt = VFNMSACVF_FLOAT(vt, da_i, vi, vl);

            vi = VFMULVF_FLOAT(vi, da_r, vl);
            vi = VFMACCVF_FLOAT(vi, da_i, vr, vl);

@@ -87,9 +93,14 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F
            vx2 = VSET_VX2(vx2, 1, vi);
            VSSEG_FLOAT(x, vx2, vl);
        }
        }

    } else {

 	if (dummy2 == 0 && da_r==0. && da_i == 0.) {
 		BLASLONG i,ix=0,inc_x2=2*inc_x;
 		for (i=0; i < n; i++) {x[ix]=0.;x[ix+1]=0.;ix+=inc_x2;};
 		return(0);
 	} else {
        for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) {
            vl = VSETVL(n);

@@ -105,6 +116,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F
            vx2 = VSET_VX2(vx2, 0, vt);
            vx2 = VSET_VX2(vx2, 1, vi);
            VSSSEG_FLOAT(x, stride_x, vx2, vl);
 	    }
        }
    }

--- a/kernel/riscv64/zscal_vector.c
+++ b/kernel/riscv64/zscal_vector.c
@@ -57,9 +57,15 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F
        if((n <= 0) || (inc_x <= 0))
                return(0);

        unsigned int gvl = 0;
        FLOAT_V_T vt, v0, v1;
        {
 	if (dummy2 == 0 && da_r == 0. && da_i == 0.) {
 		int i,inc_x2,ix;
 		inc_x2 = 2*inc_x;
 		ix=0;
 		for (i=0;i<n;i++){x[ix]=0.;x[ix+1]=0.;ix+=inc_x2;}
 	} else {
            unsigned int gvl = 0;
            FLOAT_V_T vt, v0, v1;
            {
                gvl = VSETVL(n);
                BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
                BLASLONG inc_xv = inc_x * 2 * gvl;
@@ -91,6 +97,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F
                        VSSEV_FLOAT(&x[ix], stride_x, vt, gvl);
                        VSSEV_FLOAT(&x[ix+1], stride_x, v1, gvl);
                }
            }
        }
        return(0);
 }
--- a/kernel/sparc/KERNEL
+++ b/kernel/sparc/KERNEL
@@ -86,3 +86,8 @@ endif
 ifndef QROTMKERNEL
 QROTMKERNEL = ../generic/rotm.c
 endif

 SSCALKERNEL = ../arm/scal.c
 DSCALKERNEL = ../arm/scal.c
 CSCALKERNEL = ../arm/zscal.c
 ZSCALKERNEL = ../arm/zscal.c
--- a/kernel/x86/KERNEL
+++ b/kernel/x86/KERNEL
@@ -200,3 +200,6 @@ endif
 ifndef QROTMKERNEL
 QROTMKERNEL = ../generic/rotm.c
 endif

 CSCALKERNEL = ../arm/zscal.c
 ZSCALKERNEL = ../arm/zscal.c
--- a/kernel/x86_64/KERNEL
+++ b/kernel/x86_64/KERNEL
@@ -323,11 +323,11 @@ DSCALKERNEL =  scal_sse2.S
 endif

 ifndef CSCALKERNEL
 CSCALKERNEL = zscal_sse.S
 CSCALKERNEL = ../arm/zscal.c
 endif

 ifndef ZSCALKERNEL
 ZSCALKERNEL = zscal_sse2.S
 ZSCALKERNEL = ../arm/zscal.c
 endif

 ifndef ASCALKERNEL
--- a/kernel/x86_64/cscal.c
+++ b/kernel/x86_64/cscal.c
@@ -229,10 +229,9 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,

 			if ( da_i == 0.0 )
 			{

 			    if (!dummy2) {
 				while(j < n1)
 				{
 			
 					x[i]=0.0;
 					x[i+1]=0.0;
 					x[i+inc_x]=0.0;
@@ -244,21 +243,48 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,

 				while(j < n)
 				{
 			
 					x[i]=0.0;
 					x[i+1]=0.0;
 					i += inc_x ;
 					j++;

 				}
 			    } else {
 					float temp;
 				while(j < n1)
 				{
 					if (isnan(x[i])|| isnan(x[i+1]))
 						temp=NAN;
 				        else
 						temp=0.0;
 					x[i]=temp;
 					x[i+1]=temp;
 					if (isnan(x[i+inc_x])|| isnan(x[i+inc_x+1]))
 						temp=NAN;
 				        else
 						temp=0.0;
 					x[i+inc_x]= temp;
 					x[i+inc_x+1]= temp;
 					i += 2*inc_x;
 					j+=2;

 				}
 				while(j < n)
 				{
 					if (isnan(x[i])|| isnan(x[i+1]))
 						temp=NAN;
 				        else
 						temp=0.0;
 					x[i]=temp;
 					x[i+1]=temp;
 					i += inc_x;
 					j++;
 				}
 			    }
 			}
 			else
 			{

 				while(j < n1)
 				{
 			
 					if (isnan(x[i]) || isinf(x[i]))
 						temp0	= NAN;
 					else
@@ -278,7 +304,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
 					x[i+inc_x]   =  temp1;
 					i += 2*inc_x ;
 					j+=2;

 				}

 				while(j < n)
@@ -305,14 +330,12 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
 		else
 		{


 			if ( da_i == 0.0 )
 			if ( da_i == 0.0 && dummy2 )
 			{
 				BLASLONG n1 = n & -2;

 				while(j < n1)
 				{
 			
 					temp0        =  da_r * x[i];
 					x[i+1]       =  da_r * x[i+1];
 					x[i]         =  temp0;
@@ -367,22 +390,19 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
 		return(0);
 	}


 	BLASLONG n1 = n & -16;
 	if ( n1 > 0 )
 	{

 		alpha[0] = da_r;
 		alpha[1] = da_i;
 	
 		if ( da_r == 0.0 )
 			if ( da_i == 0 )
 			if ( da_i == 0 && !dummy2)
 				cscal_kernel_16_zero(n1 , alpha , x);
 			else
 				cscal_kernel_16_zero_r(n1 , alpha , x);
 				cscal_kernel_16/*_zero_r*/(n1 , alpha , x);
 		else
 				cscal_kernel_16(n1 , alpha , x);

 		i = n1 << 1;
 		j = n1;
 	}
@@ -393,6 +413,8 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
 		{
 		FLOAT res=0.0;
 		if (isnan(da_r)) res= da_r;
 		if (dummy2)
 			if (isnan(x[i])||isnan(x[i+1])) res= NAN;
 			while(j < n)
 			{
 					x[i]=res;
@@ -415,7 +437,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,

 		} else
 		{

 			while(j < n)
 			{
 				temp0        = -da_i * x[i+1];
@@ -424,11 +445,10 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
 				if (!isinf(x[i+1]))
 					x[i+1]       =  da_i * x[i];
 				else x[i+1] = NAN;
 				if ( x[i] == x[i]) //preserve NaN
 				if ( !isnan(x[i])) //preserve NaN
 				  x[i]         =  temp0;
 				i += 2 ;
 				j++;

 			}

 		}
@@ -439,12 +459,22 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,

 		if ( da_i == 0.0 )
 		{

 			while(j < n)
 			{
 			

 					temp0        =  da_r * x[i];
 					x[i+1]       =  da_r * x[i+1];
 					if (dummy2) {
 					if (isnan(x[i])||isinf(x[i])) temp0=NAN;
 					if (isnan(x[i+1])||isinf(x[i+1]))
 					    x[i+1]=NAN;
 					else
 					    x[i+1]       =  da_r * x[i+1];
 					} else {
 					    if (isnan(x[i]))
 						x[i+1]       =  NAN;
 					    else
 						x[i+1]       =  da_r * x[i+1];
 					}
 					x[i]         =  temp0;
 					i += 2 ;
 					j++;
@@ -476,7 +506,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,

 				temp0        =  da_r * x[i]   - da_i * x[i+1];
 				x[i+1]       =  da_r * x[i+1] + da_i * x[i];
 				x[i]         =  temp0;
 				if(!isnan(x[i]))x[i]         =  temp0;
 				i += 2 ;
 				j++;

--- a/kernel/x86_64/zscal.c
+++ b/kernel/x86_64/zscal.c
@@ -222,13 +222,14 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,

 		if ( da_r == 0.0 )
 		{

 			BLASLONG n1 = n & -2;

 			if ( da_i == 0.0 )
 			{
 			    if (!dummy2) {
 				while(j < n1)
 				{

 					x[i]=0.0;
 					x[i+1]=0.0;
 					x[i+inc_x]=0.0;
@@ -245,9 +246,40 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
 					x[i+1]=0.0;
 					i += inc_x ;
 					j++;
 				}
 			    } else {
 				float temp;
 				while(j < n1)
 				{
 					if (isnan(x[i])|| isnan(x[i+1]))
 						temp=NAN;
 				        else
 						temp=0.0;
 					x[i]=temp;
 					x[i+1]=temp;
 					if (isnan(x[i+inc_x])|| isnan(x[i+inc_x+1]))
 						temp=NAN;
 				        else
 						temp=0.0;
 					x[i+inc_x]= temp;
 					x[i+inc_x+1]= temp;
 					i += 2*inc_x;
 					j+=2;

 				}
 				while(j < n)
 				{
 					if (isnan(x[i])|| isnan(x[i+1]))
 						temp=NAN;
 				        else
 						temp=0.0;
 					x[i]=temp;
 					x[i+1]=temp;
 					i += inc_x;
 					j++;

 				}
 			    }
 			}
 			else
 			{
@@ -260,7 +292,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
 						temp0   = -da_i * x[i+1];
 					if (!isinf(x[i+1]))
 						x[i+1]  = da_i * x[i];
 					else    x[i+1]	= NAN;     
 					else    x[i+1]	= NAN;
 					x[i]         =  temp0;
 					if (isnan(x[i+inc_x]) || isinf(x[i+inc_x]))
 						temp1	= NAN;
@@ -291,16 +323,13 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,

 				}



 			}

 		}
 		else
 		{


 			if ( da_i == 0.0 )
 			if ( da_i == 0.0 && dummy2)
 			{
 				BLASLONG n1 = n & -2;

@@ -370,26 +399,27 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
 		alpha[1] = da_i;

 		if ( da_r == 0.0 )
 			if ( da_i == 0 )
 			if ( da_i == 0 && !dummy2 )
 				zscal_kernel_8_zero(n1 , alpha , x);
 			else
 //				zscal_kernel_8_zero_r(n1 , alpha , x);
 				zscal_kernel_8(n1 , alpha , x);
 		else
 			if ( da_i == 0 && da_r == da_r)
 		/*	if ( da_i == 0 && da_r == da_r )
 				zscal_kernel_8_zero_i(n1 , alpha , x);
 			else
 			else*/
 				zscal_kernel_8(n1 , alpha , x);
 		}

 		i = n1 << 1;
 		j = n1;

 	if ( da_r == 0.0 || da_r != da_r )
 	}
 	if ( da_r == 0.0 || isnan(da_r) )
 	{
 		if ( da_i == 0.0 )
 		{
 		FLOAT res=0.0;
 		if (da_r != da_r) res= da_r;
 			FLOAT res=0.0;
 			if (isnan(da_r)) res= da_r;
 			if (dummy2)
 				if (isnan(x[i])||isnan(x[i+1])) res= NAN;
 			while(j < n)
 			{
 					x[i]=res;
@@ -412,7 +442,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,

 		} else
 		{

 			while(j < n)
 			{
 				temp0        = -da_i * x[i+1];
@@ -421,7 +450,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
 				if (!isinf(x[i+1]))
 					x[i+1]       =  da_i * x[i];
 				else x[i+1] = NAN;
 				if ( x[i] == x[i]) //preserve NaN
 				if ( !isnan(x[i])) //preserve NaN
 				  x[i]         =  temp0;
 				i += 2 ;
 				j++;
@@ -437,8 +466,9 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
 		{
 				while(j < n)
 				{

 					temp0        =  da_r * x[i];
 					if (isnan(x[i]))x[i+1]=NAN;
 					else
 					x[i+1]       =  da_r * x[i+1];
 					x[i]         =  temp0;
 					i += 2 ;
@@ -453,7 +483,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
 			{
 				temp0        =  da_r * x[i]   - da_i * x[i+1];
 				x[i+1]       =  da_r * x[i+1] + da_i * x[i];
 				x[i]         =  temp0;
 				if(!isnan(x[i]))x[i]         =  temp0;
 				i += 2 ;
 				j++;

--- a/kernel/zarch/cscal.c
+++ b/kernel/zarch/cscal.c
@@ -210,7 +210,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
      BLASLONG n1 = n & -2;

      if (da_i == 0.0) {

       if (dummy2 == 0) {
        while (j < n1) {

          x[i] = 0.0;
@@ -230,11 +230,43 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
          j++;

        }
       } else {
 	  while (j < n1) {
 	  if (isnan(x[i]) || isinf(x[i]) || isnan(x[i+1])) {
 	  x[i] = NAN;
 	  x[i+1] = NAN;
 	  }else{
          x[i] = 0.0;
          x[i + 1] = 0.0;
 	  }
 	  if (isnan(x[i+inc_x]) || isinf(x[i+inc_x]) || isnan(x[i+1+inc_x])) {
          x[i + inc_x] = NAN;
          x[i + 1 + inc_x] = NAN;
 	  } else {
          x[i + inc_x] = 0.0;
          x[i + 1 + inc_x] = 0.0;
 	  }
          i += 2 * inc_x;
          j += 2;

        }

        while (j < n) {
 	  if (isnan(x[i]) || isinf(x[i]) || isnan(x[i+1])) {
 	  x[i] = NAN;
 	  x[i+1] = NAN;
 	  }else{
          x[i] = 0.0;
          x[i + 1] = 0.0;
 	  }
          i += inc_x;
          j++;
 	 }
        }
      } else {

        while (j < n1) {
          if (isnan(x[i]) || isinf(x[i]))
          if (isnan(x[i]) || isinf(x[i])) 
            temp0   = NAN;
 	  else
 	    temp0 = -da_i * x[i + 1];
@@ -276,7 +308,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,

    } else {

      if (da_i == 0.0) {
      if (da_i == 0.0 && dummy2) {
        BLASLONG n1 = n & -2;

        while (j < n1) {
@@ -335,12 +367,16 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
    alpha[1] = da_i;

    if (da_r == 0.0)
      if (da_i == 0)
      if (da_i == 0 && dummy2 == 0)
        cscal_kernel_16_zero(n1, x);
      else
      else {
 /*	if (dummy2 == 0)
        cscal_kernel_16_zero_r(n1, alpha, x);
    else if (da_i == 0)
      cscal_kernel_16_zero_i(n1, alpha, x);
 	else*/
 	cscal_kernel_16(n1, da_r, da_i, x);
      }
 /*    else if (da_i == 0 && !isnan(da_r))
      cscal_kernel_16/*_zero_i(n1, alpha, x);*/
    else
      cscal_kernel_16(n1, da_r, da_i, x);

@@ -354,7 +390,8 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
      float res = 0.0;
      if (isnan(da_r)) res = da_r;
      while (j < n) {

 	if (dummy2)
 		if (isnan(x[i])|| isnan(x[i+1])) res=NAN;
        x[i] = res;
        x[i + 1] = res;
        i += 2;
@@ -382,7 +419,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
          x[i + 1] = da_i * x[i];
 	else
 	  x[i + 1] = NAN;
 	if (x[i] == x[i])
 	if (!isnan(x[i]))
          x[i] = temp0;
        i += 2;
        j++;
@@ -398,7 +435,18 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
      while (j < n) {

        temp0 = da_r * x[i];
        x[i + 1] = da_r * x[i + 1];
 	if (dummy2) {
 	if (isnan(x[i])||isinf(x[i]))temp0 = NAN;
 	if (isnan(x[i+1])||isinf(x[i+1])) 
 		x[i+1] = NAN;
 	else
 		x[i+1] = da_r * x[i + 1];
 	} else {
 		if (isnan(x[i]))
 			x[i + 1] = NAN;
 		else
        		x[i + 1] = da_r * x[i + 1];
 	}
        x[i] = temp0;
        i += 2;
        j++;
@@ -411,7 +459,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,

        temp0 = da_r * x[i] - da_i * x[i + 1];
        x[i + 1] = da_r * x[i + 1] + da_i * x[i];
        x[i] = temp0;
        if (!isnan(x[i])) x[i] = temp0;
        i += 2;
        j++;

--- a/kernel/zarch/zscal.c
+++ b/kernel/zarch/zscal.c
@@ -208,7 +208,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
      BLASLONG n1 = n & -2;

      if (da_i == 0.0) {

       if (dummy2 == 0) {
        while (j < n1) {

          x[i] = 0.0;
@@ -228,7 +228,38 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
          j++;

        }

       } else {
        while (j < n1) {
 	if (isnan(x[i]) || isinf(x[i]) || isnan(x[i+1])) {
 		x[i] = NAN;
 		x[i+1] = NAN;
 	} else {
 		x[i] = 0.0;
 		x[i+1] = 0.0;
 	}
 	if (isnan(x[i+inc_x]) || isinf(x[i+inc_x]) || isnan(x[i+inc_x+1])) {
 		x[i + inc_x] = NAN;
 		x[i + inc_x + 1] = NAN;
 	} else {
 		x[i + inc_x] = 0.;
 		x[i + inc_x + 1] = 0.;
 	}
 	i += 2 * inc_x;
 	j += 2;
 	}
 	       
 	while (j < n) {
 	if (isnan(x[i]) || isinf(x[i]) || isnan(x[i+1])) {
 		x[i] = NAN;
 		x[i+1] = NAN;
 	} else {
 		x[i] = 0.;
 		x[i+1] = 0.;
 	}
 		i += inc_x;
 		j++;
        }
       }
      } else {

        while (j < n1) {
@@ -276,7 +307,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,

    } else {

      if (da_i == 0.0) {
      if (da_i == 0.0 && dummy2) {
        BLASLONG n1 = n & -2;

        while (j < n1) {
@@ -335,12 +366,10 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
    alpha[1] = da_i;

    if (da_r == 0.0)
      if (da_i == 0)
      if (da_i == 0 && dummy2 == 0)
        zscal_kernel_8_zero(n1, x);
      else
        zscal_kernel_8(n1, da_r, da_i, x);
    else if (da_i == 0 && da_r == da_r)
      zscal_kernel_8_zero_i(n1, alpha, x);
    else
      zscal_kernel_8(n1, da_r, da_i, x);

@@ -354,7 +383,8 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
      double res= 0.0;
      if (isnan(da_r)) res = da_r;
      while (j < n) {

 	if (dummy2)
 		if (isnan(x[i]) || isnan(x[i+1])) res = NAN;
        x[i] = res;
        x[i + 1] = res;
        i += 2;
@@ -381,7 +411,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
        	x[i + 1] = da_i * x[i];
 	else
 		x[i + 1] = NAN;
 	if (x[i]==x[i])
 	if (!isnan(x[i]))
          x[i] = temp0;
        i += 2;
        j++;
@@ -397,8 +427,19 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
      while (j < n) {

        temp0 = da_r * x[i];
        x[i + 1] = da_r * x[i + 1];
        x[i] = temp0;
 	if (dummy2) {
 		if (isnan(x[i]) || isinf(x[i])) temp0 = NAN;
 		if (isnan(x[i + 1]) || isinf(x[i + 1]))
 			x[i + 1] = NAN;
 		else
 			x[i + 1] = da_r * x[i + 1];
 	} else {
 		if (isnan(x[i]))
 			x[i + 1] = NAN;
 		else
 	        x[i + 1] = da_r * x[i + 1];
 	}
 	x[i] = temp0;
        i += 2;
        j++;

@@ -410,7 +451,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,

        temp0 = da_r * x[i] - da_i * x[i + 1];
        x[i + 1] = da_r * x[i + 1] + da_i * x[i];
        x[i] = temp0;
        if (!isnan(x[i])) x[i] = temp0;
        i += 2;
        j++;

--- a/utest/test_gemv.c
+++ b/utest/test_gemv.c
@@ -128,3 +128,477 @@ CTEST(dgemv, 0_nan_inf_incy_2)
 }

 #endif

 #ifdef BUILD_COMPLEX

 CTEST(cgemv, 0_nan_inf)
 {
    int i;
    blasint N = 17;
    blasint incX = 1;
    blasint incY = 1;
    float alpha[2] = {0.0, 0.0};
    float beta[2] = {0.0, 0.0};
    char  trans = 'N';
    float A[17 * 17 * 4];
    float X[17 * 2];
    float Y[17 * 2];

    memset(A, 0, sizeof(A));
    memset(X, 0, sizeof(X));
    for (i = 0; i < (2 * N - 2); i += 4)
    {
        Y[i]     = NAN;
        Y[i + 1] = NAN;

        Y[i + 2] = INFINITY;
        Y[i + 3] = INFINITY;
    }
    Y[2 * N - 1] = NAN;
    Y[2 * N - 2] = NAN;
    BLASFUNC(cgemv)(&trans, &N, &N, alpha, A, &N, X, &incX, beta, Y, &incY);
    for (i = 0; i < 2 * N; i ++)
        ASSERT_TRUE(Y[i] == 0.0);
 }

 CTEST(cgemv, 0_nan_inf_incy_2)
 {
    int i;
    blasint N = 17;
    blasint incX = 1;
    blasint incY = 2;
    float alpha[2] = {0.0, 0.0};
    float beta[2] = {0.0, 0.0};
    char  trans = 'N';
    float A[17 * 17 * 4];
    float X[17];
    float Y[17 * 2 * 2];
    float *ay = Y;

    memset(A, 0, sizeof(A));
    memset(X, 0, sizeof(X));
    memset(Y, 0, sizeof(Y));
    for (i = 0; i < (2 * N - 2); i += 4)
    {
        ay[0]   = NAN;
        ay[1]   = NAN;
        ay     += 4;
        ay[0]   = INFINITY;
        ay[1]   = INFINITY;
        ay     += 4;
    }
    Y[4 * N - 4] = NAN;
    Y[4 * N - 3] = NAN;
    BLASFUNC(cgemv)(&trans, &N, &N, alpha, A, &N, X, &incX, beta, Y, &incY);
    for (i = 0; i < 4 * N; i ++)
        ASSERT_TRUE(Y[i] == 0.0);
 }

 CTEST(cgemv, 0_2_nan_1_inf_1)
 {
    int i;
    blasint N = 17;
    blasint incX = 1;
    blasint incY = 1;
    float alpha[2] = {0.0, 0.0};
    float beta[2] = {0.0, 2.0};
    char  trans = 'N';
    float A[17 * 17 * 4];
    float X[17 * 2];
    float Y[17 * 2];

    memset(A, 0, sizeof(A));
    memset(X, 0, sizeof(X));
    for (i = 0; i < (2 * N - 2); i += 4)
    {
        Y[i]     = NAN;
        Y[i + 1] = 1.0;

        Y[i + 2] = INFINITY;
        Y[i + 3] = 1.0;
    }
    Y[2 * N - 2] = NAN;
    Y[2 * N - 1] = 1.0;
    BLASFUNC(cgemv)(&trans, &N, &N, alpha, A, &N, X, &incX, beta, Y, &incY);
    for (i = 0; i < 2 * N; i += 2) {
        if ((i >> 1) % 2){
            ASSERT_TRUE(isnan(Y[i]));
            ASSERT_TRUE(isinf(Y[i + 1]));
        }
        else {
            ASSERT_TRUE(isnan(Y[i]));
            ASSERT_TRUE(isnan(Y[i + 1]));
        }
    }
 }

 CTEST(cgemv, 0_2_nan_1_inf_1_incy_2)
 {
    int i;
    blasint N = 17;
    blasint incX = 1;
    blasint incY = 2;
    float alpha[2] = {0.0, 0.0};
    float beta[2] = {0.0, 2.0};
    char  trans = 'N';
    float A[17 * 17 * 4];
    float X[17];
    float Y[17 * 2 * 2];
    float *ay = Y;

    memset(A, 0, sizeof(A));
    memset(X, 0, sizeof(X));
    memset(Y, 0, sizeof(Y));
    for (i = 0; i < (2 * N - 2); i += 4)
    {
        ay[0]   = NAN;
        ay[1]   = 1.0;
        ay     += 4;
        ay[0]   = INFINITY;
        ay[1]   = 1.0;
        ay     += 4;
    }
    Y[4 * N - 4] = NAN;
    Y[4 * N - 3] = 1.0;
    BLASFUNC(cgemv)(&trans, &N, &N, alpha, A, &N, X, &incX, beta, Y, &incY);
    for (i = 0; i < 4 * N; i += 2) {
        if ((i >> 1) % 2) {
            ASSERT_TRUE(Y[i] == 0.0);
            ASSERT_TRUE(Y[i + 1] == 0.0);
        }
        else {
            if ((i >> 2) % 2) {
                ASSERT_TRUE(isnan(Y[i]));
                ASSERT_TRUE(isinf(Y[i + 1]));
            }
            else {
                ASSERT_TRUE(isnan(Y[i]));
                ASSERT_TRUE(isnan(Y[i + 1]));
            }
        }
    }
 }

 CTEST(cgemv, 2_0_nan_1_inf_1)
 {
    int i;
    blasint N = 17;
    blasint incX = 1;
    blasint incY = 1;
    float alpha[2] = {0.0, 0.0};
    float beta[2] = {2.0, 0.0};
    char  trans = 'N';
    float A[17 * 17 * 4];
    float X[17 * 2];
    float Y[17 * 2];

    memset(A, 0, sizeof(A));
    memset(X, 0, sizeof(X));
    for (i = 0; i < (2 * N - 2); i += 4)
    {
        Y[i]     = NAN;
        Y[i + 1] = 1.0;

        Y[i + 2] = INFINITY;
        Y[i + 3] = 1.0;
    }
    Y[2 * N - 2] = NAN;
    Y[2 * N - 1] = 1.0;
    BLASFUNC(cgemv)(&trans, &N, &N, alpha, A, &N, X, &incX, beta, Y, &incY);
    for (i = 0; i < 2 * N; i += 2) {
        if ((i >> 1) % 2){
            ASSERT_TRUE(isinf(Y[i]));
            ASSERT_TRUE(isnan(Y[i + 1]));
        }
        else {
            ASSERT_TRUE(isnan(Y[i]));
            ASSERT_TRUE(isnan(Y[i + 1]));
        }
    }
 }

 CTEST(cgemv, 2_0_nan_1_inf_1_incy_2)
 {
    int i;
    blasint N = 17;
    blasint incX = 1;
    blasint incY = 2;
    float alpha[2] = {0.0, 0.0};
    float beta[2] = {2.0, 0.0};
    char  trans = 'N';
    float A[17 * 17 * 4];
    float X[17];
    float Y[17 * 2 * 2];
    float *ay = Y;

    memset(A, 0, sizeof(A));
    memset(X, 0, sizeof(X));
    memset(Y, 0, sizeof(Y));
    for (i = 0; i < (2 * N - 2); i += 4)
    {
        ay[0]   = NAN;
        ay[1]   = 1.0;
        ay     += 4;
        ay[0]   = INFINITY;
        ay[1]   = 1.0;
        ay     += 4;
    }
    Y[4 * N - 4] = NAN;
    Y[4 * N - 3] = 1.0;
    BLASFUNC(cgemv)(&trans, &N, &N, alpha, A, &N, X, &incX, beta, Y, &incY);
    for (i = 0; i < 4 * N; i += 2) {
        if ((i >> 1) % 2) {
            ASSERT_TRUE(Y[i] == 0.0);
            ASSERT_TRUE(Y[i + 1] == 0.0);
        }
        else {
            if ((i >> 2) % 2) {
                ASSERT_TRUE(isinf(Y[i]));
                ASSERT_TRUE(isnan(Y[i + 1]));
            }
            else {
                ASSERT_TRUE(isnan(Y[i]));
                ASSERT_TRUE(isnan(Y[i + 1]));
            }
        }
    }
 }

 #endif

 #ifdef BUILD_COMPLEX16

 CTEST(zgemv, 0_nan_inf)
 {
    int i;
    blasint N = 17;
    blasint incX = 1;
    blasint incY = 1;
    double alpha[2] = {0.0, 0.0};
    double beta[2] = {0.0, 0.0};
    char  trans = 'N';
    double A[17 * 17 * 4];
    double X[17 * 2];
    double Y[17 * 2];

    memset(A, 0, sizeof(A));
    memset(X, 0, sizeof(X));
    for (i = 0; i < (2 * N - 2); i += 4)
    {
        Y[i]     = NAN;
        Y[i + 1] = NAN;

        Y[i + 2] = INFINITY;
        Y[i + 3] = INFINITY;
    }
    Y[2 * N - 1] = NAN;
    Y[2 * N - 2] = NAN;
    BLASFUNC(zgemv)(&trans, &N, &N, alpha, A, &N, X, &incX, beta, Y, &incY);
    for (i = 0; i < 2 * N; i ++)
        ASSERT_TRUE(Y[i] == 0.0);
 }

 CTEST(zgemv, 0_nan_inf_incy_2)
 {
    int i;
    blasint N = 17;
    blasint incX = 1;
    blasint incY = 2;
    double alpha[2] = {0.0, 0.0};
    double beta[2] = {0.0, 0.0};
    char  trans = 'N';
    double A[17 * 17 * 4];
    double X[17];
    double Y[17 * 2 * 2];
    double *ay = Y;

    memset(A, 0, sizeof(A));
    memset(X, 0, sizeof(X));
    memset(Y, 0, sizeof(Y));
    for (i = 0; i < (2 * N - 2); i += 4)
    {
        ay[0]   = NAN;
        ay[1]   = NAN;
        ay     += 4;
        ay[0]   = INFINITY;
        ay[1]   = INFINITY;
        ay     += 4;
    }
    Y[4 * N - 4] = NAN;
    Y[4 * N - 3] = NAN;
    BLASFUNC(zgemv)(&trans, &N, &N, alpha, A, &N, X, &incX, beta, Y, &incY);
    for (i = 0; i < 4 * N; i ++)
        ASSERT_TRUE(Y[i] == 0.0);
 }

 CTEST(zgemv, 0_2_nan_1_inf_1)
 {
    int i;
    blasint N = 17;
    blasint incX = 1;
    blasint incY = 1;
    double alpha[2] = {0.0, 0.0};
    double beta[2] = {0.0, 2.0};
    char  trans = 'N';
    double A[17 * 17 * 4];
    double X[17 * 2];
    double Y[17 * 2];

    memset(A, 0, sizeof(A));
    memset(X, 0, sizeof(X));
    for (i = 0; i < (2 * N - 2); i += 4)
    {
        Y[i]     = NAN;
        Y[i + 1] = 1.0;

        Y[i + 2] = INFINITY;
        Y[i + 3] = 1.0;
    }
    Y[2 * N - 2] = NAN;
    Y[2 * N - 1] = 1.0;
    BLASFUNC(zgemv)(&trans, &N, &N, alpha, A, &N, X, &incX, beta, Y, &incY);
    for (i = 0; i < 2 * N; i += 2) {
        if ((i >> 1) % 2){
            ASSERT_TRUE(isnan(Y[i]));
            ASSERT_TRUE(isinf(Y[i + 1]));
        }
        else {
            ASSERT_TRUE(isnan(Y[i]));
            ASSERT_TRUE(isnan(Y[i + 1]));
        }
    }
 }

 CTEST(zgemv, 0_2_nan_1_inf_1_incy_2)
 {
    int i;
    blasint N = 17;
    blasint incX = 1;
    blasint incY = 2;
    double alpha[2] = {0.0, 0.0};
    double beta[2] = {0.0, 2.0};
    char  trans = 'N';
    double A[17 * 17 * 4];
    double X[17];
    double Y[17 * 2 * 2];
    double *ay = Y;

    memset(A, 0, sizeof(A));
    memset(X, 0, sizeof(X));
    memset(Y, 0, sizeof(Y));
    for (i = 0; i < (2 * N - 2); i += 4)
    {
        ay[0]   = NAN;
        ay[1]   = 1.0;
        ay     += 4;
        ay[0]   = INFINITY;
        ay[1]   = 1.0;
        ay     += 4;
    }
    Y[4 * N - 4] = NAN;
    Y[4 * N - 3] = 1.0;
    BLASFUNC(zgemv)(&trans, &N, &N, alpha, A, &N, X, &incX, beta, Y, &incY);
    for (i = 0; i < 4 * N; i += 2) {
        if ((i >> 1) % 2) {
            ASSERT_TRUE(Y[i] == 0.0);
            ASSERT_TRUE(Y[i + 1] == 0.0);
        }
        else {
            if ((i >> 2) % 2) {
                ASSERT_TRUE(isnan(Y[i]));
                ASSERT_TRUE(isinf(Y[i + 1]));
            }
            else {
                ASSERT_TRUE(isnan(Y[i]));
                ASSERT_TRUE(isnan(Y[i + 1]));
            }
        }
    }
 }

 CTEST(zgemv, 2_0_nan_1_inf_1)
 {
    int i;
    blasint N = 17;
    blasint incX = 1;
    blasint incY = 1;
    double alpha[2] = {0.0, 0.0};
    double beta[2] = {2.0, 0.0};
    char  trans = 'N';
    double A[17 * 17 * 4];
    double X[17 * 2];
    double Y[17 * 2];

    memset(A, 0, sizeof(A));
    memset(X, 0, sizeof(X));
    for (i = 0; i < (2 * N - 2); i += 4)
    {
        Y[i]     = NAN;
        Y[i + 1] = 1.0;

        Y[i + 2] = INFINITY;
        Y[i + 3] = 1.0;
    }
    Y[2 * N - 2] = NAN;
    Y[2 * N - 1] = 1.0;
    BLASFUNC(zgemv)(&trans, &N, &N, alpha, A, &N, X, &incX, beta, Y, &incY);
    for (i = 0; i < 2 * N; i += 2) {
        if ((i >> 1) % 2){
            ASSERT_TRUE(isinf(Y[i]));
            ASSERT_TRUE(isnan(Y[i + 1]));
        }
        else {
            ASSERT_TRUE(isnan(Y[i]));
            ASSERT_TRUE(isnan(Y[i + 1]));
        }
    }
 }

 CTEST(zgemv, 2_0_nan_1_inf_1_incy_2)
 {
    int i;
    blasint N = 17;
    blasint incX = 1;
    blasint incY = 2;
    double alpha[2] = {0.0, 0.0};
    double beta[2] = {2.0, 0.0};
    char  trans = 'N';
    double A[17 * 17 * 4];
    double X[17];
    double Y[17 * 2 * 2];
    double *ay = Y;

    memset(A, 0, sizeof(A));
    memset(X, 0, sizeof(X));
    memset(Y, 0, sizeof(Y));
    for (i = 0; i < (2 * N - 2); i += 4)
    {
        ay[0]   = NAN;
        ay[1]   = 1.0;
        ay     += 4;
        ay[0]   = INFINITY;
        ay[1]   = 1.0;
        ay     += 4;
    }
    Y[4 * N - 4] = NAN;
    Y[4 * N - 3] = 1.0;
    BLASFUNC(zgemv)(&trans, &N, &N, alpha, A, &N, X, &incX, beta, Y, &incY);
    for (i = 0; i < 4 * N; i += 2) {
        if ((i >> 1) % 2) {
            ASSERT_TRUE(Y[i] == 0.0);
            ASSERT_TRUE(Y[i + 1] == 0.0);
        }
        else {
            if ((i >> 2) % 2) {
                ASSERT_TRUE(isinf(Y[i]));
                ASSERT_TRUE(isnan(Y[i + 1]));
            }
            else {
                ASSERT_TRUE(isnan(Y[i]));
                ASSERT_TRUE(isnan(Y[i + 1]));
            }
        }
    }
 }

 #endif
--- a/utest/test_zscal.c
+++ b/utest/test_zscal.c
@@ -442,6 +442,33 @@ CTEST(cscal, i_0inf_inc_2)
    ASSERT_TRUE(isnan(inf[17]));
 }

 CTEST(cscal, i00_NAN)
 {
    blasint N=9;
    blasint incX=1;
    float i[] = {0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0 };
    float nan[] = {NAN, 0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0};
    BLASFUNC(cscal)(&N, i, nan, &incX);
    ASSERT_TRUE(isnan(nan[0]));
    ASSERT_TRUE(isnan(nan[1]));
    ASSERT_TRUE(isnan(nan[16]));
    ASSERT_TRUE(isnan(nan[17]));
 }

 CTEST(cscal, i00_NAN_incx_2)
 {
    blasint N=9;
    blasint incX=2;
    float i[] = {0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0 };
    float nan[] = {0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN,
                   0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN};
    BLASFUNC(cscal)(&N, i, nan, &incX);
    ASSERT_TRUE(isnan(nan[0]));
    ASSERT_TRUE(isnan(nan[1]));
    ASSERT_TRUE(isnan(nan[16]));
    ASSERT_TRUE(isnan(nan[17]));
 }

 #endif

 #ifdef BUILD_COMPLEX16
@@ -588,4 +615,31 @@ CTEST(zscal, i_0inf_inc_2)
    ASSERT_TRUE(isnan(inf[17]));
 }

 CTEST(zscal, i00_NAN)
 {
    blasint N=9;
    blasint incX=1;
    double i[] = {0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0 };
    double nan[] = {NAN, 0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0};
    BLASFUNC(zscal)(&N, i, nan, &incX);
    ASSERT_TRUE(isnan(nan[0]));
    ASSERT_TRUE(isnan(nan[1]));
    ASSERT_TRUE(isnan(nan[16]));
    ASSERT_TRUE(isnan(nan[17]));
 }

 CTEST(zscal, i00_NAN_incx_2)
 {
    blasint N=9;
    blasint incX=2;
    double i[] = {0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0 };
    double nan[] = {0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN,
                    0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN};
    BLASFUNC(zscal)(&N, i, nan, &incX);
    ASSERT_TRUE(isnan(nan[0]));
    ASSERT_TRUE(isnan(nan[1]));
    ASSERT_TRUE(isnan(nan[16]));
    ASSERT_TRUE(isnan(nan[17]));
 }

 #endif