Browse Source

Merge branch 'OpenMathLib:develop' into gemmt_tests

pull/5187/head
Martin Kroeker GitHub 11 months ago
parent
commit
556ffac02d
No known key found for this signature in database GPG Key ID: B5690EEEBB952194
41 changed files with 2158 additions and 992 deletions
  1. +33
    -3
      .github/workflows/c910v.yml
  2. +9
    -2
      CMakeLists.txt
  3. +1
    -0
      Makefile.system
  4. +5
    -5
      cmake/cc.cmake
  5. +1
    -1
      ctest/CMakeLists.txt
  6. +14
    -3
      driver/level3/level3_thread.c
  7. +9
    -6
      interface/gemm.c
  8. +2
    -2
      interface/zscal.c
  9. +2
    -2
      interface/zsyr.c
  10. +40
    -49
      kernel/arm/zscal.c
  11. +1
    -1
      kernel/arm64/dot_kernel_asimd.c
  12. +5
    -2
      kernel/arm64/zscal.S
  13. +332
    -0
      kernel/generic/zgemm_ncopy_16.c
  14. +212
    -562
      kernel/loongarch64/cgemm_ncopy_16_lasx.S
  15. +5
    -4
      kernel/loongarch64/cscal_lasx.S
  16. +58
    -160
      kernel/loongarch64/cscal_lsx.S
  17. +3
    -0
      kernel/loongarch64/zscal.S
  18. +47
    -50
      kernel/mips/zscal.c
  19. +3
    -0
      kernel/mips64/KERNEL
  20. +6
    -0
      kernel/power/zscal.S
  21. +1
    -1
      kernel/power/zscal.c
  22. +5
    -0
      kernel/power/zscal_ppc440.S
  23. +10
    -0
      kernel/riscv64/KERNEL.RISCV64_ZVL256B
  24. +149
    -0
      kernel/riscv64/axpby_vector_v2.c
  25. +123
    -0
      kernel/riscv64/omatcopy_cn_vector.c
  26. +47
    -0
      kernel/riscv64/zaxpy_vector.c
  27. +53
    -1
      kernel/riscv64/zdot_vector.c
  28. +63
    -20
      kernel/riscv64/zgemv_n_vector.c
  29. +106
    -0
      kernel/riscv64/zomatcopy_cn_vector.c
  30. +40
    -49
      kernel/riscv64/zscal.c
  31. +13
    -1
      kernel/riscv64/zscal_rvv.c
  32. +10
    -3
      kernel/riscv64/zscal_vector.c
  33. +5
    -0
      kernel/sparc/KERNEL
  34. +3
    -0
      kernel/x86/KERNEL
  35. +2
    -2
      kernel/x86_64/KERNEL
  36. +52
    -22
      kernel/x86_64/cscal.c
  37. +49
    -19
      kernel/x86_64/zscal.c
  38. +59
    -11
      kernel/zarch/cscal.c
  39. +52
    -11
      kernel/zarch/zscal.c
  40. +474
    -0
      utest/test_gemv.c
  41. +54
    -0
      utest/test_zscal.c

+ 33
- 3
.github/workflows/c910v.yml View File

@@ -83,9 +83,39 @@ jobs:

- name: test
run: |
export PATH=$GITHUB_WORKSPACE/qemu-install/bin/:$PATH
qemu-riscv64 ./utest/openblas_utest
qemu-riscv64 ./utest/openblas_utest_ext
run_with_retry() {
local cmd="$1"
local time_out=10
local retries=10
local attempt=0

for ((i=1; i<=retries; i++)); do
attempt=$((i))
if timeout -s 12 --preserve-status $time_out $cmd; then
echo "Command succeeded on attempt $i."
return 0
else
local exit_code=$?
if [ $exit_code -eq 140 ]; then
echo "Attempt $i timed out (retrying...)"
time_out=$((time_out + 5))
else
echo "Attempt $i failed with exit code $exit_code. Aborting workflow."
exit $exit_code
fi
fi
done
echo "All $retries attempts failed, giving up."
echo "Final failure was due to timeout."
echo "Aborting workflow."
exit $exit_code
}
export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH
which qemu-riscv64
export QEMU_BIN=$(which qemu-riscv64)
run_with_retry "$QEMU_BIN ./utest/openblas_utest"
run_with_retry "$QEMU_BIN ./utest/openblas_utest_ext"

OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xscblat1
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xdcblat1
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xccblat1


+ 9
- 2
CMakeLists.txt View File

@@ -62,6 +62,7 @@ option(CPP_THREAD_SAFETY_TEST "Run a massively parallel DGEMM test to confirm th

option(CPP_THREAD_SAFETY_GEMV "Run a massively parallel DGEMV test to confirm thread safety of the library (requires OpenMP)" OFF)
option(BUILD_STATIC_LIBS "Build static library" OFF)
option(BUILD_SHARED_LIBS "Build shared library" OFF)
if(NOT BUILD_STATIC_LIBS AND NOT BUILD_SHARED_LIBS)
set(BUILD_STATIC_LIBS ON CACHE BOOL "Build static library" FORCE)
endif()
@@ -123,7 +124,12 @@ message(WARNING "CMake support is experimental. It does not yet support all buil
include("${PROJECT_SOURCE_DIR}/cmake/utils.cmake")
include("${PROJECT_SOURCE_DIR}/cmake/system.cmake")

set(OpenBLAS_LIBNAME ${LIBNAMEPREFIX}openblas${LIBNAMESUFFIX}${SUFFIX64_UNDERSCORE})
string(FIND "${LIBNAMESUFFIX}" "${SUFFIX64_UNDERSCORE}" HAVE64)
if (${HAVE64} GREATER -1)
set(OpenBLAS_LIBNAME ${LIBNAMEPREFIX}openblas${LIBNAMESUFFIX})
else ()
set(OpenBLAS_LIBNAME ${LIBNAMEPREFIX}openblas${LIBNAMESUFFIX}${SUFFIX64_UNDERSCORE})
endif ()

set(BLASDIRS interface driver/level2 driver/level3 driver/others)

@@ -716,4 +722,5 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PN}ConfigVersion.cmake
DESTINATION ${CMAKECONFIG_INSTALL_DIR})
install(EXPORT "${PN}${SUFFIX64}Targets"
NAMESPACE "${PN}${SUFFIX64}::"
DESTINATION ${CMAKECONFIG_INSTALL_DIR})
DESTINATION ${CMAKECONFIG_INSTALL_DIR})


+ 1
- 0
Makefile.system View File

@@ -276,6 +276,7 @@ SMALL_MATRIX_OPT = 1
endif
ifeq ($(ARCH), arm64)
GEMM_GEMV_FORWARD = 1
GEMM_GEMV_FORWARD_BF16 = 1
endif
ifeq ($(ARCH), riscv)
GEMM_GEMV_FORWARD = 1


+ 5
- 5
cmake/cc.cmake View File

@@ -229,9 +229,9 @@ if (${CORE} STREQUAL NEOVERSEN1)
if (${CMAKE_C_COMPILER_ID} STREQUAL "NVC" AND NOT NO_SVE)
set (CCOMMON_OPT "${CCOMMON_OPT} -tp=neoverse-n1")
elseif (${GCC_VERSION} VERSION_GREATER 9.4 OR ${GCC_VERSION} VERSION_EQUAL 9.4)
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve -mtune=neoverse-n1")
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a -mtune=neoverse-n1")
else ()
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve")
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a")
endif()
endif ()
endif ()
@@ -260,13 +260,13 @@ endif ()

if (${CORE} STREQUAL CORTEXA510)
if (NOT DYNAMIC_ARCH)
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve")
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.4-a+sve")
endif ()
endif ()

if (${CORE} STREQUAL CORTEXA710)
if (NOT DYNAMIC_ARCH)
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve")
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.4-a+sve")
endif ()
endif ()

@@ -278,7 +278,7 @@ endif ()

if (${CORE} STREQUAL CORTEXX2)
if (NOT DYNAMIC_ARCH)
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve")
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.4-a+sve")
endif ()
endif ()



+ 1
- 1
ctest/CMakeLists.txt View File

@@ -6,7 +6,7 @@ enable_language(Fortran)
endif()

set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DADD${BU} -DCBLAS")
if (BINARY32 AND CMAKE_C_PLATFORM_ID MATCHES "MinGW" AND CMAKE_Fortran_COMPILER_VERSION VERSION_EQUAL 14.2)
if (BINARY32 AND CMAKE_C_PLATFORM_ID MATCHES "MinGW" AND CMAKE_Fortran_COMPILER_VERSION VERSION_GREATER 14.1)
list(REMOVE_ITEM ${CMAKE_Fortran_FLAGS} -O3 -O2 -O1 -Os)
set (CMAKE_Fortran_FLAGS_RELEASE "" CACHE STRING "" FORCE)
endif()


+ 14
- 3
driver/level3/level3_thread.c View File

@@ -851,9 +851,20 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, IFLOAT *sa, IF
/* Objective function come from sum of partitions in m and n. */
/* (n / nthreads_n) + (m / nthreads_m) */
/* = (n * nthreads_m + m * nthreads_n) / (nthreads_n * nthreads_m) */
while (nthreads_m % 2 == 0 && n * nthreads_m + m * nthreads_n > n * (nthreads_m / 2) + m * (nthreads_n * 2)) {
nthreads_m /= 2;
nthreads_n *= 2;
BLASLONG cost = 0, div = 0;
BLASLONG i;
for (i = 1; i <= sqrt(nthreads_m); i++) {
if (nthreads_m % i) continue;
BLASLONG j = nthreads_m / i;
BLASLONG cost_i = n * j + m * nthreads_n * i;
BLASLONG cost_j = n * i + m * nthreads_n * j;
if (cost == 0 ||
cost_i < cost) {cost = cost_i; div = i;}
if (cost_j < cost) {cost = cost_j; div = j;}
}
if (div > 1) {
nthreads_m /= div;
nthreads_n *= div;
}
}



+ 9
- 6
interface/gemm.c View File

@@ -417,21 +417,24 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS

PRINT_DEBUG_CNAME;

#if !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) && defined(USE_SGEMM_KERNEL_DIRECT)
#if defined(DYNAMIC_ARCH) && defined(ARCH_x86)
if (support_avx512() )
#if !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16)
#if defined(ARCH_x86) && (defined(USE_SGEMM_KERNEL_DIRECT)||defined(DYNAMIC_ARCH))
#if defined(DYNAMIC_ARCH)
if (support_avx512() )
#endif
if (beta == 0 && alpha == 1.0 && order == CblasRowMajor && TransA == CblasNoTrans && TransB == CblasNoTrans && SGEMM_DIRECT_PERFORMANT(m,n,k)) {
SGEMM_DIRECT(m, n, k, a, lda, b, ldb, c, ldc);
return;
}
#endif
#if defined(DYNAMIC_ARCH) && defined(ARCH_ARM64)
if (support_sme1()){
#if defined(ARCH_ARM64) && (defined(USE_SGEMM_KERNEL_DIRECT)||defined(DYNAMIC_ARCH))
#if defined(DYNAMIC_ARCH)
if (support_sme1())
#endif
if (beta == 0 && alpha == 1.0 && order == CblasRowMajor && TransA == CblasNoTrans && TransB == CblasNoTrans) {
SGEMM_DIRECT(m, n, k, a, lda, b, ldb, c, ldc);
return;
}
}
#endif
#endif



+ 2
- 2
interface/zscal.c View File

@@ -98,7 +98,7 @@ void CNAME(blasint n, FLOAT alpha_r, void *vx, blasint incx){
if (nthreads == 1) {
#endif

SCAL_K(n, 0, 0, alpha[0], alpha[1], x, incx, NULL, 0, NULL, 0);
SCAL_K(n, 0, 0, alpha[0], alpha[1], x, incx, NULL, 0, NULL, 1);

#ifdef SMP
} else {
@@ -108,7 +108,7 @@ void CNAME(blasint n, FLOAT alpha_r, void *vx, blasint incx){
mode = BLAS_SINGLE | BLAS_COMPLEX;
#endif

blas_level1_thread(mode, n, 0, 0, alpha, x, incx, NULL, 0, NULL, 0, (int (*)(void))SCAL_K, nthreads);
blas_level1_thread(mode, n, 0, 0, alpha, x, incx, NULL, 0, NULL, 1, (int (*)(void))SCAL_K, nthreads);

}
#endif


+ 2
- 2
interface/zsyr.c View File

@@ -116,12 +116,12 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA,

#else

void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, int n, FLOAT alpha, FLOAT *x, int incx, FLOAT *a, int lda) {
void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, int n, void* valpha, FLOAT *x, int incx, FLOAT *a, int lda) {

FLOAT *buffer;
int uplo;
blasint info;
FLOAT * ALPHA = &alpha;
FLOAT * ALPHA = (FLOAT*)valpha;
FLOAT alpha_r = ALPHA[0];
FLOAT alpha_i = ALPHA[1];
#ifdef SMP


+ 40
- 49
kernel/arm/zscal.c View File

@@ -27,65 +27,56 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

/**************************************************************************************
* 2013/09/14 Saar
* BLASTEST float : OK
* BLASTEST double : OK
* CTEST : OK
* TEST : OK
* BLASTEST float : OK
* BLASTEST double : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/

#include "common.h"

// The c/zscal_k function is called not only by cblas_c/zscal but also by other upper-level interfaces.
// In certain cases, the expected return values for cblas_s/zscal differ from those of other upper-level interfaces.
// To handle this, we use the dummy2 parameter to differentiate between them.
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i=0;
BLASLONG inc_x2;
BLASLONG ip = 0;
FLOAT temp;
BLASLONG i = 0;
BLASLONG inc_x2;
BLASLONG ip = 0;
FLOAT temp;

if ( (n <= 0) || (inc_x <= 0))
return(0);
if ((n <= 0) || (inc_x <= 0))
return(0);

inc_x2 = 2 * inc_x;
if (dummy2 == 0) {
for (i = 0; i < n; i++)
{
if (da_r == 0.0 && da_i == 0.0)
{
x[ip] = 0.0;
x[ip+1] = 0.0;
}
else
{
temp = da_r * x[ip] - da_i * x[ip+1];
x[ip+1] = da_r * x[ip+1] + da_i * x[ip] ;
x[ip] = temp;
}

inc_x2 = 2 * inc_x;
for ( i=0; i<n; i++ )
{
if ( da_r == 0.0 )
{
if ( da_i == 0.0 )
{
temp = 0.0;
x[ip+1] = 0.0 ;
}
else
{
temp = - da_i * x[ip+1] ;
if (isnan(x[ip]) || isinf(x[ip])) temp = NAN;
if (!isinf(x[ip+1]))
x[ip+1] = da_i * x[ip] ;
else x[ip+1] = NAN;
}
}
else
{
if ( da_i == 0.0 )
{
temp = da_r * x[ip] ;
x[ip+1] = da_r * x[ip+1];
}
else
{
temp = da_r * x[ip] - da_i * x[ip+1] ;
x[ip+1] = da_r * x[ip+1] + da_i * x[ip] ;
}
}
x[ip] = temp;
ip += inc_x2;
}
return(0);
}
for (i = 0; i < n; i++)
{
temp = da_r * x[ip] - da_i * x[ip+1];
x[ip+1] = da_r * x[ip+1] + da_i * x[ip] ;

ip += inc_x2;
}

return(0);
x[ip] = temp;
ip += inc_x2;
}

return(0);
}



+ 1
- 1
kernel/arm64/dot_kernel_asimd.c View File

@@ -134,7 +134,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
" fadd v4.4s, v4.4s, v6.4s \n" \
" fadd v0.4s, v0.4s, v4.4s \n" \
" faddp v0.4s, v0.4s, v0.4s \n" \
" faddp v0.4s, v0.4s, v0.4s \n"
" faddp "OUT", v0.2s \n"

#else /* !defined(DSDOT) */
#define KERNEL_F1 \


+ 5
- 2
kernel/arm64/zscal.S View File

@@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define INC_X x4 /* X stride */
#define I x5 /* loop variable */
#define X_COPY x6 /* Copy of X */
#define FLAG x7
/*******************************************************************************
* Macro definitions
*******************************************************************************/
@@ -216,6 +216,9 @@ zscal_begin:

cmp N, xzr
ble .Lzscal_kernel_L999
ldr FLAG, [sp]
cmp FLAG, #1
beq .Lzscal_kernel_RI_non_zero

fcmp DA_R, #0.0
bne .Lzscal_kernel_R_non_zero
@@ -228,7 +231,7 @@ zscal_begin:
.Lzscal_kernel_R_non_zero:

fcmp DA_I, #0.0
beq .Lzscal_kernel_I_zero
//QUAK beq .Lzscal_kernel_I_zero

/*******************************************************************************
* A_R != 0 && A_I != 0


+ 332
- 0
kernel/generic/zgemm_ncopy_16.c View File

@@ -0,0 +1,332 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/

#include <stdio.h>
#include "common.h"

int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
BLASLONG i, j;

IFLOAT *aoffset;
IFLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4;
IFLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8;
IFLOAT *aoffset9, *aoffset10, *aoffset11, *aoffset12;
IFLOAT *aoffset13, *aoffset14, *aoffset15, *aoffset16;

IFLOAT *boffset;
IFLOAT ctemp01, ctemp02, ctemp03, ctemp04;
IFLOAT ctemp05, ctemp06, ctemp07, ctemp08;
IFLOAT ctemp09, ctemp10, ctemp11, ctemp12;
IFLOAT ctemp13, ctemp14, ctemp15, ctemp16;
IFLOAT ctemp17, ctemp18, ctemp19, ctemp20;
IFLOAT ctemp21, ctemp22, ctemp23, ctemp24;
IFLOAT ctemp25, ctemp26, ctemp27, ctemp28;
IFLOAT ctemp29, ctemp30, ctemp31, ctemp32;

aoffset = a;
boffset = b;
lda *= 2;

j = (n >> 4);
if (j > 0){
do{
aoffset1 = aoffset;
aoffset2 = aoffset1 + lda;
aoffset3 = aoffset2 + lda;
aoffset4 = aoffset3 + lda;
aoffset5 = aoffset4 + lda;
aoffset6 = aoffset5 + lda;
aoffset7 = aoffset6 + lda;
aoffset8 = aoffset7 + lda;
aoffset9 = aoffset8 + lda;
aoffset10 = aoffset9 + lda;
aoffset11 = aoffset10 + lda;
aoffset12 = aoffset11 + lda;
aoffset13 = aoffset12 + lda;
aoffset14 = aoffset13 + lda;
aoffset15 = aoffset14 + lda;
aoffset16 = aoffset15 + lda;
aoffset += 16 * lda;

i = m;
if (i > 0){
do{
ctemp01 = *(aoffset1 + 0);
ctemp02 = *(aoffset1 + 1);
ctemp03 = *(aoffset2 + 0);
ctemp04 = *(aoffset2 + 1);
ctemp05 = *(aoffset3 + 0);
ctemp06 = *(aoffset3 + 1);
ctemp07 = *(aoffset4 + 0);
ctemp08 = *(aoffset4 + 1);
ctemp09 = *(aoffset5 + 0);
ctemp10 = *(aoffset5 + 1);
ctemp11 = *(aoffset6 + 0);
ctemp12 = *(aoffset6 + 1);
ctemp13 = *(aoffset7 + 0);
ctemp14 = *(aoffset7 + 1);
ctemp15 = *(aoffset8 + 0);
ctemp16 = *(aoffset8 + 1);

ctemp17 = *(aoffset9 + 0);
ctemp18 = *(aoffset9 + 1);
ctemp19 = *(aoffset10 + 0);
ctemp20 = *(aoffset10 + 1);
ctemp21 = *(aoffset11 + 0);
ctemp22 = *(aoffset11 + 1);
ctemp23 = *(aoffset12 + 0);
ctemp24 = *(aoffset12 + 1);
ctemp25 = *(aoffset13 + 0);
ctemp26 = *(aoffset13 + 1);
ctemp27 = *(aoffset14 + 0);
ctemp28 = *(aoffset14 + 1);
ctemp29 = *(aoffset15 + 0);
ctemp30 = *(aoffset15 + 1);
ctemp31 = *(aoffset16 + 0);
ctemp32 = *(aoffset16 + 1);

*(boffset + 0) = ctemp01;
*(boffset + 1) = ctemp02;
*(boffset + 2) = ctemp03;
*(boffset + 3) = ctemp04;
*(boffset + 4) = ctemp05;
*(boffset + 5) = ctemp06;
*(boffset + 6) = ctemp07;
*(boffset + 7) = ctemp08;
*(boffset + 8) = ctemp09;
*(boffset + 9) = ctemp10;
*(boffset + 10) = ctemp11;
*(boffset + 11) = ctemp12;
*(boffset + 12) = ctemp13;
*(boffset + 13) = ctemp14;
*(boffset + 14) = ctemp15;
*(boffset + 15) = ctemp16;

*(boffset + 16) = ctemp17;
*(boffset + 17) = ctemp18;
*(boffset + 18) = ctemp19;
*(boffset + 19) = ctemp20;
*(boffset + 20) = ctemp21;
*(boffset + 21) = ctemp22;
*(boffset + 22) = ctemp23;
*(boffset + 23) = ctemp24;
*(boffset + 24) = ctemp25;
*(boffset + 25) = ctemp26;
*(boffset + 26) = ctemp27;
*(boffset + 27) = ctemp28;
*(boffset + 28) = ctemp29;
*(boffset + 29) = ctemp30;
*(boffset + 30) = ctemp31;
*(boffset + 31) = ctemp32;

aoffset1 += 2;
aoffset2 += 2;
aoffset3 += 2;
aoffset4 += 2;
aoffset5 += 2;
aoffset6 += 2;
aoffset7 += 2;
aoffset8 += 2;
aoffset9 += 2;
aoffset10 += 2;
aoffset11 += 2;
aoffset12 += 2;
aoffset13 += 2;
aoffset14 += 2;
aoffset15 += 2;
aoffset16 += 2;

boffset += 32;
i --;
}while(i > 0);
}
j--;
}while(j > 0);
} /* end of if(j > 0) */

if (n & 8){
aoffset1 = aoffset;
aoffset2 = aoffset1 + lda;
aoffset3 = aoffset2 + lda;
aoffset4 = aoffset3 + lda;
aoffset5 = aoffset4 + lda;
aoffset6 = aoffset5 + lda;
aoffset7 = aoffset6 + lda;
aoffset8 = aoffset7 + lda;
aoffset += 8 * lda;

i = m;
if (i > 0){
do{
ctemp01 = *(aoffset1 + 0);
ctemp02 = *(aoffset1 + 1);
ctemp03 = *(aoffset2 + 0);
ctemp04 = *(aoffset2 + 1);
ctemp05 = *(aoffset3 + 0);
ctemp06 = *(aoffset3 + 1);
ctemp07 = *(aoffset4 + 0);
ctemp08 = *(aoffset4 + 1);
ctemp09 = *(aoffset5 + 0);
ctemp10 = *(aoffset5 + 1);
ctemp11 = *(aoffset6 + 0);
ctemp12 = *(aoffset6 + 1);
ctemp13 = *(aoffset7 + 0);
ctemp14 = *(aoffset7 + 1);
ctemp15 = *(aoffset8 + 0);
ctemp16 = *(aoffset8 + 1);

*(boffset + 0) = ctemp01;
*(boffset + 1) = ctemp02;
*(boffset + 2) = ctemp03;
*(boffset + 3) = ctemp04;
*(boffset + 4) = ctemp05;
*(boffset + 5) = ctemp06;
*(boffset + 6) = ctemp07;
*(boffset + 7) = ctemp08;
*(boffset + 8) = ctemp09;
*(boffset + 9) = ctemp10;
*(boffset + 10) = ctemp11;
*(boffset + 11) = ctemp12;
*(boffset + 12) = ctemp13;
*(boffset + 13) = ctemp14;
*(boffset + 14) = ctemp15;
*(boffset + 15) = ctemp16;

aoffset1 += 2;
aoffset2 += 2;
aoffset3 += 2;
aoffset4 += 2;
aoffset5 += 2;
aoffset6 += 2;
aoffset7 += 2;
aoffset8 += 2;

boffset += 16;
i --;
}while(i > 0);
}
}

if (n & 4){
aoffset1 = aoffset;
aoffset2 = aoffset1 + lda;
aoffset3 = aoffset2 + lda;
aoffset4 = aoffset3 + lda;
aoffset += 4 * lda;

i = m;
if (i > 0){
do{
ctemp01 = *(aoffset1 + 0);
ctemp02 = *(aoffset1 + 1);
ctemp03 = *(aoffset2 + 0);
ctemp04 = *(aoffset2 + 1);
ctemp05 = *(aoffset3 + 0);
ctemp06 = *(aoffset3 + 1);
ctemp07 = *(aoffset4 + 0);
ctemp08 = *(aoffset4 + 1);

*(boffset + 0) = ctemp01;
*(boffset + 1) = ctemp02;
*(boffset + 2) = ctemp03;
*(boffset + 3) = ctemp04;
*(boffset + 4) = ctemp05;
*(boffset + 5) = ctemp06;
*(boffset + 6) = ctemp07;
*(boffset + 7) = ctemp08;

aoffset1 += 2;
aoffset2 += 2;
aoffset3 += 2;
aoffset4 += 2;

boffset += 8;
i --;
}while(i > 0);
}
} /* end of if(j > 0) */

if (n & 2){
aoffset1 = aoffset;
aoffset2 = aoffset1 + lda;
aoffset += 2 * lda;

i = m;
if (i > 0){
do{
ctemp01 = *(aoffset1 + 0);
ctemp02 = *(aoffset1 + 1);
ctemp03 = *(aoffset2 + 0);
ctemp04 = *(aoffset2 + 1);

*(boffset + 0) = ctemp01;
*(boffset + 1) = ctemp02;
*(boffset + 2) = ctemp03;
*(boffset + 3) = ctemp04;

aoffset1 += 2;
aoffset2 += 2;
boffset += 4;
i --;
}while(i > 0);
}

} /* end of if(j > 0) */

if (n & 1){
aoffset1 = aoffset;

i = m;
if (i > 0){
do{
ctemp01 = *(aoffset1 + 0);
ctemp02 = *(aoffset1 + 1);

*(boffset + 0) = ctemp01;
*(boffset + 1) = ctemp02;

aoffset1 += 2;
boffset += 2;
i --;
}while(i > 0);
}

} /* end of if(j > 0) */

return 0;
}

+ 212
- 562
kernel/loongarch64/cgemm_ncopy_16_lasx.S View File

@@ -45,18 +45,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define S6 $r17
#define S7 $r18
#define S8 $r19
#define S9 $r20
#define S10 $r23
#define S11 $r24
#define S12 $r25
#define S13 $r26
#define S14 $r27
#define S15 $r28
#define S16 $r29
#define TD $r30
#define TS $r31
#define S9 $r23
#define S10 $r24
#define S11 $r25
#define S12 $r26
#define S13 $r27
#define S14 $r28
#define S15 $r29
#define S16 $r30
#define TD $r20
#define TS $r11
#define TL $r7
#define T0 $r6
#define ZERO $r0

#define F0 $f0
@@ -67,6 +66,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define F5 $f5
#define F6 $f6
#define F7 $f7
#define F8 $f8
#define F9 $f9
#define F10 $f10
#define F11 $f11
#define F12 $f12
#define F13 $f13
#define F14 $f14
#define F15 $f15
/* LASX vectors */
#define U0 $xr0
#define U1 $xr1
@@ -103,589 +110,232 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

PROLOGUE

addi.d $sp, $sp, -0x90
SDARG $r23, $sp, 0x00
SDARG $r24, $sp, 0x08
SDARG $r25, $sp, 0x10
SDARG $r26, $sp, 0x18
SDARG $r27, $sp, 0x20
SDARG $r28, $sp, 0x28
SDARG $r29, $sp, 0x30
SDARG $r30, $sp, 0x38
SDARG $r31, $sp, 0x40
ST $f23, $sp, 0x48
ST $f24, $sp, 0x50
ST $f25, $sp, 0x58
ST $f26, $sp, 0x60
ST $f27, $sp, 0x68
ST $f28, $sp, 0x70
ST $f29, $sp, 0x78
ST $f30, $sp, 0x80
ST $f31, $sp, 0x88

move TD, DST
move TS, SRC
slli.d TL, LDA, 0x03
slli.d T0, TL, 0x01
srai.d J, N, 0x04
addi.d $sp, $sp, -64
SDARG $r23, $sp, 0
SDARG $r24, $sp, 8
SDARG $r25, $sp, 16
SDARG $r26, $sp, 24
SDARG $r27, $sp, 32
SDARG $r28, $sp, 40
SDARG $r29, $sp, 48
SDARG $r30, $sp, 56

move TD, DST //boffset
move TS, SRC //aoffset
slli.d TL, LDA, 0x03 //lda
srai.d J, N, 0x04 //j
beq J, ZERO, .L_N8

.L_J1: /* J-- */
.L_J1: /* if(j>0) j--*/
move S1, TS
add.d S2, TS, TL
srai.d I, M, 0x03
move I, M
add.d S3, S2, TL
addi.d J, J, -1
add.d S4, S3, TL
add.d S5, S3, T0
add.d S6, S4, T0
add.d S7, S5, T0
add.d S8, S6, T0
add.d S9, S7, T0
add.d S10, S8, T0
add.d S11, S9, T0
add.d S12, S10, T0
add.d S13, S11, T0
add.d S14, S12, T0
add.d S15, S13, T0
add.d S16, S14, T0
add.d TS, S15, T0
beq I, ZERO, .L_I7

.L_I1: /* I-- */
xvld U0, S1, 0x00
xvld U1, S2, 0x00
xvld U2, S3, 0x00
xvld U3, S4, 0x00
xvld U4, S5, 0x00
xvld U5, S6, 0x00
xvld U6, S7, 0x00
xvld U7, S8, 0x00
xvld U8, S9, 0x00
xvld U9, S10, 0x00
xvld U10, S11, 0x00
xvld U11, S12, 0x00
xvld U12, S13, 0x00
xvld U13, S14, 0x00
xvld U14, S15, 0x00
xvld U15, S16, 0x00

xvpackev.d D0, U1, U0
xvpackod.d D1, U1, U0
xvpackev.d D2, U3, U2
xvpackod.d D3, U3, U2
xvpackev.d D4, U5, U4
xvpackod.d D5, U5, U4
xvpackev.d D6, U7, U6
xvpackod.d D7, U7, U6

xvpackev.d D8, U9, U8
xvpackod.d D9, U9, U8
xvpackev.d D10, U11, U10
xvpackod.d D11, U11, U10
xvpackev.d D12, U13, U12
xvpackod.d D13, U13, U12
xvpackev.d D14, U15, U14
xvpackod.d D15, U15, U14

xvand.v U0, D0, D0
xvpermi.q D0, D2, 0x02 // 0
xvand.v U4, D4, D4
xvpermi.q D4, D6, 0x02 // 1
xvand.v U1, D1, D1
xvpermi.q D1, D3, 0x02 // 4
xvand.v U5, D5, D5
xvpermi.q D5, D7, 0x02 // 5
xvpermi.q D2, U0, 0x31 // 8
xvpermi.q D6, U4, 0x31 // 9
xvpermi.q D3, U1, 0x31 // 12
xvpermi.q D7, U5, 0x31 // 13

xvand.v U8, D8, D8
xvpermi.q D8, D10, 0x02 // 2
xvand.v U12, D12, D12
xvpermi.q D12, D14, 0x02 // 3
xvand.v U9, D9, D9
xvpermi.q D9, D11, 0x02 // 6
xvand.v U13, D13, D13
xvpermi.q D13, D15, 0x02 // 7
xvpermi.q D10, U8, 0x31 // 10
xvpermi.q D14, U12, 0x31 // 11
xvpermi.q D11, U9, 0x31 // 14
xvpermi.q D15, U13, 0x31 // 15

xvst D0, TD, 0x00 // 0
xvst D4, TD, 0x20 // 1
xvst D8, TD, 0x40 // 2
xvst D12, TD, 0x60 // 3
xvst D1, TD, 0x80 // 4
xvst D5, TD, 0xA0 // 5
xvst D9, TD, 0xC0 // 6
xvst D13, TD, 0xE0 // 7
addi.d TD, TD, 0x100
xvst D2, TD, 0x00 // 8
xvst D6, TD, 0x20 // 9
xvst D10, TD, 0x40 // 10
xvst D14, TD, 0x60 // 11
xvst D3, TD, 0x80 // 12
xvst D7, TD, 0xA0 // 13
xvst D11, TD, 0xC0 // 14
xvst D15, TD, 0xE0 // 15
addi.d TD, TD, 0x100

xvld U0, S1, 0x20
xvld U1, S2, 0x20
xvld U2, S3, 0x20
xvld U3, S4, 0x20
xvld U4, S5, 0x20
xvld U5, S6, 0x20
xvld U6, S7, 0x20
xvld U7, S8, 0x20
xvld U8, S9, 0x20
xvld U9, S10, 0x20
xvld U10, S11, 0x20
xvld U11, S12, 0x20
xvld U12, S13, 0x20
xvld U13, S14, 0x20
xvld U14, S15, 0x20
xvld U15, S16, 0x20

xvpackev.d D0, U1, U0
xvpackod.d D1, U1, U0
xvpackev.d D2, U3, U2
xvpackod.d D3, U3, U2
xvpackev.d D4, U5, U4
xvpackod.d D5, U5, U4
xvpackev.d D6, U7, U6
xvpackod.d D7, U7, U6

xvpackev.d D8, U9, U8
xvpackod.d D9, U9, U8
xvpackev.d D10, U11, U10
xvpackod.d D11, U11, U10
xvpackev.d D12, U13, U12
xvpackod.d D13, U13, U12
xvpackev.d D14, U15, U14
xvpackod.d D15, U15, U14

xvand.v U0, D0, D0
xvpermi.q D0, D2, 0x02 // 0
xvand.v U4, D4, D4
xvpermi.q D4, D6, 0x02 // 1
xvand.v U1, D1, D1
xvpermi.q D1, D3, 0x02 // 4
xvand.v U5, D5, D5
xvpermi.q D5, D7, 0x02 // 5
xvpermi.q D2, U0, 0x31 // 8
xvpermi.q D6, U4, 0x31 // 9
xvpermi.q D3, U1, 0x31 // 12
xvpermi.q D7, U5, 0x31 // 13

xvand.v U8, D8, D8
xvpermi.q D8, D10, 0x02 // 2
xvand.v U12, D12, D12
xvpermi.q D12, D14, 0x02 // 3
xvand.v U9, D9, D9
xvpermi.q D9, D11, 0x02 // 6
xvand.v U13, D13, D13
xvpermi.q D13, D15, 0x02 // 7
xvpermi.q D10, U8, 0x31 // 10
xvpermi.q D14, U12, 0x31 // 11
xvpermi.q D11, U9, 0x31 // 14
xvpermi.q D15, U13, 0x31 // 15

xvst D0, TD, 0x00 // 0
xvst D4, TD, 0x20 // 1
xvst D8, TD, 0x40 // 2
xvst D12, TD, 0x60 // 3
xvst D1, TD, 0x80 // 4
xvst D5, TD, 0xA0 // 5
xvst D9, TD, 0xC0 // 6
xvst D13, TD, 0xE0 // 7
addi.d TD, TD, 0x100
xvst D2, TD, 0x00 // 8
xvst D6, TD, 0x20 // 9
xvst D10, TD, 0x40 // 10
xvst D14, TD, 0x60 // 11
xvst D3, TD, 0x80 // 12
xvst D7, TD, 0xA0 // 13
xvst D11, TD, 0xC0 // 14
xvst D15, TD, 0xE0 // 15
addi.d TD, TD, 0x100


addi.d S1, S1, 0x40
addi.d S2, S2, 0x40
addi.d S3, S3, 0x40
addi.d S4, S4, 0x40
addi.d S5, S5, 0x40
addi.d S6, S6, 0x40
addi.d S7, S7, 0x40
addi.d S8, S8, 0x40
addi.d S9, S9, 0x40
addi.d S10, S10, 0x40
addi.d S11, S11, 0x40
addi.d S12, S12, 0x40
addi.d S13, S13, 0x40
addi.d S14, S14, 0x40
addi.d S15, S15, 0x40
addi.d S16, S16, 0x40

add.d S5, S4, TL
add.d S6, S5, TL
add.d S7, S6, TL
add.d S8, S7, TL
add.d S9, S8, TL
add.d S10, S9, TL
add.d S11, S10, TL
add.d S12, S11, TL
add.d S13, S12, TL
add.d S14, S13, TL
add.d S15, S14, TL
add.d S16, S15, TL
add.d TS, S16, TL
beq I, ZERO, .L_J11

.L_I1: /* if(i>0) i--*/
fld.d F0, S1, 0x00
fld.d F1, S2, 0x00
fld.d F2, S3, 0x00
fld.d F3, S4, 0x00
fld.d F4, S5, 0x00
fld.d F5, S6, 0x00
fld.d F6, S7, 0x00
fld.d F7, S8, 0x00

fst.d F0, TD, 0x00
fst.d F1, TD, 0x08
fst.d F2, TD, 0x10
fst.d F3, TD, 0x18
fst.d F4, TD, 0x20
fst.d F5, TD, 0x28
fst.d F6, TD, 0x30
fst.d F7, TD, 0x38

fld.d F0, S9, 0x00
fld.d F1, S10, 0x00
fld.d F2, S11, 0x00
fld.d F3, S12, 0x00
fld.d F4, S13, 0x00
fld.d F5, S14, 0x00
fld.d F6, S15, 0x00
fld.d F7, S16, 0x00

fst.d F0, TD, 0x40
fst.d F1, TD, 0x48
fst.d F2, TD, 0x50
fst.d F3, TD, 0x58
fst.d F4, TD, 0x60
fst.d F5, TD, 0x68
fst.d F6, TD, 0x70
fst.d F7, TD, 0x78

addi.d S1, S1, 0x08
addi.d S2, S2, 0x08
addi.d S3, S3, 0x08
addi.d S4, S4, 0x08
addi.d S5, S5, 0x08
addi.d S6, S6, 0x08
addi.d S7, S7, 0x08
addi.d S8, S8, 0x08
addi.d S9, S9, 0x08
addi.d S10, S10, 0x08
addi.d S11, S11, 0x08
addi.d S12, S12, 0x08
addi.d S13, S13, 0x08
addi.d S14, S14, 0x08
addi.d S15, S15, 0x08
addi.d S16, S16, 0x08
addi.d TD, TD, 0x80
addi.d I, I, -1
blt ZERO, I, .L_I1

.L_I7:
andi I, M, 0x07
beq I, ZERO, .L_I0

.L_II1: /* I-- */
fld.d F0, S1, 0x00
fld.d F1, S2, 0x00
fld.d F2, S3, 0x00
fld.d F3, S4, 0x00
fld.d F4, S5, 0x00
fld.d F5, S6, 0x00
fld.d F6, S7, 0x00
fld.d F7, S8, 0x00

fst.d F0, TD, 0x00
addi.d S1, S1, 0x08
fst.d F1, TD, 0x08
addi.d S2, S2, 0x08
fst.d F2, TD, 0x10
addi.d S3, S3, 0x08
fst.d F3, TD, 0x18
addi.d S4, S4, 0x08
fst.d F4, TD, 0x20
addi.d S5, S5, 0x08
fst.d F5, TD, 0x28
addi.d S6, S6, 0x08
fst.d F6, TD, 0x30
addi.d S7, S7, 0x08
fst.d F7, TD, 0x38
addi.d S8, S8, 0x08
addi.d TD, TD, 0x40

fld.d F0, S9, 0x00
fld.d F1, S10, 0x00
fld.d F2, S11, 0x00
fld.d F3, S12, 0x00
fld.d F4, S13, 0x00
fld.d F5, S14, 0x00
fld.d F6, S15, 0x00
fld.d F7, S16, 0x00

fst.d F0, TD, 0x00
addi.d S9, S9, 0x08
fst.d F1, TD, 0x08
addi.d S10, S10, 0x08
fst.d F2, TD, 0x10
addi.d S11, S11, 0x08
fst.d F3, TD, 0x18
addi.d S12, S12, 0x08
fst.d F4, TD, 0x20
addi.d S13, S13, 0x08
fst.d F5, TD, 0x28
addi.d S14, S14, 0x08
fst.d F6, TD, 0x30
addi.d S15, S15, 0x08
fst.d F7, TD, 0x38
addi.d S16, S16, 0x08
addi.d TD, TD, 0x40

addi.d I, I, -1
blt ZERO, I, .L_II1

.L_I0:
blt ZERO, J, .L_J1

.L_N8:
andi J, N, 0x08
beq ZERO, J, .L_N4
.L_J11: /* j--*/
addi.d J, J, -1
blt ZERO, J, .L_J1

.L_N8: /* if(n&8)*/
andi I, N, 0x08
beq I, ZERO, .L_N4

move S1, TS
add.d S2, TS, TL
srai.d I, M, 0x03
move I, M
add.d S3, S2, TL
add.d S4, S2, T0
add.d S5, S3, T0
add.d S6, S4, T0
add.d S7, S5, T0
add.d S8, S6, T0
add.d TS, S7, T0
beq I, ZERO, .L_8I3

.L_8I1: /* I-- */
xvld U0, S1, 0x00
xvld U1, S2, 0x00
xvld U2, S3, 0x00
xvld U3, S4, 0x00
xvld U4, S5, 0x00
xvld U5, S6, 0x00
xvld U6, S7, 0x00
xvld U7, S8, 0x00

xvpackev.d D0, U1, U0
xvpackod.d D1, U1, U0
xvpackev.d D2, U3, U2
xvpackod.d D3, U3, U2
xvpackev.d D4, U5, U4
xvpackod.d D5, U5, U4
xvpackev.d D6, U7, U6
xvpackod.d D7, U7, U6

xvand.v U0, D0, D0
xvpermi.q D0, D2, 0x02 // 0
xvand.v U4, D4, D4
xvpermi.q D4, D6, 0x02 // 1
xvand.v U1, D1, D1
xvpermi.q D1, D3, 0x02 // 2
xvand.v U5, D5, D5
xvpermi.q D5, D7, 0x02 // 3
xvpermi.q D2, U0, 0x31 // 4
xvpermi.q D6, U4, 0x31 // 5
xvpermi.q D3, U1, 0x31 // 6
xvpermi.q D7, U5, 0x31 // 7

xvst D0, TD, 0x00
xvst D4, TD, 0x20
xvst D1, TD, 0x40
xvst D5, TD, 0x60
xvst D2, TD, 0x80
xvst D6, TD, 0xA0
xvst D3, TD, 0xC0
xvst D7, TD, 0xE0
addi.d TD, TD, 0x100

xvld U0, S1, 0x20
xvld U1, S2, 0x20
xvld U2, S3, 0x20
xvld U3, S4, 0x20
xvld U4, S5, 0x20
xvld U5, S6, 0x20
xvld U6, S7, 0x20
xvld U7, S8, 0x20

xvpackev.d D0, U1, U0
xvpackod.d D1, U1, U0
xvpackev.d D2, U3, U2
xvpackod.d D3, U3, U2
xvpackev.d D4, U5, U4
xvpackod.d D5, U5, U4
xvpackev.d D6, U7, U6
xvpackod.d D7, U7, U6

xvand.v U0, D0, D0
xvpermi.q D0, D2, 0x02 // 0
xvand.v U4, D4, D4
xvpermi.q D4, D6, 0x02 // 1
xvand.v U1, D1, D1
xvpermi.q D1, D3, 0x02 // 2
xvand.v U5, D5, D5
xvpermi.q D5, D7, 0x02 // 3
xvpermi.q D2, U0, 0x31 // 4
xvpermi.q D6, U4, 0x31 // 5
xvpermi.q D3, U1, 0x31 // 6
xvpermi.q D7, U5, 0x31 // 7

xvst D0, TD, 0x00
xvst D4, TD, 0x20
xvst D1, TD, 0x40
xvst D5, TD, 0x60
xvst D2, TD, 0x80
xvst D6, TD, 0xA0
xvst D3, TD, 0xC0
xvst D7, TD, 0xE0
addi.d TD, TD, 0x100

addi.d S1, S1, 0x40
addi.d S2, S2, 0x40
addi.d S3, S3, 0x40
addi.d S4, S4, 0x40
addi.d S5, S5, 0x40
addi.d S6, S6, 0x40
addi.d S7, S7, 0x40
addi.d S8, S8, 0x40

add.d S4, S3, TL
add.d S5, S4, TL
add.d S6, S5, TL
add.d S7, S6, TL
add.d S8, S7, TL
add.d TS, S8, TL
beq I, ZERO, .L_N4

.L_N81: /* if(i>0) i--*/
fld.d F0, S1, 0x00
fld.d F1, S2, 0x00
fld.d F2, S3, 0x00
fld.d F3, S4, 0x00
fld.d F4, S5, 0x00
fld.d F5, S6, 0x00
fld.d F6, S7, 0x00
fld.d F7, S8, 0x00

fst.d F0, TD, 0x00
fst.d F1, TD, 0x08
fst.d F2, TD, 0x10
fst.d F3, TD, 0x18
fst.d F4, TD, 0x20
fst.d F5, TD, 0x28
fst.d F6, TD, 0x30
fst.d F7, TD, 0x38

addi.d S1, S1, 0x08
addi.d S2, S2, 0x08
addi.d S3, S3, 0x08
addi.d S4, S4, 0x08
addi.d S5, S5, 0x08
addi.d S6, S6, 0x08
addi.d S7, S7, 0x08
addi.d S8, S8, 0x08
addi.d TD, TD, 0x40
addi.d I, I, -1
blt ZERO, I, .L_8I1

.L_8I3:
andi I, M, 0x07
beq I, ZERO, .L_N4

.L_8I11:
fld.d F0, S1, 0x00
fld.d F1, S2, 0x00
fld.d F2, S3, 0x00
fld.d F3, S4, 0x00
fld.d F4, S5, 0x00
fld.d F5, S6, 0x00
fld.d F6, S7, 0x00
fld.d F7, S8, 0x00

fst.d F0, TD, 0x00
addi.d S1, S1, 0x08
fst.d F1, TD, 0x08
addi.d S2, S2, 0x08
fst.d F2, TD, 0x10
addi.d S3, S3, 0x08
fst.d F3, TD, 0x18
addi.d S4, S4, 0x08
fst.d F4, TD, 0x20
addi.d S5, S5, 0x08
fst.d F5, TD, 0x28
addi.d S6, S6, 0x08
fst.d F6, TD, 0x30
addi.d S7, S7, 0x08
fst.d F7, TD, 0x38
addi.d S8, S8, 0x08

addi.d TD, TD, 0x40
addi.d I, I, -1
blt ZERO, I, .L_8I11

.L_N4:
andi J, N, 0x04
beq ZERO, J, .L_N2
blt ZERO, I, .L_N81

.L_N4: /* if(n&4)*/
andi I, N, 0x04
beq I, ZERO, .L_N2

move S1, TS
add.d S2, TS, TL
srai.d I, M, 0x02
move I, M
add.d S3, S2, TL
add.d S4, S2, T0
add.d TS, S3, T0
beq I, ZERO, .L_I3

.L_4I1: /* I-- */
xvld U0, S1, 0x00
xvld U1, S2, 0x00
xvld U2, S3, 0x00
xvld U3, S4, 0x00

xvpackev.d D0, U1, U0
xvpackod.d D1, U1, U0
xvpackev.d D2, U3, U2
xvpackod.d D3, U3, U2

xvand.v U0, D0, D0
xvpermi.q D0, D2, 0x02 // 0
xvand.v U1, D1, D1
xvpermi.q D1, D3, 0x02 // 1
xvpermi.q D2, U0, 0x31 // 2
xvpermi.q D3, U1, 0x31 // 3

xvst D0, TD, 0x00
xvst D1, TD, 0x20
xvst D2, TD, 0x40
xvst D3, TD, 0x60

addi.d S1, S1, 0x20
addi.d S2, S2, 0x20
addi.d S3, S3, 0x20
addi.d S4, S4, 0x20
addi.d TD, TD, 0x80

add.d S4, S3, TL
add.d TS, S4, TL
beq I, ZERO, .L_N2

.L_N41: /* if(i>0)*/
fld.d F0, S1, 0x00
fld.d F1, S2, 0x00
fld.d F2, S3, 0x00
fld.d F3, S4, 0x00

fst.d F0, TD, 0x00
fst.d F1, TD, 0x08
fst.d F2, TD, 0x10
fst.d F3, TD, 0x18

addi.d S1, S1, 0x08
addi.d S2, S2, 0x08
addi.d S3, S3, 0x08
addi.d S4, S4, 0x08
addi.d TD, TD, 0x20
addi.d I, I, -1
blt ZERO, I, .L_4I1

.L_I3:
andi I, M, 0x03
beq I, ZERO, .L_N2

.L_4II1:
fld.d F0, S1, 0x00
fld.d F1, S2, 0x00
fld.d F2, S3, 0x00
fld.d F3, S4, 0x00

fst.d F0, TD, 0x00
addi.d S1, S1, 0x08
fst.d F1, TD, 0x08
addi.d S2, S2, 0x08
fst.d F2, TD, 0x10
addi.d S3, S3, 0x08
fst.d F3, TD, 0x18
addi.d S4, S4, 0x08

addi.d TD, TD, 0x20
addi.d I, I, -1
blt ZERO, I, .L_4II1

.L_N2:
andi J, N, 0x02
beq ZERO, J, .L_N1
blt ZERO, I, .L_N41

.L_N2: /* if(n&2)*/
andi I, N, 0x02
beq I, ZERO, .L_N1

move S1, TS
add.d S2, TS, TL
srai.d I, M, 0x01
move I, M
add.d TS, S2, TL
beq I, ZERO, .L_NI1

.L_2I1: /* I-- */
xvld U0, S1, 0x00
xvld U1, S2, 0x00

xvpackev.d D0, U1, U0
xvpackod.d D1, U1, U0

xvpermi.q D0, D1, 0x02 // 0
beq I, ZERO, .L_N1

xvst D0, TD, 0x00
.L_N21: /* if(i>0)*/
fld.d F0, S1, 0x00
fld.d F1, S2, 0x00

addi.d S1, S1, 0x10
addi.d S2, S2, 0x10
addi.d TD, TD, 0x20
fst.d F0, TD, 0x00
fst.d F1, TD, 0x08

addi.d S1, S1, 0x08
addi.d S2, S2, 0x08
addi.d TD, TD, 0x10
addi.d I, I, -1
blt ZERO, I, .L_2I1

.L_NI1:
andi I, M, 0x01
beq I, ZERO, .L_N1

blt ZERO, I, .L_N21

fld.d F0, S1, 0x00
fld.d F1, S2, 0x00
.L_N1: /* if(n&2)*/
andi I, N, 0x01
beq I, ZERO, .L_N0

fst.d F0, TD, 0x00
addi.d S1, S1, 0x08
fst.d F1, TD, 0x08
addi.d S2, S2, 0x08
addi.d TD, TD, 0x10
move S1, TS
move I, M
beq I, ZERO, .L_N0

.L_N1:
move S1, TS
beq ZERO, M, .L_N0
.L_N11: /* if(i>0)*/
fld.d F0, S1, 0x00
fst.d F0, TD, 0x00

.L_M1:
fld.d F0, S1, 0x00
addi.d S1, S1, 0x08
fst.d F0, TD, 0x00
addi.d TD, TD, 0x08
addi.d M, M, -1
blt ZERO, M, .L_M1
addi.d S1, S1, 0x08
addi.d TD, TD, 0x08
addi.d I, I, -1
blt ZERO, I, .L_N11

.L_N0:
LDARG $r23, $sp, 0x00
LDARG $r24, $sp, 0x08
LDARG $r25, $sp, 0x10
LDARG $r26, $sp, 0x18
LDARG $r27, $sp, 0x20
LDARG $r28, $sp, 0x28
LDARG $r29, $sp, 0x30
LDARG $r30, $sp, 0x38
LDARG $r31, $sp, 0x40
LD $f23, $sp, 0x48
LD $f24, $sp, 0x50
LD $f25, $sp, 0x58
LD $f26, $sp, 0x60
LD $f27, $sp, 0x68
LD $f28, $sp, 0x70
LD $f29, $sp, 0x78
LD $f30, $sp, 0x80
LD $f31, $sp, 0x88
addi.d $sp, $sp, 0x90
jirl $r0, $r1, 0x00
LDARG $r23, $sp, 0
LDARG $r24, $sp, 8
LDARG $r25, $sp, 16
LDARG $r26, $sp, 24
LDARG $r27, $sp, 32
LDARG $r28, $sp, 40
LDARG $r29, $sp, 48
LDARG $r30, $sp, 56
addi.d $sp, $sp, 64
jirl $r0, $r1, 0x00

EPILOGUE

+ 5
- 4
kernel/loongarch64/cscal_lasx.S View File

@@ -94,7 +94,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
CMPEQ $fcc1, ALPHAI, a1
bge $r0, I, .L19
/////// INCX == 1 && N >= 4 ////////
bnez DUMMY2, .L17 // if DUMMPY2 == 1, called from c/zscal.
bnez DUMMY2, .L17 // if DUMMY2 == 1, called from c/zscal.

bceqz $fcc0, .L17

@@ -146,6 +146,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi.d I, I, -1
blt $r0, I, .L17
b .L19

.align 3

/////// INCX == 1 && N < 8 ///////
@@ -156,7 +157,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
andi I, N, 7
#endif
beqz I, .L999
bnez DUMMY2, .L998 // if DUMMPY2 == 1, called from c/zscal.
bnez DUMMY2, .L998 // if DUMMY2 == 1, called from c/zscal.

bceqz $fcc0, .L998

@@ -171,7 +172,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
CMPEQ $fcc1, ALPHAI, a1
move XX, X
bge $r0, I, .L29
bnez DUMMY2, .L25 // if DUMMPY2 == 1, called from c/zscal.
bnez DUMMY2, .L25 // if DUMMY2 == 1, called from c/zscal.
bceqz $fcc0, .L25

bceqz $fcc1, .L25
@@ -341,7 +342,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
andi I, N, 7
#endif
beqz I, .L999
bnez DUMMY2, .L998 // if DUMMPY2 == 1, called from c/zscal.
bnez DUMMY2, .L998 // if DUMMY2 == 1, called from c/zscal.

bceqz $fcc0, .L998



+ 58
- 160
kernel/loongarch64/cscal_lsx.S View File

@@ -33,6 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ALPHAI $f1
#define X $r7
#define INCX $r8
#define DUMMY2 $r9

#define I $r12
#define TEMP $r13
@@ -65,6 +66,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

bge $r0, N, .L999
bge $r0, INCX, .L999
ld.d DUMMY2, $sp, 0
li.d TEMP, 1
movgr2fr.d a1, $r0
FFINT a1, a1
@@ -84,24 +86,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
srai.d I, N, 2
bne INCX, TEMP, .L22

/////// INCX == 1 ////////
.L11:
bge $r0, I, .L997
CMPEQ $fcc0, ALPHAR, a1
CMPEQ $fcc1, ALPHAI, a1
bceqz $fcc0, .L13
b .L14
.align 3
bge $r0, I, .L19

.L13:
bceqz $fcc1, .L114 //alpha_r != 0.0 && alpha_i != 0.0
b .L113 //alpha_r != 0.0 && alpha_i == 0.0
/////// INCX == 1 && N >= 4 ////////
bnez DUMMY2, .L17 // if DUMMPY2 == 1, called from c/zscal.

.L14:
bceqz $fcc1, .L114 //alpha_r == 0.0 && alpha_i != 0.0
b .L111 //alpha_r == 0.0 && alpha_i == 0.0
.align 3
bceqz $fcc0, .L17

.L111: //alpha_r == 0.0 && alpha_i == 0.0
bceqz $fcc1, .L17

.L15: //alpha_r == 0.0 && alpha_i == 0.0
vst VXZ, X, 0 * SIZE
#ifdef DOUBLE
vst VXZ, X, 2 * SIZE
@@ -112,50 +110,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
addi.d X, X, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L111
b .L997
.align 3

.L113: //alpha_r != 0.0 && alpha_i == 0.0
vld VX0, X, 0 * SIZE
#ifdef DOUBLE
vld VX1, X, 2 * SIZE
vpickev.d x1, VX1, VX0
vpickod.d x2, VX1, VX0
vfmul.d x3, VXAR, x1
vfmul.d x4, VXAR, x2
vilvl.d VX2, x4 ,x3
vilvh.d VX3, x4, x3
vst VX2, X, 0 * SIZE
vst VX3, X, 2 * SIZE
vld VX0, X, 4 * SIZE
vld VX1, X, 6 * SIZE
vpickev.d x1, VX1, VX0
vpickod.d x2, VX1, VX0
vfmul.d x3, VXAR, x1
vfmul.d x4, VXAR, x2
vilvl.d VX2, x4 ,x3
vilvh.d VX3, x4, x3
vst VX2, X, 4 * SIZE
vst VX3, X, 6 * SIZE
#else
vld VX1, X, 4 * SIZE
vpickev.w x1, VX1, VX0
vpickod.w x2, VX1, VX0
vfmul.s x3, VXAR, x1
vfmul.s x4, VXAR, x2
vilvl.w VX2, x4 ,x3
vilvh.w VX3, x4, x3
vst VX2, X, 0 * SIZE
vst VX3, X, 4 * SIZE
#endif
addi.d X, X, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L113
b .L997
blt $r0, I, .L15
b .L19
.align 3

.L114: //alpha_r != 0.0 && alpha_i != 0.0
.L17:
vld VX0, X, 0 * SIZE
#ifdef DOUBLE
vld VX1, X, 2 * SIZE
@@ -196,29 +155,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
addi.d X, X, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L114
b .L997
blt $r0, I, .L17
b .L19
.align 3

/////// INCX == 1 && N < 8 ///////
.L19:
andi I, N, 3
beqz I, .L999
bnez DUMMY2, .L998 // if DUMMPY2 == 1, called from c/zscal.

bceqz $fcc0, .L998

bceqz $fcc1, .L998

b .L995 // alpha_r == 0.0 && alpha_i == 0.0

/////// INCX != 1 ////////
.L22:
bge $r0, I, .L997
move XX, X
CMPEQ $fcc0, ALPHAR, a1
CMPEQ $fcc1, ALPHAI, a1
bceqz $fcc0, .L23
b .L24
.align 3
move XX, X
bge $r0, I, .L29
bnez DUMMY2, .L25 // if DUMMPY2 == 1, called from c/zscal.

.L23:
bceqz $fcc1, .L224 //alpha_r != 0.0 && alpha_i != 0.0
b .L223 //alpha_r != 0.0 && alpha_i == 0.0
bceqz $fcc0, .L25

.L24:
bceqz $fcc1, .L224 //alpha_r == 0.0 && alpha_i != 0.0
b .L221 //alpha_r == 0.0 && alpha_i == 0.0
.align 3
bceqz $fcc1, .L25

.L221: //alpha_r == 0.0 && alpha_i == 0.0
.L27: //alpha_r == 0.0 && alpha_i == 0.0
#ifdef DOUBLE
vstelm.d VXZ, X, 0, 0
vstelm.d VXZ, X, 1 * SIZE, 0
@@ -246,92 +211,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
add.d X, X, INCX
addi.d I, I, -1
blt $r0, I, .L221
b .L997
blt $r0, I, .L27
b .L29
.align 3

.L223: //alpha_r != 0.0 && alpha_i == 0.0
#ifdef DOUBLE
ld.d t1, X, 0 * SIZE
ld.d t2, X, 1 * SIZE
add.d X, X, INCX
ld.d t3, X, 0 * SIZE
ld.d t4, X, 1 * SIZE
add.d X, X, INCX
vinsgr2vr.d x1, t1, 0
vinsgr2vr.d x2, t2, 0
vinsgr2vr.d x1, t3, 1
vinsgr2vr.d x2, t4, 1
vfmul.d x3, VXAR, x1
vfmul.d x4, VXAR, x2
vstelm.d x3, XX, 0 * SIZE, 0
vstelm.d x4, XX, 1 * SIZE, 0
add.d XX, XX, INCX
vstelm.d x3, XX, 0 * SIZE, 1
vstelm.d x4, XX, 1 * SIZE, 1
add.d XX, XX, INCX

ld.d t1, X, 0 * SIZE
ld.d t2, X, 1 * SIZE
add.d X, X, INCX
ld.d t3, X, 0 * SIZE
ld.d t4, X, 1 * SIZE
vinsgr2vr.d x1, t1, 0
vinsgr2vr.d x2, t2, 0
vinsgr2vr.d x1, t3, 1
vinsgr2vr.d x2, t4, 1
add.d X, X, INCX
vfmul.d x3, VXAR, x1
vfmul.d x4, VXAR, x2
addi.d I, I, -1
vstelm.d x3, XX, 0 * SIZE, 0
vstelm.d x4, XX, 1 * SIZE, 0
add.d XX, XX, INCX
vstelm.d x3, XX, 0 * SIZE, 1
vstelm.d x4, XX, 1 * SIZE, 1
#else
ld.w t1, X, 0 * SIZE
ld.w t2, X, 1 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
ld.w t4, X, 1 * SIZE
add.d X, X, INCX
vinsgr2vr.w x1, t1, 0
vinsgr2vr.w x2, t2, 0
vinsgr2vr.w x1, t3, 1
vinsgr2vr.w x2, t4, 1
ld.w t1, X, 0 * SIZE
ld.w t2, X, 1 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
ld.w t4, X, 1 * SIZE
vinsgr2vr.w x1, t1, 2
vinsgr2vr.w x2, t2, 2
vinsgr2vr.w x1, t3, 3
vinsgr2vr.w x2, t4, 3
add.d X, X, INCX

vfmul.s x3, VXAR, x1
vfmul.s x4, VXAR, x2
addi.d I, I, -1
vstelm.w x3, XX, 0 * SIZE, 0
vstelm.w x4, XX, 1 * SIZE, 0
add.d XX, XX, INCX
vstelm.w x3, XX, 0 * SIZE, 1
vstelm.w x4, XX, 1 * SIZE, 1
add.d XX, XX, INCX
vstelm.w x3, XX, 0 * SIZE, 2
vstelm.w x4, XX, 1 * SIZE, 2
add.d XX, XX, INCX
vstelm.w x3, XX, 0 * SIZE, 3
vstelm.w x4, XX, 1 * SIZE, 3
#endif
add.d XX, XX, INCX
blt $r0, I, .L223
b .L997
.align 3

.L224: //alpha_r != 0.0 && alpha_i != 0.0
.L25:
#ifdef DOUBLE
ld.d t1, X, 0 * SIZE
ld.d t2, X, 1 * SIZE
@@ -414,15 +298,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vstelm.w x4, XX, 1 * SIZE, 3
#endif
add.d XX, XX, INCX
blt $r0, I, .L224
b .L997
blt $r0, I, .L25
b .L29
.align 3

.L997:
andi I, N, 3
bge $r0, I, .L999
.align 3
/////// INCX != 1 && N < 8 ///////
.L29:
andi I, N, 3
beqz I, .L999
bnez DUMMY2, .L998 // if DUMMPY2 == 1, called from c/zscal.

bceqz $fcc0, .L998

bceqz $fcc1, .L998

b .L995 // alpha_r == 0.0 && alpha_i == 0.0

.L995: // alpha_r == 0.0 && alpha_i == 0.0
ST a1, X, 0 * SIZE
ST a1, X, 1 * SIZE
addi.d I, I, -1
add.d X, X, INCX
blt $r0, I, .L995
b .L999
.L998:
LD a1, X, 0 * SIZE
LD a2, X, 1 * SIZE
@@ -435,7 +333,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ST s2, X, 1 * SIZE
add.d X, X, INCX
blt $r0, I, .L998
.align 3
b .L999

.L999:
move $r4, $r12


+ 3
- 0
kernel/loongarch64/zscal.S View File

@@ -53,6 +53,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE

li.d TEMP, 2 * SIZE
ld.d XX, $sp, 0 // Load dummy2
slli.d XX, XX, ZBASE_SHIFT
MTC a1, $r0
slli.d INCX, INCX, ZBASE_SHIFT
bge $r0, N, .L999
@@ -60,6 +62,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
CMPEQ $fcc1, ALPHA_I, a1
bceqz $fcc0, .L50
bceqz $fcc1, .L50
beq XX, TEMP, .L50 // if dummp2 == 1, do not directly copy 0
srai.d I, N, 2
bne INCX, TEMP, .L20
bge $r0, I, .L15


+ 47
- 50
kernel/mips/zscal.c View File

@@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2016, The OpenBLAS Project
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -25,61 +25,58 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

/**************************************************************************************
* 2013/09/14 Saar
* BLASTEST float : OK
* BLASTEST double : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/

#include "common.h"

// The c/zscal_k function is called not only by cblas_c/zscal but also by other upper-level interfaces.
// In certain cases, the expected return values for cblas_s/zscal differ from those of other upper-level interfaces.
// To handle this, we use the dummy2 parameter to differentiate between them.
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i=0;
BLASLONG inc_x2;
BLASLONG ip = 0;
FLOAT temp;
BLASLONG i = 0;
BLASLONG inc_x2;
BLASLONG ip = 0;
FLOAT temp;

inc_x2 = 2 * inc_x;
for ( i=0; i<n; i++ )
{
if ( da_r == 0.0 )
{
if ( da_i == 0.0 )
{
temp = 0.0;
x[ip+1] = 0.0 ;
}
else
{
temp = - da_i * x[ip+1] ;
if (isnan(x[ip]) || isinf(x[ip])) temp = NAN;
if (!isinf(x[ip+1]))
x[ip+1] = da_i * x[ip] ;
else x[ip+1] = NAN;
}
}
else
{
if ( da_i == 0.0 )
{
temp = da_r * x[ip] ;
if (!isinf(x[ip+1]))
x[ip+1] = da_r * x[ip+1];
else x[ip+1] = NAN;
}
else
{
temp = da_r * x[ip] - da_i * x[ip+1] ;
if (!isinf(x[ip+1]))
x[ip+1] = da_r * x[ip+1] + da_i * x[ip] ;
else x[ip+1] = NAN;
}
}
if ( da_r != da_r )
x[ip] = da_r;
else
x[ip] = temp;
ip += inc_x2;
}
if ((n <= 0) || (inc_x <= 0))
return(0);

return(0);
inc_x2 = 2 * inc_x;
if (dummy2 == 0) {
for (i = 0; i < n; i++)
{
if (da_r == 0.0 && da_i == 0.0)
{
x[ip] = 0.0;
x[ip+1] = 0.0;
}
else
{
temp = da_r * x[ip] - da_i * x[ip+1];
x[ip+1] = da_r * x[ip+1] + da_i * x[ip] ;
x[ip] = temp;
}

}
ip += inc_x2;
}
return(0);
}
for (i = 0; i < n; i++)
{
temp = da_r * x[ip] - da_i * x[ip+1];
x[ip+1] = da_r * x[ip+1] + da_i * x[ip] ;

x[ip] = temp;
ip += inc_x2;
}

return(0);
}

+ 3
- 0
kernel/mips64/KERNEL View File

@@ -6,6 +6,9 @@ CROTKERNEL = ../mips/zrot.c
ZROTKERNEL = ../mips/zrot.c
CSWAPKERNEL = ../mips/zswap.c
ZSWAPKERNEL = ../mips/zswap.c

CSCALKERNEL = ../mips/zscal.c
ZSCALKERNEL = ../mips/zscal.c
ifndef SNRM2KERNEL


+ 6
- 0
kernel/power/zscal.S View File

@@ -51,6 +51,7 @@
#define X r8
#define INCX r9
#endif
#define FLAG r11
#endif

#if defined(_AIX) || defined(__APPLE__)
@@ -61,6 +62,7 @@
#define X r8
#define INCX r9
#endif
#define FLAG r11
#endif

#define FZERO f0
@@ -94,6 +96,10 @@
fcmpu cr0, FZERO, ALPHA_I
bne- cr0, LL(A1I1)

LDLONG FLAG, 104(SP)
cmpwi cr0, FLAG, 1
beq- cr0, LL(A1I1)

cmpwi cr0, INCX, 2 * SIZE
bne- cr0, LL(A0IN)



+ 1
- 1
kernel/power/zscal.c View File

@@ -136,7 +136,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F
if ( inc_x <= 0 )
return(0);

if (da_r == ZERO && da_i == ZERO) {
if (da_r == ZERO && da_i == ZERO && dummy2 == 0) {
//clear the vector and return
if (inc_x == 1) {
memset(x, 0, n*COMPSIZE*SIZE);


+ 5
- 0
kernel/power/zscal_ppc440.S View File

@@ -64,6 +64,7 @@
#endif

#define INC1 r11
#define FLAG r12

#define FZERO f0
#define ALPHA_R f1
@@ -97,6 +98,10 @@
fcmpu cr0, FZERO, ALPHA_I
bne- cr0, LL(A1I1)

lwz FLAG, FRAMESLOT(0)(SP)
cmpwi cr0, FLAG, 1
beq- cr0, LL(A1I1)

LL(A0IN):
srawi. r0, N, 3
mtspr CTR, r0


+ 10
- 0
kernel/riscv64/KERNEL.RISCV64_ZVL256B View File

@@ -169,6 +169,7 @@ SSYMV_U_KERNEL = symv_U_vector.c
SSYMV_L_KERNEL = symv_L_vector.c
DSYMV_U_KERNEL = symv_U_vector.c
DSYMV_L_KERNEL = symv_L_vector.c

CSYMV_U_KERNEL = ../generic/zsymv_k.c
CSYMV_L_KERNEL = ../generic/zsymv_k.c
ZSYMV_U_KERNEL = ../generic/zsymv_k.c
@@ -201,3 +202,12 @@ endif
ifndef ZGEMM_BETA
ZGEMM_BETA = ../generic/zgemm_beta.c
endif

ZOMATCOPY_CN = zomatcopy_cn_vector.c
COMATCOPY_CN = zomatcopy_cn_vector.c

DOMATCOPY_CN = omatcopy_cn_vector.c
SOMATCOPY_CN = omatcopy_cn_vector.c

SAXPBYKERNEL = axpby_vector_v2.c
DAXPBYKERNEL = axpby_vector_v2.c

+ 149
- 0
kernel/riscv64/axpby_vector_v2.c View File

@@ -0,0 +1,149 @@
/***************************************************************************
Copyright (c) 2022, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#include "common.h"

#if !defined(DOUBLE)
#define VSETVL(n) RISCV_RVV(vsetvl_e32m8)(n)
#define FLOAT_V_T vfloat32m8_t
#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m8)
#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m8)
#define VSEV_FLOAT RISCV_RVV(vse32_v_f32m8)
#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m8)
#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m8)
#define VFMULVF_FLOAT RISCV_RVV(vfmul_vf_f32m8)
#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m8)
#else
#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n)
#define FLOAT_V_T vfloat64m4_t
#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4)
#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4)
#define VSEV_FLOAT RISCV_RVV(vse64_v_f64m4)
#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4)
#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4)
#define VFMULVF_FLOAT RISCV_RVV(vfmul_vf_f64m4)
#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m4)
#endif

int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *y, BLASLONG inc_y)
{
FLOAT_V_T vx, vy;
unsigned int gvl;
if (n <= 0)
return (0);
if (inc_x == 1 && inc_y == 1)
{
while (n > 0)
{
gvl = VSETVL(n);

vx = VLEV_FLOAT(x, gvl);
vy = VLEV_FLOAT(y, gvl);

vy = VFMULVF_FLOAT(vy, beta, gvl);
vy = VFMACCVF_FLOAT(vy, alpha, vx, gvl);

VSEV_FLOAT(y, vy, gvl);

x += gvl;
y += gvl;
n -= gvl;
}
}
else if (1 == inc_x)
{
BLASLONG stride_y = inc_y * sizeof(FLOAT);
while (n > 0)
{
gvl = VSETVL(n);
vy = VLSEV_FLOAT(y, stride_y, gvl);
vx = VLEV_FLOAT(x, gvl);

vy = VFMULVF_FLOAT(vy, beta, gvl);
vy = VFMACCVF_FLOAT(vy, alpha, vx, gvl);

VSSEV_FLOAT(y, stride_y, vy, gvl);

x += gvl;
y += gvl * inc_y;
n -= gvl;
}
}
else if (1 == inc_y)
{
BLASLONG stride_x = inc_x * sizeof(FLOAT);

while (n > 0)
{
gvl = VSETVL(n);

vx = VLSEV_FLOAT(x, stride_x, gvl);
vy = VLEV_FLOAT(y, gvl);

vy = VFMULVF_FLOAT(vy, beta, gvl);
vy = VFMACCVF_FLOAT(vy, alpha, vx, gvl);

VSEV_FLOAT(y, vy, gvl);

x += gvl * inc_x;
y += gvl;
n -= gvl;
}
}
else if (inc_y == 0)
{
FLOAT vf = y[0];
for (; n > 0; n--)
{
vf = (vf * beta) + (x[0] * alpha);
x += inc_x;
}
y[0] = vf;
}
else
{
BLASLONG stride_x = inc_x * sizeof(FLOAT);
BLASLONG stride_y = inc_y * sizeof(FLOAT);
while (n > 0)
{
gvl = VSETVL(n);
vy = VLSEV_FLOAT(y, stride_y, gvl);
vx = VLSEV_FLOAT(x, stride_x, gvl);

vy = VFMULVF_FLOAT(vy, beta, gvl);
vy = VFMACCVF_FLOAT(vy, alpha, vx, gvl);

VSSEV_FLOAT(y, stride_y, vy, gvl);

x += gvl * inc_x;
y += gvl * inc_y;
n -= gvl;
}
}

return (0);
}

+ 123
- 0
kernel/riscv64/omatcopy_cn_vector.c View File

@@ -0,0 +1,123 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#include "common.h"
#if !defined(DOUBLE)
#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m4)()
#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n)
#define FLOAT_V_T vfloat32m4_t
#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4)
#define VSEV_FLOAT RISCV_RVV(vse32_v_f32m4)
#define VFMULVF_FLOAT RISCV_RVV(vfmul_vf_f32m4)
#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m4)
#else
#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m4)()
#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n)
#define FLOAT_V_T vfloat64m4_t
#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4)
#define VSEV_FLOAT RISCV_RVV(vse64_v_f64m4)
#define VFMULVF_FLOAT RISCV_RVV(vfmul_vf_f64m4)
#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m4)
#endif


int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
{
BLASLONG i,j;
FLOAT *aptr,*bptr;
size_t vl;

FLOAT_V_T va, vb,va1,vb1;
if ( rows <= 0 ) return(0);
if ( cols <= 0 ) return(0);

aptr = a;
bptr = b;

if ( alpha == 0.0 )
{
vl = VSETVL_MAX;
va = VFMVVF_FLOAT(0, vl);
for ( i=0; i<cols ; i++ )
{
for(j=0; j<rows; j+=vl)
{
vl = VSETVL(rows - j);
VSEV_FLOAT(bptr + j, va, vl);
}
bptr += ldb;
}
return(0);
}

if ( alpha == 1.0 )
{
for ( i=0; i<cols ; i++ )
{
for(j=0; j<rows; j+=vl)
{
vl = VSETVL(rows - j);
va = VLEV_FLOAT(aptr + j, vl);
VSEV_FLOAT(bptr + j, va, vl);
}
aptr += lda;
bptr += ldb;
}
return(0);
}
i = 0;
if( cols % 2 ){
for(j=0; j<rows; j+=vl)
{
vl = VSETVL(rows - j);
va = VLEV_FLOAT(aptr + j, vl);
va = VFMULVF_FLOAT(va, alpha, vl);
VSEV_FLOAT(bptr + j, va, vl);
}
aptr += lda;
bptr += ldb;
i = 1;
}
for ( ; i<cols ; i+=2 )
{
for(j=0; j<rows; j+=vl)
{
vl = VSETVL(rows - j);
va = VLEV_FLOAT(aptr + j, vl);
va1= VLEV_FLOAT(aptr + lda + j, vl);
va = VFMULVF_FLOAT(va, alpha, vl);
va1= VFMULVF_FLOAT(va1, alpha, vl);
VSEV_FLOAT(bptr + j, va, vl);
VSEV_FLOAT(bptr + ldb + j, va1, vl);
}
aptr += 2 * lda;
bptr += 2 * ldb;
}

return(0);
}

+ 47
- 0
kernel/riscv64/zaxpy_vector.c View File

@@ -43,8 +43,55 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m4)
#endif

#if !defined(DOUBLE)
inline int small_caxpy_kernel(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
#else
inline int small_zaxpy_kernel(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
#endif
{
BLASLONG i=0;
BLASLONG ix,iy;
BLASLONG inc_x2;
BLASLONG inc_y2;

if ( n <= 0 ) return(0);
if ( da_r == 0.0 && da_i == 0.0 ) return(0);

ix = 0;
iy = 0;

inc_x2 = 2 * inc_x;
inc_y2 = 2 * inc_y;

while(i < n)
{
#if !defined(CONJ)
y[iy] += ( da_r * x[ix] - da_i * x[ix+1] ) ;
y[iy+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ;
#else
y[iy] += ( da_r * x[ix] + da_i * x[ix+1] ) ;
y[iy+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ;
#endif
ix += inc_x2 ;
iy += inc_y2 ;
i++ ;

}
return(0);

}

int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
#if !defined(DOUBLE)
if(n < 16) {
return small_caxpy_kernel(n, dummy0, dummy1, da_r, da_i, x, inc_x, y, inc_y, dummy, dummy2);
}
#else
if(n < 8) {
return small_zaxpy_kernel(n, dummy0, dummy1, da_r, da_i, x, inc_x, y, inc_y, dummy, dummy2);
}
#endif
BLASLONG i = 0, j = 0;
BLASLONG ix = 0,iy = 0;
if(n <= 0) return(0);


+ 53
- 1
kernel/riscv64/zdot_vector.c View File

@@ -68,8 +68,60 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define VFNMSACVV_FLOAT RISCV_RVV(vfnmsac_vv_f64m4)
#endif

#if !defined(DOUBLE)
inline OPENBLAS_COMPLEX_FLOAT small_cdot_kernel(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
#else
inline OPENBLAS_COMPLEX_FLOAT small_zdot_kernel(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
#endif
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
FLOAT dot[2];
OPENBLAS_COMPLEX_FLOAT result;
BLASLONG inc_x2;
BLASLONG inc_y2;

dot[0]=0.0;
dot[1]=0.0;

CREAL(result) = 0.0 ;
CIMAG(result) = 0.0 ;

if ( n < 1 ) return(result);

inc_x2 = 2 * inc_x ;
inc_y2 = 2 * inc_y ;

while(i < n)
{
#if !defined(CONJ)
dot[0] += ( x[ix] * y[iy] - x[ix+1] * y[iy+1] ) ;
dot[1] += ( x[ix+1] * y[iy] + x[ix] * y[iy+1] ) ;
#else
dot[0] += ( x[ix] * y[iy] + x[ix+1] * y[iy+1] ) ;
dot[1] -= ( x[ix+1] * y[iy] - x[ix] * y[iy+1] ) ;
#endif
ix += inc_x2 ;
iy += inc_y2 ;
i++ ;

}
CREAL(result) = dot[0];
CIMAG(result) = dot[1];
return(result);

}
OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
#if !defined(DOUBLE)
if(n < 16) {
return small_cdot_kernel(n, x, inc_x, y, inc_y);
}
#else
if(n < 8) {
return small_zdot_kernel(n, x, inc_x, y, inc_y);
}
#endif
BLASLONG i=0, j=0;
BLASLONG ix=0,iy=0;
FLOAT dot[2];
@@ -148,4 +200,4 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
CREAL(result) = dot[0];
CIMAG(result) = dot[1];
return(result);
}
}

+ 63
- 20
kernel/riscv64/zgemv_n_vector.c View File

@@ -66,7 +66,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
BLASLONG lda2 = lda * 2;
vy0_new = VLSEV_FLOAT(&y[iy], stride_y, gvl);
vy1_new = VLSEV_FLOAT(&y[iy + 1], stride_y, gvl);
for (k = 0, j = 0; k < m / gvl; k++)
for (k = 0, j = 0; k < m / gvl; k ++)
{
a_ptr = a;
ix = 0;
@@ -121,30 +121,73 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
#endif
a_ptr += lda2;
ix += inc_x2;
}

for (; i < n; i += 4)
for (i = n % 4 ; i < n; i += 4)
{
#if !defined(XCONJ)

x_v0 = VLSEV_FLOAT(&x[ix], inc_x2 * sizeof(FLOAT), 4);
x_v1 = VLSEV_FLOAT(&x[ix + 1], inc_x2 * sizeof(FLOAT), 4);
temp_rv = VFMUL_VF_FLOAT(x_v0, alpha_r, 4);
temp_iv = VFMUL_VF_FLOAT(x_v0, alpha_i, 4);
temp_rv = VFNMSACVF_FLOAT(temp_rv, alpha_i, x_v1, 4);
temp_iv = VFMACCVF_FLOAT(temp_iv, alpha_r, x_v1, 4);
VSEV_FLOAT(&temp_rr[0], temp_rv, 4);
VSEV_FLOAT(&temp_ii[0], temp_iv, 4);
// temp_rr[0] = alpha_r * x[ix] - alpha_i * x[ix + 1];
// temp_rr[1] = alpha_r * x[ix + inc_x2] - alpha_i * x[ix + inc_x2 + 1];
x_v0 = VLSEV_FLOAT(&x[ix], inc_x2 * sizeof(FLOAT), 2);
x_v1 = VLSEV_FLOAT(&x[ix + 1], inc_x2 * sizeof(FLOAT), 2);
temp_rv = VFMUL_VF_FLOAT(x_v0, alpha_r, 2);
temp_rv = VFNMSACVF_FLOAT(temp_rv, alpha_i, x_v1, 2);

// temp_ii[0] = alpha_r * x[ix + 1] + alpha_i * x[ix];
// temp_ii[1] = alpha_r * x[ix + inc_x2 + 1] + alpha_i * x[ix + inc_x2];
temp_iv = VFMUL_VF_FLOAT(x_v0, alpha_i, 2);
temp_iv = VFMACCVF_FLOAT(temp_iv, alpha_r, x_v1, 2);
VSEV_FLOAT(&temp_rr[0], temp_rv, 2);
VSEV_FLOAT(&temp_ii[0], temp_iv, 2);
// temp_rr[2] = alpha_r * x[ix + inc_x2 * 2] - alpha_i * x[ix + inc_x2 * 2 + 1];
// temp_rr[3] = alpha_r * x[ix + inc_x2 * 3] - alpha_i * x[ix + inc_x2 * 3 + 1];
x_v0 = VLSEV_FLOAT(&x[ix + inc_x2 * 2], inc_x2 * sizeof(FLOAT), 2);
x_v1 = VLSEV_FLOAT(&x[ix + inc_x2 * 2 + 1], inc_x2 * sizeof(FLOAT), 2);
temp_rv = VFMUL_VF_FLOAT(x_v0, alpha_r, 2);
temp_rv = VFNMSACVF_FLOAT(temp_rv, alpha_i, x_v1, 2);

// temp_ii[2] = alpha_r * x[ix + inc_x2 * 2 + 1] + alpha_i * x[ix + inc_x2 * 2];
// temp_ii[3] = alpha_r * x[ix + inc_x2 * 3 + 1] + alpha_i * x[ix + inc_x2 * 3];
temp_iv = VFMUL_VF_FLOAT(x_v0, alpha_i, 2);
temp_iv = VFMACCVF_FLOAT(temp_iv, alpha_r, x_v1, 2);
VSEV_FLOAT(&temp_rr[2], temp_rv, 2);
VSEV_FLOAT(&temp_ii[2], temp_iv, 2);

#else
x_v0 = VLSEV_FLOAT(&x[ix], inc_x2 * sizeof(FLOAT), 4);
x_v1 = VLSEV_FLOAT(&x[ix + 1], inc_x2 * sizeof(FLOAT), 4);
temp_rv = VFMUL_VF_FLOAT(x_v0, alpha_r, 4);
temp_iv = VFMUL_VF_FLOAT(x_v0, alpha_i, 4);
temp_rv = VFMACCVF_FLOAT(temp_rv, alpha_i, x_v1, 4);
temp_iv = VFNMSACVF_FLOAT(temp_iv, alpha_r, x_v1, 4);
VSEV_FLOAT(&temp_rr[0], temp_rv, 4);
VSEV_FLOAT(&temp_ii[0], temp_iv, 4);
// temp_rr[0] = alpha_r * x[ix] + alpha_i * x[ix + 1];
// temp_rr[1] = alpha_r * x[ix + inc_x2] + alpha_i * x[ix + inc_x2 + 1];
x_v0 = VLSEV_FLOAT(&x[ix], inc_x2 * sizeof(FLOAT), 2);
x_v1 = VLSEV_FLOAT(&x[ix + 1], inc_x2 * sizeof(FLOAT), 2);
temp_rv = VFMUL_VF_FLOAT(x_v0, alpha_r, 2);
temp_rv = VFMACCVF_FLOAT(temp_rv, alpha_i, x_v1, 2);


// temp_ii[0] = alpha_r * x[ix + 1] - alpha_i * x[ix];
// temp_ii[1] = alpha_r * x[ix + inc_x2 + 1] - alpha_i * x[ix + inc_x2];
temp_iv = VFMUL_VF_FLOAT(x_v1, alpha_r, 2);
temp_iv = VFNMSACVF_FLOAT(temp_iv, alpha_i, x_v0, 2);
VSEV_FLOAT(&temp_rr[0], temp_rv, 2);
VSEV_FLOAT(&temp_ii[0], temp_iv, 2);

// temp_rr[2] = alpha_r * x[ix + inc_x2 * 2] + alpha_i * x[ix + inc_x2 * 2 + 1];
// temp_rr[3] = alpha_r * x[ix + inc_x2 * 3] + alpha_i * x[ix + inc_x2 * 3 + 1];
x_v0 = VLSEV_FLOAT(&x[ix + inc_x2 * 2], inc_x2 * sizeof(FLOAT), 2);
x_v1 = VLSEV_FLOAT(&x[ix + inc_x2 * 2 + 1], inc_x2 * sizeof(FLOAT), 2);
temp_rv = VFMUL_VF_FLOAT(x_v0, alpha_r, 2);
temp_rv = VFMACCVF_FLOAT(temp_rv, alpha_i, x_v1, 2);


temp_ii[2] = alpha_r * x[ix + inc_x2 * 2 + 1] - alpha_i * x[ix + inc_x2 * 2];
temp_ii[3] = alpha_r * x[ix + inc_x2 * 3 + 1] - alpha_i * x[ix + inc_x2 * 3];
temp_iv = VFMUL_VF_FLOAT(x_v1, alpha_r, 2);
temp_iv = VFNMSACVF_FLOAT(temp_iv, alpha_i, x_v0, 2);
VSEV_FLOAT(&temp_rr[2], temp_rv, 2);
VSEV_FLOAT(&temp_ii[2], temp_iv, 2);



#endif

@@ -257,7 +300,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl);
VSSEV_FLOAT(&y[iy + 1], stride_y, vy1, gvl);
j += gvl * 2;
iy += inc_yv;
iy += inc_yv ;
}
// tail
if (j / 2 < m)


+ 106
- 0
kernel/riscv64/zomatcopy_cn_vector.c View File

@@ -0,0 +1,106 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#include "common.h"


#if !defined(DOUBLE)
#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n)
#define FLOAT_V_T vfloat32m4_t
#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4)
#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4)
#define VSEV_FLOAT RISCV_RVV(vse32_v_f32m4)
#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4)
#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4)
#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f32m4)
#define VFMUL_VF_FLOAT RISCV_RVV(vfmul_vf_f32m4)
#define VSEV_FLOAT RISCV_RVV(vse32_v_f32m4)
#define VLSEG2_FLOAT RISCV_RVV(vlseg2e32_v_f32m4x2)
#define VSSEG2_FLOAT RISCV_RVV(vsseg2e32_v_f32m4x2)
#define FLOAT_VX2_T vfloat32m4x2_t
#define VGET_VX2 RISCV_RVV(vget_v_f32m4x2_f32m4)
#define VSET_VX2 RISCV_RVV(vset_v_f32m4_f32m4x2)
#else
#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n)
#define FLOAT_V_T vfloat64m4_t
#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4)
#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4)
#define VSEV_FLOAT RISCV_RVV(vse64_v_f64m4)
#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4)
#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4)
#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m4)
#define VFMUL_VF_FLOAT RISCV_RVV(vfmul_vf_f64m4)
#define VSEV_FLOAT RISCV_RVV(vse64_v_f64m4)
#define VLSEG2_FLOAT RISCV_RVV(vlseg2e64_v_f64m4x2)
#define VSSEG2_FLOAT RISCV_RVV(vsseg2e64_v_f64m4x2)
#define FLOAT_VX2_T vfloat64m4x2_t
#define VGET_VX2 RISCV_RVV(vget_v_f64m4x2_f64m4)
#define VSET_VX2 RISCV_RVV(vset_v_f64m4_f64m4x2)
#endif

int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
{
BLASLONG i,j,ia;
FLOAT *aptr,*bptr;
FLOAT_V_T bptr_v0 , bptr_v1 , aptr_v0 ,aptr_v1;
FLOAT_VX2_T va, vb;
unsigned int gvl = 0;

if ( rows <= 0 ) return(0);
if ( cols <= 0 ) return(0);

aptr = a;
bptr = b;

lda *= 2;
ldb *= 2;
for ( i=0; i<cols ; i++ )
{
ia = 0;
for(j=0; j<rows ; j+=gvl)
{
gvl = VSETVL(rows - j);
va = VLSEG2_FLOAT(aptr + ia, gvl);
aptr_v0 = VGET_VX2(va, 0);
aptr_v1 = VGET_VX2(va, 1);
bptr_v1 = VFMUL_VF_FLOAT( aptr_v1, alpha_r,gvl);
bptr_v1 = VFMACCVF_FLOAT(bptr_v1, alpha_i, aptr_v0, gvl);
bptr_v0 = VFMUL_VF_FLOAT( aptr_v0,alpha_r, gvl);
bptr_v0 = VFNMSACVF_FLOAT(bptr_v0, alpha_i, aptr_v1, gvl);
vb = VSET_VX2(vb, 0, bptr_v0);
vb = VSET_VX2(vb, 1, bptr_v1);
VSSEG2_FLOAT(&bptr[ia], vb, gvl);
ia += gvl * 2 ;

}
aptr += lda;
bptr += ldb;
}

return(0);

}

+ 40
- 49
kernel/riscv64/zscal.c View File

@@ -27,65 +27,56 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

/**************************************************************************************
* 2013/09/14 Saar
* BLASTEST float : OK
* BLASTEST double : OK
* CTEST : OK
* TEST : OK
* BLASTEST float : OK
* BLASTEST double : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/

#include "common.h"

// The c/zscal_k function is called not only by cblas_c/zscal but also by other upper-level interfaces.
// In certain cases, the expected return values for cblas_s/zscal differ from those of other upper-level interfaces.
// To handle this, we use the dummy2 parameter to differentiate between them.
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i=0;
BLASLONG inc_x2;
BLASLONG ip = 0;
FLOAT temp;
BLASLONG i = 0;
BLASLONG inc_x2;
BLASLONG ip = 0;
FLOAT temp;

if ( (n <= 0) || (inc_x <= 0))
return(0);
if ((n <= 0) || (inc_x <= 0))
return(0);

inc_x2 = 2 * inc_x;
if (dummy2 == 0) {
for (i = 0; i < n; i++)
{
if (da_r == 0.0 && da_i == 0.0)
{
x[ip] = 0.0;
x[ip+1] = 0.0;
}
else
{
temp = da_r * x[ip] - da_i * x[ip+1];
x[ip+1] = da_r * x[ip+1] + da_i * x[ip] ;
x[ip] = temp;
}

inc_x2 = 2 * inc_x;
for ( i=0; i<n; i++ )
{
if ( da_r == 0.0 )
{
if ( da_i == 0.0 )
{
temp = 0.0;
x[ip+1] = 0.0 ;
}
else
{
temp = - da_i * x[ip+1] ;
if (isnan(x[ip]) || isinf(x[ip])) temp = NAN;
if (!isinf(x[ip+1]))
x[ip+1] = da_i * x[ip] ;
else x[ip+1] = NAN;
}
}
else
{
if ( da_i == 0.0 )
{
temp = da_r * x[ip] ;
x[ip+1] = da_r * x[ip+1];
}
else
{
temp = da_r * x[ip] - da_i * x[ip+1] ;
x[ip+1] = da_r * x[ip+1] + da_i * x[ip] ;
}
}
x[ip] = temp;
ip += inc_x2;
}
return(0);
}
for (i = 0; i < n; i++)
{
temp = da_r * x[ip] - da_i * x[ip+1];
x[ip+1] = da_r * x[ip+1] + da_i * x[ip] ;

ip += inc_x2;
}

return(0);
x[ip] = temp;
ip += inc_x2;
}

return(0);
}



+ 13
- 1
kernel/riscv64/zscal_rvv.c View File

@@ -70,6 +70,11 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F
FLOAT_VX2_T vx2;

if(inc_x == 1) {
if (dummy2 == 0 && da_r==0. && da_i == 0.) {
BLASLONG i;
for (i=0; i < n*2; i++) x[i]=0.;
return(0);
} else {

for (size_t vl; n > 0; n -= vl, x += vl*2) {
vl = VSETVL(n);
@@ -80,6 +85,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F

vt = VFMULVF_FLOAT(vr, da_r, vl);
vt = VFNMSACVF_FLOAT(vt, da_i, vi, vl);

vi = VFMULVF_FLOAT(vi, da_r, vl);
vi = VFMACCVF_FLOAT(vi, da_i, vr, vl);

@@ -87,9 +93,14 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F
vx2 = VSET_VX2(vx2, 1, vi);
VSSEG_FLOAT(x, vx2, vl);
}
}

} else {

if (dummy2 == 0 && da_r==0. && da_i == 0.) {
BLASLONG i,ix=0,inc_x2=2*inc_x;
for (i=0; i < n; i++) {x[ix]=0.;x[ix+1]=0.;ix+=inc_x2;};
return(0);
} else {
for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) {
vl = VSETVL(n);

@@ -105,6 +116,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F
vx2 = VSET_VX2(vx2, 0, vt);
vx2 = VSET_VX2(vx2, 1, vi);
VSSSEG_FLOAT(x, stride_x, vx2, vl);
}
}
}



+ 10
- 3
kernel/riscv64/zscal_vector.c View File

@@ -57,9 +57,15 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F
if((n <= 0) || (inc_x <= 0))
return(0);

unsigned int gvl = 0;
FLOAT_V_T vt, v0, v1;
{
if (dummy2 == 0 && da_r == 0. && da_i == 0.) {
int i,inc_x2,ix;
inc_x2 = 2*inc_x;
ix=0;
for (i=0;i<n;i++){x[ix]=0.;x[ix+1]=0.;ix+=inc_x2;}
} else {
unsigned int gvl = 0;
FLOAT_V_T vt, v0, v1;
{
gvl = VSETVL(n);
BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
BLASLONG inc_xv = inc_x * 2 * gvl;
@@ -91,6 +97,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F
VSSEV_FLOAT(&x[ix], stride_x, vt, gvl);
VSSEV_FLOAT(&x[ix+1], stride_x, v1, gvl);
}
}
}
return(0);
}

+ 5
- 0
kernel/sparc/KERNEL View File

@@ -86,3 +86,8 @@ endif
ifndef QROTMKERNEL
QROTMKERNEL = ../generic/rotm.c
endif

SSCALKERNEL = ../arm/scal.c
DSCALKERNEL = ../arm/scal.c
CSCALKERNEL = ../arm/zscal.c
ZSCALKERNEL = ../arm/zscal.c

+ 3
- 0
kernel/x86/KERNEL View File

@@ -200,3 +200,6 @@ endif
ifndef QROTMKERNEL
QROTMKERNEL = ../generic/rotm.c
endif

CSCALKERNEL = ../arm/zscal.c
ZSCALKERNEL = ../arm/zscal.c

+ 2
- 2
kernel/x86_64/KERNEL View File

@@ -323,11 +323,11 @@ DSCALKERNEL = scal_sse2.S
endif

ifndef CSCALKERNEL
CSCALKERNEL = zscal_sse.S
CSCALKERNEL = ../arm/zscal.c
endif

ifndef ZSCALKERNEL
ZSCALKERNEL = zscal_sse2.S
ZSCALKERNEL = ../arm/zscal.c
endif

ifndef ASCALKERNEL


+ 52
- 22
kernel/x86_64/cscal.c View File

@@ -229,10 +229,9 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,

if ( da_i == 0.0 )
{
if (!dummy2) {
while(j < n1)
{
x[i]=0.0;
x[i+1]=0.0;
x[i+inc_x]=0.0;
@@ -244,21 +243,48 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,

while(j < n)
{
x[i]=0.0;
x[i+1]=0.0;
i += inc_x ;
j++;

}
} else {
float temp;
while(j < n1)
{
if (isnan(x[i])|| isnan(x[i+1]))
temp=NAN;
else
temp=0.0;
x[i]=temp;
x[i+1]=temp;
if (isnan(x[i+inc_x])|| isnan(x[i+inc_x+1]))
temp=NAN;
else
temp=0.0;
x[i+inc_x]= temp;
x[i+inc_x+1]= temp;
i += 2*inc_x;
j+=2;

}
while(j < n)
{
if (isnan(x[i])|| isnan(x[i+1]))
temp=NAN;
else
temp=0.0;
x[i]=temp;
x[i+1]=temp;
i += inc_x;
j++;
}
}
}
else
{

while(j < n1)
{
if (isnan(x[i]) || isinf(x[i]))
temp0 = NAN;
else
@@ -278,7 +304,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
x[i+inc_x] = temp1;
i += 2*inc_x ;
j+=2;

}

while(j < n)
@@ -305,14 +330,12 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
else
{


if ( da_i == 0.0 )
if ( da_i == 0.0 && dummy2 )
{
BLASLONG n1 = n & -2;

while(j < n1)
{
temp0 = da_r * x[i];
x[i+1] = da_r * x[i+1];
x[i] = temp0;
@@ -367,22 +390,19 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
return(0);
}


BLASLONG n1 = n & -16;
if ( n1 > 0 )
{

alpha[0] = da_r;
alpha[1] = da_i;
if ( da_r == 0.0 )
if ( da_i == 0 )
if ( da_i == 0 && !dummy2)
cscal_kernel_16_zero(n1 , alpha , x);
else
cscal_kernel_16_zero_r(n1 , alpha , x);
cscal_kernel_16/*_zero_r*/(n1 , alpha , x);
else
cscal_kernel_16(n1 , alpha , x);

i = n1 << 1;
j = n1;
}
@@ -393,6 +413,8 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
{
FLOAT res=0.0;
if (isnan(da_r)) res= da_r;
if (dummy2)
if (isnan(x[i])||isnan(x[i+1])) res= NAN;
while(j < n)
{
x[i]=res;
@@ -415,7 +437,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,

} else
{

while(j < n)
{
temp0 = -da_i * x[i+1];
@@ -424,11 +445,10 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
if (!isinf(x[i+1]))
x[i+1] = da_i * x[i];
else x[i+1] = NAN;
if ( x[i] == x[i]) //preserve NaN
if ( !isnan(x[i])) //preserve NaN
x[i] = temp0;
i += 2 ;
j++;

}

}
@@ -439,12 +459,22 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,

if ( da_i == 0.0 )
{

while(j < n)
{
temp0 = da_r * x[i];
x[i+1] = da_r * x[i+1];
if (dummy2) {
if (isnan(x[i])||isinf(x[i])) temp0=NAN;
if (isnan(x[i+1])||isinf(x[i+1]))
x[i+1]=NAN;
else
x[i+1] = da_r * x[i+1];
} else {
if (isnan(x[i]))
x[i+1] = NAN;
else
x[i+1] = da_r * x[i+1];
}
x[i] = temp0;
i += 2 ;
j++;
@@ -476,7 +506,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,

temp0 = da_r * x[i] - da_i * x[i+1];
x[i+1] = da_r * x[i+1] + da_i * x[i];
x[i] = temp0;
if(!isnan(x[i]))x[i] = temp0;
i += 2 ;
j++;



+ 49
- 19
kernel/x86_64/zscal.c View File

@@ -222,13 +222,14 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,

if ( da_r == 0.0 )
{

BLASLONG n1 = n & -2;

if ( da_i == 0.0 )
{
if (!dummy2) {
while(j < n1)
{

x[i]=0.0;
x[i+1]=0.0;
x[i+inc_x]=0.0;
@@ -245,9 +246,40 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
x[i+1]=0.0;
i += inc_x ;
j++;
}
} else {
float temp;
while(j < n1)
{
if (isnan(x[i])|| isnan(x[i+1]))
temp=NAN;
else
temp=0.0;
x[i]=temp;
x[i+1]=temp;
if (isnan(x[i+inc_x])|| isnan(x[i+inc_x+1]))
temp=NAN;
else
temp=0.0;
x[i+inc_x]= temp;
x[i+inc_x+1]= temp;
i += 2*inc_x;
j+=2;

}
while(j < n)
{
if (isnan(x[i])|| isnan(x[i+1]))
temp=NAN;
else
temp=0.0;
x[i]=temp;
x[i+1]=temp;
i += inc_x;
j++;

}
}
}
else
{
@@ -260,7 +292,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
temp0 = -da_i * x[i+1];
if (!isinf(x[i+1]))
x[i+1] = da_i * x[i];
else x[i+1] = NAN;
else x[i+1] = NAN;
x[i] = temp0;
if (isnan(x[i+inc_x]) || isinf(x[i+inc_x]))
temp1 = NAN;
@@ -291,16 +323,13 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,

}



}

}
else
{


if ( da_i == 0.0 )
if ( da_i == 0.0 && dummy2)
{
BLASLONG n1 = n & -2;

@@ -370,26 +399,27 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
alpha[1] = da_i;

if ( da_r == 0.0 )
if ( da_i == 0 )
if ( da_i == 0 && !dummy2 )
zscal_kernel_8_zero(n1 , alpha , x);
else
// zscal_kernel_8_zero_r(n1 , alpha , x);
zscal_kernel_8(n1 , alpha , x);
else
if ( da_i == 0 && da_r == da_r)
/* if ( da_i == 0 && da_r == da_r )
zscal_kernel_8_zero_i(n1 , alpha , x);
else
else*/
zscal_kernel_8(n1 , alpha , x);
}
i = n1 << 1;
j = n1;
if ( da_r == 0.0 || da_r != da_r )
}
if ( da_r == 0.0 || isnan(da_r) )
{
if ( da_i == 0.0 )
{
FLOAT res=0.0;
if (da_r != da_r) res= da_r;
FLOAT res=0.0;
if (isnan(da_r)) res= da_r;
if (dummy2)
if (isnan(x[i])||isnan(x[i+1])) res= NAN;
while(j < n)
{
x[i]=res;
@@ -412,7 +442,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,

} else
{

while(j < n)
{
temp0 = -da_i * x[i+1];
@@ -421,7 +450,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
if (!isinf(x[i+1]))
x[i+1] = da_i * x[i];
else x[i+1] = NAN;
if ( x[i] == x[i]) //preserve NaN
if ( !isnan(x[i])) //preserve NaN
x[i] = temp0;
i += 2 ;
j++;
@@ -437,8 +466,9 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
{
while(j < n)
{

temp0 = da_r * x[i];
if (isnan(x[i]))x[i+1]=NAN;
else
x[i+1] = da_r * x[i+1];
x[i] = temp0;
i += 2 ;
@@ -453,7 +483,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
{
temp0 = da_r * x[i] - da_i * x[i+1];
x[i+1] = da_r * x[i+1] + da_i * x[i];
x[i] = temp0;
if(!isnan(x[i]))x[i] = temp0;
i += 2 ;
j++;



+ 59
- 11
kernel/zarch/cscal.c View File

@@ -210,7 +210,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
BLASLONG n1 = n & -2;

if (da_i == 0.0) {
if (dummy2 == 0) {
while (j < n1) {

x[i] = 0.0;
@@ -230,11 +230,43 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
j++;

}
} else {
while (j < n1) {
if (isnan(x[i]) || isinf(x[i]) || isnan(x[i+1])) {
x[i] = NAN;
x[i+1] = NAN;
}else{
x[i] = 0.0;
x[i + 1] = 0.0;
}
if (isnan(x[i+inc_x]) || isinf(x[i+inc_x]) || isnan(x[i+1+inc_x])) {
x[i + inc_x] = NAN;
x[i + 1 + inc_x] = NAN;
} else {
x[i + inc_x] = 0.0;
x[i + 1 + inc_x] = 0.0;
}
i += 2 * inc_x;
j += 2;

}

while (j < n) {
if (isnan(x[i]) || isinf(x[i]) || isnan(x[i+1])) {
x[i] = NAN;
x[i+1] = NAN;
}else{
x[i] = 0.0;
x[i + 1] = 0.0;
}
i += inc_x;
j++;
}
}
} else {

while (j < n1) {
if (isnan(x[i]) || isinf(x[i]))
if (isnan(x[i]) || isinf(x[i]))
temp0 = NAN;
else
temp0 = -da_i * x[i + 1];
@@ -276,7 +308,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,

} else {

if (da_i == 0.0) {
if (da_i == 0.0 && dummy2) {
BLASLONG n1 = n & -2;

while (j < n1) {
@@ -335,12 +367,16 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
alpha[1] = da_i;

if (da_r == 0.0)
if (da_i == 0)
if (da_i == 0 && dummy2 == 0)
cscal_kernel_16_zero(n1, x);
else
else {
/* if (dummy2 == 0)
cscal_kernel_16_zero_r(n1, alpha, x);
else if (da_i == 0)
cscal_kernel_16_zero_i(n1, alpha, x);
else*/
cscal_kernel_16(n1, da_r, da_i, x);
}
/* else if (da_i == 0 && !isnan(da_r))
cscal_kernel_16/*_zero_i(n1, alpha, x);*/
else
cscal_kernel_16(n1, da_r, da_i, x);

@@ -354,7 +390,8 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
float res = 0.0;
if (isnan(da_r)) res = da_r;
while (j < n) {

if (dummy2)
if (isnan(x[i])|| isnan(x[i+1])) res=NAN;
x[i] = res;
x[i + 1] = res;
i += 2;
@@ -382,7 +419,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
x[i + 1] = da_i * x[i];
else
x[i + 1] = NAN;
if (x[i] == x[i])
if (!isnan(x[i]))
x[i] = temp0;
i += 2;
j++;
@@ -398,7 +435,18 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
while (j < n) {

temp0 = da_r * x[i];
x[i + 1] = da_r * x[i + 1];
if (dummy2) {
if (isnan(x[i])||isinf(x[i]))temp0 = NAN;
if (isnan(x[i+1])||isinf(x[i+1]))
x[i+1] = NAN;
else
x[i+1] = da_r * x[i + 1];
} else {
if (isnan(x[i]))
x[i + 1] = NAN;
else
x[i + 1] = da_r * x[i + 1];
}
x[i] = temp0;
i += 2;
j++;
@@ -411,7 +459,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,

temp0 = da_r * x[i] - da_i * x[i + 1];
x[i + 1] = da_r * x[i + 1] + da_i * x[i];
x[i] = temp0;
if (!isnan(x[i])) x[i] = temp0;
i += 2;
j++;



+ 52
- 11
kernel/zarch/zscal.c View File

@@ -208,7 +208,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
BLASLONG n1 = n & -2;

if (da_i == 0.0) {
if (dummy2 == 0) {
while (j < n1) {

x[i] = 0.0;
@@ -228,7 +228,38 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
j++;

}

} else {
while (j < n1) {
if (isnan(x[i]) || isinf(x[i]) || isnan(x[i+1])) {
x[i] = NAN;
x[i+1] = NAN;
} else {
x[i] = 0.0;
x[i+1] = 0.0;
}
if (isnan(x[i+inc_x]) || isinf(x[i+inc_x]) || isnan(x[i+inc_x+1])) {
x[i + inc_x] = NAN;
x[i + inc_x + 1] = NAN;
} else {
x[i + inc_x] = 0.;
x[i + inc_x + 1] = 0.;
}
i += 2 * inc_x;
j += 2;
}
while (j < n) {
if (isnan(x[i]) || isinf(x[i]) || isnan(x[i+1])) {
x[i] = NAN;
x[i+1] = NAN;
} else {
x[i] = 0.;
x[i+1] = 0.;
}
i += inc_x;
j++;
}
}
} else {

while (j < n1) {
@@ -276,7 +307,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,

} else {

if (da_i == 0.0) {
if (da_i == 0.0 && dummy2) {
BLASLONG n1 = n & -2;

while (j < n1) {
@@ -335,12 +366,10 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
alpha[1] = da_i;

if (da_r == 0.0)
if (da_i == 0)
if (da_i == 0 && dummy2 == 0)
zscal_kernel_8_zero(n1, x);
else
zscal_kernel_8(n1, da_r, da_i, x);
else if (da_i == 0 && da_r == da_r)
zscal_kernel_8_zero_i(n1, alpha, x);
else
zscal_kernel_8(n1, da_r, da_i, x);

@@ -354,7 +383,8 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
double res= 0.0;
if (isnan(da_r)) res = da_r;
while (j < n) {

if (dummy2)
if (isnan(x[i]) || isnan(x[i+1])) res = NAN;
x[i] = res;
x[i + 1] = res;
i += 2;
@@ -381,7 +411,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
x[i + 1] = da_i * x[i];
else
x[i + 1] = NAN;
if (x[i]==x[i])
if (!isnan(x[i]))
x[i] = temp0;
i += 2;
j++;
@@ -397,8 +427,19 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
while (j < n) {

temp0 = da_r * x[i];
x[i + 1] = da_r * x[i + 1];
x[i] = temp0;
if (dummy2) {
if (isnan(x[i]) || isinf(x[i])) temp0 = NAN;
if (isnan(x[i + 1]) || isinf(x[i + 1]))
x[i + 1] = NAN;
else
x[i + 1] = da_r * x[i + 1];
} else {
if (isnan(x[i]))
x[i + 1] = NAN;
else
x[i + 1] = da_r * x[i + 1];
}
x[i] = temp0;
i += 2;
j++;

@@ -410,7 +451,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,

temp0 = da_r * x[i] - da_i * x[i + 1];
x[i + 1] = da_r * x[i + 1] + da_i * x[i];
x[i] = temp0;
if (!isnan(x[i])) x[i] = temp0;
i += 2;
j++;



+ 474
- 0
utest/test_gemv.c View File

@@ -128,3 +128,477 @@ CTEST(dgemv, 0_nan_inf_incy_2)
}

#endif

#ifdef BUILD_COMPLEX

CTEST(cgemv, 0_nan_inf)
{
int i;
blasint N = 17;
blasint incX = 1;
blasint incY = 1;
float alpha[2] = {0.0, 0.0};
float beta[2] = {0.0, 0.0};
char trans = 'N';
float A[17 * 17 * 4];
float X[17 * 2];
float Y[17 * 2];

memset(A, 0, sizeof(A));
memset(X, 0, sizeof(X));
for (i = 0; i < (2 * N - 2); i += 4)
{
Y[i] = NAN;
Y[i + 1] = NAN;

Y[i + 2] = INFINITY;
Y[i + 3] = INFINITY;
}
Y[2 * N - 1] = NAN;
Y[2 * N - 2] = NAN;
BLASFUNC(cgemv)(&trans, &N, &N, alpha, A, &N, X, &incX, beta, Y, &incY);
for (i = 0; i < 2 * N; i ++)
ASSERT_TRUE(Y[i] == 0.0);
}

CTEST(cgemv, 0_nan_inf_incy_2)
{
int i;
blasint N = 17;
blasint incX = 1;
blasint incY = 2;
float alpha[2] = {0.0, 0.0};
float beta[2] = {0.0, 0.0};
char trans = 'N';
float A[17 * 17 * 4];
float X[17];
float Y[17 * 2 * 2];
float *ay = Y;

memset(A, 0, sizeof(A));
memset(X, 0, sizeof(X));
memset(Y, 0, sizeof(Y));
for (i = 0; i < (2 * N - 2); i += 4)
{
ay[0] = NAN;
ay[1] = NAN;
ay += 4;
ay[0] = INFINITY;
ay[1] = INFINITY;
ay += 4;
}
Y[4 * N - 4] = NAN;
Y[4 * N - 3] = NAN;
BLASFUNC(cgemv)(&trans, &N, &N, alpha, A, &N, X, &incX, beta, Y, &incY);
for (i = 0; i < 4 * N; i ++)
ASSERT_TRUE(Y[i] == 0.0);
}

CTEST(cgemv, 0_2_nan_1_inf_1)
{
int i;
blasint N = 17;
blasint incX = 1;
blasint incY = 1;
float alpha[2] = {0.0, 0.0};
float beta[2] = {0.0, 2.0};
char trans = 'N';
float A[17 * 17 * 4];
float X[17 * 2];
float Y[17 * 2];

memset(A, 0, sizeof(A));
memset(X, 0, sizeof(X));
for (i = 0; i < (2 * N - 2); i += 4)
{
Y[i] = NAN;
Y[i + 1] = 1.0;

Y[i + 2] = INFINITY;
Y[i + 3] = 1.0;
}
Y[2 * N - 2] = NAN;
Y[2 * N - 1] = 1.0;
BLASFUNC(cgemv)(&trans, &N, &N, alpha, A, &N, X, &incX, beta, Y, &incY);
for (i = 0; i < 2 * N; i += 2) {
if ((i >> 1) % 2){
ASSERT_TRUE(isnan(Y[i]));
ASSERT_TRUE(isinf(Y[i + 1]));
}
else {
ASSERT_TRUE(isnan(Y[i]));
ASSERT_TRUE(isnan(Y[i + 1]));
}
}
}

CTEST(cgemv, 0_2_nan_1_inf_1_incy_2)
{
int i;
blasint N = 17;
blasint incX = 1;
blasint incY = 2;
float alpha[2] = {0.0, 0.0};
float beta[2] = {0.0, 2.0};
char trans = 'N';
float A[17 * 17 * 4];
float X[17];
float Y[17 * 2 * 2];
float *ay = Y;

memset(A, 0, sizeof(A));
memset(X, 0, sizeof(X));
memset(Y, 0, sizeof(Y));
for (i = 0; i < (2 * N - 2); i += 4)
{
ay[0] = NAN;
ay[1] = 1.0;
ay += 4;
ay[0] = INFINITY;
ay[1] = 1.0;
ay += 4;
}
Y[4 * N - 4] = NAN;
Y[4 * N - 3] = 1.0;
BLASFUNC(cgemv)(&trans, &N, &N, alpha, A, &N, X, &incX, beta, Y, &incY);
for (i = 0; i < 4 * N; i += 2) {
if ((i >> 1) % 2) {
ASSERT_TRUE(Y[i] == 0.0);
ASSERT_TRUE(Y[i + 1] == 0.0);
}
else {
if ((i >> 2) % 2) {
ASSERT_TRUE(isnan(Y[i]));
ASSERT_TRUE(isinf(Y[i + 1]));
}
else {
ASSERT_TRUE(isnan(Y[i]));
ASSERT_TRUE(isnan(Y[i + 1]));
}
}
}
}

CTEST(cgemv, 2_0_nan_1_inf_1)
{
int i;
blasint N = 17;
blasint incX = 1;
blasint incY = 1;
float alpha[2] = {0.0, 0.0};
float beta[2] = {2.0, 0.0};
char trans = 'N';
float A[17 * 17 * 4];
float X[17 * 2];
float Y[17 * 2];

memset(A, 0, sizeof(A));
memset(X, 0, sizeof(X));
for (i = 0; i < (2 * N - 2); i += 4)
{
Y[i] = NAN;
Y[i + 1] = 1.0;

Y[i + 2] = INFINITY;
Y[i + 3] = 1.0;
}
Y[2 * N - 2] = NAN;
Y[2 * N - 1] = 1.0;
BLASFUNC(cgemv)(&trans, &N, &N, alpha, A, &N, X, &incX, beta, Y, &incY);
for (i = 0; i < 2 * N; i += 2) {
if ((i >> 1) % 2){
ASSERT_TRUE(isinf(Y[i]));
ASSERT_TRUE(isnan(Y[i + 1]));
}
else {
ASSERT_TRUE(isnan(Y[i]));
ASSERT_TRUE(isnan(Y[i + 1]));
}
}
}

CTEST(cgemv, 2_0_nan_1_inf_1_incy_2)
{
int i;
blasint N = 17;
blasint incX = 1;
blasint incY = 2;
float alpha[2] = {0.0, 0.0};
float beta[2] = {2.0, 0.0};
char trans = 'N';
float A[17 * 17 * 4];
float X[17];
float Y[17 * 2 * 2];
float *ay = Y;

memset(A, 0, sizeof(A));
memset(X, 0, sizeof(X));
memset(Y, 0, sizeof(Y));
for (i = 0; i < (2 * N - 2); i += 4)
{
ay[0] = NAN;
ay[1] = 1.0;
ay += 4;
ay[0] = INFINITY;
ay[1] = 1.0;
ay += 4;
}
Y[4 * N - 4] = NAN;
Y[4 * N - 3] = 1.0;
BLASFUNC(cgemv)(&trans, &N, &N, alpha, A, &N, X, &incX, beta, Y, &incY);
for (i = 0; i < 4 * N; i += 2) {
if ((i >> 1) % 2) {
ASSERT_TRUE(Y[i] == 0.0);
ASSERT_TRUE(Y[i + 1] == 0.0);
}
else {
if ((i >> 2) % 2) {
ASSERT_TRUE(isinf(Y[i]));
ASSERT_TRUE(isnan(Y[i + 1]));
}
else {
ASSERT_TRUE(isnan(Y[i]));
ASSERT_TRUE(isnan(Y[i + 1]));
}
}
}
}

#endif

#ifdef BUILD_COMPLEX16

CTEST(zgemv, 0_nan_inf)
{
int i;
blasint N = 17;
blasint incX = 1;
blasint incY = 1;
double alpha[2] = {0.0, 0.0};
double beta[2] = {0.0, 0.0};
char trans = 'N';
double A[17 * 17 * 4];
double X[17 * 2];
double Y[17 * 2];

memset(A, 0, sizeof(A));
memset(X, 0, sizeof(X));
for (i = 0; i < (2 * N - 2); i += 4)
{
Y[i] = NAN;
Y[i + 1] = NAN;

Y[i + 2] = INFINITY;
Y[i + 3] = INFINITY;
}
Y[2 * N - 1] = NAN;
Y[2 * N - 2] = NAN;
BLASFUNC(zgemv)(&trans, &N, &N, alpha, A, &N, X, &incX, beta, Y, &incY);
for (i = 0; i < 2 * N; i ++)
ASSERT_TRUE(Y[i] == 0.0);
}

CTEST(zgemv, 0_nan_inf_incy_2)
{
int i;
blasint N = 17;
blasint incX = 1;
blasint incY = 2;
double alpha[2] = {0.0, 0.0};
double beta[2] = {0.0, 0.0};
char trans = 'N';
double A[17 * 17 * 4];
double X[17];
double Y[17 * 2 * 2];
double *ay = Y;

memset(A, 0, sizeof(A));
memset(X, 0, sizeof(X));
memset(Y, 0, sizeof(Y));
for (i = 0; i < (2 * N - 2); i += 4)
{
ay[0] = NAN;
ay[1] = NAN;
ay += 4;
ay[0] = INFINITY;
ay[1] = INFINITY;
ay += 4;
}
Y[4 * N - 4] = NAN;
Y[4 * N - 3] = NAN;
BLASFUNC(zgemv)(&trans, &N, &N, alpha, A, &N, X, &incX, beta, Y, &incY);
for (i = 0; i < 4 * N; i ++)
ASSERT_TRUE(Y[i] == 0.0);
}

CTEST(zgemv, 0_2_nan_1_inf_1)
{
int i;
blasint N = 17;
blasint incX = 1;
blasint incY = 1;
double alpha[2] = {0.0, 0.0};
double beta[2] = {0.0, 2.0};
char trans = 'N';
double A[17 * 17 * 4];
double X[17 * 2];
double Y[17 * 2];

memset(A, 0, sizeof(A));
memset(X, 0, sizeof(X));
for (i = 0; i < (2 * N - 2); i += 4)
{
Y[i] = NAN;
Y[i + 1] = 1.0;

Y[i + 2] = INFINITY;
Y[i + 3] = 1.0;
}
Y[2 * N - 2] = NAN;
Y[2 * N - 1] = 1.0;
BLASFUNC(zgemv)(&trans, &N, &N, alpha, A, &N, X, &incX, beta, Y, &incY);
for (i = 0; i < 2 * N; i += 2) {
if ((i >> 1) % 2){
ASSERT_TRUE(isnan(Y[i]));
ASSERT_TRUE(isinf(Y[i + 1]));
}
else {
ASSERT_TRUE(isnan(Y[i]));
ASSERT_TRUE(isnan(Y[i + 1]));
}
}
}

CTEST(zgemv, 0_2_nan_1_inf_1_incy_2)
{
int i;
blasint N = 17;
blasint incX = 1;
blasint incY = 2;
double alpha[2] = {0.0, 0.0};
double beta[2] = {0.0, 2.0};
char trans = 'N';
double A[17 * 17 * 4];
double X[17];
double Y[17 * 2 * 2];
double *ay = Y;

memset(A, 0, sizeof(A));
memset(X, 0, sizeof(X));
memset(Y, 0, sizeof(Y));
for (i = 0; i < (2 * N - 2); i += 4)
{
ay[0] = NAN;
ay[1] = 1.0;
ay += 4;
ay[0] = INFINITY;
ay[1] = 1.0;
ay += 4;
}
Y[4 * N - 4] = NAN;
Y[4 * N - 3] = 1.0;
BLASFUNC(zgemv)(&trans, &N, &N, alpha, A, &N, X, &incX, beta, Y, &incY);
for (i = 0; i < 4 * N; i += 2) {
if ((i >> 1) % 2) {
ASSERT_TRUE(Y[i] == 0.0);
ASSERT_TRUE(Y[i + 1] == 0.0);
}
else {
if ((i >> 2) % 2) {
ASSERT_TRUE(isnan(Y[i]));
ASSERT_TRUE(isinf(Y[i + 1]));
}
else {
ASSERT_TRUE(isnan(Y[i]));
ASSERT_TRUE(isnan(Y[i + 1]));
}
}
}
}

CTEST(zgemv, 2_0_nan_1_inf_1)
{
int i;
blasint N = 17;
blasint incX = 1;
blasint incY = 1;
double alpha[2] = {0.0, 0.0};
double beta[2] = {2.0, 0.0};
char trans = 'N';
double A[17 * 17 * 4];
double X[17 * 2];
double Y[17 * 2];

memset(A, 0, sizeof(A));
memset(X, 0, sizeof(X));
for (i = 0; i < (2 * N - 2); i += 4)
{
Y[i] = NAN;
Y[i + 1] = 1.0;

Y[i + 2] = INFINITY;
Y[i + 3] = 1.0;
}
Y[2 * N - 2] = NAN;
Y[2 * N - 1] = 1.0;
BLASFUNC(zgemv)(&trans, &N, &N, alpha, A, &N, X, &incX, beta, Y, &incY);
for (i = 0; i < 2 * N; i += 2) {
if ((i >> 1) % 2){
ASSERT_TRUE(isinf(Y[i]));
ASSERT_TRUE(isnan(Y[i + 1]));
}
else {
ASSERT_TRUE(isnan(Y[i]));
ASSERT_TRUE(isnan(Y[i + 1]));
}
}
}

CTEST(zgemv, 2_0_nan_1_inf_1_incy_2)
{
int i;
blasint N = 17;
blasint incX = 1;
blasint incY = 2;
double alpha[2] = {0.0, 0.0};
double beta[2] = {2.0, 0.0};
char trans = 'N';
double A[17 * 17 * 4];
double X[17];
double Y[17 * 2 * 2];
double *ay = Y;

memset(A, 0, sizeof(A));
memset(X, 0, sizeof(X));
memset(Y, 0, sizeof(Y));
for (i = 0; i < (2 * N - 2); i += 4)
{
ay[0] = NAN;
ay[1] = 1.0;
ay += 4;
ay[0] = INFINITY;
ay[1] = 1.0;
ay += 4;
}
Y[4 * N - 4] = NAN;
Y[4 * N - 3] = 1.0;
BLASFUNC(zgemv)(&trans, &N, &N, alpha, A, &N, X, &incX, beta, Y, &incY);
for (i = 0; i < 4 * N; i += 2) {
if ((i >> 1) % 2) {
ASSERT_TRUE(Y[i] == 0.0);
ASSERT_TRUE(Y[i + 1] == 0.0);
}
else {
if ((i >> 2) % 2) {
ASSERT_TRUE(isinf(Y[i]));
ASSERT_TRUE(isnan(Y[i + 1]));
}
else {
ASSERT_TRUE(isnan(Y[i]));
ASSERT_TRUE(isnan(Y[i + 1]));
}
}
}
}

#endif

+ 54
- 0
utest/test_zscal.c View File

@@ -442,6 +442,33 @@ CTEST(cscal, i_0inf_inc_2)
ASSERT_TRUE(isnan(inf[17]));
}

CTEST(cscal, i00_NAN)
{
blasint N=9;
blasint incX=1;
float i[] = {0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0 };
float nan[] = {NAN, 0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0};
BLASFUNC(cscal)(&N, i, nan, &incX);
ASSERT_TRUE(isnan(nan[0]));
ASSERT_TRUE(isnan(nan[1]));
ASSERT_TRUE(isnan(nan[16]));
ASSERT_TRUE(isnan(nan[17]));
}

CTEST(cscal, i00_NAN_incx_2)
{
blasint N=9;
blasint incX=2;
float i[] = {0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0 };
float nan[] = {0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN,
0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN};
BLASFUNC(cscal)(&N, i, nan, &incX);
ASSERT_TRUE(isnan(nan[0]));
ASSERT_TRUE(isnan(nan[1]));
ASSERT_TRUE(isnan(nan[16]));
ASSERT_TRUE(isnan(nan[17]));
}

#endif

#ifdef BUILD_COMPLEX16
@@ -588,4 +615,31 @@ CTEST(zscal, i_0inf_inc_2)
ASSERT_TRUE(isnan(inf[17]));
}

CTEST(zscal, i00_NAN)
{
blasint N=9;
blasint incX=1;
double i[] = {0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0 };
double nan[] = {NAN, 0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0};
BLASFUNC(zscal)(&N, i, nan, &incX);
ASSERT_TRUE(isnan(nan[0]));
ASSERT_TRUE(isnan(nan[1]));
ASSERT_TRUE(isnan(nan[16]));
ASSERT_TRUE(isnan(nan[17]));
}

CTEST(zscal, i00_NAN_incx_2)
{
blasint N=9;
blasint incX=2;
double i[] = {0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0 };
double nan[] = {0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN,
0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN};
BLASFUNC(zscal)(&N, i, nan, &incX);
ASSERT_TRUE(isnan(nan[0]));
ASSERT_TRUE(isnan(nan[1]));
ASSERT_TRUE(isnan(nan[16]));
ASSERT_TRUE(isnan(nan[17]));
}

#endif

Loading…
Cancel
Save