| @@ -83,9 +83,39 @@ jobs: | |||
| - name: test | |||
| run: | | |||
| export PATH=$GITHUB_WORKSPACE/qemu-install/bin/:$PATH | |||
| qemu-riscv64 ./utest/openblas_utest | |||
| qemu-riscv64 ./utest/openblas_utest_ext | |||
| run_with_retry() { | |||
| local cmd="$1" | |||
| local time_out=10 | |||
| local retries=10 | |||
| local attempt=0 | |||
| for ((i=1; i<=retries; i++)); do | |||
| attempt=$((i)) | |||
| if timeout -s 12 --preserve-status $time_out $cmd; then | |||
| echo "Command succeeded on attempt $i." | |||
| return 0 | |||
| else | |||
| local exit_code=$? | |||
| if [ $exit_code -eq 140 ]; then | |||
| echo "Attempt $i timed out (retrying...)" | |||
| time_out=$((time_out + 5)) | |||
| else | |||
| echo "Attempt $i failed with exit code $exit_code. Aborting workflow." | |||
| exit $exit_code | |||
| fi | |||
| fi | |||
| done | |||
| echo "All $retries attempts failed, giving up." | |||
| echo "Final failure was due to timeout." | |||
| echo "Aborting workflow." | |||
| exit $exit_code | |||
| } | |||
| export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH | |||
| which qemu-riscv64 | |||
| export QEMU_BIN=$(which qemu-riscv64) | |||
| run_with_retry "$QEMU_BIN ./utest/openblas_utest" | |||
| run_with_retry "$QEMU_BIN ./utest/openblas_utest_ext" | |||
| OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xscblat1 | |||
| OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xdcblat1 | |||
| OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xccblat1 | |||
| @@ -62,6 +62,7 @@ option(CPP_THREAD_SAFETY_TEST "Run a massively parallel DGEMM test to confirm th | |||
| option(CPP_THREAD_SAFETY_GEMV "Run a massively parallel DGEMV test to confirm thread safety of the library (requires OpenMP)" OFF) | |||
| option(BUILD_STATIC_LIBS "Build static library" OFF) | |||
| option(BUILD_SHARED_LIBS "Build shared library" OFF) | |||
| if(NOT BUILD_STATIC_LIBS AND NOT BUILD_SHARED_LIBS) | |||
| set(BUILD_STATIC_LIBS ON CACHE BOOL "Build static library" FORCE) | |||
| endif() | |||
| @@ -123,7 +124,12 @@ message(WARNING "CMake support is experimental. It does not yet support all buil | |||
| include("${PROJECT_SOURCE_DIR}/cmake/utils.cmake") | |||
| include("${PROJECT_SOURCE_DIR}/cmake/system.cmake") | |||
| set(OpenBLAS_LIBNAME ${LIBNAMEPREFIX}openblas${LIBNAMESUFFIX}${SUFFIX64_UNDERSCORE}) | |||
| string(FIND "${LIBNAMESUFFIX}" "${SUFFIX64_UNDERSCORE}" HAVE64) | |||
| if (${HAVE64} GREATER -1) | |||
| set(OpenBLAS_LIBNAME ${LIBNAMEPREFIX}openblas${LIBNAMESUFFIX}) | |||
| else () | |||
| set(OpenBLAS_LIBNAME ${LIBNAMEPREFIX}openblas${LIBNAMESUFFIX}${SUFFIX64_UNDERSCORE}) | |||
| endif () | |||
| set(BLASDIRS interface driver/level2 driver/level3 driver/others) | |||
| @@ -716,4 +722,5 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PN}ConfigVersion.cmake | |||
| DESTINATION ${CMAKECONFIG_INSTALL_DIR}) | |||
| install(EXPORT "${PN}${SUFFIX64}Targets" | |||
| NAMESPACE "${PN}${SUFFIX64}::" | |||
| DESTINATION ${CMAKECONFIG_INSTALL_DIR}) | |||
| DESTINATION ${CMAKECONFIG_INSTALL_DIR}) | |||
| @@ -276,6 +276,7 @@ SMALL_MATRIX_OPT = 1 | |||
| endif | |||
| ifeq ($(ARCH), arm64) | |||
| GEMM_GEMV_FORWARD = 1 | |||
| GEMM_GEMV_FORWARD_BF16 = 1 | |||
| endif | |||
| ifeq ($(ARCH), riscv) | |||
| GEMM_GEMV_FORWARD = 1 | |||
| @@ -229,9 +229,9 @@ if (${CORE} STREQUAL NEOVERSEN1) | |||
| if (${CMAKE_C_COMPILER_ID} STREQUAL "NVC" AND NOT NO_SVE) | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -tp=neoverse-n1") | |||
| elseif (${GCC_VERSION} VERSION_GREATER 9.4 OR ${GCC_VERSION} VERSION_EQUAL 9.4) | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve -mtune=neoverse-n1") | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a -mtune=neoverse-n1") | |||
| else () | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve") | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a") | |||
| endif() | |||
| endif () | |||
| endif () | |||
| @@ -260,13 +260,13 @@ endif () | |||
| if (${CORE} STREQUAL CORTEXA510) | |||
| if (NOT DYNAMIC_ARCH) | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve") | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.4-a+sve") | |||
| endif () | |||
| endif () | |||
| if (${CORE} STREQUAL CORTEXA710) | |||
| if (NOT DYNAMIC_ARCH) | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve") | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.4-a+sve") | |||
| endif () | |||
| endif () | |||
| @@ -278,7 +278,7 @@ endif () | |||
| if (${CORE} STREQUAL CORTEXX2) | |||
| if (NOT DYNAMIC_ARCH) | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve") | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.4-a+sve") | |||
| endif () | |||
| endif () | |||
| @@ -6,7 +6,7 @@ enable_language(Fortran) | |||
| endif() | |||
| set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DADD${BU} -DCBLAS") | |||
| if (BINARY32 AND CMAKE_C_PLATFORM_ID MATCHES "MinGW" AND CMAKE_Fortran_COMPILER_VERSION VERSION_EQUAL 14.2) | |||
| if (BINARY32 AND CMAKE_C_PLATFORM_ID MATCHES "MinGW" AND CMAKE_Fortran_COMPILER_VERSION VERSION_GREATER 14.1) | |||
| list(REMOVE_ITEM ${CMAKE_Fortran_FLAGS} -O3 -O2 -O1 -Os) | |||
| set (CMAKE_Fortran_FLAGS_RELEASE "" CACHE STRING "" FORCE) | |||
| endif() | |||
| @@ -851,9 +851,20 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, IFLOAT *sa, IF | |||
| /* Objective function come from sum of partitions in m and n. */ | |||
| /* (n / nthreads_n) + (m / nthreads_m) */ | |||
| /* = (n * nthreads_m + m * nthreads_n) / (nthreads_n * nthreads_m) */ | |||
| while (nthreads_m % 2 == 0 && n * nthreads_m + m * nthreads_n > n * (nthreads_m / 2) + m * (nthreads_n * 2)) { | |||
| nthreads_m /= 2; | |||
| nthreads_n *= 2; | |||
| BLASLONG cost = 0, div = 0; | |||
| BLASLONG i; | |||
| for (i = 1; i <= sqrt(nthreads_m); i++) { | |||
| if (nthreads_m % i) continue; | |||
| BLASLONG j = nthreads_m / i; | |||
| BLASLONG cost_i = n * j + m * nthreads_n * i; | |||
| BLASLONG cost_j = n * i + m * nthreads_n * j; | |||
| if (cost == 0 || | |||
| cost_i < cost) {cost = cost_i; div = i;} | |||
| if (cost_j < cost) {cost = cost_j; div = j;} | |||
| } | |||
| if (div > 1) { | |||
| nthreads_m /= div; | |||
| nthreads_n *= div; | |||
| } | |||
| } | |||
| @@ -417,21 +417,24 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS | |||
| PRINT_DEBUG_CNAME; | |||
| #if !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) && defined(USE_SGEMM_KERNEL_DIRECT) | |||
| #if defined(DYNAMIC_ARCH) && defined(ARCH_x86) | |||
| if (support_avx512() ) | |||
| #if !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) | |||
| #if defined(ARCH_x86) && (defined(USE_SGEMM_KERNEL_DIRECT)||defined(DYNAMIC_ARCH)) | |||
| #if defined(DYNAMIC_ARCH) | |||
| if (support_avx512() ) | |||
| #endif | |||
| if (beta == 0 && alpha == 1.0 && order == CblasRowMajor && TransA == CblasNoTrans && TransB == CblasNoTrans && SGEMM_DIRECT_PERFORMANT(m,n,k)) { | |||
| SGEMM_DIRECT(m, n, k, a, lda, b, ldb, c, ldc); | |||
| return; | |||
| } | |||
| #endif | |||
| #if defined(DYNAMIC_ARCH) && defined(ARCH_ARM64) | |||
| if (support_sme1()){ | |||
| #if defined(ARCH_ARM64) && (defined(USE_SGEMM_KERNEL_DIRECT)||defined(DYNAMIC_ARCH)) | |||
| #if defined(DYNAMIC_ARCH) | |||
| if (support_sme1()) | |||
| #endif | |||
| if (beta == 0 && alpha == 1.0 && order == CblasRowMajor && TransA == CblasNoTrans && TransB == CblasNoTrans) { | |||
| SGEMM_DIRECT(m, n, k, a, lda, b, ldb, c, ldc); | |||
| return; | |||
| } | |||
| } | |||
| #endif | |||
| #endif | |||
| @@ -98,7 +98,7 @@ void CNAME(blasint n, FLOAT alpha_r, void *vx, blasint incx){ | |||
| if (nthreads == 1) { | |||
| #endif | |||
| SCAL_K(n, 0, 0, alpha[0], alpha[1], x, incx, NULL, 0, NULL, 0); | |||
| SCAL_K(n, 0, 0, alpha[0], alpha[1], x, incx, NULL, 0, NULL, 1); | |||
| #ifdef SMP | |||
| } else { | |||
| @@ -108,7 +108,7 @@ void CNAME(blasint n, FLOAT alpha_r, void *vx, blasint incx){ | |||
| mode = BLAS_SINGLE | BLAS_COMPLEX; | |||
| #endif | |||
| blas_level1_thread(mode, n, 0, 0, alpha, x, incx, NULL, 0, NULL, 0, (int (*)(void))SCAL_K, nthreads); | |||
| blas_level1_thread(mode, n, 0, 0, alpha, x, incx, NULL, 0, NULL, 1, (int (*)(void))SCAL_K, nthreads); | |||
| } | |||
| #endif | |||
| @@ -116,12 +116,12 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, | |||
| #else | |||
| void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, int n, FLOAT alpha, FLOAT *x, int incx, FLOAT *a, int lda) { | |||
| void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, int n, void* valpha, FLOAT *x, int incx, FLOAT *a, int lda) { | |||
| FLOAT *buffer; | |||
| int uplo; | |||
| blasint info; | |||
| FLOAT * ALPHA = α | |||
| FLOAT * ALPHA = (FLOAT*)valpha; | |||
| FLOAT alpha_r = ALPHA[0]; | |||
| FLOAT alpha_i = ALPHA[1]; | |||
| #ifdef SMP | |||
| @@ -27,65 +27,56 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| /************************************************************************************** | |||
| * 2013/09/14 Saar | |||
| * BLASTEST float : OK | |||
| * BLASTEST double : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * BLASTEST float : OK | |||
| * BLASTEST double : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * | |||
| **************************************************************************************/ | |||
| #include "common.h" | |||
| // The c/zscal_k function is called not only by cblas_c/zscal but also by other upper-level interfaces. | |||
| // In certain cases, the expected return values for cblas_s/zscal differ from those of other upper-level interfaces. | |||
| // To handle this, we use the dummy2 parameter to differentiate between them. | |||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | |||
| { | |||
| BLASLONG i=0; | |||
| BLASLONG inc_x2; | |||
| BLASLONG ip = 0; | |||
| FLOAT temp; | |||
| BLASLONG i = 0; | |||
| BLASLONG inc_x2; | |||
| BLASLONG ip = 0; | |||
| FLOAT temp; | |||
| if ( (n <= 0) || (inc_x <= 0)) | |||
| return(0); | |||
| if ((n <= 0) || (inc_x <= 0)) | |||
| return(0); | |||
| inc_x2 = 2 * inc_x; | |||
| if (dummy2 == 0) { | |||
| for (i = 0; i < n; i++) | |||
| { | |||
| if (da_r == 0.0 && da_i == 0.0) | |||
| { | |||
| x[ip] = 0.0; | |||
| x[ip+1] = 0.0; | |||
| } | |||
| else | |||
| { | |||
| temp = da_r * x[ip] - da_i * x[ip+1]; | |||
| x[ip+1] = da_r * x[ip+1] + da_i * x[ip] ; | |||
| x[ip] = temp; | |||
| } | |||
| inc_x2 = 2 * inc_x; | |||
| for ( i=0; i<n; i++ ) | |||
| { | |||
| if ( da_r == 0.0 ) | |||
| { | |||
| if ( da_i == 0.0 ) | |||
| { | |||
| temp = 0.0; | |||
| x[ip+1] = 0.0 ; | |||
| } | |||
| else | |||
| { | |||
| temp = - da_i * x[ip+1] ; | |||
| if (isnan(x[ip]) || isinf(x[ip])) temp = NAN; | |||
| if (!isinf(x[ip+1])) | |||
| x[ip+1] = da_i * x[ip] ; | |||
| else x[ip+1] = NAN; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| if ( da_i == 0.0 ) | |||
| { | |||
| temp = da_r * x[ip] ; | |||
| x[ip+1] = da_r * x[ip+1]; | |||
| } | |||
| else | |||
| { | |||
| temp = da_r * x[ip] - da_i * x[ip+1] ; | |||
| x[ip+1] = da_r * x[ip+1] + da_i * x[ip] ; | |||
| } | |||
| } | |||
| x[ip] = temp; | |||
| ip += inc_x2; | |||
| } | |||
| return(0); | |||
| } | |||
| for (i = 0; i < n; i++) | |||
| { | |||
| temp = da_r * x[ip] - da_i * x[ip+1]; | |||
| x[ip+1] = da_r * x[ip+1] + da_i * x[ip] ; | |||
| ip += inc_x2; | |||
| } | |||
| return(0); | |||
| x[ip] = temp; | |||
| ip += inc_x2; | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -134,7 +134,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| " fadd v4.4s, v4.4s, v6.4s \n" \ | |||
| " fadd v0.4s, v0.4s, v4.4s \n" \ | |||
| " faddp v0.4s, v0.4s, v0.4s \n" \ | |||
| " faddp v0.4s, v0.4s, v0.4s \n" | |||
| " faddp "OUT", v0.2s \n" | |||
| #else /* !defined(DSDOT) */ | |||
| #define KERNEL_F1 \ | |||
| @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define INC_X x4 /* X stride */ | |||
| #define I x5 /* loop variable */ | |||
| #define X_COPY x6 /* Copy of X */ | |||
| #define FLAG x7 | |||
| /******************************************************************************* | |||
| * Macro definitions | |||
| *******************************************************************************/ | |||
| @@ -216,6 +216,9 @@ zscal_begin: | |||
| cmp N, xzr | |||
| ble .Lzscal_kernel_L999 | |||
| ldr FLAG, [sp] | |||
| cmp FLAG, #1 | |||
| beq .Lzscal_kernel_RI_non_zero | |||
| fcmp DA_R, #0.0 | |||
| bne .Lzscal_kernel_R_non_zero | |||
| @@ -228,7 +231,7 @@ zscal_begin: | |||
| .Lzscal_kernel_R_non_zero: | |||
| fcmp DA_I, #0.0 | |||
| beq .Lzscal_kernel_I_zero | |||
| //QUAK beq .Lzscal_kernel_I_zero | |||
| /******************************************************************************* | |||
| * A_R != 0 && A_I != 0 | |||
| @@ -0,0 +1,332 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ | |||
| BLASLONG i, j; | |||
| IFLOAT *aoffset; | |||
| IFLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; | |||
| IFLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8; | |||
| IFLOAT *aoffset9, *aoffset10, *aoffset11, *aoffset12; | |||
| IFLOAT *aoffset13, *aoffset14, *aoffset15, *aoffset16; | |||
| IFLOAT *boffset; | |||
| IFLOAT ctemp01, ctemp02, ctemp03, ctemp04; | |||
| IFLOAT ctemp05, ctemp06, ctemp07, ctemp08; | |||
| IFLOAT ctemp09, ctemp10, ctemp11, ctemp12; | |||
| IFLOAT ctemp13, ctemp14, ctemp15, ctemp16; | |||
| IFLOAT ctemp17, ctemp18, ctemp19, ctemp20; | |||
| IFLOAT ctemp21, ctemp22, ctemp23, ctemp24; | |||
| IFLOAT ctemp25, ctemp26, ctemp27, ctemp28; | |||
| IFLOAT ctemp29, ctemp30, ctemp31, ctemp32; | |||
| aoffset = a; | |||
| boffset = b; | |||
| lda *= 2; | |||
| j = (n >> 4); | |||
| if (j > 0){ | |||
| do{ | |||
| aoffset1 = aoffset; | |||
| aoffset2 = aoffset1 + lda; | |||
| aoffset3 = aoffset2 + lda; | |||
| aoffset4 = aoffset3 + lda; | |||
| aoffset5 = aoffset4 + lda; | |||
| aoffset6 = aoffset5 + lda; | |||
| aoffset7 = aoffset6 + lda; | |||
| aoffset8 = aoffset7 + lda; | |||
| aoffset9 = aoffset8 + lda; | |||
| aoffset10 = aoffset9 + lda; | |||
| aoffset11 = aoffset10 + lda; | |||
| aoffset12 = aoffset11 + lda; | |||
| aoffset13 = aoffset12 + lda; | |||
| aoffset14 = aoffset13 + lda; | |||
| aoffset15 = aoffset14 + lda; | |||
| aoffset16 = aoffset15 + lda; | |||
| aoffset += 16 * lda; | |||
| i = m; | |||
| if (i > 0){ | |||
| do{ | |||
| ctemp01 = *(aoffset1 + 0); | |||
| ctemp02 = *(aoffset1 + 1); | |||
| ctemp03 = *(aoffset2 + 0); | |||
| ctemp04 = *(aoffset2 + 1); | |||
| ctemp05 = *(aoffset3 + 0); | |||
| ctemp06 = *(aoffset3 + 1); | |||
| ctemp07 = *(aoffset4 + 0); | |||
| ctemp08 = *(aoffset4 + 1); | |||
| ctemp09 = *(aoffset5 + 0); | |||
| ctemp10 = *(aoffset5 + 1); | |||
| ctemp11 = *(aoffset6 + 0); | |||
| ctemp12 = *(aoffset6 + 1); | |||
| ctemp13 = *(aoffset7 + 0); | |||
| ctemp14 = *(aoffset7 + 1); | |||
| ctemp15 = *(aoffset8 + 0); | |||
| ctemp16 = *(aoffset8 + 1); | |||
| ctemp17 = *(aoffset9 + 0); | |||
| ctemp18 = *(aoffset9 + 1); | |||
| ctemp19 = *(aoffset10 + 0); | |||
| ctemp20 = *(aoffset10 + 1); | |||
| ctemp21 = *(aoffset11 + 0); | |||
| ctemp22 = *(aoffset11 + 1); | |||
| ctemp23 = *(aoffset12 + 0); | |||
| ctemp24 = *(aoffset12 + 1); | |||
| ctemp25 = *(aoffset13 + 0); | |||
| ctemp26 = *(aoffset13 + 1); | |||
| ctemp27 = *(aoffset14 + 0); | |||
| ctemp28 = *(aoffset14 + 1); | |||
| ctemp29 = *(aoffset15 + 0); | |||
| ctemp30 = *(aoffset15 + 1); | |||
| ctemp31 = *(aoffset16 + 0); | |||
| ctemp32 = *(aoffset16 + 1); | |||
| *(boffset + 0) = ctemp01; | |||
| *(boffset + 1) = ctemp02; | |||
| *(boffset + 2) = ctemp03; | |||
| *(boffset + 3) = ctemp04; | |||
| *(boffset + 4) = ctemp05; | |||
| *(boffset + 5) = ctemp06; | |||
| *(boffset + 6) = ctemp07; | |||
| *(boffset + 7) = ctemp08; | |||
| *(boffset + 8) = ctemp09; | |||
| *(boffset + 9) = ctemp10; | |||
| *(boffset + 10) = ctemp11; | |||
| *(boffset + 11) = ctemp12; | |||
| *(boffset + 12) = ctemp13; | |||
| *(boffset + 13) = ctemp14; | |||
| *(boffset + 14) = ctemp15; | |||
| *(boffset + 15) = ctemp16; | |||
| *(boffset + 16) = ctemp17; | |||
| *(boffset + 17) = ctemp18; | |||
| *(boffset + 18) = ctemp19; | |||
| *(boffset + 19) = ctemp20; | |||
| *(boffset + 20) = ctemp21; | |||
| *(boffset + 21) = ctemp22; | |||
| *(boffset + 22) = ctemp23; | |||
| *(boffset + 23) = ctemp24; | |||
| *(boffset + 24) = ctemp25; | |||
| *(boffset + 25) = ctemp26; | |||
| *(boffset + 26) = ctemp27; | |||
| *(boffset + 27) = ctemp28; | |||
| *(boffset + 28) = ctemp29; | |||
| *(boffset + 29) = ctemp30; | |||
| *(boffset + 30) = ctemp31; | |||
| *(boffset + 31) = ctemp32; | |||
| aoffset1 += 2; | |||
| aoffset2 += 2; | |||
| aoffset3 += 2; | |||
| aoffset4 += 2; | |||
| aoffset5 += 2; | |||
| aoffset6 += 2; | |||
| aoffset7 += 2; | |||
| aoffset8 += 2; | |||
| aoffset9 += 2; | |||
| aoffset10 += 2; | |||
| aoffset11 += 2; | |||
| aoffset12 += 2; | |||
| aoffset13 += 2; | |||
| aoffset14 += 2; | |||
| aoffset15 += 2; | |||
| aoffset16 += 2; | |||
| boffset += 32; | |||
| i --; | |||
| }while(i > 0); | |||
| } | |||
| j--; | |||
| }while(j > 0); | |||
| } /* end of if(j > 0) */ | |||
| if (n & 8){ | |||
| aoffset1 = aoffset; | |||
| aoffset2 = aoffset1 + lda; | |||
| aoffset3 = aoffset2 + lda; | |||
| aoffset4 = aoffset3 + lda; | |||
| aoffset5 = aoffset4 + lda; | |||
| aoffset6 = aoffset5 + lda; | |||
| aoffset7 = aoffset6 + lda; | |||
| aoffset8 = aoffset7 + lda; | |||
| aoffset += 8 * lda; | |||
| i = m; | |||
| if (i > 0){ | |||
| do{ | |||
| ctemp01 = *(aoffset1 + 0); | |||
| ctemp02 = *(aoffset1 + 1); | |||
| ctemp03 = *(aoffset2 + 0); | |||
| ctemp04 = *(aoffset2 + 1); | |||
| ctemp05 = *(aoffset3 + 0); | |||
| ctemp06 = *(aoffset3 + 1); | |||
| ctemp07 = *(aoffset4 + 0); | |||
| ctemp08 = *(aoffset4 + 1); | |||
| ctemp09 = *(aoffset5 + 0); | |||
| ctemp10 = *(aoffset5 + 1); | |||
| ctemp11 = *(aoffset6 + 0); | |||
| ctemp12 = *(aoffset6 + 1); | |||
| ctemp13 = *(aoffset7 + 0); | |||
| ctemp14 = *(aoffset7 + 1); | |||
| ctemp15 = *(aoffset8 + 0); | |||
| ctemp16 = *(aoffset8 + 1); | |||
| *(boffset + 0) = ctemp01; | |||
| *(boffset + 1) = ctemp02; | |||
| *(boffset + 2) = ctemp03; | |||
| *(boffset + 3) = ctemp04; | |||
| *(boffset + 4) = ctemp05; | |||
| *(boffset + 5) = ctemp06; | |||
| *(boffset + 6) = ctemp07; | |||
| *(boffset + 7) = ctemp08; | |||
| *(boffset + 8) = ctemp09; | |||
| *(boffset + 9) = ctemp10; | |||
| *(boffset + 10) = ctemp11; | |||
| *(boffset + 11) = ctemp12; | |||
| *(boffset + 12) = ctemp13; | |||
| *(boffset + 13) = ctemp14; | |||
| *(boffset + 14) = ctemp15; | |||
| *(boffset + 15) = ctemp16; | |||
| aoffset1 += 2; | |||
| aoffset2 += 2; | |||
| aoffset3 += 2; | |||
| aoffset4 += 2; | |||
| aoffset5 += 2; | |||
| aoffset6 += 2; | |||
| aoffset7 += 2; | |||
| aoffset8 += 2; | |||
| boffset += 16; | |||
| i --; | |||
| }while(i > 0); | |||
| } | |||
| } | |||
| if (n & 4){ | |||
| aoffset1 = aoffset; | |||
| aoffset2 = aoffset1 + lda; | |||
| aoffset3 = aoffset2 + lda; | |||
| aoffset4 = aoffset3 + lda; | |||
| aoffset += 4 * lda; | |||
| i = m; | |||
| if (i > 0){ | |||
| do{ | |||
| ctemp01 = *(aoffset1 + 0); | |||
| ctemp02 = *(aoffset1 + 1); | |||
| ctemp03 = *(aoffset2 + 0); | |||
| ctemp04 = *(aoffset2 + 1); | |||
| ctemp05 = *(aoffset3 + 0); | |||
| ctemp06 = *(aoffset3 + 1); | |||
| ctemp07 = *(aoffset4 + 0); | |||
| ctemp08 = *(aoffset4 + 1); | |||
| *(boffset + 0) = ctemp01; | |||
| *(boffset + 1) = ctemp02; | |||
| *(boffset + 2) = ctemp03; | |||
| *(boffset + 3) = ctemp04; | |||
| *(boffset + 4) = ctemp05; | |||
| *(boffset + 5) = ctemp06; | |||
| *(boffset + 6) = ctemp07; | |||
| *(boffset + 7) = ctemp08; | |||
| aoffset1 += 2; | |||
| aoffset2 += 2; | |||
| aoffset3 += 2; | |||
| aoffset4 += 2; | |||
| boffset += 8; | |||
| i --; | |||
| }while(i > 0); | |||
| } | |||
| } /* end of if(j > 0) */ | |||
| if (n & 2){ | |||
| aoffset1 = aoffset; | |||
| aoffset2 = aoffset1 + lda; | |||
| aoffset += 2 * lda; | |||
| i = m; | |||
| if (i > 0){ | |||
| do{ | |||
| ctemp01 = *(aoffset1 + 0); | |||
| ctemp02 = *(aoffset1 + 1); | |||
| ctemp03 = *(aoffset2 + 0); | |||
| ctemp04 = *(aoffset2 + 1); | |||
| *(boffset + 0) = ctemp01; | |||
| *(boffset + 1) = ctemp02; | |||
| *(boffset + 2) = ctemp03; | |||
| *(boffset + 3) = ctemp04; | |||
| aoffset1 += 2; | |||
| aoffset2 += 2; | |||
| boffset += 4; | |||
| i --; | |||
| }while(i > 0); | |||
| } | |||
| } /* end of if(j > 0) */ | |||
| if (n & 1){ | |||
| aoffset1 = aoffset; | |||
| i = m; | |||
| if (i > 0){ | |||
| do{ | |||
| ctemp01 = *(aoffset1 + 0); | |||
| ctemp02 = *(aoffset1 + 1); | |||
| *(boffset + 0) = ctemp01; | |||
| *(boffset + 1) = ctemp02; | |||
| aoffset1 += 2; | |||
| boffset += 2; | |||
| i --; | |||
| }while(i > 0); | |||
| } | |||
| } /* end of if(j > 0) */ | |||
| return 0; | |||
| } | |||
| @@ -45,18 +45,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define S6 $r17 | |||
| #define S7 $r18 | |||
| #define S8 $r19 | |||
| #define S9 $r20 | |||
| #define S10 $r23 | |||
| #define S11 $r24 | |||
| #define S12 $r25 | |||
| #define S13 $r26 | |||
| #define S14 $r27 | |||
| #define S15 $r28 | |||
| #define S16 $r29 | |||
| #define TD $r30 | |||
| #define TS $r31 | |||
| #define S9 $r23 | |||
| #define S10 $r24 | |||
| #define S11 $r25 | |||
| #define S12 $r26 | |||
| #define S13 $r27 | |||
| #define S14 $r28 | |||
| #define S15 $r29 | |||
| #define S16 $r30 | |||
| #define TD $r20 | |||
| #define TS $r11 | |||
| #define TL $r7 | |||
| #define T0 $r6 | |||
| #define ZERO $r0 | |||
| #define F0 $f0 | |||
| @@ -67,6 +66,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define F5 $f5 | |||
| #define F6 $f6 | |||
| #define F7 $f7 | |||
| #define F8 $f8 | |||
| #define F9 $f9 | |||
| #define F10 $f10 | |||
| #define F11 $f11 | |||
| #define F12 $f12 | |||
| #define F13 $f13 | |||
| #define F14 $f14 | |||
| #define F15 $f15 | |||
| /* LASX vectors */ | |||
| #define U0 $xr0 | |||
| #define U1 $xr1 | |||
| @@ -103,589 +110,232 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| PROLOGUE | |||
| addi.d $sp, $sp, -0x90 | |||
| SDARG $r23, $sp, 0x00 | |||
| SDARG $r24, $sp, 0x08 | |||
| SDARG $r25, $sp, 0x10 | |||
| SDARG $r26, $sp, 0x18 | |||
| SDARG $r27, $sp, 0x20 | |||
| SDARG $r28, $sp, 0x28 | |||
| SDARG $r29, $sp, 0x30 | |||
| SDARG $r30, $sp, 0x38 | |||
| SDARG $r31, $sp, 0x40 | |||
| ST $f23, $sp, 0x48 | |||
| ST $f24, $sp, 0x50 | |||
| ST $f25, $sp, 0x58 | |||
| ST $f26, $sp, 0x60 | |||
| ST $f27, $sp, 0x68 | |||
| ST $f28, $sp, 0x70 | |||
| ST $f29, $sp, 0x78 | |||
| ST $f30, $sp, 0x80 | |||
| ST $f31, $sp, 0x88 | |||
| move TD, DST | |||
| move TS, SRC | |||
| slli.d TL, LDA, 0x03 | |||
| slli.d T0, TL, 0x01 | |||
| srai.d J, N, 0x04 | |||
| addi.d $sp, $sp, -64 | |||
| SDARG $r23, $sp, 0 | |||
| SDARG $r24, $sp, 8 | |||
| SDARG $r25, $sp, 16 | |||
| SDARG $r26, $sp, 24 | |||
| SDARG $r27, $sp, 32 | |||
| SDARG $r28, $sp, 40 | |||
| SDARG $r29, $sp, 48 | |||
| SDARG $r30, $sp, 56 | |||
| move TD, DST //boffset | |||
| move TS, SRC //aoffset | |||
| slli.d TL, LDA, 0x03 //lda | |||
| srai.d J, N, 0x04 //j | |||
| beq J, ZERO, .L_N8 | |||
| .L_J1: /* J-- */ | |||
| .L_J1: /* if(j>0) j--*/ | |||
| move S1, TS | |||
| add.d S2, TS, TL | |||
| srai.d I, M, 0x03 | |||
| move I, M | |||
| add.d S3, S2, TL | |||
| addi.d J, J, -1 | |||
| add.d S4, S3, TL | |||
| add.d S5, S3, T0 | |||
| add.d S6, S4, T0 | |||
| add.d S7, S5, T0 | |||
| add.d S8, S6, T0 | |||
| add.d S9, S7, T0 | |||
| add.d S10, S8, T0 | |||
| add.d S11, S9, T0 | |||
| add.d S12, S10, T0 | |||
| add.d S13, S11, T0 | |||
| add.d S14, S12, T0 | |||
| add.d S15, S13, T0 | |||
| add.d S16, S14, T0 | |||
| add.d TS, S15, T0 | |||
| beq I, ZERO, .L_I7 | |||
| .L_I1: /* I-- */ | |||
| xvld U0, S1, 0x00 | |||
| xvld U1, S2, 0x00 | |||
| xvld U2, S3, 0x00 | |||
| xvld U3, S4, 0x00 | |||
| xvld U4, S5, 0x00 | |||
| xvld U5, S6, 0x00 | |||
| xvld U6, S7, 0x00 | |||
| xvld U7, S8, 0x00 | |||
| xvld U8, S9, 0x00 | |||
| xvld U9, S10, 0x00 | |||
| xvld U10, S11, 0x00 | |||
| xvld U11, S12, 0x00 | |||
| xvld U12, S13, 0x00 | |||
| xvld U13, S14, 0x00 | |||
| xvld U14, S15, 0x00 | |||
| xvld U15, S16, 0x00 | |||
| xvpackev.d D0, U1, U0 | |||
| xvpackod.d D1, U1, U0 | |||
| xvpackev.d D2, U3, U2 | |||
| xvpackod.d D3, U3, U2 | |||
| xvpackev.d D4, U5, U4 | |||
| xvpackod.d D5, U5, U4 | |||
| xvpackev.d D6, U7, U6 | |||
| xvpackod.d D7, U7, U6 | |||
| xvpackev.d D8, U9, U8 | |||
| xvpackod.d D9, U9, U8 | |||
| xvpackev.d D10, U11, U10 | |||
| xvpackod.d D11, U11, U10 | |||
| xvpackev.d D12, U13, U12 | |||
| xvpackod.d D13, U13, U12 | |||
| xvpackev.d D14, U15, U14 | |||
| xvpackod.d D15, U15, U14 | |||
| xvand.v U0, D0, D0 | |||
| xvpermi.q D0, D2, 0x02 // 0 | |||
| xvand.v U4, D4, D4 | |||
| xvpermi.q D4, D6, 0x02 // 1 | |||
| xvand.v U1, D1, D1 | |||
| xvpermi.q D1, D3, 0x02 // 4 | |||
| xvand.v U5, D5, D5 | |||
| xvpermi.q D5, D7, 0x02 // 5 | |||
| xvpermi.q D2, U0, 0x31 // 8 | |||
| xvpermi.q D6, U4, 0x31 // 9 | |||
| xvpermi.q D3, U1, 0x31 // 12 | |||
| xvpermi.q D7, U5, 0x31 // 13 | |||
| xvand.v U8, D8, D8 | |||
| xvpermi.q D8, D10, 0x02 // 2 | |||
| xvand.v U12, D12, D12 | |||
| xvpermi.q D12, D14, 0x02 // 3 | |||
| xvand.v U9, D9, D9 | |||
| xvpermi.q D9, D11, 0x02 // 6 | |||
| xvand.v U13, D13, D13 | |||
| xvpermi.q D13, D15, 0x02 // 7 | |||
| xvpermi.q D10, U8, 0x31 // 10 | |||
| xvpermi.q D14, U12, 0x31 // 11 | |||
| xvpermi.q D11, U9, 0x31 // 14 | |||
| xvpermi.q D15, U13, 0x31 // 15 | |||
| xvst D0, TD, 0x00 // 0 | |||
| xvst D4, TD, 0x20 // 1 | |||
| xvst D8, TD, 0x40 // 2 | |||
| xvst D12, TD, 0x60 // 3 | |||
| xvst D1, TD, 0x80 // 4 | |||
| xvst D5, TD, 0xA0 // 5 | |||
| xvst D9, TD, 0xC0 // 6 | |||
| xvst D13, TD, 0xE0 // 7 | |||
| addi.d TD, TD, 0x100 | |||
| xvst D2, TD, 0x00 // 8 | |||
| xvst D6, TD, 0x20 // 9 | |||
| xvst D10, TD, 0x40 // 10 | |||
| xvst D14, TD, 0x60 // 11 | |||
| xvst D3, TD, 0x80 // 12 | |||
| xvst D7, TD, 0xA0 // 13 | |||
| xvst D11, TD, 0xC0 // 14 | |||
| xvst D15, TD, 0xE0 // 15 | |||
| addi.d TD, TD, 0x100 | |||
| xvld U0, S1, 0x20 | |||
| xvld U1, S2, 0x20 | |||
| xvld U2, S3, 0x20 | |||
| xvld U3, S4, 0x20 | |||
| xvld U4, S5, 0x20 | |||
| xvld U5, S6, 0x20 | |||
| xvld U6, S7, 0x20 | |||
| xvld U7, S8, 0x20 | |||
| xvld U8, S9, 0x20 | |||
| xvld U9, S10, 0x20 | |||
| xvld U10, S11, 0x20 | |||
| xvld U11, S12, 0x20 | |||
| xvld U12, S13, 0x20 | |||
| xvld U13, S14, 0x20 | |||
| xvld U14, S15, 0x20 | |||
| xvld U15, S16, 0x20 | |||
| xvpackev.d D0, U1, U0 | |||
| xvpackod.d D1, U1, U0 | |||
| xvpackev.d D2, U3, U2 | |||
| xvpackod.d D3, U3, U2 | |||
| xvpackev.d D4, U5, U4 | |||
| xvpackod.d D5, U5, U4 | |||
| xvpackev.d D6, U7, U6 | |||
| xvpackod.d D7, U7, U6 | |||
| xvpackev.d D8, U9, U8 | |||
| xvpackod.d D9, U9, U8 | |||
| xvpackev.d D10, U11, U10 | |||
| xvpackod.d D11, U11, U10 | |||
| xvpackev.d D12, U13, U12 | |||
| xvpackod.d D13, U13, U12 | |||
| xvpackev.d D14, U15, U14 | |||
| xvpackod.d D15, U15, U14 | |||
| xvand.v U0, D0, D0 | |||
| xvpermi.q D0, D2, 0x02 // 0 | |||
| xvand.v U4, D4, D4 | |||
| xvpermi.q D4, D6, 0x02 // 1 | |||
| xvand.v U1, D1, D1 | |||
| xvpermi.q D1, D3, 0x02 // 4 | |||
| xvand.v U5, D5, D5 | |||
| xvpermi.q D5, D7, 0x02 // 5 | |||
| xvpermi.q D2, U0, 0x31 // 8 | |||
| xvpermi.q D6, U4, 0x31 // 9 | |||
| xvpermi.q D3, U1, 0x31 // 12 | |||
| xvpermi.q D7, U5, 0x31 // 13 | |||
| xvand.v U8, D8, D8 | |||
| xvpermi.q D8, D10, 0x02 // 2 | |||
| xvand.v U12, D12, D12 | |||
| xvpermi.q D12, D14, 0x02 // 3 | |||
| xvand.v U9, D9, D9 | |||
| xvpermi.q D9, D11, 0x02 // 6 | |||
| xvand.v U13, D13, D13 | |||
| xvpermi.q D13, D15, 0x02 // 7 | |||
| xvpermi.q D10, U8, 0x31 // 10 | |||
| xvpermi.q D14, U12, 0x31 // 11 | |||
| xvpermi.q D11, U9, 0x31 // 14 | |||
| xvpermi.q D15, U13, 0x31 // 15 | |||
| xvst D0, TD, 0x00 // 0 | |||
| xvst D4, TD, 0x20 // 1 | |||
| xvst D8, TD, 0x40 // 2 | |||
| xvst D12, TD, 0x60 // 3 | |||
| xvst D1, TD, 0x80 // 4 | |||
| xvst D5, TD, 0xA0 // 5 | |||
| xvst D9, TD, 0xC0 // 6 | |||
| xvst D13, TD, 0xE0 // 7 | |||
| addi.d TD, TD, 0x100 | |||
| xvst D2, TD, 0x00 // 8 | |||
| xvst D6, TD, 0x20 // 9 | |||
| xvst D10, TD, 0x40 // 10 | |||
| xvst D14, TD, 0x60 // 11 | |||
| xvst D3, TD, 0x80 // 12 | |||
| xvst D7, TD, 0xA0 // 13 | |||
| xvst D11, TD, 0xC0 // 14 | |||
| xvst D15, TD, 0xE0 // 15 | |||
| addi.d TD, TD, 0x100 | |||
| addi.d S1, S1, 0x40 | |||
| addi.d S2, S2, 0x40 | |||
| addi.d S3, S3, 0x40 | |||
| addi.d S4, S4, 0x40 | |||
| addi.d S5, S5, 0x40 | |||
| addi.d S6, S6, 0x40 | |||
| addi.d S7, S7, 0x40 | |||
| addi.d S8, S8, 0x40 | |||
| addi.d S9, S9, 0x40 | |||
| addi.d S10, S10, 0x40 | |||
| addi.d S11, S11, 0x40 | |||
| addi.d S12, S12, 0x40 | |||
| addi.d S13, S13, 0x40 | |||
| addi.d S14, S14, 0x40 | |||
| addi.d S15, S15, 0x40 | |||
| addi.d S16, S16, 0x40 | |||
| add.d S5, S4, TL | |||
| add.d S6, S5, TL | |||
| add.d S7, S6, TL | |||
| add.d S8, S7, TL | |||
| add.d S9, S8, TL | |||
| add.d S10, S9, TL | |||
| add.d S11, S10, TL | |||
| add.d S12, S11, TL | |||
| add.d S13, S12, TL | |||
| add.d S14, S13, TL | |||
| add.d S15, S14, TL | |||
| add.d S16, S15, TL | |||
| add.d TS, S16, TL | |||
| beq I, ZERO, .L_J11 | |||
| .L_I1: /* if(i>0) i--*/ | |||
| fld.d F0, S1, 0x00 | |||
| fld.d F1, S2, 0x00 | |||
| fld.d F2, S3, 0x00 | |||
| fld.d F3, S4, 0x00 | |||
| fld.d F4, S5, 0x00 | |||
| fld.d F5, S6, 0x00 | |||
| fld.d F6, S7, 0x00 | |||
| fld.d F7, S8, 0x00 | |||
| fst.d F0, TD, 0x00 | |||
| fst.d F1, TD, 0x08 | |||
| fst.d F2, TD, 0x10 | |||
| fst.d F3, TD, 0x18 | |||
| fst.d F4, TD, 0x20 | |||
| fst.d F5, TD, 0x28 | |||
| fst.d F6, TD, 0x30 | |||
| fst.d F7, TD, 0x38 | |||
| fld.d F0, S9, 0x00 | |||
| fld.d F1, S10, 0x00 | |||
| fld.d F2, S11, 0x00 | |||
| fld.d F3, S12, 0x00 | |||
| fld.d F4, S13, 0x00 | |||
| fld.d F5, S14, 0x00 | |||
| fld.d F6, S15, 0x00 | |||
| fld.d F7, S16, 0x00 | |||
| fst.d F0, TD, 0x40 | |||
| fst.d F1, TD, 0x48 | |||
| fst.d F2, TD, 0x50 | |||
| fst.d F3, TD, 0x58 | |||
| fst.d F4, TD, 0x60 | |||
| fst.d F5, TD, 0x68 | |||
| fst.d F6, TD, 0x70 | |||
| fst.d F7, TD, 0x78 | |||
| addi.d S1, S1, 0x08 | |||
| addi.d S2, S2, 0x08 | |||
| addi.d S3, S3, 0x08 | |||
| addi.d S4, S4, 0x08 | |||
| addi.d S5, S5, 0x08 | |||
| addi.d S6, S6, 0x08 | |||
| addi.d S7, S7, 0x08 | |||
| addi.d S8, S8, 0x08 | |||
| addi.d S9, S9, 0x08 | |||
| addi.d S10, S10, 0x08 | |||
| addi.d S11, S11, 0x08 | |||
| addi.d S12, S12, 0x08 | |||
| addi.d S13, S13, 0x08 | |||
| addi.d S14, S14, 0x08 | |||
| addi.d S15, S15, 0x08 | |||
| addi.d S16, S16, 0x08 | |||
| addi.d TD, TD, 0x80 | |||
| addi.d I, I, -1 | |||
| blt ZERO, I, .L_I1 | |||
| .L_I7: | |||
| andi I, M, 0x07 | |||
| beq I, ZERO, .L_I0 | |||
| .L_II1: /* I-- */ | |||
| fld.d F0, S1, 0x00 | |||
| fld.d F1, S2, 0x00 | |||
| fld.d F2, S3, 0x00 | |||
| fld.d F3, S4, 0x00 | |||
| fld.d F4, S5, 0x00 | |||
| fld.d F5, S6, 0x00 | |||
| fld.d F6, S7, 0x00 | |||
| fld.d F7, S8, 0x00 | |||
| fst.d F0, TD, 0x00 | |||
| addi.d S1, S1, 0x08 | |||
| fst.d F1, TD, 0x08 | |||
| addi.d S2, S2, 0x08 | |||
| fst.d F2, TD, 0x10 | |||
| addi.d S3, S3, 0x08 | |||
| fst.d F3, TD, 0x18 | |||
| addi.d S4, S4, 0x08 | |||
| fst.d F4, TD, 0x20 | |||
| addi.d S5, S5, 0x08 | |||
| fst.d F5, TD, 0x28 | |||
| addi.d S6, S6, 0x08 | |||
| fst.d F6, TD, 0x30 | |||
| addi.d S7, S7, 0x08 | |||
| fst.d F7, TD, 0x38 | |||
| addi.d S8, S8, 0x08 | |||
| addi.d TD, TD, 0x40 | |||
| fld.d F0, S9, 0x00 | |||
| fld.d F1, S10, 0x00 | |||
| fld.d F2, S11, 0x00 | |||
| fld.d F3, S12, 0x00 | |||
| fld.d F4, S13, 0x00 | |||
| fld.d F5, S14, 0x00 | |||
| fld.d F6, S15, 0x00 | |||
| fld.d F7, S16, 0x00 | |||
| fst.d F0, TD, 0x00 | |||
| addi.d S9, S9, 0x08 | |||
| fst.d F1, TD, 0x08 | |||
| addi.d S10, S10, 0x08 | |||
| fst.d F2, TD, 0x10 | |||
| addi.d S11, S11, 0x08 | |||
| fst.d F3, TD, 0x18 | |||
| addi.d S12, S12, 0x08 | |||
| fst.d F4, TD, 0x20 | |||
| addi.d S13, S13, 0x08 | |||
| fst.d F5, TD, 0x28 | |||
| addi.d S14, S14, 0x08 | |||
| fst.d F6, TD, 0x30 | |||
| addi.d S15, S15, 0x08 | |||
| fst.d F7, TD, 0x38 | |||
| addi.d S16, S16, 0x08 | |||
| addi.d TD, TD, 0x40 | |||
| addi.d I, I, -1 | |||
| blt ZERO, I, .L_II1 | |||
| .L_I0: | |||
| blt ZERO, J, .L_J1 | |||
| .L_N8: | |||
| andi J, N, 0x08 | |||
| beq ZERO, J, .L_N4 | |||
| .L_J11: /* j--*/ | |||
| addi.d J, J, -1 | |||
| blt ZERO, J, .L_J1 | |||
| .L_N8: /* if(n&8)*/ | |||
| andi I, N, 0x08 | |||
| beq I, ZERO, .L_N4 | |||
| move S1, TS | |||
| add.d S2, TS, TL | |||
| srai.d I, M, 0x03 | |||
| move I, M | |||
| add.d S3, S2, TL | |||
| add.d S4, S2, T0 | |||
| add.d S5, S3, T0 | |||
| add.d S6, S4, T0 | |||
| add.d S7, S5, T0 | |||
| add.d S8, S6, T0 | |||
| add.d TS, S7, T0 | |||
| beq I, ZERO, .L_8I3 | |||
| .L_8I1: /* I-- */ | |||
| xvld U0, S1, 0x00 | |||
| xvld U1, S2, 0x00 | |||
| xvld U2, S3, 0x00 | |||
| xvld U3, S4, 0x00 | |||
| xvld U4, S5, 0x00 | |||
| xvld U5, S6, 0x00 | |||
| xvld U6, S7, 0x00 | |||
| xvld U7, S8, 0x00 | |||
| xvpackev.d D0, U1, U0 | |||
| xvpackod.d D1, U1, U0 | |||
| xvpackev.d D2, U3, U2 | |||
| xvpackod.d D3, U3, U2 | |||
| xvpackev.d D4, U5, U4 | |||
| xvpackod.d D5, U5, U4 | |||
| xvpackev.d D6, U7, U6 | |||
| xvpackod.d D7, U7, U6 | |||
| xvand.v U0, D0, D0 | |||
| xvpermi.q D0, D2, 0x02 // 0 | |||
| xvand.v U4, D4, D4 | |||
| xvpermi.q D4, D6, 0x02 // 1 | |||
| xvand.v U1, D1, D1 | |||
| xvpermi.q D1, D3, 0x02 // 2 | |||
| xvand.v U5, D5, D5 | |||
| xvpermi.q D5, D7, 0x02 // 3 | |||
| xvpermi.q D2, U0, 0x31 // 4 | |||
| xvpermi.q D6, U4, 0x31 // 5 | |||
| xvpermi.q D3, U1, 0x31 // 6 | |||
| xvpermi.q D7, U5, 0x31 // 7 | |||
| xvst D0, TD, 0x00 | |||
| xvst D4, TD, 0x20 | |||
| xvst D1, TD, 0x40 | |||
| xvst D5, TD, 0x60 | |||
| xvst D2, TD, 0x80 | |||
| xvst D6, TD, 0xA0 | |||
| xvst D3, TD, 0xC0 | |||
| xvst D7, TD, 0xE0 | |||
| addi.d TD, TD, 0x100 | |||
| xvld U0, S1, 0x20 | |||
| xvld U1, S2, 0x20 | |||
| xvld U2, S3, 0x20 | |||
| xvld U3, S4, 0x20 | |||
| xvld U4, S5, 0x20 | |||
| xvld U5, S6, 0x20 | |||
| xvld U6, S7, 0x20 | |||
| xvld U7, S8, 0x20 | |||
| xvpackev.d D0, U1, U0 | |||
| xvpackod.d D1, U1, U0 | |||
| xvpackev.d D2, U3, U2 | |||
| xvpackod.d D3, U3, U2 | |||
| xvpackev.d D4, U5, U4 | |||
| xvpackod.d D5, U5, U4 | |||
| xvpackev.d D6, U7, U6 | |||
| xvpackod.d D7, U7, U6 | |||
| xvand.v U0, D0, D0 | |||
| xvpermi.q D0, D2, 0x02 // 0 | |||
| xvand.v U4, D4, D4 | |||
| xvpermi.q D4, D6, 0x02 // 1 | |||
| xvand.v U1, D1, D1 | |||
| xvpermi.q D1, D3, 0x02 // 2 | |||
| xvand.v U5, D5, D5 | |||
| xvpermi.q D5, D7, 0x02 // 3 | |||
| xvpermi.q D2, U0, 0x31 // 4 | |||
| xvpermi.q D6, U4, 0x31 // 5 | |||
| xvpermi.q D3, U1, 0x31 // 6 | |||
| xvpermi.q D7, U5, 0x31 // 7 | |||
| xvst D0, TD, 0x00 | |||
| xvst D4, TD, 0x20 | |||
| xvst D1, TD, 0x40 | |||
| xvst D5, TD, 0x60 | |||
| xvst D2, TD, 0x80 | |||
| xvst D6, TD, 0xA0 | |||
| xvst D3, TD, 0xC0 | |||
| xvst D7, TD, 0xE0 | |||
| addi.d TD, TD, 0x100 | |||
| addi.d S1, S1, 0x40 | |||
| addi.d S2, S2, 0x40 | |||
| addi.d S3, S3, 0x40 | |||
| addi.d S4, S4, 0x40 | |||
| addi.d S5, S5, 0x40 | |||
| addi.d S6, S6, 0x40 | |||
| addi.d S7, S7, 0x40 | |||
| addi.d S8, S8, 0x40 | |||
| add.d S4, S3, TL | |||
| add.d S5, S4, TL | |||
| add.d S6, S5, TL | |||
| add.d S7, S6, TL | |||
| add.d S8, S7, TL | |||
| add.d TS, S8, TL | |||
| beq I, ZERO, .L_N4 | |||
| .L_N81: /* if(i>0) i--*/ | |||
| fld.d F0, S1, 0x00 | |||
| fld.d F1, S2, 0x00 | |||
| fld.d F2, S3, 0x00 | |||
| fld.d F3, S4, 0x00 | |||
| fld.d F4, S5, 0x00 | |||
| fld.d F5, S6, 0x00 | |||
| fld.d F6, S7, 0x00 | |||
| fld.d F7, S8, 0x00 | |||
| fst.d F0, TD, 0x00 | |||
| fst.d F1, TD, 0x08 | |||
| fst.d F2, TD, 0x10 | |||
| fst.d F3, TD, 0x18 | |||
| fst.d F4, TD, 0x20 | |||
| fst.d F5, TD, 0x28 | |||
| fst.d F6, TD, 0x30 | |||
| fst.d F7, TD, 0x38 | |||
| addi.d S1, S1, 0x08 | |||
| addi.d S2, S2, 0x08 | |||
| addi.d S3, S3, 0x08 | |||
| addi.d S4, S4, 0x08 | |||
| addi.d S5, S5, 0x08 | |||
| addi.d S6, S6, 0x08 | |||
| addi.d S7, S7, 0x08 | |||
| addi.d S8, S8, 0x08 | |||
| addi.d TD, TD, 0x40 | |||
| addi.d I, I, -1 | |||
| blt ZERO, I, .L_8I1 | |||
| .L_8I3: | |||
| andi I, M, 0x07 | |||
| beq I, ZERO, .L_N4 | |||
| .L_8I11: | |||
| fld.d F0, S1, 0x00 | |||
| fld.d F1, S2, 0x00 | |||
| fld.d F2, S3, 0x00 | |||
| fld.d F3, S4, 0x00 | |||
| fld.d F4, S5, 0x00 | |||
| fld.d F5, S6, 0x00 | |||
| fld.d F6, S7, 0x00 | |||
| fld.d F7, S8, 0x00 | |||
| fst.d F0, TD, 0x00 | |||
| addi.d S1, S1, 0x08 | |||
| fst.d F1, TD, 0x08 | |||
| addi.d S2, S2, 0x08 | |||
| fst.d F2, TD, 0x10 | |||
| addi.d S3, S3, 0x08 | |||
| fst.d F3, TD, 0x18 | |||
| addi.d S4, S4, 0x08 | |||
| fst.d F4, TD, 0x20 | |||
| addi.d S5, S5, 0x08 | |||
| fst.d F5, TD, 0x28 | |||
| addi.d S6, S6, 0x08 | |||
| fst.d F6, TD, 0x30 | |||
| addi.d S7, S7, 0x08 | |||
| fst.d F7, TD, 0x38 | |||
| addi.d S8, S8, 0x08 | |||
| addi.d TD, TD, 0x40 | |||
| addi.d I, I, -1 | |||
| blt ZERO, I, .L_8I11 | |||
| .L_N4: | |||
| andi J, N, 0x04 | |||
| beq ZERO, J, .L_N2 | |||
| blt ZERO, I, .L_N81 | |||
| .L_N4: /* if(n&4)*/ | |||
| andi I, N, 0x04 | |||
| beq I, ZERO, .L_N2 | |||
| move S1, TS | |||
| add.d S2, TS, TL | |||
| srai.d I, M, 0x02 | |||
| move I, M | |||
| add.d S3, S2, TL | |||
| add.d S4, S2, T0 | |||
| add.d TS, S3, T0 | |||
| beq I, ZERO, .L_I3 | |||
| .L_4I1: /* I-- */ | |||
| xvld U0, S1, 0x00 | |||
| xvld U1, S2, 0x00 | |||
| xvld U2, S3, 0x00 | |||
| xvld U3, S4, 0x00 | |||
| xvpackev.d D0, U1, U0 | |||
| xvpackod.d D1, U1, U0 | |||
| xvpackev.d D2, U3, U2 | |||
| xvpackod.d D3, U3, U2 | |||
| xvand.v U0, D0, D0 | |||
| xvpermi.q D0, D2, 0x02 // 0 | |||
| xvand.v U1, D1, D1 | |||
| xvpermi.q D1, D3, 0x02 // 1 | |||
| xvpermi.q D2, U0, 0x31 // 2 | |||
| xvpermi.q D3, U1, 0x31 // 3 | |||
| xvst D0, TD, 0x00 | |||
| xvst D1, TD, 0x20 | |||
| xvst D2, TD, 0x40 | |||
| xvst D3, TD, 0x60 | |||
| addi.d S1, S1, 0x20 | |||
| addi.d S2, S2, 0x20 | |||
| addi.d S3, S3, 0x20 | |||
| addi.d S4, S4, 0x20 | |||
| addi.d TD, TD, 0x80 | |||
| add.d S4, S3, TL | |||
| add.d TS, S4, TL | |||
| beq I, ZERO, .L_N2 | |||
| .L_N41: /* if(i>0)*/ | |||
| fld.d F0, S1, 0x00 | |||
| fld.d F1, S2, 0x00 | |||
| fld.d F2, S3, 0x00 | |||
| fld.d F3, S4, 0x00 | |||
| fst.d F0, TD, 0x00 | |||
| fst.d F1, TD, 0x08 | |||
| fst.d F2, TD, 0x10 | |||
| fst.d F3, TD, 0x18 | |||
| addi.d S1, S1, 0x08 | |||
| addi.d S2, S2, 0x08 | |||
| addi.d S3, S3, 0x08 | |||
| addi.d S4, S4, 0x08 | |||
| addi.d TD, TD, 0x20 | |||
| addi.d I, I, -1 | |||
| blt ZERO, I, .L_4I1 | |||
| .L_I3: | |||
| andi I, M, 0x03 | |||
| beq I, ZERO, .L_N2 | |||
| .L_4II1: | |||
| fld.d F0, S1, 0x00 | |||
| fld.d F1, S2, 0x00 | |||
| fld.d F2, S3, 0x00 | |||
| fld.d F3, S4, 0x00 | |||
| fst.d F0, TD, 0x00 | |||
| addi.d S1, S1, 0x08 | |||
| fst.d F1, TD, 0x08 | |||
| addi.d S2, S2, 0x08 | |||
| fst.d F2, TD, 0x10 | |||
| addi.d S3, S3, 0x08 | |||
| fst.d F3, TD, 0x18 | |||
| addi.d S4, S4, 0x08 | |||
| addi.d TD, TD, 0x20 | |||
| addi.d I, I, -1 | |||
| blt ZERO, I, .L_4II1 | |||
| .L_N2: | |||
| andi J, N, 0x02 | |||
| beq ZERO, J, .L_N1 | |||
| blt ZERO, I, .L_N41 | |||
| .L_N2: /* if(n&2)*/ | |||
| andi I, N, 0x02 | |||
| beq I, ZERO, .L_N1 | |||
| move S1, TS | |||
| add.d S2, TS, TL | |||
| srai.d I, M, 0x01 | |||
| move I, M | |||
| add.d TS, S2, TL | |||
| beq I, ZERO, .L_NI1 | |||
| .L_2I1: /* I-- */ | |||
| xvld U0, S1, 0x00 | |||
| xvld U1, S2, 0x00 | |||
| xvpackev.d D0, U1, U0 | |||
| xvpackod.d D1, U1, U0 | |||
| xvpermi.q D0, D1, 0x02 // 0 | |||
| beq I, ZERO, .L_N1 | |||
| xvst D0, TD, 0x00 | |||
| .L_N21: /* if(i>0)*/ | |||
| fld.d F0, S1, 0x00 | |||
| fld.d F1, S2, 0x00 | |||
| addi.d S1, S1, 0x10 | |||
| addi.d S2, S2, 0x10 | |||
| addi.d TD, TD, 0x20 | |||
| fst.d F0, TD, 0x00 | |||
| fst.d F1, TD, 0x08 | |||
| addi.d S1, S1, 0x08 | |||
| addi.d S2, S2, 0x08 | |||
| addi.d TD, TD, 0x10 | |||
| addi.d I, I, -1 | |||
| blt ZERO, I, .L_2I1 | |||
| .L_NI1: | |||
| andi I, M, 0x01 | |||
| beq I, ZERO, .L_N1 | |||
| blt ZERO, I, .L_N21 | |||
| fld.d F0, S1, 0x00 | |||
| fld.d F1, S2, 0x00 | |||
| .L_N1: /* if(n&2)*/ | |||
| andi I, N, 0x01 | |||
| beq I, ZERO, .L_N0 | |||
| fst.d F0, TD, 0x00 | |||
| addi.d S1, S1, 0x08 | |||
| fst.d F1, TD, 0x08 | |||
| addi.d S2, S2, 0x08 | |||
| addi.d TD, TD, 0x10 | |||
| move S1, TS | |||
| move I, M | |||
| beq I, ZERO, .L_N0 | |||
| .L_N1: | |||
| move S1, TS | |||
| beq ZERO, M, .L_N0 | |||
| .L_N11: /* if(i>0)*/ | |||
| fld.d F0, S1, 0x00 | |||
| fst.d F0, TD, 0x00 | |||
| .L_M1: | |||
| fld.d F0, S1, 0x00 | |||
| addi.d S1, S1, 0x08 | |||
| fst.d F0, TD, 0x00 | |||
| addi.d TD, TD, 0x08 | |||
| addi.d M, M, -1 | |||
| blt ZERO, M, .L_M1 | |||
| addi.d S1, S1, 0x08 | |||
| addi.d TD, TD, 0x08 | |||
| addi.d I, I, -1 | |||
| blt ZERO, I, .L_N11 | |||
| .L_N0: | |||
| LDARG $r23, $sp, 0x00 | |||
| LDARG $r24, $sp, 0x08 | |||
| LDARG $r25, $sp, 0x10 | |||
| LDARG $r26, $sp, 0x18 | |||
| LDARG $r27, $sp, 0x20 | |||
| LDARG $r28, $sp, 0x28 | |||
| LDARG $r29, $sp, 0x30 | |||
| LDARG $r30, $sp, 0x38 | |||
| LDARG $r31, $sp, 0x40 | |||
| LD $f23, $sp, 0x48 | |||
| LD $f24, $sp, 0x50 | |||
| LD $f25, $sp, 0x58 | |||
| LD $f26, $sp, 0x60 | |||
| LD $f27, $sp, 0x68 | |||
| LD $f28, $sp, 0x70 | |||
| LD $f29, $sp, 0x78 | |||
| LD $f30, $sp, 0x80 | |||
| LD $f31, $sp, 0x88 | |||
| addi.d $sp, $sp, 0x90 | |||
| jirl $r0, $r1, 0x00 | |||
| LDARG $r23, $sp, 0 | |||
| LDARG $r24, $sp, 8 | |||
| LDARG $r25, $sp, 16 | |||
| LDARG $r26, $sp, 24 | |||
| LDARG $r27, $sp, 32 | |||
| LDARG $r28, $sp, 40 | |||
| LDARG $r29, $sp, 48 | |||
| LDARG $r30, $sp, 56 | |||
| addi.d $sp, $sp, 64 | |||
| jirl $r0, $r1, 0x00 | |||
| EPILOGUE | |||
| @@ -94,7 +94,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| CMPEQ $fcc1, ALPHAI, a1 | |||
| bge $r0, I, .L19 | |||
| /////// INCX == 1 && N >= 4 //////// | |||
| bnez DUMMY2, .L17 // if DUMMPY2 == 1, called from c/zscal. | |||
| bnez DUMMY2, .L17 // if DUMMY2 == 1, called from c/zscal. | |||
| bceqz $fcc0, .L17 | |||
| @@ -146,6 +146,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| addi.d I, I, -1 | |||
| blt $r0, I, .L17 | |||
| b .L19 | |||
| .align 3 | |||
| /////// INCX == 1 && N < 8 /////// | |||
| @@ -156,7 +157,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| andi I, N, 7 | |||
| #endif | |||
| beqz I, .L999 | |||
| bnez DUMMY2, .L998 // if DUMMPY2 == 1, called from c/zscal. | |||
| bnez DUMMY2, .L998 // if DUMMY2 == 1, called from c/zscal. | |||
| bceqz $fcc0, .L998 | |||
| @@ -171,7 +172,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| CMPEQ $fcc1, ALPHAI, a1 | |||
| move XX, X | |||
| bge $r0, I, .L29 | |||
| bnez DUMMY2, .L25 // if DUMMPY2 == 1, called from c/zscal. | |||
| bnez DUMMY2, .L25 // if DUMMY2 == 1, called from c/zscal. | |||
| bceqz $fcc0, .L25 | |||
| bceqz $fcc1, .L25 | |||
| @@ -341,7 +342,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| andi I, N, 7 | |||
| #endif | |||
| beqz I, .L999 | |||
| bnez DUMMY2, .L998 // if DUMMPY2 == 1, called from c/zscal. | |||
| bnez DUMMY2, .L998 // if DUMMY2 == 1, called from c/zscal. | |||
| bceqz $fcc0, .L998 | |||
| @@ -33,6 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define ALPHAI $f1 | |||
| #define X $r7 | |||
| #define INCX $r8 | |||
| #define DUMMY2 $r9 | |||
| #define I $r12 | |||
| #define TEMP $r13 | |||
| @@ -65,6 +66,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| bge $r0, N, .L999 | |||
| bge $r0, INCX, .L999 | |||
| ld.d DUMMY2, $sp, 0 | |||
| li.d TEMP, 1 | |||
| movgr2fr.d a1, $r0 | |||
| FFINT a1, a1 | |||
| @@ -84,24 +86,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| srai.d I, N, 2 | |||
| bne INCX, TEMP, .L22 | |||
| /////// INCX == 1 //////// | |||
| .L11: | |||
| bge $r0, I, .L997 | |||
| CMPEQ $fcc0, ALPHAR, a1 | |||
| CMPEQ $fcc1, ALPHAI, a1 | |||
| bceqz $fcc0, .L13 | |||
| b .L14 | |||
| .align 3 | |||
| bge $r0, I, .L19 | |||
| .L13: | |||
| bceqz $fcc1, .L114 //alpha_r != 0.0 && alpha_i != 0.0 | |||
| b .L113 //alpha_r != 0.0 && alpha_i == 0.0 | |||
| /////// INCX == 1 && N >= 4 //////// | |||
| bnez DUMMY2, .L17 // if DUMMPY2 == 1, called from c/zscal. | |||
| .L14: | |||
| bceqz $fcc1, .L114 //alpha_r == 0.0 && alpha_i != 0.0 | |||
| b .L111 //alpha_r == 0.0 && alpha_i == 0.0 | |||
| .align 3 | |||
| bceqz $fcc0, .L17 | |||
| .L111: //alpha_r == 0.0 && alpha_i == 0.0 | |||
| bceqz $fcc1, .L17 | |||
| .L15: //alpha_r == 0.0 && alpha_i == 0.0 | |||
| vst VXZ, X, 0 * SIZE | |||
| #ifdef DOUBLE | |||
| vst VXZ, X, 2 * SIZE | |||
| @@ -112,50 +110,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| addi.d X, X, 8 * SIZE | |||
| addi.d I, I, -1 | |||
| blt $r0, I, .L111 | |||
| b .L997 | |||
| .align 3 | |||
| .L113: //alpha_r != 0.0 && alpha_i == 0.0 | |||
| vld VX0, X, 0 * SIZE | |||
| #ifdef DOUBLE | |||
| vld VX1, X, 2 * SIZE | |||
| vpickev.d x1, VX1, VX0 | |||
| vpickod.d x2, VX1, VX0 | |||
| vfmul.d x3, VXAR, x1 | |||
| vfmul.d x4, VXAR, x2 | |||
| vilvl.d VX2, x4 ,x3 | |||
| vilvh.d VX3, x4, x3 | |||
| vst VX2, X, 0 * SIZE | |||
| vst VX3, X, 2 * SIZE | |||
| vld VX0, X, 4 * SIZE | |||
| vld VX1, X, 6 * SIZE | |||
| vpickev.d x1, VX1, VX0 | |||
| vpickod.d x2, VX1, VX0 | |||
| vfmul.d x3, VXAR, x1 | |||
| vfmul.d x4, VXAR, x2 | |||
| vilvl.d VX2, x4 ,x3 | |||
| vilvh.d VX3, x4, x3 | |||
| vst VX2, X, 4 * SIZE | |||
| vst VX3, X, 6 * SIZE | |||
| #else | |||
| vld VX1, X, 4 * SIZE | |||
| vpickev.w x1, VX1, VX0 | |||
| vpickod.w x2, VX1, VX0 | |||
| vfmul.s x3, VXAR, x1 | |||
| vfmul.s x4, VXAR, x2 | |||
| vilvl.w VX2, x4 ,x3 | |||
| vilvh.w VX3, x4, x3 | |||
| vst VX2, X, 0 * SIZE | |||
| vst VX3, X, 4 * SIZE | |||
| #endif | |||
| addi.d X, X, 8 * SIZE | |||
| addi.d I, I, -1 | |||
| blt $r0, I, .L113 | |||
| b .L997 | |||
| blt $r0, I, .L15 | |||
| b .L19 | |||
| .align 3 | |||
| .L114: //alpha_r != 0.0 && alpha_i != 0.0 | |||
| .L17: | |||
| vld VX0, X, 0 * SIZE | |||
| #ifdef DOUBLE | |||
| vld VX1, X, 2 * SIZE | |||
| @@ -196,29 +155,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| addi.d X, X, 8 * SIZE | |||
| addi.d I, I, -1 | |||
| blt $r0, I, .L114 | |||
| b .L997 | |||
| blt $r0, I, .L17 | |||
| b .L19 | |||
| .align 3 | |||
| /////// INCX == 1 && N < 8 /////// | |||
| .L19: | |||
| andi I, N, 3 | |||
| beqz I, .L999 | |||
| bnez DUMMY2, .L998 // if DUMMPY2 == 1, called from c/zscal. | |||
| bceqz $fcc0, .L998 | |||
| bceqz $fcc1, .L998 | |||
| b .L995 // alpha_r == 0.0 && alpha_i == 0.0 | |||
| /////// INCX != 1 //////// | |||
| .L22: | |||
| bge $r0, I, .L997 | |||
| move XX, X | |||
| CMPEQ $fcc0, ALPHAR, a1 | |||
| CMPEQ $fcc1, ALPHAI, a1 | |||
| bceqz $fcc0, .L23 | |||
| b .L24 | |||
| .align 3 | |||
| move XX, X | |||
| bge $r0, I, .L29 | |||
| bnez DUMMY2, .L25 // if DUMMPY2 == 1, called from c/zscal. | |||
| .L23: | |||
| bceqz $fcc1, .L224 //alpha_r != 0.0 && alpha_i != 0.0 | |||
| b .L223 //alpha_r != 0.0 && alpha_i == 0.0 | |||
| bceqz $fcc0, .L25 | |||
| .L24: | |||
| bceqz $fcc1, .L224 //alpha_r == 0.0 && alpha_i != 0.0 | |||
| b .L221 //alpha_r == 0.0 && alpha_i == 0.0 | |||
| .align 3 | |||
| bceqz $fcc1, .L25 | |||
| .L221: //alpha_r == 0.0 && alpha_i == 0.0 | |||
| .L27: //alpha_r == 0.0 && alpha_i == 0.0 | |||
| #ifdef DOUBLE | |||
| vstelm.d VXZ, X, 0, 0 | |||
| vstelm.d VXZ, X, 1 * SIZE, 0 | |||
| @@ -246,92 +211,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| add.d X, X, INCX | |||
| addi.d I, I, -1 | |||
| blt $r0, I, .L221 | |||
| b .L997 | |||
| blt $r0, I, .L27 | |||
| b .L29 | |||
| .align 3 | |||
| .L223: //alpha_r != 0.0 && alpha_i == 0.0 | |||
| #ifdef DOUBLE | |||
| ld.d t1, X, 0 * SIZE | |||
| ld.d t2, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| ld.d t3, X, 0 * SIZE | |||
| ld.d t4, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| vinsgr2vr.d x1, t1, 0 | |||
| vinsgr2vr.d x2, t2, 0 | |||
| vinsgr2vr.d x1, t3, 1 | |||
| vinsgr2vr.d x2, t4, 1 | |||
| vfmul.d x3, VXAR, x1 | |||
| vfmul.d x4, VXAR, x2 | |||
| vstelm.d x3, XX, 0 * SIZE, 0 | |||
| vstelm.d x4, XX, 1 * SIZE, 0 | |||
| add.d XX, XX, INCX | |||
| vstelm.d x3, XX, 0 * SIZE, 1 | |||
| vstelm.d x4, XX, 1 * SIZE, 1 | |||
| add.d XX, XX, INCX | |||
| ld.d t1, X, 0 * SIZE | |||
| ld.d t2, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| ld.d t3, X, 0 * SIZE | |||
| ld.d t4, X, 1 * SIZE | |||
| vinsgr2vr.d x1, t1, 0 | |||
| vinsgr2vr.d x2, t2, 0 | |||
| vinsgr2vr.d x1, t3, 1 | |||
| vinsgr2vr.d x2, t4, 1 | |||
| add.d X, X, INCX | |||
| vfmul.d x3, VXAR, x1 | |||
| vfmul.d x4, VXAR, x2 | |||
| addi.d I, I, -1 | |||
| vstelm.d x3, XX, 0 * SIZE, 0 | |||
| vstelm.d x4, XX, 1 * SIZE, 0 | |||
| add.d XX, XX, INCX | |||
| vstelm.d x3, XX, 0 * SIZE, 1 | |||
| vstelm.d x4, XX, 1 * SIZE, 1 | |||
| #else | |||
| ld.w t1, X, 0 * SIZE | |||
| ld.w t2, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| ld.w t3, X, 0 * SIZE | |||
| ld.w t4, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| vinsgr2vr.w x1, t1, 0 | |||
| vinsgr2vr.w x2, t2, 0 | |||
| vinsgr2vr.w x1, t3, 1 | |||
| vinsgr2vr.w x2, t4, 1 | |||
| ld.w t1, X, 0 * SIZE | |||
| ld.w t2, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| ld.w t3, X, 0 * SIZE | |||
| ld.w t4, X, 1 * SIZE | |||
| vinsgr2vr.w x1, t1, 2 | |||
| vinsgr2vr.w x2, t2, 2 | |||
| vinsgr2vr.w x1, t3, 3 | |||
| vinsgr2vr.w x2, t4, 3 | |||
| add.d X, X, INCX | |||
| vfmul.s x3, VXAR, x1 | |||
| vfmul.s x4, VXAR, x2 | |||
| addi.d I, I, -1 | |||
| vstelm.w x3, XX, 0 * SIZE, 0 | |||
| vstelm.w x4, XX, 1 * SIZE, 0 | |||
| add.d XX, XX, INCX | |||
| vstelm.w x3, XX, 0 * SIZE, 1 | |||
| vstelm.w x4, XX, 1 * SIZE, 1 | |||
| add.d XX, XX, INCX | |||
| vstelm.w x3, XX, 0 * SIZE, 2 | |||
| vstelm.w x4, XX, 1 * SIZE, 2 | |||
| add.d XX, XX, INCX | |||
| vstelm.w x3, XX, 0 * SIZE, 3 | |||
| vstelm.w x4, XX, 1 * SIZE, 3 | |||
| #endif | |||
| add.d XX, XX, INCX | |||
| blt $r0, I, .L223 | |||
| b .L997 | |||
| .align 3 | |||
| .L224: //alpha_r != 0.0 && alpha_i != 0.0 | |||
| .L25: | |||
| #ifdef DOUBLE | |||
| ld.d t1, X, 0 * SIZE | |||
| ld.d t2, X, 1 * SIZE | |||
| @@ -414,15 +298,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| vstelm.w x4, XX, 1 * SIZE, 3 | |||
| #endif | |||
| add.d XX, XX, INCX | |||
| blt $r0, I, .L224 | |||
| b .L997 | |||
| blt $r0, I, .L25 | |||
| b .L29 | |||
| .align 3 | |||
| .L997: | |||
| andi I, N, 3 | |||
| bge $r0, I, .L999 | |||
| .align 3 | |||
| /////// INCX != 1 && N < 8 /////// | |||
| .L29: | |||
| andi I, N, 3 | |||
| beqz I, .L999 | |||
| bnez DUMMY2, .L998 // if DUMMPY2 == 1, called from c/zscal. | |||
| bceqz $fcc0, .L998 | |||
| bceqz $fcc1, .L998 | |||
| b .L995 // alpha_r == 0.0 && alpha_i == 0.0 | |||
| .L995: // alpha_r == 0.0 && alpha_i == 0.0 | |||
| ST a1, X, 0 * SIZE | |||
| ST a1, X, 1 * SIZE | |||
| addi.d I, I, -1 | |||
| add.d X, X, INCX | |||
| blt $r0, I, .L995 | |||
| b .L999 | |||
| .L998: | |||
| LD a1, X, 0 * SIZE | |||
| LD a2, X, 1 * SIZE | |||
| @@ -435,7 +333,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ST s2, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| blt $r0, I, .L998 | |||
| .align 3 | |||
| b .L999 | |||
| .L999: | |||
| move $r4, $r12 | |||
| @@ -53,6 +53,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| PROLOGUE | |||
| li.d TEMP, 2 * SIZE | |||
| ld.d XX, $sp, 0 // Load dummy2 | |||
| slli.d XX, XX, ZBASE_SHIFT | |||
| MTC a1, $r0 | |||
| slli.d INCX, INCX, ZBASE_SHIFT | |||
| bge $r0, N, .L999 | |||
| @@ -60,6 +62,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| CMPEQ $fcc1, ALPHA_I, a1 | |||
| bceqz $fcc0, .L50 | |||
| bceqz $fcc1, .L50 | |||
| beq XX, TEMP, .L50 // if dummp2 == 1, do not directly copy 0 | |||
| srai.d I, N, 2 | |||
| bne INCX, TEMP, .L20 | |||
| bge $r0, I, .L15 | |||
| @@ -1,5 +1,5 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| Copyright (c) 2013, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| @@ -25,61 +25,58 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2013/09/14 Saar | |||
| * BLASTEST float : OK | |||
| * BLASTEST double : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * | |||
| **************************************************************************************/ | |||
| #include "common.h" | |||
| // The c/zscal_k function is called not only by cblas_c/zscal but also by other upper-level interfaces. | |||
| // In certain cases, the expected return values for cblas_s/zscal differ from those of other upper-level interfaces. | |||
| // To handle this, we use the dummy2 parameter to differentiate between them. | |||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | |||
| { | |||
| BLASLONG i=0; | |||
| BLASLONG inc_x2; | |||
| BLASLONG ip = 0; | |||
| FLOAT temp; | |||
| BLASLONG i = 0; | |||
| BLASLONG inc_x2; | |||
| BLASLONG ip = 0; | |||
| FLOAT temp; | |||
| inc_x2 = 2 * inc_x; | |||
| for ( i=0; i<n; i++ ) | |||
| { | |||
| if ( da_r == 0.0 ) | |||
| { | |||
| if ( da_i == 0.0 ) | |||
| { | |||
| temp = 0.0; | |||
| x[ip+1] = 0.0 ; | |||
| } | |||
| else | |||
| { | |||
| temp = - da_i * x[ip+1] ; | |||
| if (isnan(x[ip]) || isinf(x[ip])) temp = NAN; | |||
| if (!isinf(x[ip+1])) | |||
| x[ip+1] = da_i * x[ip] ; | |||
| else x[ip+1] = NAN; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| if ( da_i == 0.0 ) | |||
| { | |||
| temp = da_r * x[ip] ; | |||
| if (!isinf(x[ip+1])) | |||
| x[ip+1] = da_r * x[ip+1]; | |||
| else x[ip+1] = NAN; | |||
| } | |||
| else | |||
| { | |||
| temp = da_r * x[ip] - da_i * x[ip+1] ; | |||
| if (!isinf(x[ip+1])) | |||
| x[ip+1] = da_r * x[ip+1] + da_i * x[ip] ; | |||
| else x[ip+1] = NAN; | |||
| } | |||
| } | |||
| if ( da_r != da_r ) | |||
| x[ip] = da_r; | |||
| else | |||
| x[ip] = temp; | |||
| ip += inc_x2; | |||
| } | |||
| if ((n <= 0) || (inc_x <= 0)) | |||
| return(0); | |||
| return(0); | |||
| inc_x2 = 2 * inc_x; | |||
| if (dummy2 == 0) { | |||
| for (i = 0; i < n; i++) | |||
| { | |||
| if (da_r == 0.0 && da_i == 0.0) | |||
| { | |||
| x[ip] = 0.0; | |||
| x[ip+1] = 0.0; | |||
| } | |||
| else | |||
| { | |||
| temp = da_r * x[ip] - da_i * x[ip+1]; | |||
| x[ip+1] = da_r * x[ip+1] + da_i * x[ip] ; | |||
| x[ip] = temp; | |||
| } | |||
| } | |||
| ip += inc_x2; | |||
| } | |||
| return(0); | |||
| } | |||
| for (i = 0; i < n; i++) | |||
| { | |||
| temp = da_r * x[ip] - da_i * x[ip+1]; | |||
| x[ip+1] = da_r * x[ip+1] + da_i * x[ip] ; | |||
| x[ip] = temp; | |||
| ip += inc_x2; | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -6,6 +6,9 @@ CROTKERNEL = ../mips/zrot.c | |||
| ZROTKERNEL = ../mips/zrot.c | |||
| CSWAPKERNEL = ../mips/zswap.c | |||
| ZSWAPKERNEL = ../mips/zswap.c | |||
| CSCALKERNEL = ../mips/zscal.c | |||
| ZSCALKERNEL = ../mips/zscal.c | |||
| ifndef SNRM2KERNEL | |||
| @@ -51,6 +51,7 @@ | |||
| #define X r8 | |||
| #define INCX r9 | |||
| #endif | |||
| #define FLAG r11 | |||
| #endif | |||
| #if defined(_AIX) || defined(__APPLE__) | |||
| @@ -61,6 +62,7 @@ | |||
| #define X r8 | |||
| #define INCX r9 | |||
| #endif | |||
| #define FLAG r11 | |||
| #endif | |||
| #define FZERO f0 | |||
| @@ -94,6 +96,10 @@ | |||
| fcmpu cr0, FZERO, ALPHA_I | |||
| bne- cr0, LL(A1I1) | |||
| LDLONG FLAG, 104(SP) | |||
| cmpwi cr0, FLAG, 1 | |||
| beq- cr0, LL(A1I1) | |||
| cmpwi cr0, INCX, 2 * SIZE | |||
| bne- cr0, LL(A0IN) | |||
| @@ -136,7 +136,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F | |||
| if ( inc_x <= 0 ) | |||
| return(0); | |||
| if (da_r == ZERO && da_i == ZERO) { | |||
| if (da_r == ZERO && da_i == ZERO && dummy2 == 0) { | |||
| //clear the vector and return | |||
| if (inc_x == 1) { | |||
| memset(x, 0, n*COMPSIZE*SIZE); | |||
| @@ -64,6 +64,7 @@ | |||
| #endif | |||
| #define INC1 r11 | |||
| #define FLAG r12 | |||
| #define FZERO f0 | |||
| #define ALPHA_R f1 | |||
| @@ -97,6 +98,10 @@ | |||
| fcmpu cr0, FZERO, ALPHA_I | |||
| bne- cr0, LL(A1I1) | |||
| lwz FLAG, FRAMESLOT(0)(SP) | |||
| cmpwi cr0, FLAG, 1 | |||
| beq- cr0, LL(A1I1) | |||
| LL(A0IN): | |||
| srawi. r0, N, 3 | |||
| mtspr CTR, r0 | |||
| @@ -169,6 +169,7 @@ SSYMV_U_KERNEL = symv_U_vector.c | |||
| SSYMV_L_KERNEL = symv_L_vector.c | |||
| DSYMV_U_KERNEL = symv_U_vector.c | |||
| DSYMV_L_KERNEL = symv_L_vector.c | |||
| CSYMV_U_KERNEL = ../generic/zsymv_k.c | |||
| CSYMV_L_KERNEL = ../generic/zsymv_k.c | |||
| ZSYMV_U_KERNEL = ../generic/zsymv_k.c | |||
| @@ -201,3 +202,12 @@ endif | |||
| ifndef ZGEMM_BETA | |||
| ZGEMM_BETA = ../generic/zgemm_beta.c | |||
| endif | |||
| ZOMATCOPY_CN = zomatcopy_cn_vector.c | |||
| COMATCOPY_CN = zomatcopy_cn_vector.c | |||
| DOMATCOPY_CN = omatcopy_cn_vector.c | |||
| SOMATCOPY_CN = omatcopy_cn_vector.c | |||
| SAXPBYKERNEL = axpby_vector_v2.c | |||
| DAXPBYKERNEL = axpby_vector_v2.c | |||
| @@ -0,0 +1,149 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) RISCV_RVV(vsetvl_e32m8)(n) | |||
| #define FLOAT_V_T vfloat32m8_t | |||
| #define VLEV_FLOAT RISCV_RVV(vle32_v_f32m8) | |||
| #define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m8) | |||
| #define VSEV_FLOAT RISCV_RVV(vse32_v_f32m8) | |||
| #define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m8) | |||
| #define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m8) | |||
| #define VFMULVF_FLOAT RISCV_RVV(vfmul_vf_f32m8) | |||
| #define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m8) | |||
| #else | |||
| #define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) | |||
| #define FLOAT_V_T vfloat64m4_t | |||
| #define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4) | |||
| #define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4) | |||
| #define VSEV_FLOAT RISCV_RVV(vse64_v_f64m4) | |||
| #define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4) | |||
| #define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4) | |||
| #define VFMULVF_FLOAT RISCV_RVV(vfmul_vf_f64m4) | |||
| #define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m4) | |||
| #endif | |||
| int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *y, BLASLONG inc_y) | |||
| { | |||
| FLOAT_V_T vx, vy; | |||
| unsigned int gvl; | |||
| if (n <= 0) | |||
| return (0); | |||
| if (inc_x == 1 && inc_y == 1) | |||
| { | |||
| while (n > 0) | |||
| { | |||
| gvl = VSETVL(n); | |||
| vx = VLEV_FLOAT(x, gvl); | |||
| vy = VLEV_FLOAT(y, gvl); | |||
| vy = VFMULVF_FLOAT(vy, beta, gvl); | |||
| vy = VFMACCVF_FLOAT(vy, alpha, vx, gvl); | |||
| VSEV_FLOAT(y, vy, gvl); | |||
| x += gvl; | |||
| y += gvl; | |||
| n -= gvl; | |||
| } | |||
| } | |||
| else if (1 == inc_x) | |||
| { | |||
| BLASLONG stride_y = inc_y * sizeof(FLOAT); | |||
| while (n > 0) | |||
| { | |||
| gvl = VSETVL(n); | |||
| vy = VLSEV_FLOAT(y, stride_y, gvl); | |||
| vx = VLEV_FLOAT(x, gvl); | |||
| vy = VFMULVF_FLOAT(vy, beta, gvl); | |||
| vy = VFMACCVF_FLOAT(vy, alpha, vx, gvl); | |||
| VSSEV_FLOAT(y, stride_y, vy, gvl); | |||
| x += gvl; | |||
| y += gvl * inc_y; | |||
| n -= gvl; | |||
| } | |||
| } | |||
| else if (1 == inc_y) | |||
| { | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| while (n > 0) | |||
| { | |||
| gvl = VSETVL(n); | |||
| vx = VLSEV_FLOAT(x, stride_x, gvl); | |||
| vy = VLEV_FLOAT(y, gvl); | |||
| vy = VFMULVF_FLOAT(vy, beta, gvl); | |||
| vy = VFMACCVF_FLOAT(vy, alpha, vx, gvl); | |||
| VSEV_FLOAT(y, vy, gvl); | |||
| x += gvl * inc_x; | |||
| y += gvl; | |||
| n -= gvl; | |||
| } | |||
| } | |||
| else if (inc_y == 0) | |||
| { | |||
| FLOAT vf = y[0]; | |||
| for (; n > 0; n--) | |||
| { | |||
| vf = (vf * beta) + (x[0] * alpha); | |||
| x += inc_x; | |||
| } | |||
| y[0] = vf; | |||
| } | |||
| else | |||
| { | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| BLASLONG stride_y = inc_y * sizeof(FLOAT); | |||
| while (n > 0) | |||
| { | |||
| gvl = VSETVL(n); | |||
| vy = VLSEV_FLOAT(y, stride_y, gvl); | |||
| vx = VLSEV_FLOAT(x, stride_x, gvl); | |||
| vy = VFMULVF_FLOAT(vy, beta, gvl); | |||
| vy = VFMACCVF_FLOAT(vy, alpha, vx, gvl); | |||
| VSSEV_FLOAT(y, stride_y, vy, gvl); | |||
| x += gvl * inc_x; | |||
| y += gvl * inc_y; | |||
| n -= gvl; | |||
| } | |||
| } | |||
| return (0); | |||
| } | |||
| @@ -0,0 +1,123 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m4)() | |||
| #define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) | |||
| #define FLOAT_V_T vfloat32m4_t | |||
| #define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4) | |||
| #define VSEV_FLOAT RISCV_RVV(vse32_v_f32m4) | |||
| #define VFMULVF_FLOAT RISCV_RVV(vfmul_vf_f32m4) | |||
| #define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m4) | |||
| #else | |||
| #define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m4)() | |||
| #define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) | |||
| #define FLOAT_V_T vfloat64m4_t | |||
| #define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4) | |||
| #define VSEV_FLOAT RISCV_RVV(vse64_v_f64m4) | |||
| #define VFMULVF_FLOAT RISCV_RVV(vfmul_vf_f64m4) | |||
| #define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m4) | |||
| #endif | |||
| int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb) | |||
| { | |||
| BLASLONG i,j; | |||
| FLOAT *aptr,*bptr; | |||
| size_t vl; | |||
| FLOAT_V_T va, vb,va1,vb1; | |||
| if ( rows <= 0 ) return(0); | |||
| if ( cols <= 0 ) return(0); | |||
| aptr = a; | |||
| bptr = b; | |||
| if ( alpha == 0.0 ) | |||
| { | |||
| vl = VSETVL_MAX; | |||
| va = VFMVVF_FLOAT(0, vl); | |||
| for ( i=0; i<cols ; i++ ) | |||
| { | |||
| for(j=0; j<rows; j+=vl) | |||
| { | |||
| vl = VSETVL(rows - j); | |||
| VSEV_FLOAT(bptr + j, va, vl); | |||
| } | |||
| bptr += ldb; | |||
| } | |||
| return(0); | |||
| } | |||
| if ( alpha == 1.0 ) | |||
| { | |||
| for ( i=0; i<cols ; i++ ) | |||
| { | |||
| for(j=0; j<rows; j+=vl) | |||
| { | |||
| vl = VSETVL(rows - j); | |||
| va = VLEV_FLOAT(aptr + j, vl); | |||
| VSEV_FLOAT(bptr + j, va, vl); | |||
| } | |||
| aptr += lda; | |||
| bptr += ldb; | |||
| } | |||
| return(0); | |||
| } | |||
| i = 0; | |||
| if( cols % 2 ){ | |||
| for(j=0; j<rows; j+=vl) | |||
| { | |||
| vl = VSETVL(rows - j); | |||
| va = VLEV_FLOAT(aptr + j, vl); | |||
| va = VFMULVF_FLOAT(va, alpha, vl); | |||
| VSEV_FLOAT(bptr + j, va, vl); | |||
| } | |||
| aptr += lda; | |||
| bptr += ldb; | |||
| i = 1; | |||
| } | |||
| for ( ; i<cols ; i+=2 ) | |||
| { | |||
| for(j=0; j<rows; j+=vl) | |||
| { | |||
| vl = VSETVL(rows - j); | |||
| va = VLEV_FLOAT(aptr + j, vl); | |||
| va1= VLEV_FLOAT(aptr + lda + j, vl); | |||
| va = VFMULVF_FLOAT(va, alpha, vl); | |||
| va1= VFMULVF_FLOAT(va1, alpha, vl); | |||
| VSEV_FLOAT(bptr + j, va, vl); | |||
| VSEV_FLOAT(bptr + ldb + j, va1, vl); | |||
| } | |||
| aptr += 2 * lda; | |||
| bptr += 2 * ldb; | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -43,8 +43,55 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m4) | |||
| #endif | |||
| #if !defined(DOUBLE) | |||
| inline int small_caxpy_kernel(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | |||
| #else | |||
| inline int small_zaxpy_kernel(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | |||
| #endif | |||
| { | |||
| BLASLONG i=0; | |||
| BLASLONG ix,iy; | |||
| BLASLONG inc_x2; | |||
| BLASLONG inc_y2; | |||
| if ( n <= 0 ) return(0); | |||
| if ( da_r == 0.0 && da_i == 0.0 ) return(0); | |||
| ix = 0; | |||
| iy = 0; | |||
| inc_x2 = 2 * inc_x; | |||
| inc_y2 = 2 * inc_y; | |||
| while(i < n) | |||
| { | |||
| #if !defined(CONJ) | |||
| y[iy] += ( da_r * x[ix] - da_i * x[ix+1] ) ; | |||
| y[iy+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ; | |||
| #else | |||
| y[iy] += ( da_r * x[ix] + da_i * x[ix+1] ) ; | |||
| y[iy+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ; | |||
| #endif | |||
| ix += inc_x2 ; | |||
| iy += inc_y2 ; | |||
| i++ ; | |||
| } | |||
| return(0); | |||
| } | |||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | |||
| { | |||
| #if !defined(DOUBLE) | |||
| if(n < 16) { | |||
| return small_caxpy_kernel(n, dummy0, dummy1, da_r, da_i, x, inc_x, y, inc_y, dummy, dummy2); | |||
| } | |||
| #else | |||
| if(n < 8) { | |||
| return small_zaxpy_kernel(n, dummy0, dummy1, da_r, da_i, x, inc_x, y, inc_y, dummy, dummy2); | |||
| } | |||
| #endif | |||
| BLASLONG i = 0, j = 0; | |||
| BLASLONG ix = 0,iy = 0; | |||
| if(n <= 0) return(0); | |||
| @@ -68,8 +68,60 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define VFNMSACVV_FLOAT RISCV_RVV(vfnmsac_vv_f64m4) | |||
| #endif | |||
| #if !defined(DOUBLE) | |||
| inline OPENBLAS_COMPLEX_FLOAT small_cdot_kernel(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| #else | |||
| inline OPENBLAS_COMPLEX_FLOAT small_zdot_kernel(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| #endif | |||
| { | |||
| BLASLONG i=0; | |||
| BLASLONG ix=0,iy=0; | |||
| FLOAT dot[2]; | |||
| OPENBLAS_COMPLEX_FLOAT result; | |||
| BLASLONG inc_x2; | |||
| BLASLONG inc_y2; | |||
| dot[0]=0.0; | |||
| dot[1]=0.0; | |||
| CREAL(result) = 0.0 ; | |||
| CIMAG(result) = 0.0 ; | |||
| if ( n < 1 ) return(result); | |||
| inc_x2 = 2 * inc_x ; | |||
| inc_y2 = 2 * inc_y ; | |||
| while(i < n) | |||
| { | |||
| #if !defined(CONJ) | |||
| dot[0] += ( x[ix] * y[iy] - x[ix+1] * y[iy+1] ) ; | |||
| dot[1] += ( x[ix+1] * y[iy] + x[ix] * y[iy+1] ) ; | |||
| #else | |||
| dot[0] += ( x[ix] * y[iy] + x[ix+1] * y[iy+1] ) ; | |||
| dot[1] -= ( x[ix+1] * y[iy] - x[ix] * y[iy+1] ) ; | |||
| #endif | |||
| ix += inc_x2 ; | |||
| iy += inc_y2 ; | |||
| i++ ; | |||
| } | |||
| CREAL(result) = dot[0]; | |||
| CIMAG(result) = dot[1]; | |||
| return(result); | |||
| } | |||
| OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| { | |||
| #if !defined(DOUBLE) | |||
| if(n < 16) { | |||
| return small_cdot_kernel(n, x, inc_x, y, inc_y); | |||
| } | |||
| #else | |||
| if(n < 8) { | |||
| return small_zdot_kernel(n, x, inc_x, y, inc_y); | |||
| } | |||
| #endif | |||
| BLASLONG i=0, j=0; | |||
| BLASLONG ix=0,iy=0; | |||
| FLOAT dot[2]; | |||
| @@ -148,4 +200,4 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA | |||
| CREAL(result) = dot[0]; | |||
| CIMAG(result) = dot[1]; | |||
| return(result); | |||
| } | |||
| } | |||
| @@ -66,7 +66,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, | |||
| BLASLONG lda2 = lda * 2; | |||
| vy0_new = VLSEV_FLOAT(&y[iy], stride_y, gvl); | |||
| vy1_new = VLSEV_FLOAT(&y[iy + 1], stride_y, gvl); | |||
| for (k = 0, j = 0; k < m / gvl; k++) | |||
| for (k = 0, j = 0; k < m / gvl; k ++) | |||
| { | |||
| a_ptr = a; | |||
| ix = 0; | |||
| @@ -121,30 +121,73 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, | |||
| #endif | |||
| a_ptr += lda2; | |||
| ix += inc_x2; | |||
| } | |||
| for (; i < n; i += 4) | |||
| for (i = n % 4 ; i < n; i += 4) | |||
| { | |||
| #if !defined(XCONJ) | |||
| x_v0 = VLSEV_FLOAT(&x[ix], inc_x2 * sizeof(FLOAT), 4); | |||
| x_v1 = VLSEV_FLOAT(&x[ix + 1], inc_x2 * sizeof(FLOAT), 4); | |||
| temp_rv = VFMUL_VF_FLOAT(x_v0, alpha_r, 4); | |||
| temp_iv = VFMUL_VF_FLOAT(x_v0, alpha_i, 4); | |||
| temp_rv = VFNMSACVF_FLOAT(temp_rv, alpha_i, x_v1, 4); | |||
| temp_iv = VFMACCVF_FLOAT(temp_iv, alpha_r, x_v1, 4); | |||
| VSEV_FLOAT(&temp_rr[0], temp_rv, 4); | |||
| VSEV_FLOAT(&temp_ii[0], temp_iv, 4); | |||
| // temp_rr[0] = alpha_r * x[ix] - alpha_i * x[ix + 1]; | |||
| // temp_rr[1] = alpha_r * x[ix + inc_x2] - alpha_i * x[ix + inc_x2 + 1]; | |||
| x_v0 = VLSEV_FLOAT(&x[ix], inc_x2 * sizeof(FLOAT), 2); | |||
| x_v1 = VLSEV_FLOAT(&x[ix + 1], inc_x2 * sizeof(FLOAT), 2); | |||
| temp_rv = VFMUL_VF_FLOAT(x_v0, alpha_r, 2); | |||
| temp_rv = VFNMSACVF_FLOAT(temp_rv, alpha_i, x_v1, 2); | |||
| // temp_ii[0] = alpha_r * x[ix + 1] + alpha_i * x[ix]; | |||
| // temp_ii[1] = alpha_r * x[ix + inc_x2 + 1] + alpha_i * x[ix + inc_x2]; | |||
| temp_iv = VFMUL_VF_FLOAT(x_v0, alpha_i, 2); | |||
| temp_iv = VFMACCVF_FLOAT(temp_iv, alpha_r, x_v1, 2); | |||
| VSEV_FLOAT(&temp_rr[0], temp_rv, 2); | |||
| VSEV_FLOAT(&temp_ii[0], temp_iv, 2); | |||
| // temp_rr[2] = alpha_r * x[ix + inc_x2 * 2] - alpha_i * x[ix + inc_x2 * 2 + 1]; | |||
| // temp_rr[3] = alpha_r * x[ix + inc_x2 * 3] - alpha_i * x[ix + inc_x2 * 3 + 1]; | |||
| x_v0 = VLSEV_FLOAT(&x[ix + inc_x2 * 2], inc_x2 * sizeof(FLOAT), 2); | |||
| x_v1 = VLSEV_FLOAT(&x[ix + inc_x2 * 2 + 1], inc_x2 * sizeof(FLOAT), 2); | |||
| temp_rv = VFMUL_VF_FLOAT(x_v0, alpha_r, 2); | |||
| temp_rv = VFNMSACVF_FLOAT(temp_rv, alpha_i, x_v1, 2); | |||
| // temp_ii[2] = alpha_r * x[ix + inc_x2 * 2 + 1] + alpha_i * x[ix + inc_x2 * 2]; | |||
| // temp_ii[3] = alpha_r * x[ix + inc_x2 * 3 + 1] + alpha_i * x[ix + inc_x2 * 3]; | |||
| temp_iv = VFMUL_VF_FLOAT(x_v0, alpha_i, 2); | |||
| temp_iv = VFMACCVF_FLOAT(temp_iv, alpha_r, x_v1, 2); | |||
| VSEV_FLOAT(&temp_rr[2], temp_rv, 2); | |||
| VSEV_FLOAT(&temp_ii[2], temp_iv, 2); | |||
| #else | |||
| x_v0 = VLSEV_FLOAT(&x[ix], inc_x2 * sizeof(FLOAT), 4); | |||
| x_v1 = VLSEV_FLOAT(&x[ix + 1], inc_x2 * sizeof(FLOAT), 4); | |||
| temp_rv = VFMUL_VF_FLOAT(x_v0, alpha_r, 4); | |||
| temp_iv = VFMUL_VF_FLOAT(x_v0, alpha_i, 4); | |||
| temp_rv = VFMACCVF_FLOAT(temp_rv, alpha_i, x_v1, 4); | |||
| temp_iv = VFNMSACVF_FLOAT(temp_iv, alpha_r, x_v1, 4); | |||
| VSEV_FLOAT(&temp_rr[0], temp_rv, 4); | |||
| VSEV_FLOAT(&temp_ii[0], temp_iv, 4); | |||
| // temp_rr[0] = alpha_r * x[ix] + alpha_i * x[ix + 1]; | |||
| // temp_rr[1] = alpha_r * x[ix + inc_x2] + alpha_i * x[ix + inc_x2 + 1]; | |||
| x_v0 = VLSEV_FLOAT(&x[ix], inc_x2 * sizeof(FLOAT), 2); | |||
| x_v1 = VLSEV_FLOAT(&x[ix + 1], inc_x2 * sizeof(FLOAT), 2); | |||
| temp_rv = VFMUL_VF_FLOAT(x_v0, alpha_r, 2); | |||
| temp_rv = VFMACCVF_FLOAT(temp_rv, alpha_i, x_v1, 2); | |||
| // temp_ii[0] = alpha_r * x[ix + 1] - alpha_i * x[ix]; | |||
| // temp_ii[1] = alpha_r * x[ix + inc_x2 + 1] - alpha_i * x[ix + inc_x2]; | |||
| temp_iv = VFMUL_VF_FLOAT(x_v1, alpha_r, 2); | |||
| temp_iv = VFNMSACVF_FLOAT(temp_iv, alpha_i, x_v0, 2); | |||
| VSEV_FLOAT(&temp_rr[0], temp_rv, 2); | |||
| VSEV_FLOAT(&temp_ii[0], temp_iv, 2); | |||
| // temp_rr[2] = alpha_r * x[ix + inc_x2 * 2] + alpha_i * x[ix + inc_x2 * 2 + 1]; | |||
| // temp_rr[3] = alpha_r * x[ix + inc_x2 * 3] + alpha_i * x[ix + inc_x2 * 3 + 1]; | |||
| x_v0 = VLSEV_FLOAT(&x[ix + inc_x2 * 2], inc_x2 * sizeof(FLOAT), 2); | |||
| x_v1 = VLSEV_FLOAT(&x[ix + inc_x2 * 2 + 1], inc_x2 * sizeof(FLOAT), 2); | |||
| temp_rv = VFMUL_VF_FLOAT(x_v0, alpha_r, 2); | |||
| temp_rv = VFMACCVF_FLOAT(temp_rv, alpha_i, x_v1, 2); | |||
| temp_ii[2] = alpha_r * x[ix + inc_x2 * 2 + 1] - alpha_i * x[ix + inc_x2 * 2]; | |||
| temp_ii[3] = alpha_r * x[ix + inc_x2 * 3 + 1] - alpha_i * x[ix + inc_x2 * 3]; | |||
| temp_iv = VFMUL_VF_FLOAT(x_v1, alpha_r, 2); | |||
| temp_iv = VFNMSACVF_FLOAT(temp_iv, alpha_i, x_v0, 2); | |||
| VSEV_FLOAT(&temp_rr[2], temp_rv, 2); | |||
| VSEV_FLOAT(&temp_ii[2], temp_iv, 2); | |||
| #endif | |||
| @@ -257,7 +300,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, | |||
| VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl); | |||
| VSSEV_FLOAT(&y[iy + 1], stride_y, vy1, gvl); | |||
| j += gvl * 2; | |||
| iy += inc_yv; | |||
| iy += inc_yv ; | |||
| } | |||
| // tail | |||
| if (j / 2 < m) | |||
| @@ -0,0 +1,106 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) | |||
| #define FLOAT_V_T vfloat32m4_t | |||
| #define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4) | |||
| #define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4) | |||
| #define VSEV_FLOAT RISCV_RVV(vse32_v_f32m4) | |||
| #define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4) | |||
| #define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4) | |||
| #define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f32m4) | |||
| #define VFMUL_VF_FLOAT RISCV_RVV(vfmul_vf_f32m4) | |||
| #define VSEV_FLOAT RISCV_RVV(vse32_v_f32m4) | |||
| #define VLSEG2_FLOAT RISCV_RVV(vlseg2e32_v_f32m4x2) | |||
| #define VSSEG2_FLOAT RISCV_RVV(vsseg2e32_v_f32m4x2) | |||
| #define FLOAT_VX2_T vfloat32m4x2_t | |||
| #define VGET_VX2 RISCV_RVV(vget_v_f32m4x2_f32m4) | |||
| #define VSET_VX2 RISCV_RVV(vset_v_f32m4_f32m4x2) | |||
| #else | |||
| #define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) | |||
| #define FLOAT_V_T vfloat64m4_t | |||
| #define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4) | |||
| #define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4) | |||
| #define VSEV_FLOAT RISCV_RVV(vse64_v_f64m4) | |||
| #define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4) | |||
| #define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4) | |||
| #define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m4) | |||
| #define VFMUL_VF_FLOAT RISCV_RVV(vfmul_vf_f64m4) | |||
| #define VSEV_FLOAT RISCV_RVV(vse64_v_f64m4) | |||
| #define VLSEG2_FLOAT RISCV_RVV(vlseg2e64_v_f64m4x2) | |||
| #define VSSEG2_FLOAT RISCV_RVV(vsseg2e64_v_f64m4x2) | |||
| #define FLOAT_VX2_T vfloat64m4x2_t | |||
| #define VGET_VX2 RISCV_RVV(vget_v_f64m4x2_f64m4) | |||
| #define VSET_VX2 RISCV_RVV(vset_v_f64m4_f64m4x2) | |||
| #endif | |||
| int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb) | |||
| { | |||
| BLASLONG i,j,ia; | |||
| FLOAT *aptr,*bptr; | |||
| FLOAT_V_T bptr_v0 , bptr_v1 , aptr_v0 ,aptr_v1; | |||
| FLOAT_VX2_T va, vb; | |||
| unsigned int gvl = 0; | |||
| if ( rows <= 0 ) return(0); | |||
| if ( cols <= 0 ) return(0); | |||
| aptr = a; | |||
| bptr = b; | |||
| lda *= 2; | |||
| ldb *= 2; | |||
| for ( i=0; i<cols ; i++ ) | |||
| { | |||
| ia = 0; | |||
| for(j=0; j<rows ; j+=gvl) | |||
| { | |||
| gvl = VSETVL(rows - j); | |||
| va = VLSEG2_FLOAT(aptr + ia, gvl); | |||
| aptr_v0 = VGET_VX2(va, 0); | |||
| aptr_v1 = VGET_VX2(va, 1); | |||
| bptr_v1 = VFMUL_VF_FLOAT( aptr_v1, alpha_r,gvl); | |||
| bptr_v1 = VFMACCVF_FLOAT(bptr_v1, alpha_i, aptr_v0, gvl); | |||
| bptr_v0 = VFMUL_VF_FLOAT( aptr_v0,alpha_r, gvl); | |||
| bptr_v0 = VFNMSACVF_FLOAT(bptr_v0, alpha_i, aptr_v1, gvl); | |||
| vb = VSET_VX2(vb, 0, bptr_v0); | |||
| vb = VSET_VX2(vb, 1, bptr_v1); | |||
| VSSEG2_FLOAT(&bptr[ia], vb, gvl); | |||
| ia += gvl * 2 ; | |||
| } | |||
| aptr += lda; | |||
| bptr += ldb; | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -27,65 +27,56 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| /************************************************************************************** | |||
| * 2013/09/14 Saar | |||
| * BLASTEST float : OK | |||
| * BLASTEST double : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * BLASTEST float : OK | |||
| * BLASTEST double : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * | |||
| **************************************************************************************/ | |||
| #include "common.h" | |||
| // The c/zscal_k function is called not only by cblas_c/zscal but also by other upper-level interfaces. | |||
| // In certain cases, the expected return values for cblas_s/zscal differ from those of other upper-level interfaces. | |||
| // To handle this, we use the dummy2 parameter to differentiate between them. | |||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | |||
| { | |||
| BLASLONG i=0; | |||
| BLASLONG inc_x2; | |||
| BLASLONG ip = 0; | |||
| FLOAT temp; | |||
| BLASLONG i = 0; | |||
| BLASLONG inc_x2; | |||
| BLASLONG ip = 0; | |||
| FLOAT temp; | |||
| if ( (n <= 0) || (inc_x <= 0)) | |||
| return(0); | |||
| if ((n <= 0) || (inc_x <= 0)) | |||
| return(0); | |||
| inc_x2 = 2 * inc_x; | |||
| if (dummy2 == 0) { | |||
| for (i = 0; i < n; i++) | |||
| { | |||
| if (da_r == 0.0 && da_i == 0.0) | |||
| { | |||
| x[ip] = 0.0; | |||
| x[ip+1] = 0.0; | |||
| } | |||
| else | |||
| { | |||
| temp = da_r * x[ip] - da_i * x[ip+1]; | |||
| x[ip+1] = da_r * x[ip+1] + da_i * x[ip] ; | |||
| x[ip] = temp; | |||
| } | |||
| inc_x2 = 2 * inc_x; | |||
| for ( i=0; i<n; i++ ) | |||
| { | |||
| if ( da_r == 0.0 ) | |||
| { | |||
| if ( da_i == 0.0 ) | |||
| { | |||
| temp = 0.0; | |||
| x[ip+1] = 0.0 ; | |||
| } | |||
| else | |||
| { | |||
| temp = - da_i * x[ip+1] ; | |||
| if (isnan(x[ip]) || isinf(x[ip])) temp = NAN; | |||
| if (!isinf(x[ip+1])) | |||
| x[ip+1] = da_i * x[ip] ; | |||
| else x[ip+1] = NAN; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| if ( da_i == 0.0 ) | |||
| { | |||
| temp = da_r * x[ip] ; | |||
| x[ip+1] = da_r * x[ip+1]; | |||
| } | |||
| else | |||
| { | |||
| temp = da_r * x[ip] - da_i * x[ip+1] ; | |||
| x[ip+1] = da_r * x[ip+1] + da_i * x[ip] ; | |||
| } | |||
| } | |||
| x[ip] = temp; | |||
| ip += inc_x2; | |||
| } | |||
| return(0); | |||
| } | |||
| for (i = 0; i < n; i++) | |||
| { | |||
| temp = da_r * x[ip] - da_i * x[ip+1]; | |||
| x[ip+1] = da_r * x[ip+1] + da_i * x[ip] ; | |||
| ip += inc_x2; | |||
| } | |||
| return(0); | |||
| x[ip] = temp; | |||
| ip += inc_x2; | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -70,6 +70,11 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F | |||
| FLOAT_VX2_T vx2; | |||
| if(inc_x == 1) { | |||
| if (dummy2 == 0 && da_r==0. && da_i == 0.) { | |||
| BLASLONG i; | |||
| for (i=0; i < n*2; i++) x[i]=0.; | |||
| return(0); | |||
| } else { | |||
| for (size_t vl; n > 0; n -= vl, x += vl*2) { | |||
| vl = VSETVL(n); | |||
| @@ -80,6 +85,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F | |||
| vt = VFMULVF_FLOAT(vr, da_r, vl); | |||
| vt = VFNMSACVF_FLOAT(vt, da_i, vi, vl); | |||
| vi = VFMULVF_FLOAT(vi, da_r, vl); | |||
| vi = VFMACCVF_FLOAT(vi, da_i, vr, vl); | |||
| @@ -87,9 +93,14 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F | |||
| vx2 = VSET_VX2(vx2, 1, vi); | |||
| VSSEG_FLOAT(x, vx2, vl); | |||
| } | |||
| } | |||
| } else { | |||
| if (dummy2 == 0 && da_r==0. && da_i == 0.) { | |||
| BLASLONG i,ix=0,inc_x2=2*inc_x; | |||
| for (i=0; i < n; i++) {x[ix]=0.;x[ix+1]=0.;ix+=inc_x2;}; | |||
| return(0); | |||
| } else { | |||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { | |||
| vl = VSETVL(n); | |||
| @@ -105,6 +116,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F | |||
| vx2 = VSET_VX2(vx2, 0, vt); | |||
| vx2 = VSET_VX2(vx2, 1, vi); | |||
| VSSSEG_FLOAT(x, stride_x, vx2, vl); | |||
| } | |||
| } | |||
| } | |||
| @@ -57,9 +57,15 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F | |||
| if((n <= 0) || (inc_x <= 0)) | |||
| return(0); | |||
| unsigned int gvl = 0; | |||
| FLOAT_V_T vt, v0, v1; | |||
| { | |||
| if (dummy2 == 0 && da_r == 0. && da_i == 0.) { | |||
| int i,inc_x2,ix; | |||
| inc_x2 = 2*inc_x; | |||
| ix=0; | |||
| for (i=0;i<n;i++){x[ix]=0.;x[ix+1]=0.;ix+=inc_x2;} | |||
| } else { | |||
| unsigned int gvl = 0; | |||
| FLOAT_V_T vt, v0, v1; | |||
| { | |||
| gvl = VSETVL(n); | |||
| BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); | |||
| BLASLONG inc_xv = inc_x * 2 * gvl; | |||
| @@ -91,6 +97,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F | |||
| VSSEV_FLOAT(&x[ix], stride_x, vt, gvl); | |||
| VSSEV_FLOAT(&x[ix+1], stride_x, v1, gvl); | |||
| } | |||
| } | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -86,3 +86,8 @@ endif | |||
| ifndef QROTMKERNEL | |||
| QROTMKERNEL = ../generic/rotm.c | |||
| endif | |||
| SSCALKERNEL = ../arm/scal.c | |||
| DSCALKERNEL = ../arm/scal.c | |||
| CSCALKERNEL = ../arm/zscal.c | |||
| ZSCALKERNEL = ../arm/zscal.c | |||
| @@ -200,3 +200,6 @@ endif | |||
| ifndef QROTMKERNEL | |||
| QROTMKERNEL = ../generic/rotm.c | |||
| endif | |||
| CSCALKERNEL = ../arm/zscal.c | |||
| ZSCALKERNEL = ../arm/zscal.c | |||
| @@ -323,11 +323,11 @@ DSCALKERNEL = scal_sse2.S | |||
| endif | |||
| ifndef CSCALKERNEL | |||
| CSCALKERNEL = zscal_sse.S | |||
| CSCALKERNEL = ../arm/zscal.c | |||
| endif | |||
| ifndef ZSCALKERNEL | |||
| ZSCALKERNEL = zscal_sse2.S | |||
| ZSCALKERNEL = ../arm/zscal.c | |||
| endif | |||
| ifndef ASCALKERNEL | |||
| @@ -229,10 +229,9 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||
| if ( da_i == 0.0 ) | |||
| { | |||
| if (!dummy2) { | |||
| while(j < n1) | |||
| { | |||
| x[i]=0.0; | |||
| x[i+1]=0.0; | |||
| x[i+inc_x]=0.0; | |||
| @@ -244,21 +243,48 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||
| while(j < n) | |||
| { | |||
| x[i]=0.0; | |||
| x[i+1]=0.0; | |||
| i += inc_x ; | |||
| j++; | |||
| } | |||
| } else { | |||
| float temp; | |||
| while(j < n1) | |||
| { | |||
| if (isnan(x[i])|| isnan(x[i+1])) | |||
| temp=NAN; | |||
| else | |||
| temp=0.0; | |||
| x[i]=temp; | |||
| x[i+1]=temp; | |||
| if (isnan(x[i+inc_x])|| isnan(x[i+inc_x+1])) | |||
| temp=NAN; | |||
| else | |||
| temp=0.0; | |||
| x[i+inc_x]= temp; | |||
| x[i+inc_x+1]= temp; | |||
| i += 2*inc_x; | |||
| j+=2; | |||
| } | |||
| while(j < n) | |||
| { | |||
| if (isnan(x[i])|| isnan(x[i+1])) | |||
| temp=NAN; | |||
| else | |||
| temp=0.0; | |||
| x[i]=temp; | |||
| x[i+1]=temp; | |||
| i += inc_x; | |||
| j++; | |||
| } | |||
| } | |||
| } | |||
| else | |||
| { | |||
| while(j < n1) | |||
| { | |||
| if (isnan(x[i]) || isinf(x[i])) | |||
| temp0 = NAN; | |||
| else | |||
| @@ -278,7 +304,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||
| x[i+inc_x] = temp1; | |||
| i += 2*inc_x ; | |||
| j+=2; | |||
| } | |||
| while(j < n) | |||
| @@ -305,14 +330,12 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||
| else | |||
| { | |||
| if ( da_i == 0.0 ) | |||
| if ( da_i == 0.0 && dummy2 ) | |||
| { | |||
| BLASLONG n1 = n & -2; | |||
| while(j < n1) | |||
| { | |||
| temp0 = da_r * x[i]; | |||
| x[i+1] = da_r * x[i+1]; | |||
| x[i] = temp0; | |||
| @@ -367,22 +390,19 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||
| return(0); | |||
| } | |||
| BLASLONG n1 = n & -16; | |||
| if ( n1 > 0 ) | |||
| { | |||
| alpha[0] = da_r; | |||
| alpha[1] = da_i; | |||
| if ( da_r == 0.0 ) | |||
| if ( da_i == 0 ) | |||
| if ( da_i == 0 && !dummy2) | |||
| cscal_kernel_16_zero(n1 , alpha , x); | |||
| else | |||
| cscal_kernel_16_zero_r(n1 , alpha , x); | |||
| cscal_kernel_16/*_zero_r*/(n1 , alpha , x); | |||
| else | |||
| cscal_kernel_16(n1 , alpha , x); | |||
| i = n1 << 1; | |||
| j = n1; | |||
| } | |||
| @@ -393,6 +413,8 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||
| { | |||
| FLOAT res=0.0; | |||
| if (isnan(da_r)) res= da_r; | |||
| if (dummy2) | |||
| if (isnan(x[i])||isnan(x[i+1])) res= NAN; | |||
| while(j < n) | |||
| { | |||
| x[i]=res; | |||
| @@ -415,7 +437,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||
| } else | |||
| { | |||
| while(j < n) | |||
| { | |||
| temp0 = -da_i * x[i+1]; | |||
| @@ -424,11 +445,10 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||
| if (!isinf(x[i+1])) | |||
| x[i+1] = da_i * x[i]; | |||
| else x[i+1] = NAN; | |||
| if ( x[i] == x[i]) //preserve NaN | |||
| if ( !isnan(x[i])) //preserve NaN | |||
| x[i] = temp0; | |||
| i += 2 ; | |||
| j++; | |||
| } | |||
| } | |||
| @@ -439,12 +459,22 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||
| if ( da_i == 0.0 ) | |||
| { | |||
| while(j < n) | |||
| { | |||
| temp0 = da_r * x[i]; | |||
| x[i+1] = da_r * x[i+1]; | |||
| if (dummy2) { | |||
| if (isnan(x[i])||isinf(x[i])) temp0=NAN; | |||
| if (isnan(x[i+1])||isinf(x[i+1])) | |||
| x[i+1]=NAN; | |||
| else | |||
| x[i+1] = da_r * x[i+1]; | |||
| } else { | |||
| if (isnan(x[i])) | |||
| x[i+1] = NAN; | |||
| else | |||
| x[i+1] = da_r * x[i+1]; | |||
| } | |||
| x[i] = temp0; | |||
| i += 2 ; | |||
| j++; | |||
| @@ -476,7 +506,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||
| temp0 = da_r * x[i] - da_i * x[i+1]; | |||
| x[i+1] = da_r * x[i+1] + da_i * x[i]; | |||
| x[i] = temp0; | |||
| if(!isnan(x[i]))x[i] = temp0; | |||
| i += 2 ; | |||
| j++; | |||
| @@ -222,13 +222,14 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||
| if ( da_r == 0.0 ) | |||
| { | |||
| BLASLONG n1 = n & -2; | |||
| if ( da_i == 0.0 ) | |||
| { | |||
| if (!dummy2) { | |||
| while(j < n1) | |||
| { | |||
| x[i]=0.0; | |||
| x[i+1]=0.0; | |||
| x[i+inc_x]=0.0; | |||
| @@ -245,9 +246,40 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||
| x[i+1]=0.0; | |||
| i += inc_x ; | |||
| j++; | |||
| } | |||
| } else { | |||
| float temp; | |||
| while(j < n1) | |||
| { | |||
| if (isnan(x[i])|| isnan(x[i+1])) | |||
| temp=NAN; | |||
| else | |||
| temp=0.0; | |||
| x[i]=temp; | |||
| x[i+1]=temp; | |||
| if (isnan(x[i+inc_x])|| isnan(x[i+inc_x+1])) | |||
| temp=NAN; | |||
| else | |||
| temp=0.0; | |||
| x[i+inc_x]= temp; | |||
| x[i+inc_x+1]= temp; | |||
| i += 2*inc_x; | |||
| j+=2; | |||
| } | |||
| while(j < n) | |||
| { | |||
| if (isnan(x[i])|| isnan(x[i+1])) | |||
| temp=NAN; | |||
| else | |||
| temp=0.0; | |||
| x[i]=temp; | |||
| x[i+1]=temp; | |||
| i += inc_x; | |||
| j++; | |||
| } | |||
| } | |||
| } | |||
| else | |||
| { | |||
| @@ -260,7 +292,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||
| temp0 = -da_i * x[i+1]; | |||
| if (!isinf(x[i+1])) | |||
| x[i+1] = da_i * x[i]; | |||
| else x[i+1] = NAN; | |||
| else x[i+1] = NAN; | |||
| x[i] = temp0; | |||
| if (isnan(x[i+inc_x]) || isinf(x[i+inc_x])) | |||
| temp1 = NAN; | |||
| @@ -291,16 +323,13 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||
| } | |||
| } | |||
| } | |||
| else | |||
| { | |||
| if ( da_i == 0.0 ) | |||
| if ( da_i == 0.0 && dummy2) | |||
| { | |||
| BLASLONG n1 = n & -2; | |||
| @@ -370,26 +399,27 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||
| alpha[1] = da_i; | |||
| if ( da_r == 0.0 ) | |||
| if ( da_i == 0 ) | |||
| if ( da_i == 0 && !dummy2 ) | |||
| zscal_kernel_8_zero(n1 , alpha , x); | |||
| else | |||
| // zscal_kernel_8_zero_r(n1 , alpha , x); | |||
| zscal_kernel_8(n1 , alpha , x); | |||
| else | |||
| if ( da_i == 0 && da_r == da_r) | |||
| /* if ( da_i == 0 && da_r == da_r ) | |||
| zscal_kernel_8_zero_i(n1 , alpha , x); | |||
| else | |||
| else*/ | |||
| zscal_kernel_8(n1 , alpha , x); | |||
| } | |||
| i = n1 << 1; | |||
| j = n1; | |||
| if ( da_r == 0.0 || da_r != da_r ) | |||
| } | |||
| if ( da_r == 0.0 || isnan(da_r) ) | |||
| { | |||
| if ( da_i == 0.0 ) | |||
| { | |||
| FLOAT res=0.0; | |||
| if (da_r != da_r) res= da_r; | |||
| FLOAT res=0.0; | |||
| if (isnan(da_r)) res= da_r; | |||
| if (dummy2) | |||
| if (isnan(x[i])||isnan(x[i+1])) res= NAN; | |||
| while(j < n) | |||
| { | |||
| x[i]=res; | |||
| @@ -412,7 +442,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||
| } else | |||
| { | |||
| while(j < n) | |||
| { | |||
| temp0 = -da_i * x[i+1]; | |||
| @@ -421,7 +450,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||
| if (!isinf(x[i+1])) | |||
| x[i+1] = da_i * x[i]; | |||
| else x[i+1] = NAN; | |||
| if ( x[i] == x[i]) //preserve NaN | |||
| if ( !isnan(x[i])) //preserve NaN | |||
| x[i] = temp0; | |||
| i += 2 ; | |||
| j++; | |||
| @@ -437,8 +466,9 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||
| { | |||
| while(j < n) | |||
| { | |||
| temp0 = da_r * x[i]; | |||
| if (isnan(x[i]))x[i+1]=NAN; | |||
| else | |||
| x[i+1] = da_r * x[i+1]; | |||
| x[i] = temp0; | |||
| i += 2 ; | |||
| @@ -453,7 +483,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||
| { | |||
| temp0 = da_r * x[i] - da_i * x[i+1]; | |||
| x[i+1] = da_r * x[i+1] + da_i * x[i]; | |||
| x[i] = temp0; | |||
| if(!isnan(x[i]))x[i] = temp0; | |||
| i += 2 ; | |||
| j++; | |||
| @@ -210,7 +210,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||
| BLASLONG n1 = n & -2; | |||
| if (da_i == 0.0) { | |||
| if (dummy2 == 0) { | |||
| while (j < n1) { | |||
| x[i] = 0.0; | |||
| @@ -230,11 +230,43 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||
| j++; | |||
| } | |||
| } else { | |||
| while (j < n1) { | |||
| if (isnan(x[i]) || isinf(x[i]) || isnan(x[i+1])) { | |||
| x[i] = NAN; | |||
| x[i+1] = NAN; | |||
| }else{ | |||
| x[i] = 0.0; | |||
| x[i + 1] = 0.0; | |||
| } | |||
| if (isnan(x[i+inc_x]) || isinf(x[i+inc_x]) || isnan(x[i+1+inc_x])) { | |||
| x[i + inc_x] = NAN; | |||
| x[i + 1 + inc_x] = NAN; | |||
| } else { | |||
| x[i + inc_x] = 0.0; | |||
| x[i + 1 + inc_x] = 0.0; | |||
| } | |||
| i += 2 * inc_x; | |||
| j += 2; | |||
| } | |||
| while (j < n) { | |||
| if (isnan(x[i]) || isinf(x[i]) || isnan(x[i+1])) { | |||
| x[i] = NAN; | |||
| x[i+1] = NAN; | |||
| }else{ | |||
| x[i] = 0.0; | |||
| x[i + 1] = 0.0; | |||
| } | |||
| i += inc_x; | |||
| j++; | |||
| } | |||
| } | |||
| } else { | |||
| while (j < n1) { | |||
| if (isnan(x[i]) || isinf(x[i])) | |||
| if (isnan(x[i]) || isinf(x[i])) | |||
| temp0 = NAN; | |||
| else | |||
| temp0 = -da_i * x[i + 1]; | |||
| @@ -276,7 +308,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||
| } else { | |||
| if (da_i == 0.0) { | |||
| if (da_i == 0.0 && dummy2) { | |||
| BLASLONG n1 = n & -2; | |||
| while (j < n1) { | |||
| @@ -335,12 +367,16 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||
| alpha[1] = da_i; | |||
| if (da_r == 0.0) | |||
| if (da_i == 0) | |||
| if (da_i == 0 && dummy2 == 0) | |||
| cscal_kernel_16_zero(n1, x); | |||
| else | |||
| else { | |||
| /* if (dummy2 == 0) | |||
| cscal_kernel_16_zero_r(n1, alpha, x); | |||
| else if (da_i == 0) | |||
| cscal_kernel_16_zero_i(n1, alpha, x); | |||
| else*/ | |||
| cscal_kernel_16(n1, da_r, da_i, x); | |||
| } | |||
| /* else if (da_i == 0 && !isnan(da_r)) | |||
| cscal_kernel_16/*_zero_i(n1, alpha, x);*/ | |||
| else | |||
| cscal_kernel_16(n1, da_r, da_i, x); | |||
| @@ -354,7 +390,8 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||
| float res = 0.0; | |||
| if (isnan(da_r)) res = da_r; | |||
| while (j < n) { | |||
| if (dummy2) | |||
| if (isnan(x[i])|| isnan(x[i+1])) res=NAN; | |||
| x[i] = res; | |||
| x[i + 1] = res; | |||
| i += 2; | |||
| @@ -382,7 +419,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||
| x[i + 1] = da_i * x[i]; | |||
| else | |||
| x[i + 1] = NAN; | |||
| if (x[i] == x[i]) | |||
| if (!isnan(x[i])) | |||
| x[i] = temp0; | |||
| i += 2; | |||
| j++; | |||
| @@ -398,7 +435,18 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||
| while (j < n) { | |||
| temp0 = da_r * x[i]; | |||
| x[i + 1] = da_r * x[i + 1]; | |||
| if (dummy2) { | |||
| if (isnan(x[i])||isinf(x[i]))temp0 = NAN; | |||
| if (isnan(x[i+1])||isinf(x[i+1])) | |||
| x[i+1] = NAN; | |||
| else | |||
| x[i+1] = da_r * x[i + 1]; | |||
| } else { | |||
| if (isnan(x[i])) | |||
| x[i + 1] = NAN; | |||
| else | |||
| x[i + 1] = da_r * x[i + 1]; | |||
| } | |||
| x[i] = temp0; | |||
| i += 2; | |||
| j++; | |||
| @@ -411,7 +459,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||
| temp0 = da_r * x[i] - da_i * x[i + 1]; | |||
| x[i + 1] = da_r * x[i + 1] + da_i * x[i]; | |||
| x[i] = temp0; | |||
| if (!isnan(x[i])) x[i] = temp0; | |||
| i += 2; | |||
| j++; | |||
| @@ -208,7 +208,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||
| BLASLONG n1 = n & -2; | |||
| if (da_i == 0.0) { | |||
| if (dummy2 == 0) { | |||
| while (j < n1) { | |||
| x[i] = 0.0; | |||
| @@ -228,7 +228,38 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||
| j++; | |||
| } | |||
| } else { | |||
| while (j < n1) { | |||
| if (isnan(x[i]) || isinf(x[i]) || isnan(x[i+1])) { | |||
| x[i] = NAN; | |||
| x[i+1] = NAN; | |||
| } else { | |||
| x[i] = 0.0; | |||
| x[i+1] = 0.0; | |||
| } | |||
| if (isnan(x[i+inc_x]) || isinf(x[i+inc_x]) || isnan(x[i+inc_x+1])) { | |||
| x[i + inc_x] = NAN; | |||
| x[i + inc_x + 1] = NAN; | |||
| } else { | |||
| x[i + inc_x] = 0.; | |||
| x[i + inc_x + 1] = 0.; | |||
| } | |||
| i += 2 * inc_x; | |||
| j += 2; | |||
| } | |||
| while (j < n) { | |||
| if (isnan(x[i]) || isinf(x[i]) || isnan(x[i+1])) { | |||
| x[i] = NAN; | |||
| x[i+1] = NAN; | |||
| } else { | |||
| x[i] = 0.; | |||
| x[i+1] = 0.; | |||
| } | |||
| i += inc_x; | |||
| j++; | |||
| } | |||
| } | |||
| } else { | |||
| while (j < n1) { | |||
| @@ -276,7 +307,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||
| } else { | |||
| if (da_i == 0.0) { | |||
| if (da_i == 0.0 && dummy2) { | |||
| BLASLONG n1 = n & -2; | |||
| while (j < n1) { | |||
| @@ -335,12 +366,10 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||
| alpha[1] = da_i; | |||
| if (da_r == 0.0) | |||
| if (da_i == 0) | |||
| if (da_i == 0 && dummy2 == 0) | |||
| zscal_kernel_8_zero(n1, x); | |||
| else | |||
| zscal_kernel_8(n1, da_r, da_i, x); | |||
| else if (da_i == 0 && da_r == da_r) | |||
| zscal_kernel_8_zero_i(n1, alpha, x); | |||
| else | |||
| zscal_kernel_8(n1, da_r, da_i, x); | |||
| @@ -354,7 +383,8 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||
| double res= 0.0; | |||
| if (isnan(da_r)) res = da_r; | |||
| while (j < n) { | |||
| if (dummy2) | |||
| if (isnan(x[i]) || isnan(x[i+1])) res = NAN; | |||
| x[i] = res; | |||
| x[i + 1] = res; | |||
| i += 2; | |||
| @@ -381,7 +411,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||
| x[i + 1] = da_i * x[i]; | |||
| else | |||
| x[i + 1] = NAN; | |||
| if (x[i]==x[i]) | |||
| if (!isnan(x[i])) | |||
| x[i] = temp0; | |||
| i += 2; | |||
| j++; | |||
| @@ -397,8 +427,19 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||
| while (j < n) { | |||
| temp0 = da_r * x[i]; | |||
| x[i + 1] = da_r * x[i + 1]; | |||
| x[i] = temp0; | |||
| if (dummy2) { | |||
| if (isnan(x[i]) || isinf(x[i])) temp0 = NAN; | |||
| if (isnan(x[i + 1]) || isinf(x[i + 1])) | |||
| x[i + 1] = NAN; | |||
| else | |||
| x[i + 1] = da_r * x[i + 1]; | |||
| } else { | |||
| if (isnan(x[i])) | |||
| x[i + 1] = NAN; | |||
| else | |||
| x[i + 1] = da_r * x[i + 1]; | |||
| } | |||
| x[i] = temp0; | |||
| i += 2; | |||
| j++; | |||
| @@ -410,7 +451,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||
| temp0 = da_r * x[i] - da_i * x[i + 1]; | |||
| x[i + 1] = da_r * x[i + 1] + da_i * x[i]; | |||
| x[i] = temp0; | |||
| if (!isnan(x[i])) x[i] = temp0; | |||
| i += 2; | |||
| j++; | |||
| @@ -128,3 +128,477 @@ CTEST(dgemv, 0_nan_inf_incy_2) | |||
| } | |||
| #endif | |||
| #ifdef BUILD_COMPLEX | |||
| CTEST(cgemv, 0_nan_inf) | |||
| { | |||
| int i; | |||
| blasint N = 17; | |||
| blasint incX = 1; | |||
| blasint incY = 1; | |||
| float alpha[2] = {0.0, 0.0}; | |||
| float beta[2] = {0.0, 0.0}; | |||
| char trans = 'N'; | |||
| float A[17 * 17 * 4]; | |||
| float X[17 * 2]; | |||
| float Y[17 * 2]; | |||
| memset(A, 0, sizeof(A)); | |||
| memset(X, 0, sizeof(X)); | |||
| for (i = 0; i < (2 * N - 2); i += 4) | |||
| { | |||
| Y[i] = NAN; | |||
| Y[i + 1] = NAN; | |||
| Y[i + 2] = INFINITY; | |||
| Y[i + 3] = INFINITY; | |||
| } | |||
| Y[2 * N - 1] = NAN; | |||
| Y[2 * N - 2] = NAN; | |||
| BLASFUNC(cgemv)(&trans, &N, &N, alpha, A, &N, X, &incX, beta, Y, &incY); | |||
| for (i = 0; i < 2 * N; i ++) | |||
| ASSERT_TRUE(Y[i] == 0.0); | |||
| } | |||
| CTEST(cgemv, 0_nan_inf_incy_2) | |||
| { | |||
| int i; | |||
| blasint N = 17; | |||
| blasint incX = 1; | |||
| blasint incY = 2; | |||
| float alpha[2] = {0.0, 0.0}; | |||
| float beta[2] = {0.0, 0.0}; | |||
| char trans = 'N'; | |||
| float A[17 * 17 * 4]; | |||
| float X[17]; | |||
| float Y[17 * 2 * 2]; | |||
| float *ay = Y; | |||
| memset(A, 0, sizeof(A)); | |||
| memset(X, 0, sizeof(X)); | |||
| memset(Y, 0, sizeof(Y)); | |||
| for (i = 0; i < (2 * N - 2); i += 4) | |||
| { | |||
| ay[0] = NAN; | |||
| ay[1] = NAN; | |||
| ay += 4; | |||
| ay[0] = INFINITY; | |||
| ay[1] = INFINITY; | |||
| ay += 4; | |||
| } | |||
| Y[4 * N - 4] = NAN; | |||
| Y[4 * N - 3] = NAN; | |||
| BLASFUNC(cgemv)(&trans, &N, &N, alpha, A, &N, X, &incX, beta, Y, &incY); | |||
| for (i = 0; i < 4 * N; i ++) | |||
| ASSERT_TRUE(Y[i] == 0.0); | |||
| } | |||
| CTEST(cgemv, 0_2_nan_1_inf_1) | |||
| { | |||
| int i; | |||
| blasint N = 17; | |||
| blasint incX = 1; | |||
| blasint incY = 1; | |||
| float alpha[2] = {0.0, 0.0}; | |||
| float beta[2] = {0.0, 2.0}; | |||
| char trans = 'N'; | |||
| float A[17 * 17 * 4]; | |||
| float X[17 * 2]; | |||
| float Y[17 * 2]; | |||
| memset(A, 0, sizeof(A)); | |||
| memset(X, 0, sizeof(X)); | |||
| for (i = 0; i < (2 * N - 2); i += 4) | |||
| { | |||
| Y[i] = NAN; | |||
| Y[i + 1] = 1.0; | |||
| Y[i + 2] = INFINITY; | |||
| Y[i + 3] = 1.0; | |||
| } | |||
| Y[2 * N - 2] = NAN; | |||
| Y[2 * N - 1] = 1.0; | |||
| BLASFUNC(cgemv)(&trans, &N, &N, alpha, A, &N, X, &incX, beta, Y, &incY); | |||
| for (i = 0; i < 2 * N; i += 2) { | |||
| if ((i >> 1) % 2){ | |||
| ASSERT_TRUE(isnan(Y[i])); | |||
| ASSERT_TRUE(isinf(Y[i + 1])); | |||
| } | |||
| else { | |||
| ASSERT_TRUE(isnan(Y[i])); | |||
| ASSERT_TRUE(isnan(Y[i + 1])); | |||
| } | |||
| } | |||
| } | |||
| CTEST(cgemv, 0_2_nan_1_inf_1_incy_2) | |||
| { | |||
| int i; | |||
| blasint N = 17; | |||
| blasint incX = 1; | |||
| blasint incY = 2; | |||
| float alpha[2] = {0.0, 0.0}; | |||
| float beta[2] = {0.0, 2.0}; | |||
| char trans = 'N'; | |||
| float A[17 * 17 * 4]; | |||
| float X[17]; | |||
| float Y[17 * 2 * 2]; | |||
| float *ay = Y; | |||
| memset(A, 0, sizeof(A)); | |||
| memset(X, 0, sizeof(X)); | |||
| memset(Y, 0, sizeof(Y)); | |||
| for (i = 0; i < (2 * N - 2); i += 4) | |||
| { | |||
| ay[0] = NAN; | |||
| ay[1] = 1.0; | |||
| ay += 4; | |||
| ay[0] = INFINITY; | |||
| ay[1] = 1.0; | |||
| ay += 4; | |||
| } | |||
| Y[4 * N - 4] = NAN; | |||
| Y[4 * N - 3] = 1.0; | |||
| BLASFUNC(cgemv)(&trans, &N, &N, alpha, A, &N, X, &incX, beta, Y, &incY); | |||
| for (i = 0; i < 4 * N; i += 2) { | |||
| if ((i >> 1) % 2) { | |||
| ASSERT_TRUE(Y[i] == 0.0); | |||
| ASSERT_TRUE(Y[i + 1] == 0.0); | |||
| } | |||
| else { | |||
| if ((i >> 2) % 2) { | |||
| ASSERT_TRUE(isnan(Y[i])); | |||
| ASSERT_TRUE(isinf(Y[i + 1])); | |||
| } | |||
| else { | |||
| ASSERT_TRUE(isnan(Y[i])); | |||
| ASSERT_TRUE(isnan(Y[i + 1])); | |||
| } | |||
| } | |||
| } | |||
| } | |||
| CTEST(cgemv, 2_0_nan_1_inf_1) | |||
| { | |||
| int i; | |||
| blasint N = 17; | |||
| blasint incX = 1; | |||
| blasint incY = 1; | |||
| float alpha[2] = {0.0, 0.0}; | |||
| float beta[2] = {2.0, 0.0}; | |||
| char trans = 'N'; | |||
| float A[17 * 17 * 4]; | |||
| float X[17 * 2]; | |||
| float Y[17 * 2]; | |||
| memset(A, 0, sizeof(A)); | |||
| memset(X, 0, sizeof(X)); | |||
| for (i = 0; i < (2 * N - 2); i += 4) | |||
| { | |||
| Y[i] = NAN; | |||
| Y[i + 1] = 1.0; | |||
| Y[i + 2] = INFINITY; | |||
| Y[i + 3] = 1.0; | |||
| } | |||
| Y[2 * N - 2] = NAN; | |||
| Y[2 * N - 1] = 1.0; | |||
| BLASFUNC(cgemv)(&trans, &N, &N, alpha, A, &N, X, &incX, beta, Y, &incY); | |||
| for (i = 0; i < 2 * N; i += 2) { | |||
| if ((i >> 1) % 2){ | |||
| ASSERT_TRUE(isinf(Y[i])); | |||
| ASSERT_TRUE(isnan(Y[i + 1])); | |||
| } | |||
| else { | |||
| ASSERT_TRUE(isnan(Y[i])); | |||
| ASSERT_TRUE(isnan(Y[i + 1])); | |||
| } | |||
| } | |||
| } | |||
| CTEST(cgemv, 2_0_nan_1_inf_1_incy_2) | |||
| { | |||
| int i; | |||
| blasint N = 17; | |||
| blasint incX = 1; | |||
| blasint incY = 2; | |||
| float alpha[2] = {0.0, 0.0}; | |||
| float beta[2] = {2.0, 0.0}; | |||
| char trans = 'N'; | |||
| float A[17 * 17 * 4]; | |||
| float X[17]; | |||
| float Y[17 * 2 * 2]; | |||
| float *ay = Y; | |||
| memset(A, 0, sizeof(A)); | |||
| memset(X, 0, sizeof(X)); | |||
| memset(Y, 0, sizeof(Y)); | |||
| for (i = 0; i < (2 * N - 2); i += 4) | |||
| { | |||
| ay[0] = NAN; | |||
| ay[1] = 1.0; | |||
| ay += 4; | |||
| ay[0] = INFINITY; | |||
| ay[1] = 1.0; | |||
| ay += 4; | |||
| } | |||
| Y[4 * N - 4] = NAN; | |||
| Y[4 * N - 3] = 1.0; | |||
| BLASFUNC(cgemv)(&trans, &N, &N, alpha, A, &N, X, &incX, beta, Y, &incY); | |||
| for (i = 0; i < 4 * N; i += 2) { | |||
| if ((i >> 1) % 2) { | |||
| ASSERT_TRUE(Y[i] == 0.0); | |||
| ASSERT_TRUE(Y[i + 1] == 0.0); | |||
| } | |||
| else { | |||
| if ((i >> 2) % 2) { | |||
| ASSERT_TRUE(isinf(Y[i])); | |||
| ASSERT_TRUE(isnan(Y[i + 1])); | |||
| } | |||
| else { | |||
| ASSERT_TRUE(isnan(Y[i])); | |||
| ASSERT_TRUE(isnan(Y[i + 1])); | |||
| } | |||
| } | |||
| } | |||
| } | |||
| #endif | |||
| #ifdef BUILD_COMPLEX16 | |||
| CTEST(zgemv, 0_nan_inf) | |||
| { | |||
| int i; | |||
| blasint N = 17; | |||
| blasint incX = 1; | |||
| blasint incY = 1; | |||
| double alpha[2] = {0.0, 0.0}; | |||
| double beta[2] = {0.0, 0.0}; | |||
| char trans = 'N'; | |||
| double A[17 * 17 * 4]; | |||
| double X[17 * 2]; | |||
| double Y[17 * 2]; | |||
| memset(A, 0, sizeof(A)); | |||
| memset(X, 0, sizeof(X)); | |||
| for (i = 0; i < (2 * N - 2); i += 4) | |||
| { | |||
| Y[i] = NAN; | |||
| Y[i + 1] = NAN; | |||
| Y[i + 2] = INFINITY; | |||
| Y[i + 3] = INFINITY; | |||
| } | |||
| Y[2 * N - 1] = NAN; | |||
| Y[2 * N - 2] = NAN; | |||
| BLASFUNC(zgemv)(&trans, &N, &N, alpha, A, &N, X, &incX, beta, Y, &incY); | |||
| for (i = 0; i < 2 * N; i ++) | |||
| ASSERT_TRUE(Y[i] == 0.0); | |||
| } | |||
| CTEST(zgemv, 0_nan_inf_incy_2) | |||
| { | |||
| int i; | |||
| blasint N = 17; | |||
| blasint incX = 1; | |||
| blasint incY = 2; | |||
| double alpha[2] = {0.0, 0.0}; | |||
| double beta[2] = {0.0, 0.0}; | |||
| char trans = 'N'; | |||
| double A[17 * 17 * 4]; | |||
| double X[17]; | |||
| double Y[17 * 2 * 2]; | |||
| double *ay = Y; | |||
| memset(A, 0, sizeof(A)); | |||
| memset(X, 0, sizeof(X)); | |||
| memset(Y, 0, sizeof(Y)); | |||
| for (i = 0; i < (2 * N - 2); i += 4) | |||
| { | |||
| ay[0] = NAN; | |||
| ay[1] = NAN; | |||
| ay += 4; | |||
| ay[0] = INFINITY; | |||
| ay[1] = INFINITY; | |||
| ay += 4; | |||
| } | |||
| Y[4 * N - 4] = NAN; | |||
| Y[4 * N - 3] = NAN; | |||
| BLASFUNC(zgemv)(&trans, &N, &N, alpha, A, &N, X, &incX, beta, Y, &incY); | |||
| for (i = 0; i < 4 * N; i ++) | |||
| ASSERT_TRUE(Y[i] == 0.0); | |||
| } | |||
| CTEST(zgemv, 0_2_nan_1_inf_1) | |||
| { | |||
| int i; | |||
| blasint N = 17; | |||
| blasint incX = 1; | |||
| blasint incY = 1; | |||
| double alpha[2] = {0.0, 0.0}; | |||
| double beta[2] = {0.0, 2.0}; | |||
| char trans = 'N'; | |||
| double A[17 * 17 * 4]; | |||
| double X[17 * 2]; | |||
| double Y[17 * 2]; | |||
| memset(A, 0, sizeof(A)); | |||
| memset(X, 0, sizeof(X)); | |||
| for (i = 0; i < (2 * N - 2); i += 4) | |||
| { | |||
| Y[i] = NAN; | |||
| Y[i + 1] = 1.0; | |||
| Y[i + 2] = INFINITY; | |||
| Y[i + 3] = 1.0; | |||
| } | |||
| Y[2 * N - 2] = NAN; | |||
| Y[2 * N - 1] = 1.0; | |||
| BLASFUNC(zgemv)(&trans, &N, &N, alpha, A, &N, X, &incX, beta, Y, &incY); | |||
| for (i = 0; i < 2 * N; i += 2) { | |||
| if ((i >> 1) % 2){ | |||
| ASSERT_TRUE(isnan(Y[i])); | |||
| ASSERT_TRUE(isinf(Y[i + 1])); | |||
| } | |||
| else { | |||
| ASSERT_TRUE(isnan(Y[i])); | |||
| ASSERT_TRUE(isnan(Y[i + 1])); | |||
| } | |||
| } | |||
| } | |||
| CTEST(zgemv, 0_2_nan_1_inf_1_incy_2) | |||
| { | |||
| int i; | |||
| blasint N = 17; | |||
| blasint incX = 1; | |||
| blasint incY = 2; | |||
| double alpha[2] = {0.0, 0.0}; | |||
| double beta[2] = {0.0, 2.0}; | |||
| char trans = 'N'; | |||
| double A[17 * 17 * 4]; | |||
| double X[17]; | |||
| double Y[17 * 2 * 2]; | |||
| double *ay = Y; | |||
| memset(A, 0, sizeof(A)); | |||
| memset(X, 0, sizeof(X)); | |||
| memset(Y, 0, sizeof(Y)); | |||
| for (i = 0; i < (2 * N - 2); i += 4) | |||
| { | |||
| ay[0] = NAN; | |||
| ay[1] = 1.0; | |||
| ay += 4; | |||
| ay[0] = INFINITY; | |||
| ay[1] = 1.0; | |||
| ay += 4; | |||
| } | |||
| Y[4 * N - 4] = NAN; | |||
| Y[4 * N - 3] = 1.0; | |||
| BLASFUNC(zgemv)(&trans, &N, &N, alpha, A, &N, X, &incX, beta, Y, &incY); | |||
| for (i = 0; i < 4 * N; i += 2) { | |||
| if ((i >> 1) % 2) { | |||
| ASSERT_TRUE(Y[i] == 0.0); | |||
| ASSERT_TRUE(Y[i + 1] == 0.0); | |||
| } | |||
| else { | |||
| if ((i >> 2) % 2) { | |||
| ASSERT_TRUE(isnan(Y[i])); | |||
| ASSERT_TRUE(isinf(Y[i + 1])); | |||
| } | |||
| else { | |||
| ASSERT_TRUE(isnan(Y[i])); | |||
| ASSERT_TRUE(isnan(Y[i + 1])); | |||
| } | |||
| } | |||
| } | |||
| } | |||
| CTEST(zgemv, 2_0_nan_1_inf_1) | |||
| { | |||
| int i; | |||
| blasint N = 17; | |||
| blasint incX = 1; | |||
| blasint incY = 1; | |||
| double alpha[2] = {0.0, 0.0}; | |||
| double beta[2] = {2.0, 0.0}; | |||
| char trans = 'N'; | |||
| double A[17 * 17 * 4]; | |||
| double X[17 * 2]; | |||
| double Y[17 * 2]; | |||
| memset(A, 0, sizeof(A)); | |||
| memset(X, 0, sizeof(X)); | |||
| for (i = 0; i < (2 * N - 2); i += 4) | |||
| { | |||
| Y[i] = NAN; | |||
| Y[i + 1] = 1.0; | |||
| Y[i + 2] = INFINITY; | |||
| Y[i + 3] = 1.0; | |||
| } | |||
| Y[2 * N - 2] = NAN; | |||
| Y[2 * N - 1] = 1.0; | |||
| BLASFUNC(zgemv)(&trans, &N, &N, alpha, A, &N, X, &incX, beta, Y, &incY); | |||
| for (i = 0; i < 2 * N; i += 2) { | |||
| if ((i >> 1) % 2){ | |||
| ASSERT_TRUE(isinf(Y[i])); | |||
| ASSERT_TRUE(isnan(Y[i + 1])); | |||
| } | |||
| else { | |||
| ASSERT_TRUE(isnan(Y[i])); | |||
| ASSERT_TRUE(isnan(Y[i + 1])); | |||
| } | |||
| } | |||
| } | |||
| CTEST(zgemv, 2_0_nan_1_inf_1_incy_2) | |||
| { | |||
| int i; | |||
| blasint N = 17; | |||
| blasint incX = 1; | |||
| blasint incY = 2; | |||
| double alpha[2] = {0.0, 0.0}; | |||
| double beta[2] = {2.0, 0.0}; | |||
| char trans = 'N'; | |||
| double A[17 * 17 * 4]; | |||
| double X[17]; | |||
| double Y[17 * 2 * 2]; | |||
| double *ay = Y; | |||
| memset(A, 0, sizeof(A)); | |||
| memset(X, 0, sizeof(X)); | |||
| memset(Y, 0, sizeof(Y)); | |||
| for (i = 0; i < (2 * N - 2); i += 4) | |||
| { | |||
| ay[0] = NAN; | |||
| ay[1] = 1.0; | |||
| ay += 4; | |||
| ay[0] = INFINITY; | |||
| ay[1] = 1.0; | |||
| ay += 4; | |||
| } | |||
| Y[4 * N - 4] = NAN; | |||
| Y[4 * N - 3] = 1.0; | |||
| BLASFUNC(zgemv)(&trans, &N, &N, alpha, A, &N, X, &incX, beta, Y, &incY); | |||
| for (i = 0; i < 4 * N; i += 2) { | |||
| if ((i >> 1) % 2) { | |||
| ASSERT_TRUE(Y[i] == 0.0); | |||
| ASSERT_TRUE(Y[i + 1] == 0.0); | |||
| } | |||
| else { | |||
| if ((i >> 2) % 2) { | |||
| ASSERT_TRUE(isinf(Y[i])); | |||
| ASSERT_TRUE(isnan(Y[i + 1])); | |||
| } | |||
| else { | |||
| ASSERT_TRUE(isnan(Y[i])); | |||
| ASSERT_TRUE(isnan(Y[i + 1])); | |||
| } | |||
| } | |||
| } | |||
| } | |||
| #endif | |||
| @@ -442,6 +442,33 @@ CTEST(cscal, i_0inf_inc_2) | |||
| ASSERT_TRUE(isnan(inf[17])); | |||
| } | |||
| CTEST(cscal, i00_NAN) | |||
| { | |||
| blasint N=9; | |||
| blasint incX=1; | |||
| float i[] = {0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0 }; | |||
| float nan[] = {NAN, 0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0}; | |||
| BLASFUNC(cscal)(&N, i, nan, &incX); | |||
| ASSERT_TRUE(isnan(nan[0])); | |||
| ASSERT_TRUE(isnan(nan[1])); | |||
| ASSERT_TRUE(isnan(nan[16])); | |||
| ASSERT_TRUE(isnan(nan[17])); | |||
| } | |||
| CTEST(cscal, i00_NAN_incx_2) | |||
| { | |||
| blasint N=9; | |||
| blasint incX=2; | |||
| float i[] = {0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0 }; | |||
| float nan[] = {0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, | |||
| 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN}; | |||
| BLASFUNC(cscal)(&N, i, nan, &incX); | |||
| ASSERT_TRUE(isnan(nan[0])); | |||
| ASSERT_TRUE(isnan(nan[1])); | |||
| ASSERT_TRUE(isnan(nan[16])); | |||
| ASSERT_TRUE(isnan(nan[17])); | |||
| } | |||
| #endif | |||
| #ifdef BUILD_COMPLEX16 | |||
| @@ -588,4 +615,31 @@ CTEST(zscal, i_0inf_inc_2) | |||
| ASSERT_TRUE(isnan(inf[17])); | |||
| } | |||
| CTEST(zscal, i00_NAN) | |||
| { | |||
| blasint N=9; | |||
| blasint incX=1; | |||
| double i[] = {0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0 }; | |||
| double nan[] = {NAN, 0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0}; | |||
| BLASFUNC(zscal)(&N, i, nan, &incX); | |||
| ASSERT_TRUE(isnan(nan[0])); | |||
| ASSERT_TRUE(isnan(nan[1])); | |||
| ASSERT_TRUE(isnan(nan[16])); | |||
| ASSERT_TRUE(isnan(nan[17])); | |||
| } | |||
| CTEST(zscal, i00_NAN_incx_2) | |||
| { | |||
| blasint N=9; | |||
| blasint incX=2; | |||
| double i[] = {0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0 }; | |||
| double nan[] = {0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, | |||
| 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN}; | |||
| BLASFUNC(zscal)(&N, i, nan, &incX); | |||
| ASSERT_TRUE(isnan(nan[0])); | |||
| ASSERT_TRUE(isnan(nan[1])); | |||
| ASSERT_TRUE(isnan(nan[16])); | |||
| ASSERT_TRUE(isnan(nan[17])); | |||
| } | |||
| #endif | |||