| @@ -231,7 +231,12 @@ In chronological order: | |||||
| * [2024-01-24] Optimize GEMV forwarding on ARM64 systems | * [2024-01-24] Optimize GEMV forwarding on ARM64 systems | ||||
| * Aniket P. Garade <https://github.com/garadeaniket> Sushil Pratap Singh <https://github.com/SushilPratap04> Juliya James <https://github.com/Juliya32> | * Aniket P. Garade <https://github.com/garadeaniket> Sushil Pratap Singh <https://github.com/SushilPratap04> Juliya James <https://github.com/Juliya32> | ||||
| * [2024-12-13] Optimized swap and rot Level-1 BLAS routines with ARM SVE | |||||
| * [2024-12-13] Optimized swap and rot Level-1 BLAS routines with ARM SVE | |||||
| * Annop Wongwathanarat <annop.wongwathanarat@arm.com> | * Annop Wongwathanarat <annop.wongwathanarat@arm.com> | ||||
| * [2025-01-21] Optimize gemv_t_sve_v1x3 kernel | |||||
| * [2025-01-10] Add thread throttling profile for SGEMM on NEOVERSEV1 | |||||
| * [2025-01-21] Optimize gemv_t_sve_v1x3 kernel | |||||
| * Marek Michalowski <https://github.com/michalowski-arm> | |||||
| * [2025-01-21] Add thread throttling profile for SGEMV on `NEOVERSEV1` | |||||
| @@ -315,8 +315,8 @@ endif | |||||
| endif | endif | ||||
| ifeq ($(CPP_THREAD_SAFETY_TEST), 1) | ifeq ($(CPP_THREAD_SAFETY_TEST), 1) | ||||
| @install -m 666 cpp_thread_test/dgemm_tester $(DESTDIR)$(OPENBLAS_BINARY_DIR) | |||||
| @install -m 666 cpp_thread_test/dgemv_tester $(DESTDIR)$(OPENBLAS_BINARY_DIR) | |||||
| @install -m 666 cpp_thread_test/dgemm_tester $(DESTDIR)$(OPENBLAS_BINARY_DIR) | |||||
| @install -m 666 cpp_thread_test/dgemv_tester $(DESTDIR)$(OPENBLAS_BINARY_DIR) | |||||
| endif | endif | ||||
| endif | endif | ||||
| @@ -79,6 +79,9 @@ macro(SetDefaultL1) | |||||
| SetFallback(CROTKERNEL zrot.S) | SetFallback(CROTKERNEL zrot.S) | ||||
| SetFallback(ZROTKERNEL zrot.S) | SetFallback(ZROTKERNEL zrot.S) | ||||
| SetFallback(XROTKERNEL zrot.S) | SetFallback(XROTKERNEL zrot.S) | ||||
| SetFallback(SROTMKERNEL rotm.S) | |||||
| SetFallback(DROTMKERNEL rotm.S) | |||||
| SetFallback(QROTMKERNEL rotm.S) | |||||
| SetFallback(SSCALKERNEL scal.S) | SetFallback(SSCALKERNEL scal.S) | ||||
| SetFallback(DSCALKERNEL scal.S) | SetFallback(DSCALKERNEL scal.S) | ||||
| SetFallback(CSCALKERNEL zscal.S) | SetFallback(CSCALKERNEL zscal.S) | ||||
| @@ -21,7 +21,15 @@ endif() | |||||
| # Other files expect CORE, which is actually TARGET and will become TARGET_CORE for kernel build. Confused yet? | # Other files expect CORE, which is actually TARGET and will become TARGET_CORE for kernel build. Confused yet? | ||||
| # It seems we are meant to use TARGET as input and CORE internally as kernel. | # It seems we are meant to use TARGET as input and CORE internally as kernel. | ||||
| if(NOT DEFINED CORE AND DEFINED TARGET) | if(NOT DEFINED CORE AND DEFINED TARGET) | ||||
| set(CORE ${TARGET}) | |||||
| if (${TARGET} STREQUAL "LOONGSON3R5") | |||||
| set(CORE "LA464") | |||||
| elseif (${TARGET} STREQUAL "LOONGSON2K1000") | |||||
| set(CORE "LA264") | |||||
| elseif (${TARGET} STREQUAL "LOONGSONGENERIC") | |||||
| set(CORE "LA64_GENERIC)") | |||||
| else () | |||||
| set(CORE ${TARGET}) | |||||
| endif() | |||||
| endif() | endif() | ||||
| # TARGET_CORE will override TARGET which is used in DYNAMIC_ARCH=1. | # TARGET_CORE will override TARGET which is used in DYNAMIC_ARCH=1. | ||||
| @@ -16,6 +16,14 @@ endfunction () | |||||
| macro(ParseMakefileVars MAKEFILE_IN) | macro(ParseMakefileVars MAKEFILE_IN) | ||||
| message(STATUS "Reading vars from ${MAKEFILE_IN}...") | message(STATUS "Reading vars from ${MAKEFILE_IN}...") | ||||
| set (C_COMPILER ${CMAKE_C_COMPILER_ID}) | set (C_COMPILER ${CMAKE_C_COMPILER_ID}) | ||||
| set (OSNAME ${CMAKE_SYSTEM_NAME}) | |||||
| if (${C_COMPILER} MATCHES Clang) | |||||
| set (C_COMPILER CLANG) | |||||
| endif () | |||||
| if (${OSNAME} STREQUAL Windows) | |||||
| set (OSNAME WINNT) | |||||
| endif () | |||||
| message(STATUS OS ${OSNAME} COMPILER ${C_COMPILER}) | |||||
| set (IfElse 0) | set (IfElse 0) | ||||
| set (ElseSeen 0) | set (ElseSeen 0) | ||||
| set (SkipIfs 0) | set (SkipIfs 0) | ||||
| @@ -22,6 +22,7 @@ | |||||
| #define DSUM_K dsum_k | #define DSUM_K dsum_k | ||||
| #define DSWAP_K dswap_k | #define DSWAP_K dswap_k | ||||
| #define DROT_K drot_k | #define DROT_K drot_k | ||||
| #define DROTM_K drotm_k | |||||
| #define DGEMV_N dgemv_n | #define DGEMV_N dgemv_n | ||||
| #define DGEMV_T dgemv_t | #define DGEMV_T dgemv_t | ||||
| @@ -180,6 +181,7 @@ | |||||
| #define DSUM_K gotoblas -> dsum_k | #define DSUM_K gotoblas -> dsum_k | ||||
| #define DSWAP_K gotoblas -> dswap_k | #define DSWAP_K gotoblas -> dswap_k | ||||
| #define DROT_K gotoblas -> drot_k | #define DROT_K gotoblas -> drot_k | ||||
| #define DROTM_K gotoblas -> drotm_k | |||||
| #define DGEMV_N gotoblas -> dgemv_n | #define DGEMV_N gotoblas -> dgemv_n | ||||
| #define DGEMV_T gotoblas -> dgemv_t | #define DGEMV_T gotoblas -> dgemv_t | ||||
| @@ -213,9 +213,9 @@ int srotmg_k(float *, float *, float *, float *, float *); | |||||
| int drotmg_k(double *, double *, double *, double *, double *); | int drotmg_k(double *, double *, double *, double *, double *); | ||||
| int qrotmg_k(xdouble *, xdouble *, xdouble *, xdouble *, xdouble *); | int qrotmg_k(xdouble *, xdouble *, xdouble *, xdouble *, xdouble *); | ||||
| int srotm_k (BLASLONG, float, BLASLONG, float, BLASLONG, float); | |||||
| int drotm_k (BLASLONG, double, BLASLONG, double, BLASLONG, double); | |||||
| int qrotm_k (BLASLONG, xdouble, BLASLONG, xdouble, BLASLONG, xdouble); | |||||
| int srotm_k (BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); | |||||
| int drotm_k (BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); | |||||
| int qrotm_k (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); | |||||
| int saxpby_k (BLASLONG, float, float *, BLASLONG, float, float *, BLASLONG); | int saxpby_k (BLASLONG, float, float *, BLASLONG, float, float *, BLASLONG); | ||||
| @@ -70,6 +70,7 @@ | |||||
| #define SUM_K QSUM_K | #define SUM_K QSUM_K | ||||
| #define SWAP_K QSWAP_K | #define SWAP_K QSWAP_K | ||||
| #define ROT_K QROT_K | #define ROT_K QROT_K | ||||
| #define ROTM_K QROTM_K | |||||
| #define GEMV_N QGEMV_N | #define GEMV_N QGEMV_N | ||||
| #define GEMV_T QGEMV_T | #define GEMV_T QGEMV_T | ||||
| @@ -361,6 +362,7 @@ | |||||
| #define SUM_K DSUM_K | #define SUM_K DSUM_K | ||||
| #define SWAP_K DSWAP_K | #define SWAP_K DSWAP_K | ||||
| #define ROT_K DROT_K | #define ROT_K DROT_K | ||||
| #define ROTM_K DROTM_K | |||||
| #define GEMV_N DGEMV_N | #define GEMV_N DGEMV_N | ||||
| #define GEMV_T DGEMV_T | #define GEMV_T DGEMV_T | ||||
| @@ -977,6 +979,7 @@ | |||||
| #define SUM_K SSUM_K | #define SUM_K SSUM_K | ||||
| #define SWAP_K SSWAP_K | #define SWAP_K SSWAP_K | ||||
| #define ROT_K SROT_K | #define ROT_K SROT_K | ||||
| #define ROTM_K SROTM_K | |||||
| #define GEMV_N SGEMV_N | #define GEMV_N SGEMV_N | ||||
| #define GEMV_T SGEMV_T | #define GEMV_T SGEMV_T | ||||
| @@ -197,6 +197,7 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); | |||||
| //double (*dsdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); | //double (*dsdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); | ||||
| int (*srot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float); | int (*srot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float); | ||||
| int (*srotm_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); | |||||
| #endif | #endif | ||||
| #if (BUILD_SINGLE==1) || (BUILD_DOUBLE==1) || (BUILD_COMPLEX==1) | #if (BUILD_SINGLE==1) || (BUILD_DOUBLE==1) || (BUILD_COMPLEX==1) | ||||
| int (*saxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); | int (*saxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); | ||||
| @@ -330,6 +331,7 @@ BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG); | |||||
| #endif | #endif | ||||
| #if (BUILD_DOUBLE==1) || (BUILD_COMPLEX16==1) | #if (BUILD_DOUBLE==1) || (BUILD_COMPLEX16==1) | ||||
| int (*drot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG, double, double); | int (*drot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG, double, double); | ||||
| int (*drotm_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); | |||||
| int (*daxpy_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); | int (*daxpy_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); | ||||
| int (*dscal_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); | int (*dscal_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); | ||||
| int (*dswap_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); | int (*dswap_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); | ||||
| @@ -439,6 +441,7 @@ BLASLONG (*iqmin_k) (BLASLONG, xdouble *, BLASLONG); | |||||
| int (*qcopy_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); | int (*qcopy_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); | ||||
| xdouble (*qdot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); | xdouble (*qdot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); | ||||
| int (*qrot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble); | int (*qrot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble); | ||||
| int (*qrotm_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); | |||||
| int (*qaxpy_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); | int (*qaxpy_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); | ||||
| int (*qscal_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); | int (*qscal_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); | ||||
| @@ -22,6 +22,7 @@ | |||||
| #define QSUM_K qsum_k | #define QSUM_K qsum_k | ||||
| #define QSWAP_K qswap_k | #define QSWAP_K qswap_k | ||||
| #define QROT_K qrot_k | #define QROT_K qrot_k | ||||
| #define QROTM_K qrotm_k | |||||
| #define QGEMV_N qgemv_n | #define QGEMV_N qgemv_n | ||||
| #define QGEMV_T qgemv_t | #define QGEMV_T qgemv_t | ||||
| @@ -165,6 +166,7 @@ | |||||
| #define QSUM_K gotoblas -> qsum_k | #define QSUM_K gotoblas -> qsum_k | ||||
| #define QSWAP_K gotoblas -> qswap_k | #define QSWAP_K gotoblas -> qswap_k | ||||
| #define QROT_K gotoblas -> qrot_k | #define QROT_K gotoblas -> qrot_k | ||||
| #define QROTM_K gotoblas -> qrotm_k | |||||
| #define QGEMV_N gotoblas -> qgemv_n | #define QGEMV_N gotoblas -> qgemv_n | ||||
| #define QGEMV_T gotoblas -> qgemv_t | #define QGEMV_T gotoblas -> qgemv_t | ||||
| @@ -24,6 +24,7 @@ | |||||
| #define SSCAL_K sscal_k | #define SSCAL_K sscal_k | ||||
| #define SSWAP_K sswap_k | #define SSWAP_K sswap_k | ||||
| #define SROT_K srot_k | #define SROT_K srot_k | ||||
| #define SROTM_K srotm_k | |||||
| #define SGEMV_N sgemv_n | #define SGEMV_N sgemv_n | ||||
| #define SGEMV_T sgemv_t | #define SGEMV_T sgemv_t | ||||
| @@ -189,6 +190,7 @@ | |||||
| #define SSCAL_K gotoblas -> sscal_k | #define SSCAL_K gotoblas -> sscal_k | ||||
| #define SSWAP_K gotoblas -> sswap_k | #define SSWAP_K gotoblas -> sswap_k | ||||
| #define SROT_K gotoblas -> srot_k | #define SROT_K gotoblas -> srot_k | ||||
| #define SROTM_K gotoblas -> srotm_k | |||||
| #define SGEMV_N gotoblas -> sgemv_n | #define SGEMV_N gotoblas -> sgemv_n | ||||
| #define SGEMV_T gotoblas -> sgemv_t | #define SGEMV_T gotoblas -> sgemv_t | ||||
| @@ -480,13 +480,13 @@ the LLVM toolchain enables native compilation of the Fortran sources of LAPACK a | |||||
| 4. Navigate to the OpenBLAS source code directory and start building OpenBLAS | 4. Navigate to the OpenBLAS source code directory and start building OpenBLAS | ||||
| by invoking Ninja: | by invoking Ninja: | ||||
| ```cmd | ```cmd | ||||
| cd OpenBLAS | cd OpenBLAS | ||||
| mkdir build | mkdir build | ||||
| cd build | cd build | ||||
| cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release -DTARGET=ARMV8 -DBINARY=64 -DCMAKE_C_COMPILER=clang-cl -DCMAKE_C_COMPILER=arm64-pc-windows-msvc -DCMAKE_ASM_COMPILER=arm64-pc-windows-msvc -DCMAKE_Fortran_COMPILER=flang-new | |||||
| cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release -DTARGET=ARMV8 -DBINARY=64 -DCMAKE_C_COMPILER=clang-cl -DCMAKE_C_COMPILER_TARGET=arm64-pc-windows-msvc -DCMAKE_ASM_COMPILER_TARGET=arm64-pc-windows-msvc -DCMAKE_Fortran_COMPILER=flang-new | |||||
| ninja -j16 | ninja -j16 | ||||
| ``` | ``` | ||||
| @@ -1,5 +1,5 @@ | |||||
| /*********************************************************************/ | /*********************************************************************/ | ||||
| /* Copyright 2024 The OpenBLAS Project */ | |||||
| /* Copyright 2024, 2025 The OpenBLAS Project */ | |||||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | /* Copyright 2009, 2010 The University of Texas at Austin. */ | ||||
| /* All rights reserved. */ | /* All rights reserved. */ | ||||
| /* */ | /* */ | ||||
| @@ -177,6 +177,49 @@ static int init_amxtile_permission() { | |||||
| } | } | ||||
| #endif | #endif | ||||
| #ifdef DYNAMIC_ARCH | |||||
| extern char* gotoblas_corename(void); | |||||
| #endif | |||||
| #if defined(DYNAMIC_ARCH) || defined(NEOVERSEV1) | |||||
| static inline int get_gemm_optimal_nthreads_neoversev1(double MNK, int ncpu) { | |||||
| return | |||||
| MNK < 262144L ? 1 | |||||
| : MNK < 1124864L ? MIN(ncpu, 6) | |||||
| : MNK < 7880599L ? MIN(ncpu, 12) | |||||
| : MNK < 17173512L ? MIN(ncpu, 16) | |||||
| : MNK < 33386248L ? MIN(ncpu, 20) | |||||
| : MNK < 57066625L ? MIN(ncpu, 24) | |||||
| : MNK < 91733851L ? MIN(ncpu, 32) | |||||
| : MNK < 265847707L ? MIN(ncpu, 40) | |||||
| : MNK < 458314011L ? MIN(ncpu, 48) | |||||
| : MNK < 729000000L ? MIN(ncpu, 56) | |||||
| : ncpu; | |||||
| } | |||||
| #endif | |||||
| static inline int get_gemm_optimal_nthreads(double MNK) { | |||||
| int ncpu = num_cpu_avail(3); | |||||
| #if defined(NEOVERSEV1) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) | |||||
| return get_gemm_optimal_nthreads_neoversev1(MNK, ncpu); | |||||
| #elif defined(DYNAMIC_ARCH) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) | |||||
| if (strcmp(gotoblas_corename(), "neoversev1") == 0) { | |||||
| return get_gemm_optimal_nthreads_neoversev1(MNK, ncpu); | |||||
| } | |||||
| #endif | |||||
| if ( MNK <= (SMP_THRESHOLD_MIN * (double) GEMM_MULTITHREAD_THRESHOLD) ) { | |||||
| return 1; | |||||
| } | |||||
| else { | |||||
| if (MNK/ncpu < SMP_THRESHOLD_MIN*(double)GEMM_MULTITHREAD_THRESHOLD) { | |||||
| return MNK/(SMP_THRESHOLD_MIN*(double)GEMM_MULTITHREAD_THRESHOLD); | |||||
| } | |||||
| else { | |||||
| return ncpu; | |||||
| } | |||||
| } | |||||
| } | |||||
| #ifndef CBLAS | #ifndef CBLAS | ||||
| void NAME(char *TRANSA, char *TRANSB, | void NAME(char *TRANSA, char *TRANSB, | ||||
| @@ -310,7 +353,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS | |||||
| FLOAT *beta = (FLOAT*) vbeta; | FLOAT *beta = (FLOAT*) vbeta; | ||||
| FLOAT *a = (FLOAT*) va; | FLOAT *a = (FLOAT*) va; | ||||
| FLOAT *b = (FLOAT*) vb; | FLOAT *b = (FLOAT*) vb; | ||||
| FLOAT *c = (FLOAT*) vc; | |||||
| FLOAT *c = (FLOAT*) vc; | |||||
| #endif | #endif | ||||
| blas_arg_t args; | blas_arg_t args; | ||||
| @@ -352,7 +395,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS | |||||
| #if !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) && defined(USE_SGEMM_KERNEL_DIRECT) | #if !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) && defined(USE_SGEMM_KERNEL_DIRECT) | ||||
| #ifdef DYNAMIC_ARCH | #ifdef DYNAMIC_ARCH | ||||
| if (support_avx512() ) | if (support_avx512() ) | ||||
| #endif | |||||
| #endif | |||||
| if (beta == 0 && alpha == 1.0 && order == CblasRowMajor && TransA == CblasNoTrans && TransB == CblasNoTrans && SGEMM_DIRECT_PERFORMANT(m,n,k)) { | if (beta == 0 && alpha == 1.0 && order == CblasRowMajor && TransA == CblasNoTrans && TransB == CblasNoTrans && SGEMM_DIRECT_PERFORMANT(m,n,k)) { | ||||
| SGEMM_DIRECT(m, n, k, a, lda, b, ldb, c, ldc); | SGEMM_DIRECT(m, n, k, a, lda, b, ldb, c, ldc); | ||||
| return; | return; | ||||
| @@ -604,13 +647,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS | |||||
| #endif | #endif | ||||
| MNK = (double) args.m * (double) args.n * (double) args.k; | MNK = (double) args.m * (double) args.n * (double) args.k; | ||||
| if ( MNK <= (SMP_THRESHOLD_MIN * (double) GEMM_MULTITHREAD_THRESHOLD) ) | |||||
| args.nthreads = 1; | |||||
| else { | |||||
| args.nthreads = num_cpu_avail(3); | |||||
| if (MNK/args.nthreads < SMP_THRESHOLD_MIN*(double)GEMM_MULTITHREAD_THRESHOLD) | |||||
| args.nthreads = MNK/(SMP_THRESHOLD_MIN*(double)GEMM_MULTITHREAD_THRESHOLD); | |||||
| } | |||||
| args.nthreads = get_gemm_optimal_nthreads(MNK); | |||||
| args.common = NULL; | args.common = NULL; | ||||
| @@ -63,6 +63,36 @@ static int (*gemv_thread[])(BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT | |||||
| }; | }; | ||||
| #endif | #endif | ||||
| #ifdef DYNAMIC_ARCH | |||||
| extern char* gotoblas_corename(void); | |||||
| #endif | |||||
| #if defined(DYNAMIC_ARCH) || defined(NEOVERSEV1) | |||||
| static inline int get_gemv_optimal_nthreads_neoversev1(BLASLONG MN, int ncpu) { | |||||
| return | |||||
| MN < 25600L ? 1 | |||||
| : MN < 63001L ? MIN(ncpu, 4) | |||||
| : MN < 459684L ? MIN(ncpu, 16) | |||||
| : ncpu; | |||||
| } | |||||
| #endif | |||||
| static inline int get_gemv_optimal_nthreads(BLASLONG MN) { | |||||
| int ncpu = num_cpu_avail(3); | |||||
| #if defined(NEOVERSEV1) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) | |||||
| return get_gemv_optimal_nthreads_neoversev1(MN, ncpu); | |||||
| #elif defined(DYNAMIC_ARCH) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) | |||||
| if (strcmp(gotoblas_corename(), "neoversev1") == 0) { | |||||
| return get_gemv_optimal_nthreads_neoversev1(MN, ncpu); | |||||
| } | |||||
| #endif | |||||
| if ( MN < 115200L * GEMM_MULTITHREAD_THRESHOLD ) | |||||
| return 1; | |||||
| else | |||||
| return num_cpu_avail(2); | |||||
| } | |||||
| #ifndef CBLAS | #ifndef CBLAS | ||||
| void NAME(char *TRANS, blasint *M, blasint *N, | void NAME(char *TRANS, blasint *M, blasint *N, | ||||
| @@ -225,11 +255,7 @@ void CNAME(enum CBLAS_ORDER order, | |||||
| STACK_ALLOC(buffer_size, FLOAT, buffer); | STACK_ALLOC(buffer_size, FLOAT, buffer); | ||||
| #ifdef SMP | #ifdef SMP | ||||
| if ( 1L * m * n < 115200L * GEMM_MULTITHREAD_THRESHOLD ) | |||||
| nthreads = 1; | |||||
| else | |||||
| nthreads = num_cpu_avail(2); | |||||
| nthreads = get_gemv_optimal_nthreads(1L * m * n); | |||||
| if (nthreads == 1) { | if (nthreads == 1) { | ||||
| #endif | #endif | ||||
| @@ -7,149 +7,21 @@ | |||||
| void NAME(blasint *N, FLOAT *dx, blasint *INCX, FLOAT *dy, blasint *INCY, FLOAT *dparam){ | void NAME(blasint *N, FLOAT *dx, blasint *INCX, FLOAT *dy, blasint *INCY, FLOAT *dparam){ | ||||
| blasint n = *N; | |||||
| blasint incx = *INCX; | |||||
| blasint incy = *INCY; | |||||
| blasint n = *N; | |||||
| blasint incx = *INCX; | |||||
| blasint incy = *INCY; | |||||
| PRINT_DEBUG_NAME | |||||
| #else | #else | ||||
| void CNAME(blasint n, FLOAT *dx, blasint incx, FLOAT *dy, blasint incy, FLOAT *dparam){ | void CNAME(blasint n, FLOAT *dx, blasint incx, FLOAT *dy, blasint incy, FLOAT *dparam){ | ||||
| #endif | |||||
| blasint i__1, i__2; | |||||
| PRINT_DEBUG_CNAME; | |||||
| blasint i__; | |||||
| FLOAT w, z__; | |||||
| blasint kx, ky; | |||||
| FLOAT dh11, dh12, dh22, dh21, dflag; | |||||
| blasint nsteps; | |||||
| #ifndef CBLAS | |||||
| PRINT_DEBUG_CNAME; | |||||
| #else | |||||
| PRINT_DEBUG_CNAME; | |||||
| #endif | #endif | ||||
| --dparam; | |||||
| --dy; | |||||
| --dx; | |||||
| dflag = dparam[1]; | |||||
| if (n <= 0 || dflag == - 2.0) goto L140; | |||||
| if (! (incx == incy && incx > 0)) goto L70; | |||||
| nsteps = n * incx; | |||||
| if (dflag < 0.) { | |||||
| goto L50; | |||||
| } else if (dflag == 0) { | |||||
| goto L10; | |||||
| } else { | |||||
| goto L30; | |||||
| } | |||||
| L10: | |||||
| dh12 = dparam[4]; | |||||
| dh21 = dparam[3]; | |||||
| i__1 = nsteps; | |||||
| i__2 = incx; | |||||
| for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) { | |||||
| w = dx[i__]; | |||||
| z__ = dy[i__]; | |||||
| dx[i__] = w + z__ * dh12; | |||||
| dy[i__] = w * dh21 + z__; | |||||
| /* L20: */ | |||||
| } | |||||
| goto L140; | |||||
| L30: | |||||
| dh11 = dparam[2]; | |||||
| dh22 = dparam[5]; | |||||
| i__2 = nsteps; | |||||
| i__1 = incx; | |||||
| for (i__ = 1; i__1 < 0 ? i__ >= i__2 : i__ <= i__2; i__ += i__1) { | |||||
| w = dx[i__]; | |||||
| z__ = dy[i__]; | |||||
| dx[i__] = w * dh11 + z__; | |||||
| dy[i__] = -w + dh22 * z__; | |||||
| /* L40: */ | |||||
| } | |||||
| goto L140; | |||||
| L50: | |||||
| dh11 = dparam[2]; | |||||
| dh12 = dparam[4]; | |||||
| dh21 = dparam[3]; | |||||
| dh22 = dparam[5]; | |||||
| i__1 = nsteps; | |||||
| i__2 = incx; | |||||
| for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) { | |||||
| w = dx[i__]; | |||||
| z__ = dy[i__]; | |||||
| dx[i__] = w * dh11 + z__ * dh12; | |||||
| dy[i__] = w * dh21 + z__ * dh22; | |||||
| /* L60: */ | |||||
| } | |||||
| goto L140; | |||||
| L70: | |||||
| kx = 1; | |||||
| ky = 1; | |||||
| if (incx < 0) { | |||||
| kx = (1 - n) * incx + 1; | |||||
| } | |||||
| if (incy < 0) { | |||||
| ky = (1 - n) * incy + 1; | |||||
| } | |||||
| ROTM_K(n, dx, incx, dy, incy, dparam); | |||||
| if (dflag < 0.) { | |||||
| goto L120; | |||||
| } else if (dflag == 0) { | |||||
| goto L80; | |||||
| } else { | |||||
| goto L100; | |||||
| } | |||||
| L80: | |||||
| dh12 = dparam[4]; | |||||
| dh21 = dparam[3]; | |||||
| i__2 = n; | |||||
| for (i__ = 1; i__ <= i__2; ++i__) { | |||||
| w = dx[kx]; | |||||
| z__ = dy[ky]; | |||||
| dx[kx] = w + z__ * dh12; | |||||
| dy[ky] = w * dh21 + z__; | |||||
| kx += incx; | |||||
| ky += incy; | |||||
| /* L90: */ | |||||
| } | |||||
| goto L140; | |||||
| L100: | |||||
| dh11 = dparam[2]; | |||||
| dh22 = dparam[5]; | |||||
| i__2 = n; | |||||
| for (i__ = 1; i__ <= i__2; ++i__) { | |||||
| w = dx[kx]; | |||||
| z__ = dy[ky]; | |||||
| dx[kx] = w * dh11 + z__; | |||||
| dy[ky] = -w + dh22 * z__; | |||||
| kx += incx; | |||||
| ky += incy; | |||||
| /* L110: */ | |||||
| } | |||||
| goto L140; | |||||
| L120: | |||||
| dh11 = dparam[2]; | |||||
| dh12 = dparam[4]; | |||||
| dh21 = dparam[3]; | |||||
| dh22 = dparam[5]; | |||||
| i__2 = n; | |||||
| for (i__ = 1; i__ <= i__2; ++i__) { | |||||
| w = dx[kx]; | |||||
| z__ = dy[ky]; | |||||
| dx[kx] = w * dh11 + z__ * dh12; | |||||
| dy[ky] = w * dh21 + z__ * dh22; | |||||
| kx += incx; | |||||
| ky += incy; | |||||
| /* L130: */ | |||||
| } | |||||
| L140: | |||||
| return; | return; | ||||
| } | } | ||||
| @@ -65,6 +65,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) | |||||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}COPYKERNEL}" "C_INTERFACE" "copy_k" false "" "" false ${float_type}) | GenerateNamedObjects("${KERNELDIR}/${${float_char}COPYKERNEL}" "C_INTERFACE" "copy_k" false "" "" false ${float_type}) | ||||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}NRM2KERNEL}" "" "nrm2_k" false "" "" false ${float_type}) | GenerateNamedObjects("${KERNELDIR}/${${float_char}NRM2KERNEL}" "" "nrm2_k" false "" "" false ${float_type}) | ||||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}ROTKERNEL}" "" "rot_k" false "" "" false ${float_type}) | GenerateNamedObjects("${KERNELDIR}/${${float_char}ROTKERNEL}" "" "rot_k" false "" "" false ${float_type}) | ||||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}ROTMKERNEL}" "" "rotm_k" false "" "" false ${float_type}) | |||||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}SCALKERNEL}" "" "scal_k" false "" "" false ${float_type}) | GenerateNamedObjects("${KERNELDIR}/${${float_char}SCALKERNEL}" "" "scal_k" false "" "" false ${float_type}) | ||||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}SWAPKERNEL}" "" "swap_k" false "" "" false ${float_type}) | GenerateNamedObjects("${KERNELDIR}/${${float_char}SWAPKERNEL}" "" "swap_k" false "" "" false ${float_type}) | ||||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}AXPBYKERNEL}" "" "axpby_k" false "" "" false ${float_type}) | GenerateNamedObjects("${KERNELDIR}/${${float_char}AXPBYKERNEL}" "" "axpby_k" false "" "" false ${float_type}) | ||||
| @@ -125,6 +126,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) | |||||
| GenerateNamedObjects("${KERNELDIR}/${SNRM2KERNEL}" "" "nrm2_k" false "" "" false "SINGLE") | GenerateNamedObjects("${KERNELDIR}/${SNRM2KERNEL}" "" "nrm2_k" false "" "" false "SINGLE") | ||||
| GenerateNamedObjects("${KERNELDIR}/${SDOTKERNEL}" "" "dot_k" false "" "" false "SINGLE") | GenerateNamedObjects("${KERNELDIR}/${SDOTKERNEL}" "" "dot_k" false "" "" false "SINGLE") | ||||
| GenerateNamedObjects("${KERNELDIR}/${SROTKERNEL}" "" "rot_k" false "" "" false "SINGLE") | GenerateNamedObjects("${KERNELDIR}/${SROTKERNEL}" "" "rot_k" false "" "" false "SINGLE") | ||||
| GenerateNamedObjects("${KERNELDIR}/${SROTMKERNEL}" "" "rotm_k" false "" "" false "SINGLE") | |||||
| endif () | endif () | ||||
| if (BUILD_COMPLEX16 AND NOT BUILD_DOUBLE) | if (BUILD_COMPLEX16 AND NOT BUILD_DOUBLE) | ||||
| GenerateNamedObjects("${KERNELDIR}/${DAMAXKERNEL}" "USE_ABS" "amax_k" false "" "" false "DOUBLE") | GenerateNamedObjects("${KERNELDIR}/${DAMAXKERNEL}" "USE_ABS" "amax_k" false "" "" false "DOUBLE") | ||||
| @@ -148,6 +150,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) | |||||
| GenerateNamedObjects("${KERNELDIR}/${DCOPYKERNEL}" "C_INTERFACE" "copy_k" false "" "" false "DOUBLE") | GenerateNamedObjects("${KERNELDIR}/${DCOPYKERNEL}" "C_INTERFACE" "copy_k" false "" "" false "DOUBLE") | ||||
| GenerateNamedObjects("${KERNELDIR}/${DNRM2KERNEL}" "" "nrm2_k" false "" "" false "DOUBLE") | GenerateNamedObjects("${KERNELDIR}/${DNRM2KERNEL}" "" "nrm2_k" false "" "" false "DOUBLE") | ||||
| GenerateNamedObjects("${KERNELDIR}/${DROTKERNEL}" "" "rot_k" false "" "" false "DOUBLE") | GenerateNamedObjects("${KERNELDIR}/${DROTKERNEL}" "" "rot_k" false "" "" false "DOUBLE") | ||||
| GenerateNamedObjects("${KERNELDIR}/${DROTMKERNEL}" "" "rotm_k" false "" "" false "DOUBLE") | |||||
| GenerateNamedObjects("${KERNELDIR}/${DDOTKERNEL}" "" "dot_k" false "" "" false "DOUBLE") | GenerateNamedObjects("${KERNELDIR}/${DDOTKERNEL}" "" "dot_k" false "" "" false "DOUBLE") | ||||
| GenerateNamedObjects("${KERNELDIR}/${DSWAPKERNEL}" "" "swap_k" false "" "" false "DOUBLE") | GenerateNamedObjects("${KERNELDIR}/${DSWAPKERNEL}" "" "swap_k" false "" "" false "DOUBLE") | ||||
| GenerateNamedObjects("${KERNELDIR}/${DAXPYKERNEL}" "" "axpy_k" false "" "" false "DOUBLE") | GenerateNamedObjects("${KERNELDIR}/${DAXPYKERNEL}" "" "axpy_k" false "" "" false "DOUBLE") | ||||
| @@ -1105,6 +1108,7 @@ endif () | |||||
| GenerateNamedObjects("${KERNELDIR}/${DCOPYKERNEL}" "C_INTERFACE" "copy_k" false "" "" false "DOUBLE") | GenerateNamedObjects("${KERNELDIR}/${DCOPYKERNEL}" "C_INTERFACE" "copy_k" false "" "" false "DOUBLE") | ||||
| GenerateNamedObjects("${KERNELDIR}/${DNRM2KERNEL}" "" "nrm2_k" false "" "" false "DOUBLE") | GenerateNamedObjects("${KERNELDIR}/${DNRM2KERNEL}" "" "nrm2_k" false "" "" false "DOUBLE") | ||||
| GenerateNamedObjects("${KERNELDIR}/${DROTKERNEL}" "" "rot_k" false "" "" false "DOUBLE") | GenerateNamedObjects("${KERNELDIR}/${DROTKERNEL}" "" "rot_k" false "" "" false "DOUBLE") | ||||
| GenerateNamedObjects("${KERNELDIR}/${DROTMKERNEL}" "" "rotm_k" false "" "" false "DOUBLE") | |||||
| GenerateNamedObjects("${KERNELDIR}/${DDOTKERNEL}" "" "dot_k" false "" "" false "DOUBLE") | GenerateNamedObjects("${KERNELDIR}/${DDOTKERNEL}" "" "dot_k" false "" "" false "DOUBLE") | ||||
| GenerateNamedObjects("${KERNELDIR}/${DSWAPKERNEL}" "" "swap_k" false "" "" false "DOUBLE") | GenerateNamedObjects("${KERNELDIR}/${DSWAPKERNEL}" "" "swap_k" false "" "" false "DOUBLE") | ||||
| GenerateNamedObjects("${KERNELDIR}/${DAXPYKERNEL}" "" "axpy_k" false "" "" false "DOUBLE") | GenerateNamedObjects("${KERNELDIR}/${DAXPYKERNEL}" "" "axpy_k" false "" "" false "DOUBLE") | ||||
| @@ -336,6 +336,18 @@ ifndef XROTKERNEL | |||||
| XROTKERNEL = zrot.S | XROTKERNEL = zrot.S | ||||
| endif | endif | ||||
| ifndef SROTMKERNEL | |||||
| SROTMKERNEL = rotm.S | |||||
| endif | |||||
| ifndef DROTMKERNEL | |||||
| DROTMKERNEL = rotm.S | |||||
| endif | |||||
| ifndef QROTMKERNEL | |||||
| QROTMKERNEL = rotm.S | |||||
| endif | |||||
| ### SCAL ### | ### SCAL ### | ||||
| ifndef SSCALKERNEL | ifndef SSCALKERNEL | ||||
| @@ -504,21 +516,21 @@ SBLASOBJS += \ | |||||
| sasum_k$(TSUFFIX).$(SUFFIX) ssum_k$(TSUFFIX).$(SUFFIX) saxpy_k$(TSUFFIX).$(SUFFIX) scopy_k$(TSUFFIX).$(SUFFIX) \ | sasum_k$(TSUFFIX).$(SUFFIX) ssum_k$(TSUFFIX).$(SUFFIX) saxpy_k$(TSUFFIX).$(SUFFIX) scopy_k$(TSUFFIX).$(SUFFIX) \ | ||||
| sdot_k$(TSUFFIX).$(SUFFIX) sdsdot_k$(TSUFFIX).$(SUFFIX) dsdot_k$(TSUFFIX).$(SUFFIX) \ | sdot_k$(TSUFFIX).$(SUFFIX) sdsdot_k$(TSUFFIX).$(SUFFIX) dsdot_k$(TSUFFIX).$(SUFFIX) \ | ||||
| snrm2_k$(TSUFFIX).$(SUFFIX) srot_k$(TSUFFIX).$(SUFFIX) sscal_k$(TSUFFIX).$(SUFFIX) sswap_k$(TSUFFIX).$(SUFFIX) \ | snrm2_k$(TSUFFIX).$(SUFFIX) srot_k$(TSUFFIX).$(SUFFIX) sscal_k$(TSUFFIX).$(SUFFIX) sswap_k$(TSUFFIX).$(SUFFIX) \ | ||||
| saxpby_k$(TSUFFIX).$(SUFFIX) | |||||
| saxpby_k$(TSUFFIX).$(SUFFIX) srotm_k$(TSUFFIX).$(SUFFIX) | |||||
| DBLASOBJS += \ | DBLASOBJS += \ | ||||
| damax_k$(TSUFFIX).$(SUFFIX) damin_k$(TSUFFIX).$(SUFFIX) dmax_k$(TSUFFIX).$(SUFFIX) dmin_k$(TSUFFIX).$(SUFFIX) \ | damax_k$(TSUFFIX).$(SUFFIX) damin_k$(TSUFFIX).$(SUFFIX) dmax_k$(TSUFFIX).$(SUFFIX) dmin_k$(TSUFFIX).$(SUFFIX) \ | ||||
| idamax_k$(TSUFFIX).$(SUFFIX) idamin_k$(TSUFFIX).$(SUFFIX) idmax_k$(TSUFFIX).$(SUFFIX) idmin_k$(TSUFFIX).$(SUFFIX) \ | idamax_k$(TSUFFIX).$(SUFFIX) idamin_k$(TSUFFIX).$(SUFFIX) idmax_k$(TSUFFIX).$(SUFFIX) idmin_k$(TSUFFIX).$(SUFFIX) \ | ||||
| dasum_k$(TSUFFIX).$(SUFFIX) daxpy_k$(TSUFFIX).$(SUFFIX) dcopy_k$(TSUFFIX).$(SUFFIX) ddot_k$(TSUFFIX).$(SUFFIX) \ | dasum_k$(TSUFFIX).$(SUFFIX) daxpy_k$(TSUFFIX).$(SUFFIX) dcopy_k$(TSUFFIX).$(SUFFIX) ddot_k$(TSUFFIX).$(SUFFIX) \ | ||||
| dnrm2_k$(TSUFFIX).$(SUFFIX) drot_k$(TSUFFIX).$(SUFFIX) dscal_k$(TSUFFIX).$(SUFFIX) dswap_k$(TSUFFIX).$(SUFFIX) \ | dnrm2_k$(TSUFFIX).$(SUFFIX) drot_k$(TSUFFIX).$(SUFFIX) dscal_k$(TSUFFIX).$(SUFFIX) dswap_k$(TSUFFIX).$(SUFFIX) \ | ||||
| daxpby_k$(TSUFFIX).$(SUFFIX) dsum_k$(TSUFFIX).$(SUFFIX) | |||||
| daxpby_k$(TSUFFIX).$(SUFFIX) dsum_k$(TSUFFIX).$(SUFFIX) drotm_k$(TSUFFIX).$(SUFFIX) | |||||
| QBLASOBJS += \ | QBLASOBJS += \ | ||||
| qamax_k$(TSUFFIX).$(SUFFIX) qamin_k$(TSUFFIX).$(SUFFIX) qmax_k$(TSUFFIX).$(SUFFIX) qmin_k$(TSUFFIX).$(SUFFIX) \ | qamax_k$(TSUFFIX).$(SUFFIX) qamin_k$(TSUFFIX).$(SUFFIX) qmax_k$(TSUFFIX).$(SUFFIX) qmin_k$(TSUFFIX).$(SUFFIX) \ | ||||
| iqamax_k$(TSUFFIX).$(SUFFIX) iqamin_k$(TSUFFIX).$(SUFFIX) iqmax_k$(TSUFFIX).$(SUFFIX) iqmin_k$(TSUFFIX).$(SUFFIX) \ | iqamax_k$(TSUFFIX).$(SUFFIX) iqamin_k$(TSUFFIX).$(SUFFIX) iqmax_k$(TSUFFIX).$(SUFFIX) iqmin_k$(TSUFFIX).$(SUFFIX) \ | ||||
| qasum_k$(TSUFFIX).$(SUFFIX) qaxpy_k$(TSUFFIX).$(SUFFIX) qcopy_k$(TSUFFIX).$(SUFFIX) qdot_k$(TSUFFIX).$(SUFFIX) \ | qasum_k$(TSUFFIX).$(SUFFIX) qaxpy_k$(TSUFFIX).$(SUFFIX) qcopy_k$(TSUFFIX).$(SUFFIX) qdot_k$(TSUFFIX).$(SUFFIX) \ | ||||
| qnrm2_k$(TSUFFIX).$(SUFFIX) qrot_k$(TSUFFIX).$(SUFFIX) qscal_k$(TSUFFIX).$(SUFFIX) qswap_k$(TSUFFIX).$(SUFFIX) \ | qnrm2_k$(TSUFFIX).$(SUFFIX) qrot_k$(TSUFFIX).$(SUFFIX) qscal_k$(TSUFFIX).$(SUFFIX) qswap_k$(TSUFFIX).$(SUFFIX) \ | ||||
| qsum_k$(TSUFFIX).$(SUFFIX) | |||||
| qsum_k$(TSUFFIX).$(SUFFIX) qrotm_k$(TSUFFIX).$(SUFFIX) | |||||
| CBLASOBJS += \ | CBLASOBJS += \ | ||||
| camax_k$(TSUFFIX).$(SUFFIX) camin_k$(TSUFFIX).$(SUFFIX) icamax_k$(TSUFFIX).$(SUFFIX) icamin_k$(TSUFFIX).$(SUFFIX) \ | camax_k$(TSUFFIX).$(SUFFIX) camin_k$(TSUFFIX).$(SUFFIX) icamax_k$(TSUFFIX).$(SUFFIX) icamin_k$(TSUFFIX).$(SUFFIX) \ | ||||
| @@ -842,7 +854,16 @@ $(KDIR)drot_k$(TSUFFIX).$(SUFFIX) $(KDIR)drot_k$(TPSUFFIX).$(PSUFFIX) : $(KERN | |||||
| $(CC) -c $(CFLAGS) $(FMAFLAG) -UCOMPLEX -UCOMPLEX -DDOUBLE $< -o $@ | $(CC) -c $(CFLAGS) $(FMAFLAG) -UCOMPLEX -UCOMPLEX -DDOUBLE $< -o $@ | ||||
| $(KDIR)qrot_k$(TSUFFIX).$(SUFFIX) $(KDIR)qrot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QROTKERNEL) | $(KDIR)qrot_k$(TSUFFIX).$(SUFFIX) $(KDIR)qrot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QROTKERNEL) | ||||
| $(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -DXDOUBLE $< -o $@ | |||||
| $(CC) -c $(CFLAGS) $(FMAFLAG) -UCOMPLEX -UCOMPLEX -DXDOUBLE $< -o $@ | |||||
| $(KDIR)srotm_k$(TSUFFIX).$(SUFFIX) $(KDIR)srotm_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SROTMKERNEL) | |||||
| $(CC) -c $(CFLAGS) $(FMAFLAG) -UCOMPLEX -UCOMPLEX -UDOUBLE $< -o $@ | |||||
| $(KDIR)drotm_k$(TSUFFIX).$(SUFFIX) $(KDIR)drotm_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DROTMKERNEL) | |||||
| $(CC) -c $(CFLAGS) $(FMAFLAG) -UCOMPLEX -UCOMPLEX -DDOUBLE $< -o $@ | |||||
| $(KDIR)qrotm_k$(TSUFFIX).$(SUFFIX) $(KDIR)qrotm_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QROTMKERNEL) | |||||
| $(CC) -c $(CFLAGS) $(FMAFLAG) -UCOMPLEX -UCOMPLEX -DXDOUBLE $< -o $@ | |||||
| $(KDIR)csrot_k$(TSUFFIX).$(SUFFIX) $(KDIR)csrot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CROTKERNEL) | $(KDIR)csrot_k$(TSUFFIX).$(SUFFIX) $(KDIR)csrot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CROTKERNEL) | ||||
| $(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -UDOUBLE $< -o $@ | $(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -UDOUBLE $< -o $@ | ||||
| @@ -122,3 +122,15 @@ ZTRSMKERNEL_LN = ztrsm_kernel_2x2_LN.S | |||||
| ZTRSMKERNEL_LT = ztrsm_kernel_2x2_LT.S | ZTRSMKERNEL_LT = ztrsm_kernel_2x2_LT.S | ||||
| ZTRSMKERNEL_RN = ztrsm_kernel_2x2_LT.S | ZTRSMKERNEL_RN = ztrsm_kernel_2x2_LT.S | ||||
| ZTRSMKERNEL_RT = ztrsm_kernel_2x2_RT.S | ZTRSMKERNEL_RT = ztrsm_kernel_2x2_RT.S | ||||
| ifndef SROTMKERNEL | |||||
| SROTMKERNEL = ../generic/rotm.c | |||||
| endif | |||||
| ifndef DROTMKERNEL | |||||
| DROTMKERNEL = ../generic/rotm.c | |||||
| endif | |||||
| ifndef QROTMKERNEL | |||||
| QROTMKERNEL = ../generic/rotm.c | |||||
| endif | |||||
| @@ -43,4 +43,14 @@ ifndef ZGEMM_BETA | |||||
| ZGEMM_BETA = ../generic/zgemm_beta.c | ZGEMM_BETA = ../generic/zgemm_beta.c | ||||
| endif | endif | ||||
| ifndef SROTMKERNEL | |||||
| SROTMKERNEL = ../generic/rotm.c | |||||
| endif | |||||
| ifndef DROTMKERNEL | |||||
| DROTMKERNEL = ../generic/rotm.c | |||||
| endif | |||||
| ifndef QROTMKERNEL | |||||
| QROTMKERNEL = ../generic/rotm.c | |||||
| endif | |||||
| @@ -45,4 +45,14 @@ ifndef ZGEMM_BETA | |||||
| ZGEMM_BETA = ../generic/zgemm_beta.c | ZGEMM_BETA = ../generic/zgemm_beta.c | ||||
| endif | endif | ||||
| ifndef SROTMKERNEL | |||||
| SROTMKERNEL = ../generic/rotm.c | |||||
| endif | |||||
| ifndef DROTMKERNEL | |||||
| DROTMKERNEL = ../generic/rotm.c | |||||
| endif | |||||
| ifndef QROTMKERNEL | |||||
| QROTMKERNEL = ../generic/rotm.c | |||||
| endif | |||||
| @@ -171,3 +171,15 @@ QCABS_KERNEL = ../generic/cabs.c | |||||
| #Dump kernel | #Dump kernel | ||||
| CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c | CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c | ||||
| ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c | ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c | ||||
| ifndef SROTMKERNEL | |||||
| SROTMKERNEL = ../generic/rotm.c | |||||
| endif | |||||
| ifndef DROTMKERNEL | |||||
| DROTMKERNEL = ../generic/rotm.c | |||||
| endif | |||||
| ifndef QROTMKERNEL | |||||
| QROTMKERNEL = ../generic/rotm.c | |||||
| endif | |||||
| @@ -213,7 +213,7 @@ CNAME(BLASLONG M, | |||||
| const BLASLONG n2 = N & -2; | const BLASLONG n2 = N & -2; | ||||
| const BLASLONG n8 = N & -8; | const BLASLONG n8 = N & -8; | ||||
| const int pack_a = M >= v_size2 && N >= 8 && K >= 8 ? 1 : 0; | |||||
| const int pack_a = M >= v_size2 && N >= 8 ? 1 : 0; | |||||
| FLOAT* packed_a = | FLOAT* packed_a = | ||||
| (pack_a) ? packed_a = (FLOAT*)malloc(K * v_size2 * sizeof(FLOAT)) : NULL; | (pack_a) ? packed_a = (FLOAT*)malloc(K * v_size2 * sizeof(FLOAT)) : NULL; | ||||
| @@ -219,7 +219,7 @@ CNAME(BLASLONG M, | |||||
| const BLASLONG n4 = N & -4; | const BLASLONG n4 = N & -4; | ||||
| const BLASLONG n2 = N & -2; | const BLASLONG n2 = N & -2; | ||||
| const int pack_a = M >= v_size2 && N >= 8 && K >= 8 ? 1 : 0; | |||||
| const int pack_a = M >= v_size2 && N >= 8 ? 1 : 0; | |||||
| FLOAT* packed_a = | FLOAT* packed_a = | ||||
| (pack_a) ? packed_a = (FLOAT*)malloc(K * v_size2 * sizeof(FLOAT)) : NULL; | (pack_a) ? packed_a = (FLOAT*)malloc(K * v_size2 * sizeof(FLOAT)) : NULL; | ||||
| @@ -222,7 +222,7 @@ CNAME(BLASLONG M, | |||||
| const BLASLONG n8 = N & -8; | const BLASLONG n8 = N & -8; | ||||
| const BLASLONG n4 = N & -4; | const BLASLONG n4 = N & -4; | ||||
| const int pack_a = M >= v_size2 && N >= 8 && K >= 8 ? 1 : 0; | |||||
| const int pack_a = M >= v_size2 && N >= 8 ? 1 : 0; | |||||
| FLOAT* packed_a = | FLOAT* packed_a = | ||||
| (pack_a) ? packed_a = (FLOAT*)malloc(K * v_size2 * sizeof(FLOAT)) : NULL; | (pack_a) ? packed_a = (FLOAT*)malloc(K * v_size2 * sizeof(FLOAT)) : NULL; | ||||
| @@ -223,7 +223,7 @@ CNAME(BLASLONG M, | |||||
| const BLASLONG n8 = N & -8; | const BLASLONG n8 = N & -8; | ||||
| const BLASLONG n4 = N & -4; | const BLASLONG n4 = N & -4; | ||||
| const int pack_a = M >= v_size2 && N >= 8 && K >= 8 ? 1 : 0; | |||||
| const int pack_a = M >= v_size2 && N >= 8 ? 1 : 0; | |||||
| FLOAT* packed_a = | FLOAT* packed_a = | ||||
| (pack_a) ? packed_a = (FLOAT*)malloc(K * v_size2 * sizeof(FLOAT)) : NULL; | (pack_a) ? packed_a = (FLOAT*)malloc(K * v_size2 * sizeof(FLOAT)) : NULL; | ||||
| @@ -146,4 +146,14 @@ DGEMM_BETA = ../generic/gemm_beta.c | |||||
| CGEMM_BETA = ../generic/zgemm_beta.c | CGEMM_BETA = ../generic/zgemm_beta.c | ||||
| ZGEMM_BETA = ../generic/zgemm_beta.c | ZGEMM_BETA = ../generic/zgemm_beta.c | ||||
| ifndef SROTMKERNEL | |||||
| SROTMKERNEL = ../generic/rotm.c | |||||
| endif | |||||
| ifndef DROTMKERNEL | |||||
| DROTMKERNEL = ../generic/rotm.c | |||||
| endif | |||||
| ifndef QROTMKERNEL | |||||
| QROTMKERNEL = ../generic/rotm.c | |||||
| endif | |||||
| @@ -146,4 +146,14 @@ DGEMM_BETA = ../generic/gemm_beta.c | |||||
| CGEMM_BETA = ../generic/zgemm_beta.c | CGEMM_BETA = ../generic/zgemm_beta.c | ||||
| ZGEMM_BETA = ../generic/zgemm_beta.c | ZGEMM_BETA = ../generic/zgemm_beta.c | ||||
| ifndef SROTMKERNEL | |||||
| SROTMKERNEL = ../generic/rotm.c | |||||
| endif | |||||
| ifndef DROTMKERNEL | |||||
| DROTMKERNEL = ../generic/rotm.c | |||||
| endif | |||||
| ifndef QROTMKERNEL | |||||
| QROTMKERNEL = ../generic/rotm.c | |||||
| endif | |||||
| @@ -0,0 +1,159 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #include "common.h" | |||||
| int CNAME(BLASLONG n, FLOAT *dx, BLASLONG incx, FLOAT *dy, BLASLONG incy, FLOAT *dparam) | |||||
| { | |||||
| BLASLONG i__1, i__2; | |||||
| BLASLONG i__; | |||||
| FLOAT w, z__; | |||||
| BLASLONG kx, ky; | |||||
| FLOAT dh11, dh12, dh22, dh21, dflag; | |||||
| BLASLONG nsteps; | |||||
| --dparam; | |||||
| --dy; | |||||
| --dx; | |||||
| dflag = dparam[1]; | |||||
| if (n <= 0 || dflag == - 2.0) goto L140; | |||||
| if (! (incx == incy && incx > 0)) goto L70; | |||||
| nsteps = n * incx; | |||||
| if (dflag < 0.) { | |||||
| goto L50; | |||||
| } else if (dflag == 0) { | |||||
| goto L10; | |||||
| } else { | |||||
| goto L30; | |||||
| } | |||||
| L10: | |||||
| dh12 = dparam[4]; | |||||
| dh21 = dparam[3]; | |||||
| i__1 = nsteps; | |||||
| i__2 = incx; | |||||
| for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) { | |||||
| w = dx[i__]; | |||||
| z__ = dy[i__]; | |||||
| dx[i__] = w + z__ * dh12; | |||||
| dy[i__] = w * dh21 + z__; | |||||
| /* L20: */ | |||||
| } | |||||
| goto L140; | |||||
| L30: | |||||
| dh11 = dparam[2]; | |||||
| dh22 = dparam[5]; | |||||
| i__2 = nsteps; | |||||
| i__1 = incx; | |||||
| for (i__ = 1; i__1 < 0 ? i__ >= i__2 : i__ <= i__2; i__ += i__1) { | |||||
| w = dx[i__]; | |||||
| z__ = dy[i__]; | |||||
| dx[i__] = w * dh11 + z__; | |||||
| dy[i__] = -w + dh22 * z__; | |||||
| /* L40: */ | |||||
| } | |||||
| goto L140; | |||||
| L50: | |||||
| dh11 = dparam[2]; | |||||
| dh12 = dparam[4]; | |||||
| dh21 = dparam[3]; | |||||
| dh22 = dparam[5]; | |||||
| i__1 = nsteps; | |||||
| i__2 = incx; | |||||
| for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) { | |||||
| w = dx[i__]; | |||||
| z__ = dy[i__]; | |||||
| dx[i__] = w * dh11 + z__ * dh12; | |||||
| dy[i__] = w * dh21 + z__ * dh22; | |||||
| /* L60: */ | |||||
| } | |||||
| goto L140; | |||||
| L70: | |||||
| kx = 1; | |||||
| ky = 1; | |||||
| if (incx < 0) { | |||||
| kx = (1 - n) * incx + 1; | |||||
| } | |||||
| if (incy < 0) { | |||||
| ky = (1 - n) * incy + 1; | |||||
| } | |||||
| if (dflag < 0.) { | |||||
| goto L120; | |||||
| } else if (dflag == 0) { | |||||
| goto L80; | |||||
| } else { | |||||
| goto L100; | |||||
| } | |||||
| L80: | |||||
| dh12 = dparam[4]; | |||||
| dh21 = dparam[3]; | |||||
| i__2 = n; | |||||
| for (i__ = 1; i__ <= i__2; ++i__) { | |||||
| w = dx[kx]; | |||||
| z__ = dy[ky]; | |||||
| dx[kx] = w + z__ * dh12; | |||||
| dy[ky] = w * dh21 + z__; | |||||
| kx += incx; | |||||
| ky += incy; | |||||
| /* L90: */ | |||||
| } | |||||
| goto L140; | |||||
| L100: | |||||
| dh11 = dparam[2]; | |||||
| dh22 = dparam[5]; | |||||
| i__2 = n; | |||||
| for (i__ = 1; i__ <= i__2; ++i__) { | |||||
| w = dx[kx]; | |||||
| z__ = dy[ky]; | |||||
| dx[kx] = w * dh11 + z__; | |||||
| dy[ky] = -w + dh22 * z__; | |||||
| kx += incx; | |||||
| ky += incy; | |||||
| /* L110: */ | |||||
| } | |||||
| goto L140; | |||||
| L120: | |||||
| dh11 = dparam[2]; | |||||
| dh12 = dparam[4]; | |||||
| dh21 = dparam[3]; | |||||
| dh22 = dparam[5]; | |||||
| i__2 = n; | |||||
| for (i__ = 1; i__ <= i__2; ++i__) { | |||||
| w = dx[kx]; | |||||
| z__ = dy[ky]; | |||||
| dx[kx] = w * dh11 + z__ * dh12; | |||||
| dy[ky] = w * dh21 + z__ * dh22; | |||||
| kx += incx; | |||||
| ky += incy; | |||||
| /* L130: */ | |||||
| } | |||||
| L140: | |||||
| return(0); | |||||
| } | |||||
| @@ -142,3 +142,15 @@ ZTRSMKERNEL_RT = ztrsm_kernel_RT.S | |||||
| CGEMM3MKERNEL = zgemm3m_kernel.S | CGEMM3MKERNEL = zgemm3m_kernel.S | ||||
| ZGEMM3MKERNEL = zgemm3m_kernel.S | ZGEMM3MKERNEL = zgemm3m_kernel.S | ||||
| ifndef SROTMKERNEL | |||||
| SROTMKERNEL = ../generic/rotm.c | |||||
| endif | |||||
| ifndef DROTMKERNEL | |||||
| DROTMKERNEL = ../generic/rotm.c | |||||
| endif | |||||
| ifndef QROTMKERNEL | |||||
| QROTMKERNEL = ../generic/rotm.c | |||||
| endif | |||||
| @@ -236,3 +236,15 @@ ZGEMM3MKERNEL = zgemm3m_kernel.S | |||||
| endif | endif | ||||
| DSDOTKERNEL = dot.S | DSDOTKERNEL = dot.S | ||||
| ifndef SROTMKERNEL | |||||
| SROTMKERNEL = ../generic/rotm.c | |||||
| endif | |||||
| ifndef DROTMKERNEL | |||||
| DROTMKERNEL = ../generic/rotm.c | |||||
| endif | |||||
| ifndef QROTMKERNEL | |||||
| QROTMKERNEL = ../generic/rotm.c | |||||
| endif | |||||
| @@ -169,3 +169,15 @@ QCABS_KERNEL = ../generic/cabs.c | |||||
| #Dump kernel | #Dump kernel | ||||
| CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c | CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c | ||||
| ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c | ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c | ||||
| ifndef SROTMKERNEL | |||||
| SROTMKERNEL = ../generic/rotm.c | |||||
| endif | |||||
| ifndef DROTMKERNEL | |||||
| DROTMKERNEL = ../generic/rotm.c | |||||
| endif | |||||
| ifndef QROTMKERNEL | |||||
| QROTMKERNEL = ../generic/rotm.c | |||||
| endif | |||||
| @@ -43,4 +43,14 @@ ifndef ZGEMM_BETA | |||||
| ZGEMM_BETA = ../generic/zgemm_beta.c | ZGEMM_BETA = ../generic/zgemm_beta.c | ||||
| endif | endif | ||||
| ifndef SROTMKERNEL | |||||
| SROTMKERNEL = ../generic/rotm.c | |||||
| endif | |||||
| ifndef DROTMKERNEL | |||||
| DROTMKERNEL = ../generic/rotm.c | |||||
| endif | |||||
| ifndef QROTMKERNEL | |||||
| QROTMKERNEL = ../generic/rotm.c | |||||
| endif | |||||
| @@ -158,3 +158,15 @@ ZHEMV_L_KERNEL = ../generic/zhemv_k.c | |||||
| CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c | CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c | ||||
| ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c | ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c | ||||
| ifndef SROTMKERNEL | |||||
| SROTMKERNEL = ../generic/rotm.c | |||||
| endif | |||||
| ifndef DROTMKERNEL | |||||
| DROTMKERNEL = ../generic/rotm.c | |||||
| endif | |||||
| ifndef QROTMKERNEL | |||||
| QROTMKERNEL = ../generic/rotm.c | |||||
| endif | |||||
| @@ -199,3 +199,15 @@ endif | |||||
| ifndef IQMAXKERNEL | ifndef IQMAXKERNEL | ||||
| IQMAXKERNEL = imax.S | IQMAXKERNEL = imax.S | ||||
| endif | endif | ||||
| ifndef SROTMKERNEL | |||||
| SROTMKERNEL = ../generic/rotm.c | |||||
| endif | |||||
| ifndef DROTMKERNEL | |||||
| DROTMKERNEL = ../generic/rotm.c | |||||
| endif | |||||
| ifndef QROTMKERNEL | |||||
| QROTMKERNEL = ../generic/rotm.c | |||||
| endif | |||||
| @@ -158,3 +158,15 @@ ZHEMV_L_KERNEL = ../generic/zhemv_k.c | |||||
| CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c | CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c | ||||
| ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c | ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c | ||||
| ifndef SROTMKERNEL | |||||
| SROTMKERNEL = ../generic/rotm.c | |||||
| endif | |||||
| ifndef DROTMKERNEL | |||||
| DROTMKERNEL = ../generic/rotm.c | |||||
| endif | |||||
| ifndef QROTMKERNEL | |||||
| QROTMKERNEL = ../generic/rotm.c | |||||
| endif | |||||
| @@ -73,3 +73,15 @@ endif | |||||
| ifndef IQMAXKERNEL | ifndef IQMAXKERNEL | ||||
| IQMAXKERNEL = imax.S | IQMAXKERNEL = imax.S | ||||
| endif | endif | ||||
| ifndef SROTMKERNEL | |||||
| SROTMKERNEL = ../generic/rotm.c | |||||
| endif | |||||
| ifndef DROTMKERNEL | |||||
| DROTMKERNEL = ../generic/rotm.c | |||||
| endif | |||||
| ifndef QROTMKERNEL | |||||
| QROTMKERNEL = ../generic/rotm.c | |||||
| endif | |||||
| @@ -27,4 +27,14 @@ ifndef ZGEMM_BETA | |||||
| ZGEMM_BETA = ../generic/zgemm_beta.c | ZGEMM_BETA = ../generic/zgemm_beta.c | ||||
| endif | endif | ||||
| ifndef SROTMKERNEL | |||||
| SROTMKERNEL = ../generic/rotm.c | |||||
| endif | |||||
| ifndef DROTMKERNEL | |||||
| DROTMKERNEL = ../generic/rotm.c | |||||
| endif | |||||
| ifndef QROTMKERNEL | |||||
| QROTMKERNEL = ../generic/rotm.c | |||||
| endif | |||||
| @@ -71,6 +71,10 @@ DROTKERNEL = rot_vector.c | |||||
| CROTKERNEL = zrot_vector.c | CROTKERNEL = zrot_vector.c | ||||
| ZROTKERNEL = zrot_vector.c | ZROTKERNEL = zrot_vector.c | ||||
| SROTMKERNEL = ../generic/rotm.c | |||||
| DROTMKERNEL = ../generic/rotm.c | |||||
| QROTMKERNEL = ../generic/rotm.c | |||||
| SSCALKERNEL = scal_vector.c | SSCALKERNEL = scal_vector.c | ||||
| DSCALKERNEL = scal_vector.c | DSCALKERNEL = scal_vector.c | ||||
| CSCALKERNEL = zscal_vector.c | CSCALKERNEL = zscal_vector.c | ||||
| @@ -71,6 +71,10 @@ DROTKERNEL = ../riscv64/rot.c | |||||
| CROTKERNEL = ../riscv64/zrot.c | CROTKERNEL = ../riscv64/zrot.c | ||||
| ZROTKERNEL = ../riscv64/zrot.c | ZROTKERNEL = ../riscv64/zrot.c | ||||
| SROTMKERNEL = ../generic/rotm.c | |||||
| DROTMKERNEL = ../generic/rotm.c | |||||
| QROTMKERNEL = ../generic/rotm.c | |||||
| SSCALKERNEL = ../riscv64/scal.c | SSCALKERNEL = ../riscv64/scal.c | ||||
| DSCALKERNEL = ../riscv64/scal.c | DSCALKERNEL = ../riscv64/scal.c | ||||
| CSCALKERNEL = ../riscv64/zscal.c | CSCALKERNEL = ../riscv64/zscal.c | ||||
| @@ -71,6 +71,10 @@ DROTKERNEL = rot_rvv.c | |||||
| CROTKERNEL = zrot_rvv.c | CROTKERNEL = zrot_rvv.c | ||||
| ZROTKERNEL = zrot_rvv.c | ZROTKERNEL = zrot_rvv.c | ||||
| SROTMKERNEL = ../generic/rotm.c | |||||
| DROTMKERNEL = ../generic/rotm.c | |||||
| QROTMKERNEL = ../generic/rotm.c | |||||
| SSCALKERNEL = scal_rvv.c | SSCALKERNEL = scal_rvv.c | ||||
| DSCALKERNEL = scal_rvv.c | DSCALKERNEL = scal_rvv.c | ||||
| CSCALKERNEL = zscal_rvv.c | CSCALKERNEL = zscal_rvv.c | ||||
| @@ -66,6 +66,10 @@ DROTKERNEL = rot_vector.c | |||||
| CROTKERNEL = zrot_vector.c | CROTKERNEL = zrot_vector.c | ||||
| ZROTKERNEL = zrot_vector.c | ZROTKERNEL = zrot_vector.c | ||||
| SROTMKERNEL = ../generic/rotm.c | |||||
| DROTMKERNEL = ../generic/rotm.c | |||||
| QROTMKERNEL = ../generic/rotm.c | |||||
| SSCALKERNEL = scal_vector.c | SSCALKERNEL = scal_vector.c | ||||
| DSCALKERNEL = scal_vector.c | DSCALKERNEL = scal_vector.c | ||||
| CSCALKERNEL = zscal_vector.c | CSCALKERNEL = zscal_vector.c | ||||
| @@ -98,6 +98,10 @@ DROTKERNEL = rot_rvv.c | |||||
| CROTKERNEL = zrot_rvv.c | CROTKERNEL = zrot_rvv.c | ||||
| ZROTKERNEL = zrot_rvv.c | ZROTKERNEL = zrot_rvv.c | ||||
| SROTMKERNEL = rotm_rvv.c | |||||
| DROTMKERNEL = rotm_rvv.c | |||||
| QROTMKERNEL = ../generic/rotm.c | |||||
| SSCALKERNEL = scal_rvv.c | SSCALKERNEL = scal_rvv.c | ||||
| DSCALKERNEL = scal_rvv.c | DSCALKERNEL = scal_rvv.c | ||||
| CSCALKERNEL = zscal_rvv.c | CSCALKERNEL = zscal_rvv.c | ||||
| @@ -0,0 +1,260 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #include "common.h" | |||||
| #if !defined(DOUBLE) | |||||
| #define VSETVL(n) __riscv_vsetvl_e32m8(n) | |||||
| #define FLOAT_V_T vfloat32m8_t | |||||
| #define VLSEV_FLOAT __riscv_vlse32_v_f32m8 | |||||
| #define VSSEV_FLOAT __riscv_vsse32_v_f32m8 | |||||
| #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m8 | |||||
| #define VFMULVF_FLOAT __riscv_vfmul_vf_f32m8 | |||||
| #define VFMSACVF_FLOAT __riscv_vfmsac_vf_f32m8 | |||||
| #else | |||||
| #define VSETVL(n) __riscv_vsetvl_e64m8(n) | |||||
| #define FLOAT_V_T vfloat64m8_t | |||||
| #define VLSEV_FLOAT __riscv_vlse64_v_f64m8 | |||||
| #define VSSEV_FLOAT __riscv_vsse64_v_f64m8 | |||||
| #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m8 | |||||
| #define VFMULVF_FLOAT __riscv_vfmul_vf_f64m8 | |||||
| #define VFMSACVF_FLOAT __riscv_vfmsac_vf_f64m8 | |||||
| #endif | |||||
| int CNAME(BLASLONG n, FLOAT *dx, BLASLONG incx, FLOAT *dy, BLASLONG incy, FLOAT *dparam) | |||||
| { | |||||
| BLASLONG i__1, i__2; | |||||
| BLASLONG kx, ky; | |||||
| FLOAT dh11, dh12, dh22, dh21, dflag; | |||||
| BLASLONG nsteps; | |||||
| --dparam; | |||||
| --dy; | |||||
| --dx; | |||||
| FLOAT_V_T v_w, v_z__, v_dx, v_dy; | |||||
| BLASLONG stride, stride_x, stride_y, offset; | |||||
| dflag = dparam[1]; | |||||
| if (n <= 0 || dflag == - 2.0) goto L140; | |||||
| if (!(incx == incy && incx > 0)) goto L70; | |||||
| nsteps = n * incx; | |||||
| if (dflag < 0.) { | |||||
| goto L50; | |||||
| } else if (dflag == 0) { | |||||
| goto L10; | |||||
| } else { | |||||
| goto L30; | |||||
| } | |||||
| L10: | |||||
| dh12 = dparam[4]; | |||||
| dh21 = dparam[3]; | |||||
| i__1 = nsteps; | |||||
| i__2 = incx; | |||||
| if(i__2 < 0){ | |||||
| offset = i__1 - 2; | |||||
| dx += offset; | |||||
| dy += offset; | |||||
| i__1 = -i__1; | |||||
| i__2 = -i__2; | |||||
| } | |||||
| stride = i__2 * sizeof(FLOAT); | |||||
| n = i__1 / i__2; | |||||
| for (size_t vl; n > 0; n -= vl, dx += vl*i__2, dy += vl*i__2) { | |||||
| vl = VSETVL(n); | |||||
| v_w = VLSEV_FLOAT(&dx[1], stride, vl); | |||||
| v_z__ = VLSEV_FLOAT(&dy[1], stride, vl); | |||||
| v_dx = VFMACCVF_FLOAT(v_w, dh12, v_z__, vl); | |||||
| v_dy = VFMACCVF_FLOAT(v_z__, dh21, v_w, vl); | |||||
| VSSEV_FLOAT(&dx[1], stride, v_dx, vl); | |||||
| VSSEV_FLOAT(&dy[1], stride, v_dy, vl); | |||||
| } | |||||
| goto L140; | |||||
| L30: | |||||
| dh11 = dparam[2]; | |||||
| dh22 = dparam[5]; | |||||
| i__2 = nsteps; | |||||
| i__1 = incx; | |||||
| if(i__1 < 0){ | |||||
| offset = i__2 - 2; | |||||
| dx += offset; | |||||
| dy += offset; | |||||
| i__1 = -i__1; | |||||
| i__2 = -i__2; | |||||
| } | |||||
| stride = i__1 * sizeof(FLOAT); | |||||
| n = i__2 / i__1; | |||||
| for (size_t vl; n > 0; n -= vl, dx += vl*i__1, dy += vl*i__1) { | |||||
| vl = VSETVL(n); | |||||
| v_w = VLSEV_FLOAT(&dx[1], stride, vl); | |||||
| v_z__ = VLSEV_FLOAT(&dy[1], stride, vl); | |||||
| v_dx = VFMACCVF_FLOAT(v_z__, dh11, v_w, vl); | |||||
| v_dy = VFMSACVF_FLOAT(v_w, dh22, v_z__, vl); | |||||
| VSSEV_FLOAT(&dx[1], stride, v_dx, vl); | |||||
| VSSEV_FLOAT(&dy[1], stride, v_dy, vl); | |||||
| } | |||||
| goto L140; | |||||
| L50: | |||||
| dh11 = dparam[2]; | |||||
| dh12 = dparam[4]; | |||||
| dh21 = dparam[3]; | |||||
| dh22 = dparam[5]; | |||||
| i__1 = nsteps; | |||||
| i__2 = incx; | |||||
| if(i__2 < 0){ | |||||
| offset = i__1 - 2; | |||||
| dx += offset; | |||||
| dy += offset; | |||||
| i__1 = -i__1; | |||||
| i__2 = -i__2; | |||||
| } | |||||
| stride = i__2 * sizeof(FLOAT); | |||||
| n = i__1 / i__2; | |||||
| for (size_t vl; n > 0; n -= vl, dx += vl*i__2, dy += vl*i__2) { | |||||
| vl = VSETVL(n); | |||||
| v_w = VLSEV_FLOAT(&dx[1], stride, vl); | |||||
| v_z__ = VLSEV_FLOAT(&dy[1], stride, vl); | |||||
| v_dx = VFMULVF_FLOAT(v_w, dh11, vl); | |||||
| v_dx = VFMACCVF_FLOAT(v_dx, dh12, v_z__, vl); | |||||
| VSSEV_FLOAT(&dx[1], stride, v_dx, vl); | |||||
| v_dy = VFMULVF_FLOAT(v_w, dh21, vl); | |||||
| v_dy = VFMACCVF_FLOAT(v_dy, dh22, v_z__, vl); | |||||
| VSSEV_FLOAT(&dy[1], stride, v_dy, vl); | |||||
| } | |||||
| goto L140; | |||||
| L70: | |||||
| kx = 1; | |||||
| ky = 1; | |||||
| if (incx < 0) { | |||||
| kx = (1 - n) * incx + 1; | |||||
| } | |||||
| if (incy < 0) { | |||||
| ky = (1 - n) * incy + 1; | |||||
| } | |||||
| if (dflag < 0.) { | |||||
| goto L120; | |||||
| } else if (dflag == 0) { | |||||
| goto L80; | |||||
| } else { | |||||
| goto L100; | |||||
| } | |||||
| L80: | |||||
| dh12 = dparam[4]; | |||||
| dh21 = dparam[3]; | |||||
| if(incx < 0){ | |||||
| incx = -incx; | |||||
| dx -= n*incx; | |||||
| } | |||||
| if(incy < 0){ | |||||
| incy = -incy; | |||||
| dy -= n*incy; | |||||
| } | |||||
| stride_x = incx * sizeof(FLOAT); | |||||
| stride_y = incy * sizeof(FLOAT); | |||||
| for (size_t vl; n > 0; n -= vl, dx += vl*incx, dy += vl*incy) { | |||||
| vl = VSETVL(n); | |||||
| v_w = VLSEV_FLOAT(&dx[kx], stride_x, vl); | |||||
| v_z__ = VLSEV_FLOAT(&dy[ky], stride_y, vl); | |||||
| v_dx = VFMACCVF_FLOAT(v_w, dh12, v_z__, vl); | |||||
| v_dy = VFMACCVF_FLOAT(v_z__, dh21, v_w, vl); | |||||
| VSSEV_FLOAT(&dx[kx], stride_x, v_dx, vl); | |||||
| VSSEV_FLOAT(&dy[ky], stride_y, v_dy, vl); | |||||
| } | |||||
| goto L140; | |||||
| L100: | |||||
| dh11 = dparam[2]; | |||||
| dh22 = dparam[5]; | |||||
| if(incx < 0){ | |||||
| incx = -incx; | |||||
| dx -= n*incx; | |||||
| } | |||||
| if(incy < 0){ | |||||
| incy = -incy; | |||||
| dy -= n*incy; | |||||
| } | |||||
| stride_x = incx * sizeof(FLOAT); | |||||
| stride_y = incy * sizeof(FLOAT); | |||||
| for (size_t vl; n > 0; n -= vl, dx += vl*incx, dy += vl*incy) { | |||||
| vl = VSETVL(n); | |||||
| v_w = VLSEV_FLOAT(&dx[kx], stride_x, vl); | |||||
| v_z__ = VLSEV_FLOAT(&dy[ky], stride_y, vl); | |||||
| v_dx = VFMACCVF_FLOAT(v_z__, dh11, v_w, vl); | |||||
| v_dy = VFMSACVF_FLOAT(v_w, dh22, v_z__, vl); | |||||
| VSSEV_FLOAT(&dx[kx], stride_x, v_dx, vl); | |||||
| VSSEV_FLOAT(&dy[ky], stride_y, v_dy, vl); | |||||
| } | |||||
| goto L140; | |||||
| L120: | |||||
| dh11 = dparam[2]; | |||||
| dh12 = dparam[4]; | |||||
| dh21 = dparam[3]; | |||||
| dh22 = dparam[5]; | |||||
| if(incx < 0){ | |||||
| incx = -incx; | |||||
| dx -= n*incx; | |||||
| } | |||||
| if(incy < 0){ | |||||
| incy = -incy; | |||||
| dy -= n*incy; | |||||
| } | |||||
| stride_x = incx * sizeof(FLOAT); | |||||
| stride_y = incy * sizeof(FLOAT); | |||||
| for (size_t vl; n > 0; n -= vl, dx += vl*incx, dy += vl*incy) { | |||||
| vl = VSETVL(n); | |||||
| v_w = VLSEV_FLOAT(&dx[kx], stride_x, vl); | |||||
| v_z__ = VLSEV_FLOAT(&dy[ky], stride_y, vl); | |||||
| v_dx = VFMULVF_FLOAT(v_w, dh11, vl); | |||||
| v_dx = VFMACCVF_FLOAT(v_dx, dh12, v_z__, vl); | |||||
| VSSEV_FLOAT(&dx[kx], stride_x, v_dx, vl); | |||||
| v_dy = VFMULVF_FLOAT(v_w, dh21, vl); | |||||
| v_dy = VFMACCVF_FLOAT(v_dy, dh22, v_z__, vl); | |||||
| VSSEV_FLOAT(&dy[ky], stride_y, v_dy, vl); | |||||
| } | |||||
| L140: | |||||
| return(0); | |||||
| } | |||||
| @@ -72,9 +72,9 @@ gotoblas_t TABLE_NAME = { | |||||
| samax_kTS, samin_kTS, smax_kTS, smin_kTS, | samax_kTS, samin_kTS, smax_kTS, smin_kTS, | ||||
| isamax_kTS, isamin_kTS, ismax_kTS, ismin_kTS, | isamax_kTS, isamin_kTS, ismax_kTS, ismin_kTS, | ||||
| snrm2_kTS, sasum_kTS, ssum_kTS, scopy_kTS, sbdot_kTS, | |||||
| snrm2_kTS, sasum_kTS, ssum_kTS, scopy_kTS, sbdot_kTS, | |||||
| dsdot_kTS, | dsdot_kTS, | ||||
| srot_kTS, saxpy_kTS, sscal_kTS, sswap_kTS, | |||||
| srot_kTS, srotm_kTS, saxpy_kTS, sscal_kTS, sswap_kTS, | |||||
| sbgemv_nTS, sbgemv_tTS, sger_kTS, | sbgemv_nTS, sbgemv_tTS, sger_kTS, | ||||
| ssymv_LTS, ssymv_UTS, | ssymv_LTS, ssymv_UTS, | ||||
| @@ -158,7 +158,7 @@ gotoblas_t TABLE_NAME = { | |||||
| #if (BUILD_SINGLE==1) || (BUILD_DOUBLE==1) || (BUILD_COMPLEX==1) | #if (BUILD_SINGLE==1) || (BUILD_DOUBLE==1) || (BUILD_COMPLEX==1) | ||||
| scopy_kTS, sdot_kTS, | scopy_kTS, sdot_kTS, | ||||
| // dsdot_kTS, | // dsdot_kTS, | ||||
| srot_kTS, saxpy_kTS, | |||||
| srot_kTS, srotm_kTS, saxpy_kTS, | |||||
| #endif | #endif | ||||
| #if (BUILD_SINGLE==1) || (BUILD_DOUBLE==1) || (BUILD_COMPLEX==1) || (BUILD_COMPLEX16==1) | #if (BUILD_SINGLE==1) || (BUILD_DOUBLE==1) || (BUILD_COMPLEX==1) || (BUILD_COMPLEX16==1) | ||||
| sscal_kTS, | sscal_kTS, | ||||
| @@ -260,6 +260,7 @@ gotoblas_t TABLE_NAME = { | |||||
| #endif | #endif | ||||
| #if (BUILD_DOUBLE==1) || (BUILD_COMPLEX16==1) | #if (BUILD_DOUBLE==1) || (BUILD_COMPLEX16==1) | ||||
| drot_kTS, | drot_kTS, | ||||
| drotm_kTS, | |||||
| daxpy_kTS, | daxpy_kTS, | ||||
| dscal_kTS, | dscal_kTS, | ||||
| dswap_kTS, | dswap_kTS, | ||||
| @@ -331,10 +332,9 @@ gotoblas_t TABLE_NAME = { | |||||
| qamax_kTS, qamin_kTS, qmax_kTS, qmin_kTS, | qamax_kTS, qamin_kTS, qmax_kTS, qmin_kTS, | ||||
| iqamax_kTS, iqamin_kTS, iqmax_kTS, iqmin_kTS, | iqamax_kTS, iqamin_kTS, iqmax_kTS, iqmin_kTS, | ||||
| qnrm2_kTS, qasum_kTS, qsum_kTS, qcopy_kTS, qdot_kTS, | qnrm2_kTS, qasum_kTS, qsum_kTS, qcopy_kTS, qdot_kTS, | ||||
| qrot_kTS, qaxpy_kTS, qscal_kTS, qswap_kTS, | |||||
| qrot_kTS, qrotm_kTS, qaxpy_kTS, qscal_kTS, qswap_kTS, | |||||
| qgemv_nTS, qgemv_tTS, qger_kTS, | qgemv_nTS, qgemv_tTS, qger_kTS, | ||||
| qsymv_LTS, qsymv_UTS, | qsymv_LTS, qsymv_UTS, | ||||
| qgemm_kernelTS, qgemm_betaTS, | qgemm_kernelTS, qgemm_betaTS, | ||||
| #if QGEMM_DEFAULT_UNROLL_M != QGEMM_DEFAULT_UNROLL_N | #if QGEMM_DEFAULT_UNROLL_M != QGEMM_DEFAULT_UNROLL_N | ||||
| qgemm_incopyTS, qgemm_itcopyTS, | qgemm_incopyTS, qgemm_itcopyTS, | ||||
| @@ -75,3 +75,14 @@ DGEMM_BETA = ../generic/gemm_beta.c | |||||
| CGEMM_BETA = ../generic/zgemm_beta.c | CGEMM_BETA = ../generic/zgemm_beta.c | ||||
| ZGEMM_BETA = ../generic/zgemm_beta.c | ZGEMM_BETA = ../generic/zgemm_beta.c | ||||
| ifndef SROTMKERNEL | |||||
| SROTMKERNEL = ../generic/rotm.c | |||||
| endif | |||||
| ifndef DROTMKERNEL | |||||
| DROTMKERNEL = ../generic/rotm.c | |||||
| endif | |||||
| ifndef QROTMKERNEL | |||||
| QROTMKERNEL = ../generic/rotm.c | |||||
| endif | |||||
| @@ -189,3 +189,14 @@ ZGEMM_BETA = ../generic/zgemm_beta.c | |||||
| QGEMM_BETA = ../generic/gemm_beta.c | QGEMM_BETA = ../generic/gemm_beta.c | ||||
| XGEMM_BETA = ../generic/zgemm_beta.c | XGEMM_BETA = ../generic/zgemm_beta.c | ||||
| ifndef SROTMKERNEL | |||||
| SROTMKERNEL = ../generic/rotm.c | |||||
| endif | |||||
| ifndef DROTMKERNEL | |||||
| DROTMKERNEL = ../generic/rotm.c | |||||
| endif | |||||
| ifndef QROTMKERNEL | |||||
| QROTMKERNEL = ../generic/rotm.c | |||||
| endif | |||||
| @@ -162,3 +162,15 @@ ZHEMV_L_KERNEL = ../generic/zhemv_k.c | |||||
| CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c | CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c | ||||
| ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c | ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c | ||||
| ifndef SROTMKERNEL | |||||
| SROTMKERNEL = ../generic/rotm.c | |||||
| endif | |||||
| ifndef DROTMKERNEL | |||||
| DROTMKERNEL = ../generic/rotm.c | |||||
| endif | |||||
| ifndef QROTMKERNEL | |||||
| QROTMKERNEL = ../generic/rotm.c | |||||
| endif | |||||
| @@ -290,6 +290,18 @@ ifndef QROTKERNEL | |||||
| QROTKERNEL = rot.S | QROTKERNEL = rot.S | ||||
| endif | endif | ||||
| ifndef SROTMKERNEL | |||||
| SROTMKERNEL = ../generic/rotm.c | |||||
| endif | |||||
| ifndef DROTMKERNEL | |||||
| DROTMKERNEL = ../generic/rotm.c | |||||
| endif | |||||
| ifndef QROTMKERNEL | |||||
| QROTMKERNEL = ../generic/rotm.c | |||||
| endif | |||||
| ifndef CROTKERNEL | ifndef CROTKERNEL | ||||
| CROTKERNEL = zrot_sse.S | CROTKERNEL = zrot_sse.S | ||||
| endif | endif | ||||
| @@ -168,3 +168,15 @@ QCABS_KERNEL = ../generic/cabs.c | |||||
| #Dump kernel | #Dump kernel | ||||
| CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c | CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c | ||||
| ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c | ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c | ||||
| ifndef SROTMKERNEL | |||||
| SROTMKERNEL = ../generic/rotm.c | |||||
| endif | |||||
| ifndef DROTMKERNEL | |||||
| DROTMKERNEL = ../generic/rotm.c | |||||
| endif | |||||
| ifndef QROTMKERNEL | |||||
| QROTMKERNEL = ../generic/rotm.c | |||||
| endif | |||||
| @@ -27,4 +27,14 @@ ifndef ZGEMM_BETA | |||||
| ZGEMM_BETA = ../generic/zgemm_beta.c | ZGEMM_BETA = ../generic/zgemm_beta.c | ||||
| endif | endif | ||||
| ifndef SROTMKERNEL | |||||
| SROTMKERNEL = ../generic/rotm.c | |||||
| endif | |||||
| ifndef DROTMKERNEL | |||||
| DROTMKERNEL = ../generic/rotm.c | |||||
| endif | |||||
| ifndef QROTMKERNEL | |||||
| QROTMKERNEL = ../generic/rotm.c | |||||
| endif | |||||
| @@ -135,5 +135,14 @@ ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | ||||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | ||||
| ifndef SROTMKERNEL | |||||
| SROTMKERNEL = ../generic/rotm.c | |||||
| endif | |||||
| ifndef DROTMKERNEL | |||||
| DROTMKERNEL = ../generic/rotm.c | |||||
| endif | |||||
| ifndef QROTMKERNEL | |||||
| QROTMKERNEL = ../generic/rotm.c | |||||
| endif | |||||
| @@ -70,6 +70,24 @@ CTEST(rot,drot_inc_1) | |||||
| ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS); | ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS); | ||||
| } | } | ||||
| } | } | ||||
| CTEST(rot,drotm_inc_1) | |||||
| { | |||||
| blasint i = 0; | |||||
| blasint N = 12, incX = 1, incY = 1; | |||||
| double param[5] = {1.0, 2.0, 3.0, 4.0, 5.0}; | |||||
| double x_actual[] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0}; | |||||
| double y_actual[] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0}; | |||||
| double x_referece[] = {3.0, 6.0, 9.0, 12.0, 15.0, 18.0, 21.0, 24.0, 27.0, 30.0, 33.0, 36.0}; | |||||
| double y_referece[] = {4.0, 8.0, 12.0, 16.0, 20.0, 24.0, 28.0, 32.0, 36.0, 40.0, 44.0, 48.0}; | |||||
| //OpenBLAS | |||||
| BLASFUNC(drotm)(&N, x_actual, &incX, y_actual, &incY, param); | |||||
| for(i = 0; i < N; i++){ | |||||
| ASSERT_DBL_NEAR_TOL(x_referece[i], x_actual[i], DOUBLE_EPS); | |||||
| ASSERT_DBL_NEAR_TOL(y_referece[i], y_actual[i], DOUBLE_EPS); | |||||
| } | |||||
| } | |||||
| #endif | #endif | ||||
| #ifdef BUILD_COMPLEX16 | #ifdef BUILD_COMPLEX16 | ||||
| @@ -130,6 +148,24 @@ CTEST(rot,srot_inc_1) | |||||
| ASSERT_DBL_NEAR_TOL(y2[i], y1[i], SINGLE_EPS); | ASSERT_DBL_NEAR_TOL(y2[i], y1[i], SINGLE_EPS); | ||||
| } | } | ||||
| } | } | ||||
| CTEST(rot,srotm_inc_1) | |||||
| { | |||||
| blasint i = 0; | |||||
| blasint N = 12, incX = 1, incY = 1; | |||||
| float param[5] = {1.0, 2.0, 3.0, 4.0, 5.0}; | |||||
| float x_actual[] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0}; | |||||
| float y_actual[] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0}; | |||||
| float x_referece[] = {3.0, 6.0, 9.0, 12.0, 15.0, 18.0, 21.0, 24.0, 27.0, 30.0, 33.0, 36.0}; | |||||
| float y_referece[] = {4.0, 8.0, 12.0, 16.0, 20.0, 24.0, 28.0, 32.0, 36.0, 40.0, 44.0, 48.0}; | |||||
| //OpenBLAS | |||||
| BLASFUNC(srotm)(&N, x_actual, &incX, y_actual, &incY, param); | |||||
| for(i = 0; i < N; i++){ | |||||
| ASSERT_DBL_NEAR_TOL(x_referece[i], x_actual[i], SINGLE_EPS); | |||||
| ASSERT_DBL_NEAR_TOL(y_referece[i], y_actual[i], SINGLE_EPS); | |||||
| } | |||||
| } | |||||
| #endif | #endif | ||||
| #ifdef BUILD_COMPLEX | #ifdef BUILD_COMPLEX | ||||