From 8a316e68a52693f488774a7dfbb4041d3add329a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 12 Jan 2025 17:38:20 +0100 Subject: [PATCH 001/205] Update version to 0.3.29.dev --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9e49c4e29..8e99bd208 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -9,7 +9,7 @@ project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 29) +set(OpenBLAS_PATCH_VERSION 29.dev) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") From 4e817f804c61a3c7c94c3ce52db433ec20e83ab3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 12 Jan 2025 17:39:00 +0100 Subject: [PATCH 002/205] Update version to 0.3.29.dev --- Makefile.rule | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index c3edd1cff..1472ed938 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.29 +VERSION = 0.3.29.dev # If you set this prefix, the library name will be lib$(LIBNAMESUFFIX)openblas.a # and lib$(LIBNAMESUFFIX)openblas.so, with a matching soname in the shared library From af10c132b845dbf290e8df2f41fe396e017dbd9c Mon Sep 17 00:00:00 2001 From: Xi Ruoyao Date: Mon, 13 Jan 2025 21:43:18 +0800 Subject: [PATCH 003/205] LoongArch64: Fix dsymv and ssymv LASX version "fmov.d $f2, $f4" leaves all the bits higher than the 63-th bit unpredictable but it's obvious that the following code uses the value of those high bits. We actually want to replicate the lower 64 bits here, so we should use xvreplve0.d instead. LA464 (Loongson 3[A-Z]-5000) happens to replicate them for us due to some uarch internal details so the issue was not detected, but for LA664 (Loongson 3[A-Z]-6000) and future uarch we need to do things correctly or we end up getting a lot of test failures. Closes: https://bbs.aosc.io/t/topic/302 Signed-off-by: Xi Ruoyao --- kernel/loongarch64/dsymv_L_lasx.S | 2 +- kernel/loongarch64/dsymv_U_lasx.S | 2 +- kernel/loongarch64/ssymv_L_lasx.S | 2 +- kernel/loongarch64/ssymv_U_lasx.S | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/loongarch64/dsymv_L_lasx.S b/kernel/loongarch64/dsymv_L_lasx.S index a36cff9a9..508232228 100644 --- a/kernel/loongarch64/dsymv_L_lasx.S +++ b/kernel/loongarch64/dsymv_L_lasx.S @@ -288,7 +288,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //Acc U2 GACC xvf, d, U4, U2 - fmov.d $f2, $f4 + xvreplve0.d U2, U4 .L03: /* &4 */ sub.d T0, M, J addi.d T0, T0, -1 diff --git a/kernel/loongarch64/dsymv_U_lasx.S b/kernel/loongarch64/dsymv_U_lasx.S index 892c5ed2f..21bf3dffc 100644 --- a/kernel/loongarch64/dsymv_U_lasx.S +++ b/kernel/loongarch64/dsymv_U_lasx.S @@ -272,7 +272,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //Acc U2 GACC xvf, d, U4, U2 - fmov.d $f2, $f4 + xvreplve0.d U2, U4 .L03: /* &4 */ andi T0, J, 4 diff --git a/kernel/loongarch64/ssymv_L_lasx.S b/kernel/loongarch64/ssymv_L_lasx.S index 81796883d..21ffcec69 100644 --- a/kernel/loongarch64/ssymv_L_lasx.S +++ b/kernel/loongarch64/ssymv_L_lasx.S @@ -279,7 +279,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //Acc U2 GACC xvf, s, U4, U2 - fmov.d $f2, $f4 + xvreplve0.d U2, U4 .L03: /* &4 */ sub.d T0, M, J diff --git a/kernel/loongarch64/ssymv_U_lasx.S b/kernel/loongarch64/ssymv_U_lasx.S index ff68723e1..662f311d9 100644 --- a/kernel/loongarch64/ssymv_U_lasx.S +++ b/kernel/loongarch64/ssymv_U_lasx.S @@ -263,7 +263,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //Acc U2 GACC xvf, s, U4, U2 - fmov.d $f2, $f4 + xvreplve0.d U2, U4 .L03: /* &4 */ andi T0, J, 4 From c8cd8da496406b83d9c171fe572719bbfcaf2105 Mon Sep 17 00:00:00 2001 From: Annop Wongwathanarat Date: Mon, 13 Jan 2025 15:43:08 +0000 Subject: [PATCH 004/205] Add thread throttling profile for SGEMM on NEOVERSEV1 --- CONTRIBUTORS.md | 3 +++ interface/gemm.c | 57 +++++++++++++++++++++++++++++++++++++++--------- 2 files changed, 50 insertions(+), 10 deletions(-) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 508dbcd0e..d97eb3bcc 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -232,3 +232,6 @@ In chronological order: * Aniket P. Garade Sushil Pratap Singh Juliya James * [2024-12-13] Optimized swap and rot Level-1 BLAS routines with ARM SVE + +* Annop Wongwathanarat + * [2025-01-10] Add thread throttling profile for SGEMM on NEOVERSEV1 \ No newline at end of file diff --git a/interface/gemm.c b/interface/gemm.c index c9f810faa..8a806cfb4 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -1,5 +1,5 @@ /*********************************************************************/ -/* Copyright 2024 The OpenBLAS Project */ +/* Copyright 2024, 2025 The OpenBLAS Project */ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ @@ -177,6 +177,49 @@ static int init_amxtile_permission() { } #endif +#ifdef DYNAMIC_ARCH +extern char* gotoblas_corename(void); +#endif + +#if defined(DYNAMIC_ARCH) || defined(NEOVERSEV1) +static inline int get_gemm_optimal_nthreads_neoversev1(double MNK, int ncpu) { + return + MNK < 262144L ? 1 + : MNK < 1124864L ? MIN(ncpu, 6) + : MNK < 7880599L ? MIN(ncpu, 12) + : MNK < 17173512L ? MIN(ncpu, 16) + : MNK < 33386248L ? MIN(ncpu, 20) + : MNK < 57066625L ? MIN(ncpu, 24) + : MNK < 91733851L ? MIN(ncpu, 32) + : MNK < 265847707L ? MIN(ncpu, 40) + : MNK < 458314011L ? MIN(ncpu, 48) + : MNK < 729000000L ? MIN(ncpu, 56) + : ncpu; +} +#endif + +static inline int get_gemm_optimal_nthreads(double MNK) { + int ncpu = num_cpu_avail(3); +#if defined(NEOVERSEV1) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) + return get_gemm_optimal_nthreads_neoversev1(MNK, ncpu); +#elif defined(DYNAMIC_ARCH) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) + if (strcmp(gotoblas_corename(), "neoversev1") == 0) { + return get_gemm_optimal_nthreads_neoversev1(MNK, ncpu); + } +#endif + if ( MNK <= (SMP_THRESHOLD_MIN * (double) GEMM_MULTITHREAD_THRESHOLD) ) { + return 1; + } + else { + if (MNK/ncpu < SMP_THRESHOLD_MIN*(double)GEMM_MULTITHREAD_THRESHOLD) { + return MNK/(SMP_THRESHOLD_MIN*(double)GEMM_MULTITHREAD_THRESHOLD); + } + else { + return ncpu; + } + } +} + #ifndef CBLAS void NAME(char *TRANSA, char *TRANSB, @@ -310,7 +353,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS FLOAT *beta = (FLOAT*) vbeta; FLOAT *a = (FLOAT*) va; FLOAT *b = (FLOAT*) vb; - FLOAT *c = (FLOAT*) vc; + FLOAT *c = (FLOAT*) vc; #endif blas_arg_t args; @@ -352,7 +395,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS #if !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) && defined(USE_SGEMM_KERNEL_DIRECT) #ifdef DYNAMIC_ARCH if (support_avx512() ) -#endif +#endif if (beta == 0 && alpha == 1.0 && order == CblasRowMajor && TransA == CblasNoTrans && TransB == CblasNoTrans && SGEMM_DIRECT_PERFORMANT(m,n,k)) { SGEMM_DIRECT(m, n, k, a, lda, b, ldb, c, ldc); return; @@ -604,13 +647,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS #endif MNK = (double) args.m * (double) args.n * (double) args.k; - if ( MNK <= (SMP_THRESHOLD_MIN * (double) GEMM_MULTITHREAD_THRESHOLD) ) - args.nthreads = 1; - else { - args.nthreads = num_cpu_avail(3); - if (MNK/args.nthreads < SMP_THRESHOLD_MIN*(double)GEMM_MULTITHREAD_THRESHOLD) - args.nthreads = MNK/(SMP_THRESHOLD_MIN*(double)GEMM_MULTITHREAD_THRESHOLD); - } + args.nthreads = get_gemm_optimal_nthreads(MNK); args.common = NULL; From 0c0112dfef960db77957e78619561459d42df24f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 13 Jan 2025 17:29:38 +0100 Subject: [PATCH 005/205] update deprecated macos-12 jobs to macos-13 image --- azure-pipelines.yml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 8c5b1e5bb..0bdf4e316 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -141,7 +141,7 @@ jobs: - job: OSX_OpenMP pool: - vmImage: 'macOS-12' + vmImage: 'macOS-13' steps: - script: | brew update @@ -151,7 +151,7 @@ jobs: - job: OSX_GCC_Nothreads pool: - vmImage: 'macOS-12' + vmImage: 'macOS-13' steps: - script: | brew update @@ -195,7 +195,7 @@ jobs: - job: OSX_dynarch_cmake pool: - vmImage: 'macOS-12' + vmImage: 'macOS-13' variables: LD_LIBRARY_PATH: /usr/local/opt/llvm/lib LIBRARY_PATH: /usr/local/opt/llvm/lib @@ -242,7 +242,7 @@ jobs: - job: OSX_NDK_ARMV7 pool: - vmImage: 'macOS-12' + vmImage: 'macOS-13' steps: - script: | brew update @@ -252,7 +252,7 @@ jobs: - job: OSX_IOS_ARMV8 pool: - vmImage: 'macOS-12' + vmImage: 'macOS-13' variables: CC: /Applications/Xcode_14.2.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang CFLAGS: -O2 -Wno-macro-redefined -isysroot /Applications/Xcode_14.2.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS16.2.sdk -arch arm64 -miphoneos-version-min=10.0 @@ -262,7 +262,7 @@ jobs: - job: OSX_IOS_ARMV7 pool: - vmImage: 'macOS-12' + vmImage: 'macOS-13' variables: CC: /Applications/Xcode_14.2.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang CFLAGS: -O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode_14.2.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS16.2.sdk -arch armv7 -miphoneos-version-min=5.1 @@ -272,7 +272,7 @@ jobs: - job: OSX_xbuild_DYNAMIC_ARM64 pool: - vmImage: 'macOS-12' + vmImage: 'macOS-13' variables: CC: /Applications/Xcode_14.2.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang CFLAGS: -O2 -Wno-macro-redefined -isysroot /Applications/Xcode_14.2.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX13.1.sdk -arch arm64 From a9070ba3f9f1a3ae2892a5b2b0bb56022852d748 Mon Sep 17 00:00:00 2001 From: gxw Date: Tue, 14 Jan 2025 09:06:29 +0000 Subject: [PATCH 006/205] LoongArch64: Update ssymv LSX version --- kernel/loongarch64/ssymv_L_lsx.S | 212 +++++++++++++++++-------------- kernel/loongarch64/ssymv_U_lsx.S | 189 +++++++++++++++------------ 2 files changed, 223 insertions(+), 178 deletions(-) diff --git a/kernel/loongarch64/ssymv_L_lsx.S b/kernel/loongarch64/ssymv_L_lsx.S index 949e9e902..a98cad38b 100644 --- a/kernel/loongarch64/ssymv_L_lsx.S +++ b/kernel/loongarch64/ssymv_L_lsx.S @@ -28,6 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ASSEMBLER #include "common.h" +#include "loongarch64_asm.S" /* Param */ #define M $r4 @@ -57,6 +58,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define T2 $r28 #define T3 $r29 #define T4 $r30 +#define T5 $r17 +#define T6 $r16 /* LSX vectors */ #define U0 $vr31 @@ -88,77 +91,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define a9 $f9 - PROLOGUE - - LDARG BUFFER, $sp, 0 - - addi.d $sp, $sp, -88 - - SDARG $r23, $sp, 0 - SDARG $r24, $sp, 8 - SDARG $r25, $sp, 16 - SDARG $r26, $sp, 32 - SDARG $r27, $sp, 40 - SDARG $r28, $sp, 48 - SDARG $r29, $sp, 56 - SDARG $r30, $sp, 64 - SDARG $r31, $sp, 72 - ST ALPHA, $sp, 80 - - vldrepl.w VALPHA, $sp, 80 - - slli.d LDA, LDA, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - slli.d INCY, INCY, BASE_SHIFT - - bge $r0, M, .L999 - bge $r0, N, .L999 - - move J, $r0 - move JY, $r0 - move JX, $r0 - move AO1, A - - beq J, N, .L999 - -.L01: - MTC a2, $r0 //temp2 - fldx.s a6, X, JX - fmul.s a3, ALPHA, a6 //temp1 - vpermi.w U3, U3, 0x00 - vpermi.w U2, U2, 0x00 - - mul.w T0, J, LDA - slli.d T1, J, BASE_SHIFT - add.w T0, T0, T1 - fldx.s a6, AO1, T0 - fldx.s a4, Y, JY - fmadd.s a4, a3, a6, a4 - fstx.s a4, Y, JY - - move IY, JY - move IX, JX - addi.d II, J, 1 - move I, II - slli.d II, II, BASE_SHIFT - - sub.d T0, M, J - addi.d T0, T0, -1 - srai.d T0, T0, 3 - add.d T0, T0, J - addi.d T0, T0, 1 - beq I, T0, .L03 - bge I, T0, .L03 - - mul.w T1, J, LDA - add.d T1, T1, II - -.L02: /* /8 */ - vldx U1, AO1, T1 - addi.d T1, T1, 16 - vldx U14, AO1, T1 - addi.d T1, T1, 16 - +.macro LOAD_Y_8 + beqz T5, .L01_Y_0 add.d T2, IY, INCY fldx.s $f4, Y, T2 add.d T2, T2, INCY @@ -183,10 +117,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vextrins.w U8, U9, 0x10 vextrins.w U8, U10, 0x20 vextrins.w U8, U11, 0x30 - - vfmadd.s U4, U3, U1, U4 - vfmadd.s U8, U3, U14, U8 - + b .L01_Y_1 +.L01_Y_0: + add.d T3, IY, INCY + vldx U4, Y, T3 + alsl.d T4, INCY, T3, 2 + vldx U8, Y, T4 +.L01_Y_1: +.endm + +.macro STORE_Y_8 + beqz T5, .L01_Y_2 vextrins.w U5, U4, 0x01 vextrins.w U6, U4, 0x02 vextrins.w U7, U4, 0x03 @@ -211,10 +152,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fstx.s $f10, Y, T2 add.d T2, T2, INCY fstx.s $f11, Y, T2 - - slli.d T2, INCY, 3 - add.d IY, IY, T2 - + b .L01_Y_3 +.L01_Y_2: + vstx U4, Y, T3 + vstx U8, Y, T4 +.L01_Y_3: +.endm + +.macro LOAD_X_8 + beqz T6, .L01_X_0 add.d T2, IX, INCX fldx.s $f4, X, T2 add.d T2, T2, INCX @@ -239,31 +185,109 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vextrins.w $vr8, $vr9, 0x10 vextrins.w $vr8, $vr10, 0x20 vextrins.w $vr8, $vr11, 0x30 + b .L01_X_1 +.L01_X_0: + add.d T3, IX, INCX + vldx U4, X, T3 + alsl.d T4, INCX, T3, 2 + vldx U8, X, T4 +.L01_X_1: +.endm - vand.v $vr12, $vr2, $vr2 + PROLOGUE - vfmadd.s U2, U1, U4, U2 - vfsub.s U2, U2, $vr12 - vfmadd.s U2, U14, U8, U2 + addi.d $sp, $sp, -88 - vextrins.w U4, U2, 0x01 - vextrins.w U5, U2, 0x02 - vextrins.w U6, U2, 0x03 + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 32 + SDARG $r27, $sp, 40 + SDARG $r28, $sp, 48 + SDARG $r29, $sp, 56 + SDARG $r30, $sp, 64 + SDARG $r31, $sp, 72 + ST ALPHA, $sp, 80 - fadd.s $f2, $f2, $f4 - fadd.s $f2, $f2, $f5 - fadd.s $f2, $f2, $f6 - fadd.s $f2, $f2, $f12 + vldrepl.w VALPHA, $sp, 80 - vpermi.w U2, U2, 0x00 + addi.d T5, INCY, -1 + addi.d T6, INCX, -1 + slli.d LDA, LDA, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + + bge $r0, M, .L999 + bge $r0, N, .L999 + + move J, $r0 + move JY, $r0 + move JX, $r0 + move AO1, A + + beq J, N, .L999 + +.L01: + vxor.v U2, U2, U2 + fldx.s a6, X, JX + fmul.s a3, ALPHA, a6 //temp1 + vpermi.w U3, U3, 0x00 + + mul.w T0, J, LDA + slli.d T1, J, BASE_SHIFT + add.w T0, T0, T1 + fldx.s a6, AO1, T0 + fldx.s a4, Y, JY + fmadd.s a4, a3, a6, a4 + fstx.s a4, Y, JY + + move IY, JY + move IX, JX + addi.d II, J, 1 + move I, II + slli.d II, II, BASE_SHIFT - slli.d T2, INCX, 3 - add.d IX, IX, T2 + sub.d T0, M, J + addi.d T0, T0, -1 + srai.d T0, T0, 3 + add.d T0, T0, J + addi.d T0, T0, 1 + beq I, T0, .L03 + bge I, T0, .L03 + + mul.w T1, J, LDA + add.d T1, T1, II + +.L02: /* /8 */ + vldx U1, AO1, T1 + addi.d T1, T1, 16 + vldx U14, AO1, T1 + addi.d T1, T1, 16 + + LOAD_Y_8 + + vfmadd.s U4, U3, U1, U4 + vfmadd.s U8, U3, U14, U8 + + STORE_Y_8 + + alsl.d IY, INCY, IY, 3 + + LOAD_X_8 + + vfmadd.s U2, U1, U4, U2 + vfmadd.s U2, U14, U8, U2 + + alsl.d IX, INCX, IX, 3 addi.d II, II, 32 addi.d I, I, 1 blt I, T0, .L02 + // Acc U2 + GACC vf, s, U4, U2 + vpermi.w U2, U4, 0 + .L03: /* &4 */ sub.d T0, M, J addi.d T0, T0, -1 @@ -426,4 +450,4 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi.d $sp, $sp, 88 jirl $r0, $r1, 0x0 - EPILOGUE \ No newline at end of file + EPILOGUE diff --git a/kernel/loongarch64/ssymv_U_lsx.S b/kernel/loongarch64/ssymv_U_lsx.S index f3898e148..7ff9b9b7b 100644 --- a/kernel/loongarch64/ssymv_U_lsx.S +++ b/kernel/loongarch64/ssymv_U_lsx.S @@ -28,6 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ASSEMBLER #include "common.h" +#include "loongarch64_asm.S" /* Param */ #define M $r4 @@ -57,6 +58,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define T2 $r28 #define T3 $r29 #define T4 $r30 +#define T5 $r17 +#define T6 $r16 /* LSX vectors */ #define U0 $vr31 @@ -87,67 +90,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define a8 $f8 #define a9 $f9 - - PROLOGUE - - LDARG BUFFER, $sp, 0 - - addi.d $sp, $sp, -88 - - SDARG $r23, $sp, 0 - SDARG $r24, $sp, 8 - SDARG $r25, $sp, 16 - SDARG $r26, $sp, 32 - SDARG $r27, $sp, 40 - SDARG $r28, $sp, 48 - SDARG $r29, $sp, 56 - SDARG $r30, $sp, 64 - SDARG $r31, $sp, 72 - ST ALPHA, $sp, 80 - - vldrepl.w VALPHA, $sp, 80 - - slli.d LDA, LDA, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - slli.d INCY, INCY, BASE_SHIFT - - bge $r0, M, .L999 - bge $r0, N, .L999 - - sub.d M1, M, N - - mul.d JY, M1, INCY - mul.d JX, M1, INCX - - move J, M1 - move AO1, A - - beq J, M, .L999 - -.L01: - MTC $f2, $r0 //temp2 - fldx.s $f6, X, JX - fmul.s $f3, ALPHA, $f6 //temp1 - vpermi.w U3, U3, 0x00 - vpermi.w U2, U2, 0x00 - - move IY, $r0 - move IX, $r0 - move II, $r0 - move I, $r0 - - srai.d T0, J, 3 - beq I, T0, .L03 - - mul.w T1, J, LDA - add.d T1, T1, II - -.L02: /* /8 */ - vldx U1, AO1, T1 - addi.d T1, T1, 16 - vldx U14, AO1, T1 - addi.d T1, T1, 16 - +.macro LOAD_Y_8 + beqz T5, .L01_Y_0 fldx.s $f4, Y, IY add.d T2, IY, INCY fldx.s $f5, Y, T2 @@ -171,10 +115,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vextrins.w U8, U9, 0x10 vextrins.w U8, U10, 0x20 vextrins.w U8, U11, 0x30 - - vfmadd.s U4, U3, U1, U4 - vfmadd.s U8, U3, U14, U8 - + b .L01_Y_1 +.L01_Y_0: + vldx U4, Y, IY + alsl.d T2, INCY, IY, 2 + vldx U8, Y, T2 +.L01_Y_1: +.endm + +.macro STORE_Y_8 + beqz T5, .L01_Y_2 vextrins.w U5, U4, 0x01 vextrins.w U6, U4, 0x02 vextrins.w U7, U4, 0x03 @@ -198,10 +148,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fstx.s $f10, Y, T2 add.d T2, T2, INCY fstx.s $f11, Y, T2 - - slli.d T2, INCY, 3 - add.d IY, IY, T2 - + b .L01_Y_3 +.L01_Y_2: + vstx U4, Y, IY + vstx U8, Y, T2 +.L01_Y_3: +.endm + +.macro LOAD_X_8 + beqz T6, .L01_X_0 fldx.s $f4, X, IX add.d T2, IX, INCX fldx.s $f5, X, T2 @@ -225,31 +180,97 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vextrins.w $vr8, $vr9, 0x10 vextrins.w $vr8, $vr10, 0x20 vextrins.w $vr8, $vr11, 0x30 + b .L01_X_1 +.L01_X_0: + vldx U4, X, IX + alsl.d T3, INCX, IX, 2 + vldx U8, X, T3 +.L01_X_1: +.endm - vand.v $vr12, $vr2, $vr2 + PROLOGUE - vfmadd.s U2, U1, U4, U2 - vfsub.s U2, U2, $vr12 - vfmadd.s U2, U14, U8, U2 + addi.d $sp, $sp, -88 - vextrins.w U4, U2, 0x01 - vextrins.w U5, U2, 0x02 - vextrins.w U6, U2, 0x03 + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 32 + SDARG $r27, $sp, 40 + SDARG $r28, $sp, 48 + SDARG $r29, $sp, 56 + SDARG $r30, $sp, 64 + SDARG $r31, $sp, 72 + ST ALPHA, $sp, 80 - fadd.s $f2, $f2, $f4 - fadd.s $f2, $f2, $f5 - fadd.s $f2, $f2, $f6 - fadd.s $f2, $f2, $f12 + vldrepl.w VALPHA, $sp, 80 - vpermi.w U2, U2, 0x00 + addi.d T5, INCY, -1 + addi.d T6, INCX, -1 + slli.d LDA, LDA, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + + bge $r0, M, .L999 + bge $r0, N, .L999 + + sub.d M1, M, N + + mul.d JY, M1, INCY + mul.d JX, M1, INCX + + move J, M1 + move AO1, A + + beq J, M, .L999 + +.L01: + vxor.v U2, U2, U2 + fldx.s $f6, X, JX + fmul.s $f3, ALPHA, $f6 //temp1 + vpermi.w U3, U3, 0x00 + + move IY, $r0 + move IX, $r0 + move II, $r0 + move I, $r0 + + srai.d T0, J, 3 + beq I, T0, .L03 + + mul.w T1, J, LDA + add.d T1, T1, II - slli.d T2, INCX, 3 - add.d IX, IX, T2 +.L02: /* /8 */ + vldx U1, AO1, T1 + addi.d T1, T1, 16 + vldx U14, AO1, T1 + addi.d T1, T1, 16 + + LOAD_Y_8 + + vfmadd.s U4, U3, U1, U4 + vfmadd.s U8, U3, U14, U8 + + STORE_Y_8 + + alsl.d IY, INCY, IY, 3 + + LOAD_X_8 + + vfmadd.s U2, U1, U4, U2 + vfmadd.s U2, U14, U8, U2 + + alsl.d IX, INCX, IX, 3 addi.d II, II, 32 addi.d I, I, 1 blt I, T0, .L02 + // Acc U2 + GACC vf, s, U4, U2 + vpermi.w U2, U4, 0x00 + .L03: /* &4 */ andi T0, J, 4 beq $r0, T0, .L04 @@ -414,4 +435,4 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi.d $sp, $sp, 88 jirl $r0, $r1, 0x0 - EPILOGUE \ No newline at end of file + EPILOGUE From e0a8216554f22529d264ff921fc245255a580447 Mon Sep 17 00:00:00 2001 From: gxw Date: Tue, 14 Jan 2025 10:25:08 +0000 Subject: [PATCH 007/205] LoongArch64: Update dsymv LSX version --- kernel/loongarch64/dsymv_L_lsx.S | 208 ++++++++++++++++++------------- kernel/loongarch64/dsymv_U_lsx.S | 200 +++++++++++++++++------------ 2 files changed, 241 insertions(+), 167 deletions(-) diff --git a/kernel/loongarch64/dsymv_L_lsx.S b/kernel/loongarch64/dsymv_L_lsx.S index 1fd0d26f5..fed408108 100644 --- a/kernel/loongarch64/dsymv_L_lsx.S +++ b/kernel/loongarch64/dsymv_L_lsx.S @@ -28,6 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ASSEMBLER #include "common.h" +#include "loongarch64_asm.S" /* Param */ #define M $r4 @@ -57,6 +58,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define T2 $r28 #define T3 $r29 #define T4 $r30 +#define T5 $r17 +#define T6 $r16 +#define T7 $r12 /* LSX vectors */ #define U0 $vr31 @@ -87,10 +91,114 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define a8 $f8 #define a9 $f9 +.macro LOAD_Y_8 + beqz T5, .L01_Y_0 + add.d T2, IY, INCY + fldx.d $f4, Y, T2 + add.d T2, T2, INCY + fldx.d $f5, Y, T2 + add.d T2, T2, INCY + fldx.d $f6, Y, T2 + add.d T2, T2, INCY + fldx.d $f7, Y, T2 - PROLOGUE + add.d T2, T2, INCY + fldx.d $f8, Y, T2 + add.d T2, T2, INCY + fldx.d $f9, Y, T2 + add.d T2, T2, INCY + fldx.d $f10, Y, T2 + add.d T2, T2, INCY + fldx.d $f11, Y, T2 + + vextrins.d U4, U5, 0x10 + vextrins.d U6, U7, 0x10 + vextrins.d U8, U9, 0x10 + vextrins.d U10, U11, 0x10 + b .L01_Y_1 +.L01_Y_0: + add.d T7, IY, INCY + vldx U4, Y, T7 + alsl.d T2, INCY, T7, 1 + vldx U6, Y, T2 + alsl.d T3, INCY, T2, 1 + vldx U8, Y, T3 + alsl.d T4, INCY, T3, 1 + vldx U10, Y, T4 +.L01_Y_1: +.endm + +.macro LOAD_X_8 + beqz T6, .L01_X_0 + add.d T2, IX, INCX + fldx.d $f4, X, T2 + add.d T2, T2, INCX + fldx.d $f5, X, T2 + add.d T2, T2, INCX + fldx.d $f6, X, T2 + add.d T2, T2, INCX + fldx.d $f7, X, T2 + + add.d T2, T2, INCX + fldx.d $f8, X, T2 + add.d T2, T2, INCX + fldx.d $f9, X, T2 + add.d T2, T2, INCX + fldx.d $f10, X, T2 + add.d T2, T2, INCX + fldx.d $f11, X, T2 + + vextrins.d U4, U5, 0x10 + vextrins.d U6, U7, 0x10 + vextrins.d U8, U9, 0x10 + vextrins.d U10, U11, 0x10 + b .L01_X_1 +.L01_X_0: + add.d T7, IX, INCX + vldx U4, X, T7 + alsl.d T2, INCX, T7, 1 + vldx U6, X, T2 + alsl.d T3, INCX, T2, 1 + vldx U8, X, T3 + alsl.d T4, INCX, T3, 1 + vldx U10, X, T4 +.L01_X_1: +.endm + +.macro STORE_Y_8 + beqz T5, .L01_Y_2 + vextrins.d U5, U4, 0x01 + vextrins.d U7, U6, 0x01 + vextrins.d U9, U8, 0x01 + vextrins.d U11, U10, 0x01 + + add.d T2, IY, INCY + fstx.d $f4, Y, T2 + add.d T2, T2, INCY + fstx.d $f5, Y, T2 + add.d T2, T2, INCY + fstx.d $f6, Y, T2 + add.d T2, T2, INCY + fstx.d $f7, Y, T2 + + add.d T2, T2, INCY + fstx.d $f8, Y, T2 + add.d T2, T2, INCY + fstx.d $f9, Y, T2 + add.d T2, T2, INCY + fstx.d $f10, Y, T2 + add.d T2, T2, INCY + fstx.d $f11, Y, T2 + b .L01_Y_3 +.L01_Y_2: + vstx U4, Y, T7 + vstx U6, Y, T2 + vstx U8, Y, T3 + vstx U10, Y, T4 +.L01_Y_3: +.endm - LDARG BUFFER, $sp, 0 + PROLOGUE addi.d $sp, $sp, -88 @@ -107,6 +215,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vldrepl.d VALPHA, $sp, 80 + addi.d T5, INCY, -1 + addi.d T6, INCX, -1 slli.d LDA, LDA, BASE_SHIFT slli.d INCX, INCX, BASE_SHIFT slli.d INCY, INCY, BASE_SHIFT @@ -122,11 +232,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. beq J, N, .L999 .L01: - MTC a2, $r0 //temp2 + vxor.v U2, U2, U2 fldx.d a6, X, JX fmul.d a3, ALPHA, a6 //temp1 vshuf4i.d U3, U3, 0x00 - vshuf4i.d U2, U2, 0x00 mul.d T0, J, LDA slli.d T1, J, BASE_SHIFT @@ -163,105 +272,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vldx U16, AO1, T1 addi.d T1, T1, 16 - add.d T2, IY, INCY - fldx.d $f4, Y, T2 - add.d T2, T2, INCY - fldx.d $f5, Y, T2 - add.d T2, T2, INCY - fldx.d $f6, Y, T2 - add.d T2, T2, INCY - fldx.d $f7, Y, T2 - - add.d T2, T2, INCY - fldx.d $f8, Y, T2 - add.d T2, T2, INCY - fldx.d $f9, Y, T2 - add.d T2, T2, INCY - fldx.d $f10, Y, T2 - add.d T2, T2, INCY - fldx.d $f11, Y, T2 - - vextrins.d U4, U5, 0x10 - vextrins.d U6, U7, 0x10 - vextrins.d U8, U9, 0x10 - vextrins.d U10, U11, 0x10 + LOAD_Y_8 vfmadd.d U4, U3, U1, U4 vfmadd.d U6, U3, U14, U6 vfmadd.d U8, U3, U15, U8 vfmadd.d U10, U3, U16, U10 - vextrins.d U5, U4, 0x01 - vextrins.d U7, U6, 0x01 - vextrins.d U9, U8, 0x01 - vextrins.d U11, U10, 0x01 - - add.d T2, IY, INCY - fstx.d $f4, Y, T2 - add.d T2, T2, INCY - fstx.d $f5, Y, T2 - add.d T2, T2, INCY - fstx.d $f6, Y, T2 - add.d T2, T2, INCY - fstx.d $f7, Y, T2 - - add.d T2, T2, INCY - fstx.d $f8, Y, T2 - add.d T2, T2, INCY - fstx.d $f9, Y, T2 - add.d T2, T2, INCY - fstx.d $f10, Y, T2 - add.d T2, T2, INCY - fstx.d $f11, Y, T2 - - slli.d T2, INCY, 3 - add.d IY, IY, T2 - - add.d T2, IX, INCX - fldx.d $f4, X, T2 - add.d T2, T2, INCX - fldx.d $f5, X, T2 - add.d T2, T2, INCX - fldx.d $f6, X, T2 - add.d T2, T2, INCX - fldx.d $f7, X, T2 - - add.d T2, T2, INCX - fldx.d $f8, X, T2 - add.d T2, T2, INCX - fldx.d $f9, X, T2 - add.d T2, T2, INCX - fldx.d $f10, X, T2 - add.d T2, T2, INCX - fldx.d $f11, X, T2 + STORE_Y_8 - vextrins.d U4, U5, 0x10 - vextrins.d U6, U7, 0x10 - vextrins.d U8, U9, 0x10 - vextrins.d U10, U11, 0x10 + alsl.d IY, INCY, IY, 3 - vand.v $vr12, $vr2, $vr2 + LOAD_X_8 vfmadd.d U2, U1, U4, U2 - vfsub.d U2, U2, $vr12 vfmadd.d U2, U14, U6, U2 vfmadd.d U2, U15, U8, U2 vfmadd.d U2, U16, U10, U2 - vextrins.d U4, U2, 0x01 - - fadd.d $f2, $f2, $f4 - fadd.d $f2, $f2, $f12 - - vextrins.d U2, U2, 0x10 - - slli.d T2, INCX, 3 - add.d IX, IX, T2 + alsl.d IX, INCX, IX, 3 addi.d II, II, 64 addi.d I, I, 1 blt I, T0, .L02 + // Acc U2 + GACC vf, d, U4, U2 + vilvl.d U2, U4, U4 + .L03: /* &4 */ sub.d T0, M, J addi.d T0, T0, -1 @@ -429,4 +467,4 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi.d $sp, $sp, 88 jirl $r0, $r1, 0x0 - EPILOGUE \ No newline at end of file + EPILOGUE diff --git a/kernel/loongarch64/dsymv_U_lsx.S b/kernel/loongarch64/dsymv_U_lsx.S index f708196aa..2589f3191 100644 --- a/kernel/loongarch64/dsymv_U_lsx.S +++ b/kernel/loongarch64/dsymv_U_lsx.S @@ -28,6 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ASSEMBLER #include "common.h" +#include "loongarch64_asm.S" /* Param */ #define M $r4 @@ -57,6 +58,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define T2 $r28 #define T3 $r29 #define T4 $r30 +#define T5 $r17 +#define T6 $r16 +#define T7 $r12 /* LSX vectors */ #define U0 $vr31 @@ -87,10 +91,109 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define a8 $f8 #define a9 $f9 +.macro LOAD_Y_8 + beqz T5, .L01_Y_0 + fldx.d $f4, Y, IY + add.d T2, IY, INCY + fldx.d $f5, Y, T2 + add.d T2, T2, INCY + fldx.d $f6, Y, T2 + add.d T2, T2, INCY + fldx.d $f7, Y, T2 - PROLOGUE + add.d T2, T2, INCY + fldx.d $f8, Y, T2 + add.d T2, T2, INCY + fldx.d $f9, Y, T2 + add.d T2, T2, INCY + fldx.d $f10, Y, T2 + add.d T2, T2, INCY + fldx.d $f11, Y, T2 - LDARG BUFFER, $sp, 0 + vextrins.d U4, U5, 0x10 + vextrins.d U6, U7, 0x10 + vextrins.d U8, U9, 0x10 + vextrins.d U10, U11, 0x10 + b .L01_Y_1 +.L01_Y_0: + vldx U4, Y, IY + alsl.d T2, INCY, IY, 1 + vldx U6, Y, T2 + alsl.d T3, INCY, T2, 1 + vldx U8, Y, T3 + alsl.d T4, INCY, T3, 1 + vldx U10, Y, T4 +.L01_Y_1: +.endm + +.macro STORE_Y_8 + beqz T5, .L01_Y_2 + vextrins.d U5, U4, 0x01 + vextrins.d U7, U6, 0x01 + vextrins.d U9, U8, 0x01 + vextrins.d U11, U10, 0x01 + + fstx.d $f4, Y, IY + add.d T2, IY, INCY + fstx.d $f5, Y, T2 + add.d T2, T2, INCY + fstx.d $f6, Y, T2 + add.d T2, T2, INCY + fstx.d $f7, Y, T2 + + add.d T2, T2, INCY + fstx.d $f8, Y, T2 + add.d T2, T2, INCY + fstx.d $f9, Y, T2 + add.d T2, T2, INCY + fstx.d $f10, Y, T2 + add.d T2, T2, INCY + fstx.d $f11, Y, T2 + b .L01_Y_3 +.L01_Y_2: + vstx U4, Y, IY + vstx U6, Y, T2 + vstx U8, Y, T3 + vstx U10,Y, T4 +.L01_Y_3: +.endm + +.macro LOAD_X_8 + beqz T6, .L01_X_0 + fldx.d $f4, X, IX + add.d T2, IX, INCX + fldx.d $f5, X, T2 + add.d T2, T2, INCX + fldx.d $f6, X, T2 + add.d T2, T2, INCX + fldx.d $f7, X, T2 + + add.d T2, T2, INCX + fldx.d $f8, X, T2 + add.d T2, T2, INCX + fldx.d $f9, X, T2 + add.d T2, T2, INCX + fldx.d $f10, X, T2 + add.d T2, T2, INCX + fldx.d $f11, X, T2 + + vextrins.d U4, U5, 0x10 + vextrins.d U6, U7, 0x10 + vextrins.d U8, U9, 0x10 + vextrins.d U10, U11, 0x10 + b .L01_X_1 +.L01_X_0: + vldx U4, X, IX + alsl.d T2, INCX, IX, 1 + vldx U6, X, T2 + alsl.d T3, INCX, T2, 1 + vldx U8, X, T3 + alsl.d T4, INCX, T3, 1 + vldx U10, X, T4 +.L01_X_1: +.endm + + PROLOGUE addi.d $sp, $sp, -88 @@ -107,6 +210,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vldrepl.d VALPHA, $sp, 80 + addi.d T5, INCY, -1 + addi.d T6, INCX, -1 slli.d LDA, LDA, BASE_SHIFT slli.d INCX, INCX, BASE_SHIFT slli.d INCY, INCY, BASE_SHIFT @@ -125,11 +230,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. beq J, M, .L999 .L01: - MTC $f2, $r0 //temp2 + vxor.v U2, U2, U2 fldx.d $f6, X, JX fmul.d $f3, ALPHA, $f6 //temp1 vshuf4i.d U3, U3, 0x00 - vshuf4i.d U2, U2, 0x00 move IY, $r0 move IX, $r0 @@ -152,102 +256,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vldx U16, AO1, T1 addi.d T1, T1, 16 - fldx.d $f4, Y, IY - add.d T2, IY, INCY - fldx.d $f5, Y, T2 - add.d T2, T2, INCY - fldx.d $f6, Y, T2 - add.d T2, T2, INCY - fldx.d $f7, Y, T2 - - add.d T2, T2, INCY - fldx.d $f8, Y, T2 - add.d T2, T2, INCY - fldx.d $f9, Y, T2 - add.d T2, T2, INCY - fldx.d $f10, Y, T2 - add.d T2, T2, INCY - fldx.d $f11, Y, T2 - - vextrins.d U4, U5, 0x10 - vextrins.d U6, U7, 0x10 - vextrins.d U8, U9, 0x10 - vextrins.d U10, U11, 0x10 + LOAD_Y_8 vfmadd.d U4, U3, U1, U4 vfmadd.d U6, U3, U14, U6 vfmadd.d U8, U3, U15, U8 vfmadd.d U10, U3, U16, U10 - vextrins.d U5, U4, 0x01 - vextrins.d U7, U6, 0x01 - vextrins.d U9, U8, 0x01 - vextrins.d U11, U10, 0x01 + STORE_Y_8 - fstx.d $f4, Y, IY - add.d T2, IY, INCY - fstx.d $f5, Y, T2 - add.d T2, T2, INCY - fstx.d $f6, Y, T2 - add.d T2, T2, INCY - fstx.d $f7, Y, T2 + alsl.d IY, INCY, IY, 3 - add.d T2, T2, INCY - fstx.d $f8, Y, T2 - add.d T2, T2, INCY - fstx.d $f9, Y, T2 - add.d T2, T2, INCY - fstx.d $f10, Y, T2 - add.d T2, T2, INCY - fstx.d $f11, Y, T2 - - slli.d T2, INCY, 3 - add.d IY, IY, T2 - - fldx.d $f4, X, IX - add.d T2, IX, INCX - fldx.d $f5, X, T2 - add.d T2, T2, INCX - fldx.d $f6, X, T2 - add.d T2, T2, INCX - fldx.d $f7, X, T2 - - add.d T2, T2, INCX - fldx.d $f8, X, T2 - add.d T2, T2, INCX - fldx.d $f9, X, T2 - add.d T2, T2, INCX - fldx.d $f10, X, T2 - add.d T2, T2, INCX - fldx.d $f11, X, T2 - - vextrins.d U4, U5, 0x10 - vextrins.d U6, U7, 0x10 - vextrins.d U8, U9, 0x10 - vextrins.d U10, U11, 0x10 - - vand.v $vr12, $vr2, $vr2 + LOAD_X_8 vfmadd.d U2, U1, U4, U2 - vfsub.d U2, U2, $vr12 vfmadd.d U2, U14, U6, U2 vfmadd.d U2, U15, U8, U2 vfmadd.d U2, U16, U10, U2 - vextrins.d U4, U2, 0x01 - - fadd.d $f2, $f2, $f4 - fadd.d $f2, $f2, $f12 - - vextrins.d U2, U2, 0x10 - - slli.d T2, INCX, 3 - add.d IX, IX, T2 + alsl.d IX, INCX, IX, 3 addi.d II, II, 64 addi.d I, I, 1 blt I, T0, .L02 + // Acc U2 + GACC vf, d, U4, U2 + vilvl.d U2, U4, U4 + .L03: /* &4 */ andi T0, J, 4 beq $r0, T0, .L04 @@ -417,4 +453,4 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi.d $sp, $sp, 88 jirl $r0, $r1, 0x0 - EPILOGUE \ No newline at end of file + EPILOGUE From ef7f54b35713a315671f64f03fe0d417f1bb0360 Mon Sep 17 00:00:00 2001 From: "tingbo.liao" Date: Wed, 15 Jan 2025 11:13:21 +0800 Subject: [PATCH 008/205] Optimized the gemm_tcopy_8_rvv to be compatible with the vlens 128 and 256. Signed-off-by: tingbo.liao --- kernel/riscv64/gemm_tcopy_8_rvv.c | 214 ++++-------------------------- 1 file changed, 25 insertions(+), 189 deletions(-) diff --git a/kernel/riscv64/gemm_tcopy_8_rvv.c b/kernel/riscv64/gemm_tcopy_8_rvv.c index 4742ae6a7..c50b0d5b4 100644 --- a/kernel/riscv64/gemm_tcopy_8_rvv.c +++ b/kernel/riscv64/gemm_tcopy_8_rvv.c @@ -28,35 +28,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) __riscv_vsetvl_e32m1(n) -#define FLOAT_V_T vfloat32m1_t -#define FLOAT_VX2_T vfloat32m1x2_t -#define FLOAT_VX4_T vfloat32m1x4_t -#define FLOAT_VX8_T vfloat32m1x8_t -#define VLEV_FLOAT __riscv_vle32_v_f32m1 -#define VLSEV_FLOAT __riscv_vlse32_v_f32m1 -#define VSEV_FLOAT __riscv_vse32_v_f32m1 -#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m1x2 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m1x2 -#define VLSSEG4_FLOAT __riscv_vlsseg4e32_v_f32m1x4 -#define VSSEG4_FLOAT __riscv_vsseg4e32_v_f32m1x4 -#define VLSSEG8_FLOAT __riscv_vlsseg8e32_v_f32m1x8 -#define VSSEG8_FLOAT __riscv_vsseg8e32_v_f32m1x8 +#define FLOAT_V_T vfloat32m2_t +#define FLOAT_V_T_HALF vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VLEV_FLOAT_HALF __riscv_vle32_v_f32m1 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VSEV_FLOAT_HALF __riscv_vse32_v_f32m1 #else -#define VSETVL(n) __riscv_vsetvl_e64m1(n) -#define FLOAT_V_T vfloat64m1_t -#define FLOAT_VX2_T vfloat64m1x2_t -#define FLOAT_VX4_T vfloat64m1x4_t -#define FLOAT_VX8_T vfloat64m1x8_t -#define VLEV_FLOAT __riscv_vle64_v_f64m1 -#define VLSEV_FLOAT __riscv_vlse64_v_f64m1 -#define VSEV_FLOAT __riscv_vse64_v_f64m1 -#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m1x2 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m1x2 -#define VLSSEG4_FLOAT __riscv_vlsseg4e64_v_f64m1x4 -#define VSSEG4_FLOAT __riscv_vsseg4e64_v_f64m1x4 -#define VLSSEG8_FLOAT __riscv_vlsseg8e64_v_f64m1x8 -#define VSSEG8_FLOAT __riscv_vsseg8e64_v_f64m1x8 +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_HALF vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m4 +#define VLEV_FLOAT_HALF __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m4 +#define VSEV_FLOAT_HALF __riscv_vse64_v_f64m2 #endif int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) @@ -69,9 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) IFLOAT *boffset, *boffset1, *boffset2, *boffset3, *boffset4; FLOAT_V_T v0; - FLOAT_VX2_T vx2; - FLOAT_VX4_T vx4; - FLOAT_VX8_T vx8; + FLOAT_V_T_HALF v1; // fprintf(stderr, "gemm_tcopy_8 m=%ld n=%ld lda=%ld\n", m, n, lda); @@ -81,156 +63,12 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) boffset3 = b + m * (n & ~3); boffset4 = b + m * (n & ~1); - for(j = (m >> 3); j > 0; j--) { - - aoffset1 = aoffset; - aoffset += 8 * lda; - - boffset1 = boffset; - boffset += 64; - - for(i = (n >> 3); i > 0; i--) { - size_t vl = 8; - - vx8 = VLSSEG8_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); - VSSEG8_FLOAT(boffset1, vx8, vl); - - aoffset1 += 8; - boffset1 += m * 8; - } - - if (n & 4) { - size_t vl = 8; - - vx4 = VLSSEG4_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); - VSSEG4_FLOAT(boffset2, vx4, vl); - - aoffset1 += 4; - boffset2 += 32; - } - - if (n & 2) { - size_t vl = 8; - - vx2 = VLSSEG2_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); - VSSEG2_FLOAT(boffset3, vx2, vl); - - aoffset1 += 2; - boffset3 += 16; - } - - if (n & 1) { - size_t vl = 8; - - v0 = VLSEV_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); - VSEV_FLOAT(boffset4, v0, vl); - - aoffset1 += 1; - boffset4 += 8; - } - - } - - if (m & 4) { - - aoffset1 = aoffset; - aoffset += 4 * lda; - - boffset1 = boffset; - boffset += 32; - - for(i = (n >> 3); i > 0; i--) { - size_t vl = 4; - - vx8 = VLSSEG8_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); - VSSEG8_FLOAT(boffset1, vx8, vl); - - aoffset1 += 8; - boffset1 += m * 8; - } - - if (n & 4) { - size_t vl = 4; - - vx4 = VLSSEG4_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); - VSSEG4_FLOAT(boffset2, vx4, vl); - - aoffset1 += 4; - boffset2 += 16; - } - - if (n & 2) { - size_t vl = 4; - - vx2 = VLSSEG2_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); - VSSEG2_FLOAT(boffset3, vx2, vl); - - aoffset1 += 2; - boffset3 += 8; - } - - if (n & 1) { - size_t vl = 4; - - v0 = VLSEV_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); - VSEV_FLOAT(boffset4, v0, vl); - - aoffset1 += 1; - boffset4 += 4; - } - } - - if (m & 2) { + for(j = m; j > 0; j--) { aoffset1 = aoffset; - aoffset += 2 * lda; - boffset1 = boffset; - boffset += 16; - - for(i = (n >> 3); i > 0; i--) { - size_t vl = 2; - vx8 = VLSSEG8_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); - VSSEG8_FLOAT(boffset1, vx8, vl); - - aoffset1 += 8; - boffset1 += m * 8; - } - - if (n & 4) { - size_t vl = 2; - - vx4 = VLSSEG4_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); - VSSEG4_FLOAT(boffset2, vx4, vl); - - aoffset1 += 4; - boffset2 += 8; - } - - if (n & 2) { - size_t vl = 2; - - vx2 = VLSSEG2_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); - VSSEG2_FLOAT(boffset3, vx2, vl); - - aoffset1 += 2; - boffset3 += 4; - } - - if (n & 1) { - size_t vl = 2; - - v0 = VLSEV_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); - VSEV_FLOAT(boffset4, v0, vl); - - aoffset1 += 1; - boffset4 += 2; - } - } - - if (m & 1) { - aoffset1 = aoffset; - boffset1 = boffset; + aoffset += lda; + boffset += 8; for(i = (n >> 3); i > 0; i--) { size_t vl = 8; @@ -245,27 +83,25 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) if (n & 4) { size_t vl = 4; - v0 = VLEV_FLOAT(aoffset1, vl); - VSEV_FLOAT(boffset2, v0, vl); + v1 = VLEV_FLOAT_HALF(aoffset1, vl); + VSEV_FLOAT_HALF(boffset2, v1, vl); aoffset1 += 4; - //boffset2 += 4; + boffset2 += 4; } if (n & 2) { - size_t vl = 2; - - v0 = VLEV_FLOAT(aoffset1, vl); - VSEV_FLOAT(boffset3, v0, vl); + *(boffset3) = *(aoffset1); + *(boffset3 + 1) = *(aoffset1 + 1); aoffset1 += 2; - // boffset3 += 2; + boffset3 += 2; } if (n & 1) { - *(boffset4) = *(aoffset1); - // aoffset1 ++; - // boffset4 ++; + *(boffset4) = *(aoffset1); + aoffset1 ++; + boffset4 ++; } } From 7c3a920a815f2cb890618300cce8aa46d513cf95 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 16 Jan 2025 20:48:30 +0100 Subject: [PATCH 009/205] CI: Update ubuntu-latest runners to fix side effects of switch to 24.04 (#5079) --- .github/workflows/c910v.yml | 3 ++- .github/workflows/codspeed-bench.yml | 4 ++-- .github/workflows/dynamic_arch.yml | 4 +++- .github/workflows/loongarch64_clang.yml | 2 +- .github/workflows/mips64.yml | 7 +++---- 5 files changed, 11 insertions(+), 9 deletions(-) diff --git a/.github/workflows/c910v.yml b/.github/workflows/c910v.yml index a47ca1dce..1dd3a2c71 100644 --- a/.github/workflows/c910v.yml +++ b/.github/workflows/c910v.yml @@ -37,7 +37,7 @@ jobs: run: | sudo apt-get update sudo apt-get install autoconf automake autotools-dev ninja-build make ccache \ - gcc-${{ matrix.apt_triple }} gfortran-${{ matrix.apt_triple }} libgomp1-riscv64-cross + gcc-${{ matrix.apt_triple }} gfortran-${{ matrix.apt_triple }} libgomp1-riscv64-cross libglib2.0-dev - name: checkout qemu uses: actions/checkout@v3 @@ -52,6 +52,7 @@ jobs: wget https://github.com/revyos/qemu/commit/5164bca5a4bcde4534dc1a9aa3a7f619719874cf.patch cd qemu patch -p1 < ../5164bca5a4bcde4534dc1a9aa3a7f619719874cf.patch + export CXXFLAGS="-Wno-error"; export CFLAGS="-Wno-error" ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=riscv64-linux-user --disable-system make -j$(nproc) make install diff --git a/.github/workflows/codspeed-bench.yml b/.github/workflows/codspeed-bench.yml index 25e196ef2..94e0d708e 100644 --- a/.github/workflows/codspeed-bench.yml +++ b/.github/workflows/codspeed-bench.yml @@ -15,7 +15,7 @@ jobs: strategy: fail-fast: false matrix: - os: [ubuntu-latest] + os: [ubuntu-22.04] fortran: [gfortran] build: [make] pyver: ["3.12"] @@ -147,7 +147,7 @@ jobs: OPENBLAS_NUM_THREADS=1 pytest benchmarks/bench_blas.py -k 'gesdd' - name: Run benchmarks - uses: CodSpeedHQ/action@v2 + uses: CodSpeedHQ/action@v3 with: token: ${{ secrets.CODSPEED_TOKEN }} run: | diff --git a/.github/workflows/dynamic_arch.yml b/.github/workflows/dynamic_arch.yml index 9e55e7346..f42d4c57f 100644 --- a/.github/workflows/dynamic_arch.yml +++ b/.github/workflows/dynamic_arch.yml @@ -43,7 +43,9 @@ jobs: run: | if [ "$RUNNER_OS" == "Linux" ]; then sudo apt-get update - sudo apt-get install -y gfortran cmake ccache libtinfo5 + sudo apt-get install -y gfortran cmake ccache + wget http://security.ubuntu.com/ubuntu/pool/universe/n/ncurses/libtinfo5_6.3-2ubuntu0.1_amd64.deb + sudo apt install ./libtinfo5_6.3-2ubuntu0.1_amd64.deb elif [ "$RUNNER_OS" == "macOS" ]; then # It looks like "gfortran" isn't working correctly unless "gcc" is re-installed. brew reinstall gcc diff --git a/.github/workflows/loongarch64_clang.yml b/.github/workflows/loongarch64_clang.yml index f1a75ad34..fdb48309b 100644 --- a/.github/workflows/loongarch64_clang.yml +++ b/.github/workflows/loongarch64_clang.yml @@ -41,7 +41,7 @@ jobs: - name: Install APT deps run: | sudo apt-get update - sudo apt-get install autoconf automake autotools-dev ninja-build make ccache + sudo apt-get install autoconf automake autotools-dev ninja-build make ccache libglib2.0-dev - name: Download and install loongarch64-toolchain run: | diff --git a/.github/workflows/mips64.yml b/.github/workflows/mips64.yml index 1491aff78..56da22c6b 100644 --- a/.github/workflows/mips64.yml +++ b/.github/workflows/mips64.yml @@ -41,14 +41,14 @@ jobs: run: | sudo apt-get update sudo apt-get install autoconf automake autotools-dev ninja-build make ccache \ - gcc-${{ matrix.triple }} gfortran-${{ matrix.triple }} libgomp1-mips64el-cross + gcc-${{ matrix.triple }} gfortran-${{ matrix.triple }} libgomp1-mips64el-cross libglib2.0-dev - name: checkout qemu uses: actions/checkout@v3 with: repository: qemu/qemu path: qemu - ref: 79dfa177ae348bb5ab5f97c0915359b13d6186e2 + ref: ae35f033b874c627d81d51070187fbf55f0bf1a7 - name: build qemu run: | @@ -59,8 +59,7 @@ jobs: - name: Compilation cache uses: actions/cache@v3 - with: - path: ~/.ccache + with: path: ~/.ccache key: ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}-${{ github.sha }} restore-keys: | ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }} From 2954dc1a705eee9987d57a68e6c5129af26b8e8f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 17 Jan 2025 12:28:26 +0100 Subject: [PATCH 010/205] CI: Add NeoverseN2 build on the new Cobalt-100 (#5080) * Add NeoverseN2 build --- .github/workflows/dynamic_arch.yml | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/.github/workflows/dynamic_arch.yml b/.github/workflows/dynamic_arch.yml index f42d4c57f..b388cb1b2 100644 --- a/.github/workflows/dynamic_arch.yml +++ b/.github/workflows/dynamic_arch.yml @@ -356,3 +356,23 @@ jobs: - name: Build OpenBLAS run: | make -j$(nproc) HOSTCC="ccache gcc" CC="ccache ${{ matrix.triple }}-gcc" FC="ccache ${{ matrix.triple }}-gfortran" ARCH=${{ matrix.target }} ${{ matrix.opts }} + + neoverse_build: + if: "github.repository == 'OpenMathLib/OpenBLAS'" + runs-on: ubuntu-24.04-arm + + steps: + - name: Checkout repository + uses: actions/checkout@v3 + + - name: Install Dependencies + run: | + sudo apt-get update + sudo apt-get install -y gcc gfortran make + + - name: Build OpenBLAS + run: | + make -j${nproc} TARGET=NEOVERSEN2 + make -j${nproc} TARGET=NEOVERSEN2 lapack-test + + From 87083fdbf64579410368367ba9e83c22a1455ae1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 18 Jan 2025 16:45:56 +0100 Subject: [PATCH 011/205] [WIP] Work around assembler limitations in current LLVM for Windows on Arm (#5076) * Protect align directives in assembly files that are currently problematic with LLVM on WoA * use the armv8 zdot on WoA to work around other LLVM issues --- kernel/arm64/KERNEL.NEOVERSEN1 | 10 + kernel/arm64/copy_thunderx2t99.c | 433 +++++++++++++++--------------- kernel/arm64/dasum_thunderx2t99.c | 2 + kernel/arm64/dot_kernel_asimd.c | 3 +- kernel/arm64/sasum_thunderx2t99.c | 3 +- kernel/arm64/zasum_thunderx2t99.c | 3 +- 6 files changed, 235 insertions(+), 219 deletions(-) diff --git a/kernel/arm64/KERNEL.NEOVERSEN1 b/kernel/arm64/KERNEL.NEOVERSEN1 index 5b3174473..e623814d6 100644 --- a/kernel/arm64/KERNEL.NEOVERSEN1 +++ b/kernel/arm64/KERNEL.NEOVERSEN1 @@ -98,8 +98,18 @@ ZNRM2KERNEL = znrm2.S DDOTKERNEL = dot.c SDOTKERNEL = dot.c +ifeq ($(OSNAME), WINNT) +ifeq ($(C_COMPILER), CLANG) +CDOTKERNEL = zdot.S +ZDOTKERNEL = zdot.S +else +CDOTKERNEL = zdot_thunderx2t99.c +ZDOTKERNEL = zdot_thunderx2t99.c +endif +else CDOTKERNEL = zdot_thunderx2t99.c ZDOTKERNEL = zdot_thunderx2t99.c +endif DSDOTKERNEL = dot.S DGEMM_BETA = dgemm_beta.S diff --git a/kernel/arm64/copy_thunderx2t99.c b/kernel/arm64/copy_thunderx2t99.c index e31876139..263cc3013 100644 --- a/kernel/arm64/copy_thunderx2t99.c +++ b/kernel/arm64/copy_thunderx2t99.c @@ -1,216 +1,217 @@ -/*************************************************************************** -Copyright (c) 2017, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" - -#include -#define N "x0" /* vector length */ -#define X "x1" /* X vector address */ -#define INC_X "x2" /* X stride */ -#define Y "x3" /* Y vector address */ -#define INC_Y "x4" /* Y stride */ -#define J "x5" /* loop variable */ - -/******************************************************************************* -* Macro definitions -*******************************************************************************/ -#if !defined(COMPLEX) -#if !defined(DOUBLE) -#define TMPF "s0" -#define INC_SHIFT "2" -#define N_DIV_SHIFT "2" -#define N_REM_MASK "3" -#else -#define TMPF "d0" -#define INC_SHIFT "3" -#define N_DIV_SHIFT "1" -#define N_REM_MASK "1" -#endif -#else -#if !defined(DOUBLE) -#define TMPF "d0" -#define INC_SHIFT "3" -#define N_DIV_SHIFT "1" -#define N_REM_MASK "1" -#else -#define TMPF "q0" -#define INC_SHIFT "4" -#define N_DIV_SHIFT "0" -#define N_REM_MASK "0" -#endif -#endif - -#define KERNEL_F1 \ - "ldr "TMPF", ["X"] \n" \ - "add "X", "X", "INC_X" \n" \ - "str "TMPF", ["Y"] \n" \ - "add "Y", "Y", "INC_Y" \n" - -#define KERNEL_F \ - "ldr q0, ["X"], #16 \n" \ - "str q0, ["Y"], #16 \n" - -#define INIT \ - "lsl "INC_X", "INC_X", #"INC_SHIFT" \n" \ - "lsl "INC_Y", "INC_Y", #"INC_SHIFT" \n" - - -static int do_copy(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) -{ - if ( n < 0 ) return 0; - - __asm__ __volatile__ ( - " mov "N", %[N_] \n" - " mov "X", %[X_] \n" - " mov "INC_X", %[INCX_] \n" - " mov "Y", %[Y_] \n" - " mov "INC_Y", %[INCY_] \n" - " cmp "N", xzr \n" - " ble 8f //copy_kernel_L999 \n" - " cmp "INC_X", #1 \n" - " bne 4f //copy_kernel_S_BEGIN \n" - " cmp "INC_Y", #1 \n" - " bne 4f //copy_kernel_S_BEGIN \n" - - "// .Lcopy_kernel_F_BEGIN: \n" - " "INIT" \n" - " asr "J", "N", #"N_DIV_SHIFT" \n" - " cmp "J", xzr \n" - " beq 2f //copy_kernel_F1 \n" - " .align 5 \n" - - "1: //copy_kernel_F: \n" - " "KERNEL_F" \n" - " subs "J", "J", #1 \n" - " bne 1b //copy_kernel_F \n" - - "2: //copy_kernel_F1: \n" -#if defined(COMPLEX) && defined(DOUBLE) - " b 8f //copy_kernel_L999 \n" -#else - " ands "J", "N", #"N_REM_MASK" \n" - " ble 8f //copy_kernel_L999 \n" -#endif - - "3: //copy_kernel_F10: \n" - " "KERNEL_F1" \n" - " subs "J", "J", #1 \n" - " bne 3b //copy_kernel_F10 \n" - " b 8f //copy_kernel_L999 \n" - - "4: //copy_kernel_S_BEGIN: \n" - " "INIT" \n" - " asr "J", "N", #2 \n" - " cmp "J", xzr \n" - " ble 6f //copy_kernel_S1 \n" - - "5: //copy_kernel_S4: \n" - " "KERNEL_F1" \n" - " "KERNEL_F1" \n" - " "KERNEL_F1" \n" - " "KERNEL_F1" \n" - " subs "J", "J", #1 \n" - " bne 5b //copy_kernel_S4 \n" - - "6: //copy_kernel_S1: \n" - " ands "J", "N", #3 \n" - " ble 8f //copy_kernel_L999 \n" - - "7: //copy_kernel_S10: \n" - " "KERNEL_F1" \n" - " subs "J", "J", #1 \n" - " bne 7b //copy_kernel_S10 \n" - - "8: //copy_kernel_L999: \n" - - : - : [N_] "r" (n), //%1 - [X_] "r" (x), //%2 - [INCX_] "r" (inc_x), //%3 - [Y_] "r" (y), //%4 - [INCY_] "r" (inc_y) //%5 - : "cc", - "memory", - "x0", "x1", "x2", "x3", "x4", "x5", - "d0" - ); - - return 0; -} - -#if defined(SMP) -static int copy_thread_function(BLASLONG n, BLASLONG dummy0, - BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y, - BLASLONG inc_y, FLOAT *dummy3, BLASLONG dummy4) -{ - do_copy(n, x, inc_x, y, inc_y); - - return 0; -} -#endif - -int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) -{ -#if defined(SMP) - int nthreads; - FLOAT dummy_alpha; -#endif - - if (n <= 0) return 0; - -#if defined(SMP) - if (inc_x == 0 || n <= 10000) - nthreads = 1; - else - nthreads = num_cpu_avail(1); - - if (nthreads == 1) { - do_copy(n, x, inc_x, y, inc_y); - } else { - int mode = 0; - -#if !defined(COMPLEX) - mode = BLAS_REAL; -#else - mode = BLAS_COMPLEX; -#endif -#if !defined(DOUBLE) - mode |= BLAS_SINGLE; -#else - mode |= BLAS_DOUBLE; -#endif - - blas_level1_thread(mode, n, 0, 0, &dummy_alpha, - x, inc_x, y, inc_y, NULL, 0, - ( void *)copy_thread_function, nthreads); - } -#else - do_copy(n, x, inc_x, y, inc_y); -#endif - - return 0; -} +/*************************************************************************** +Copyright (c) 2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#include +#define N "x0" /* vector length */ +#define X "x1" /* X vector address */ +#define INC_X "x2" /* X stride */ +#define Y "x3" /* Y vector address */ +#define INC_Y "x4" /* Y stride */ +#define J "x5" /* loop variable */ + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ +#if !defined(COMPLEX) +#if !defined(DOUBLE) +#define TMPF "s0" +#define INC_SHIFT "2" +#define N_DIV_SHIFT "2" +#define N_REM_MASK "3" +#else +#define TMPF "d0" +#define INC_SHIFT "3" +#define N_DIV_SHIFT "1" +#define N_REM_MASK "1" +#endif +#else +#if !defined(DOUBLE) +#define TMPF "d0" +#define INC_SHIFT "3" +#define N_DIV_SHIFT "1" +#define N_REM_MASK "1" +#else +#define TMPF "q0" +#define INC_SHIFT "4" +#define N_DIV_SHIFT "0" +#define N_REM_MASK "0" +#endif +#endif + +#define KERNEL_F1 \ + "ldr "TMPF", ["X"] \n" \ + "add "X", "X", "INC_X" \n" \ + "str "TMPF", ["Y"] \n" \ + "add "Y", "Y", "INC_Y" \n" + +#define KERNEL_F \ + "ldr q0, ["X"], #16 \n" \ + "str q0, ["Y"], #16 \n" + +#define INIT \ + "lsl "INC_X", "INC_X", #"INC_SHIFT" \n" \ + "lsl "INC_Y", "INC_Y", #"INC_SHIFT" \n" + + +static int do_copy(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + if ( n < 0 ) return 0; + + __asm__ __volatile__ ( + " mov "N", %[N_] \n" + " mov "X", %[X_] \n" + " mov "INC_X", %[INCX_] \n" + " mov "Y", %[Y_] \n" + " mov "INC_Y", %[INCY_] \n" + " cmp "N", xzr \n" + " ble 8f //copy_kernel_L999 \n" + " cmp "INC_X", #1 \n" + " bne 4f //copy_kernel_S_BEGIN \n" + " cmp "INC_Y", #1 \n" + " bne 4f //copy_kernel_S_BEGIN \n" + + "// .Lcopy_kernel_F_BEGIN: \n" + " "INIT" \n" + " asr "J", "N", #"N_DIV_SHIFT" \n" + " cmp "J", xzr \n" + " beq 2f //copy_kernel_F1 \n" +#if !(defined(__clang__) && defined(OS_WINDOWS)) + " .align 5 \n" +#endif + "1: //copy_kernel_F: \n" + " "KERNEL_F" \n" + " subs "J", "J", #1 \n" + " bne 1b //copy_kernel_F \n" + + "2: //copy_kernel_F1: \n" +#if defined(COMPLEX) && defined(DOUBLE) + " b 8f //copy_kernel_L999 \n" +#else + " ands "J", "N", #"N_REM_MASK" \n" + " ble 8f //copy_kernel_L999 \n" +#endif + + "3: //copy_kernel_F10: \n" + " "KERNEL_F1" \n" + " subs "J", "J", #1 \n" + " bne 3b //copy_kernel_F10 \n" + " b 8f //copy_kernel_L999 \n" + + "4: //copy_kernel_S_BEGIN: \n" + " "INIT" \n" + " asr "J", "N", #2 \n" + " cmp "J", xzr \n" + " ble 6f //copy_kernel_S1 \n" + + "5: //copy_kernel_S4: \n" + " "KERNEL_F1" \n" + " "KERNEL_F1" \n" + " "KERNEL_F1" \n" + " "KERNEL_F1" \n" + " subs "J", "J", #1 \n" + " bne 5b //copy_kernel_S4 \n" + + "6: //copy_kernel_S1: \n" + " ands "J", "N", #3 \n" + " ble 8f //copy_kernel_L999 \n" + + "7: //copy_kernel_S10: \n" + " "KERNEL_F1" \n" + " subs "J", "J", #1 \n" + " bne 7b //copy_kernel_S10 \n" + + "8: //copy_kernel_L999: \n" + + : + : [N_] "r" (n), //%1 + [X_] "r" (x), //%2 + [INCX_] "r" (inc_x), //%3 + [Y_] "r" (y), //%4 + [INCY_] "r" (inc_y) //%5 + : "cc", + "memory", + "x0", "x1", "x2", "x3", "x4", "x5", + "d0" + ); + + return 0; +} + +#if defined(SMP) +static int copy_thread_function(BLASLONG n, BLASLONG dummy0, + BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y, + BLASLONG inc_y, FLOAT *dummy3, BLASLONG dummy4) +{ + do_copy(n, x, inc_x, y, inc_y); + + return 0; +} +#endif + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ +#if defined(SMP) + int nthreads; + FLOAT dummy_alpha; +#endif + + if (n <= 0) return 0; + +#if defined(SMP) + if (inc_x == 0 || n <= 10000) + nthreads = 1; + else + nthreads = num_cpu_avail(1); + + if (nthreads == 1) { + do_copy(n, x, inc_x, y, inc_y); + } else { + int mode = 0; + +#if !defined(COMPLEX) + mode = BLAS_REAL; +#else + mode = BLAS_COMPLEX; +#endif +#if !defined(DOUBLE) + mode |= BLAS_SINGLE; +#else + mode |= BLAS_DOUBLE; +#endif + + blas_level1_thread(mode, n, 0, 0, &dummy_alpha, + x, inc_x, y, inc_y, NULL, 0, + ( void *)copy_thread_function, nthreads); + } +#else + do_copy(n, x, inc_x, y, inc_y); +#endif + + return 0; +} diff --git a/kernel/arm64/dasum_thunderx2t99.c b/kernel/arm64/dasum_thunderx2t99.c index a212c9534..b554f0a9b 100644 --- a/kernel/arm64/dasum_thunderx2t99.c +++ b/kernel/arm64/dasum_thunderx2t99.c @@ -152,7 +152,9 @@ static FLOAT dasum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) " cmp "J", xzr \n" " beq 3f //asum_kernel_F1 \n" +#if !(defined(__clang__) && defined(OS_WINDOWS)) ".align 5 \n" +#endif "2: //asum_kernel_F32: \n" " "KERNEL_F32" \n" " subs "J", "J", #1 \n" diff --git a/kernel/arm64/dot_kernel_asimd.c b/kernel/arm64/dot_kernel_asimd.c index 1288838f8..a404c9636 100644 --- a/kernel/arm64/dot_kernel_asimd.c +++ b/kernel/arm64/dot_kernel_asimd.c @@ -285,8 +285,9 @@ static RETURN_TYPE dot_kernel_asimd(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT " asr %[J_], %[N_], #"N_DIV_SHIFT" \n" " cmp %[J_], xzr \n" " beq 3f //dot_kernel_F1 \n" - +#if !(defined(__clang__) && defined(OS_WINDOWS)) " .align 5 \n" +#endif "2: //dot_kernel_F: \n" " "KERNEL_F" \n" " subs %[J_], %[J_], #1 \n" diff --git a/kernel/arm64/sasum_thunderx2t99.c b/kernel/arm64/sasum_thunderx2t99.c index 014c667ba..2db1e69e7 100644 --- a/kernel/arm64/sasum_thunderx2t99.c +++ b/kernel/arm64/sasum_thunderx2t99.c @@ -153,8 +153,9 @@ static FLOAT sasum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) " asr "J", "N", #6 \n" " cmp "J", xzr \n" " beq 3f //asum_kernel_F1 \n" - +#if !(defined(__clang__) && defined(OS_WINDOWS)) ".align 5 \n" +#endif "2: //asum_kernel_F64: \n" " "KERNEL_F64" \n" " subs "J", "J", #1 \n" diff --git a/kernel/arm64/zasum_thunderx2t99.c b/kernel/arm64/zasum_thunderx2t99.c index 1d303a9a3..481357400 100644 --- a/kernel/arm64/zasum_thunderx2t99.c +++ b/kernel/arm64/zasum_thunderx2t99.c @@ -153,8 +153,9 @@ static FLOAT zasum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) " asr "J", "N", #4 \n" " cmp "J", xzr \n" " beq 3f //asum_kernel_F1 \n" - +#if !(defined(__clang__) && defined(OS_WINDOWS)) ".align 5 \n" +#endif "2: //asum_kernel_F16: \n" " "KERNEL_F16" \n" " subs "J", "J", #1 \n" From ca3e1c8f9c9ebe16f45cafbb4eb7c5dcb1a0a4a9 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 18 Jan 2025 16:50:45 +0100 Subject: [PATCH 012/205] Get TARGET information from the registry on Windows --- cpuid_arm64.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/cpuid_arm64.c b/cpuid_arm64.c index 47e8ffcd6..20dbead23 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -43,6 +43,9 @@ size_t length64=sizeof(value64); #ifndef HWCAP_SVE #define HWCAP_SVE (1 << 22) #endif +#if (defined OS_WINDOWS) +#include +#endif #define get_cpu_ftr(id, var) ({ \ __asm__ __volatile__ ("mrs %0, "#id : "=r" (var)); \ @@ -385,6 +388,28 @@ int detect(void) if (value64 == 3660830781) return CPU_VORTEX; //A15/M2 if (value64 == 2271604202) return CPU_VORTEX; //A16/M3 if (value64 == 1867590060) return CPU_VORTEX; //M4 +#else +#ifdef OS_WINDOWS + HKEY reghandle; + HKEY hklm = HKEY_LOCAL_MACHINE; + WCHAR valstring[512]; + PVOID pvalstring=valstring; + DWORD size=sizeof (valstring); + DWORD type=RRF_RT_ANY; + DWORD flags=0; + LPCWSTR subkey= L"HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0"; + LPCWSTR field=L"ProcessorNameString"; + LONG errcode=RegOpenKeyEx(HKEY_LOCAL_MACHINE,TEXT("Hardware\\Description\\System\\CentralProcessor\\0"), 0, KEY_READ, ®handle); + if (errcode != NO_ERROR) wprintf(L"Could not open registry key for proc0: %x\n",errcode); + errcode=RegQueryValueEx(reghandle, "ProcessorNameString", NULL,NULL ,pvalstring,&size); + if (errcode != ERROR_SUCCESS) wprintf(L"Error reading cpuname from registry:%x\n",errcode); +//wprintf(stderr,L"%s\n",(PWSTR)valstring); + RegCloseKey(reghandle); + if (strstr(valstring, "Snapdragon(R) X Elite")) return CPU_NEOVERSEN1; + if (strstr(valstring, "Ampere(R) Altra")) return CPU_NEOVERSEN1; + if (strstr(valstring, "Snapdragon (TM) 8cx Gen 3")) return CPU_CORTEXX1; + if (strstr(valstring, "Snapdragon Compute Platform")) return CPU_CORTEXX1; +#endif #endif return CPU_ARMV8; #endif From 100e74d4d627511b2388ce199f3a2a80d5587ad3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 18 Jan 2025 18:09:58 +0100 Subject: [PATCH 013/205] restore deleted line break --- .github/workflows/mips64.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/mips64.yml b/.github/workflows/mips64.yml index 56da22c6b..bad7bf85e 100644 --- a/.github/workflows/mips64.yml +++ b/.github/workflows/mips64.yml @@ -59,7 +59,8 @@ jobs: - name: Compilation cache uses: actions/cache@v3 - with: path: ~/.ccache + with: + path: ~/.ccache key: ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}-${{ github.sha }} restore-keys: | ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }} From e114880dc4e6d3413303fe045dee19b6c389e979 Mon Sep 17 00:00:00 2001 From: gxw Date: Fri, 17 Jan 2025 16:01:50 +0800 Subject: [PATCH 014/205] kernel/generic: Fixed cscal and zscal --- interface/zscal.c | 4 +-- kernel/arm/zscal.c | 89 +++++++++++++++++++++------------------------- 2 files changed, 42 insertions(+), 51 deletions(-) diff --git a/interface/zscal.c b/interface/zscal.c index 498377343..0e52d113b 100644 --- a/interface/zscal.c +++ b/interface/zscal.c @@ -98,7 +98,7 @@ void CNAME(blasint n, FLOAT alpha_r, void *vx, blasint incx){ if (nthreads == 1) { #endif - SCAL_K(n, 0, 0, alpha[0], alpha[1], x, incx, NULL, 0, NULL, 0); + SCAL_K(n, 0, 0, alpha[0], alpha[1], x, incx, NULL, 0, NULL, 1); #ifdef SMP } else { @@ -108,7 +108,7 @@ void CNAME(blasint n, FLOAT alpha_r, void *vx, blasint incx){ mode = BLAS_SINGLE | BLAS_COMPLEX; #endif - blas_level1_thread(mode, n, 0, 0, alpha, x, incx, NULL, 0, NULL, 0, (int (*)(void))SCAL_K, nthreads); + blas_level1_thread(mode, n, 0, 0, alpha, x, incx, NULL, 0, NULL, 1, (int (*)(void))SCAL_K, nthreads); } #endif diff --git a/kernel/arm/zscal.c b/kernel/arm/zscal.c index c4855f73e..b210f9af3 100644 --- a/kernel/arm/zscal.c +++ b/kernel/arm/zscal.c @@ -27,65 +27,56 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /************************************************************************************** * 2013/09/14 Saar -* BLASTEST float : OK -* BLASTEST double : OK -* CTEST : OK -* TEST : OK +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : OK +* TEST : OK * **************************************************************************************/ #include "common.h" +// The c/zscal_k function is called not only by cblas_c/zscal but also by other upper-level interfaces. +// In certain cases, the expected return values for cblas_s/zscal differ from those of other upper-level interfaces. +// To handle this, we use the dummy2 parameter to differentiate between them. int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { - BLASLONG i=0; - BLASLONG inc_x2; - BLASLONG ip = 0; - FLOAT temp; + BLASLONG i = 0; + BLASLONG inc_x2; + BLASLONG ip = 0; + FLOAT temp; - if ( (n <= 0) || (inc_x <= 0)) - return(0); + if ((n <= 0) || (inc_x <= 0)) + return(0); + inc_x2 = 2 * inc_x; + if (dummy2 == 0) { + for (i = 0; i < n; i++) + { + if (da_r == 0.0 && da_i == 0.0) + { + x[ip] = 0.0; + x[ip+1] = 0.0; + } + else + { + temp = da_r * x[ip] - da_i * x[ip+1]; + x[ip+1] = da_r * x[ip+1] + da_i * x[ip] ; + x[ip] = temp; + } - inc_x2 = 2 * inc_x; - for ( i=0; i Date: Mon, 20 Jan 2025 15:58:15 +0800 Subject: [PATCH 015/205] utest: Add utest for {c/z}scal and {c/z}gemv --- utest/test_gemv.c | 474 +++++++++++++++++++++++++++++++++++++++++++++ utest/test_zscal.c | 54 ++++++ 2 files changed, 528 insertions(+) diff --git a/utest/test_gemv.c b/utest/test_gemv.c index dab6d2f11..66fc30995 100644 --- a/utest/test_gemv.c +++ b/utest/test_gemv.c @@ -128,3 +128,477 @@ CTEST(dgemv, 0_nan_inf_incy_2) } #endif + +#ifdef BUILD_COMPLEX + +CTEST(cgemv, 0_nan_inf) +{ + int i; + blasint N = 17; + blasint incX = 1; + blasint incY = 1; + float alpha[2] = {0.0, 0.0}; + float beta[2] = {0.0, 0.0}; + char trans = 'N'; + float A[17 * 17 * 4]; + float X[17 * 2]; + float Y[17 * 2]; + + memset(A, 0, sizeof(A)); + memset(X, 0, sizeof(X)); + for (i = 0; i < (2 * N - 2); i += 4) + { + Y[i] = NAN; + Y[i + 1] = NAN; + + Y[i + 2] = INFINITY; + Y[i + 3] = INFINITY; + } + Y[2 * N - 1] = NAN; + Y[2 * N - 2] = NAN; + BLASFUNC(cgemv)(&trans, &N, &N, alpha, A, &N, X, &incX, beta, Y, &incY); + for (i = 0; i < 2 * N; i ++) + ASSERT_TRUE(Y[i] == 0.0); +} + +CTEST(cgemv, 0_nan_inf_incy_2) +{ + int i; + blasint N = 17; + blasint incX = 1; + blasint incY = 2; + float alpha[2] = {0.0, 0.0}; + float beta[2] = {0.0, 0.0}; + char trans = 'N'; + float A[17 * 17 * 4]; + float X[17]; + float Y[17 * 2 * 2]; + float *ay = Y; + + memset(A, 0, sizeof(A)); + memset(X, 0, sizeof(X)); + memset(Y, 0, sizeof(Y)); + for (i = 0; i < (2 * N - 2); i += 4) + { + ay[0] = NAN; + ay[1] = NAN; + ay += 4; + ay[0] = INFINITY; + ay[1] = INFINITY; + ay += 4; + } + Y[4 * N - 4] = NAN; + Y[4 * N - 3] = NAN; + BLASFUNC(cgemv)(&trans, &N, &N, alpha, A, &N, X, &incX, beta, Y, &incY); + for (i = 0; i < 4 * N; i ++) + ASSERT_TRUE(Y[i] == 0.0); +} + +CTEST(cgemv, 0_2_nan_1_inf_1) +{ + int i; + blasint N = 17; + blasint incX = 1; + blasint incY = 1; + float alpha[2] = {0.0, 0.0}; + float beta[2] = {0.0, 2.0}; + char trans = 'N'; + float A[17 * 17 * 4]; + float X[17 * 2]; + float Y[17 * 2]; + + memset(A, 0, sizeof(A)); + memset(X, 0, sizeof(X)); + for (i = 0; i < (2 * N - 2); i += 4) + { + Y[i] = NAN; + Y[i + 1] = 1.0; + + Y[i + 2] = INFINITY; + Y[i + 3] = 1.0; + } + Y[2 * N - 2] = NAN; + Y[2 * N - 1] = 1.0; + BLASFUNC(cgemv)(&trans, &N, &N, alpha, A, &N, X, &incX, beta, Y, &incY); + for (i = 0; i < 2 * N; i += 2) { + if ((i >> 1) % 2){ + ASSERT_TRUE(isnan(Y[i])); + ASSERT_TRUE(isinf(Y[i + 1])); + } + else { + ASSERT_TRUE(isnan(Y[i])); + ASSERT_TRUE(isnan(Y[i + 1])); + } + } +} + +CTEST(cgemv, 0_2_nan_1_inf_1_incy_2) +{ + int i; + blasint N = 17; + blasint incX = 1; + blasint incY = 2; + float alpha[2] = {0.0, 0.0}; + float beta[2] = {0.0, 2.0}; + char trans = 'N'; + float A[17 * 17 * 4]; + float X[17]; + float Y[17 * 2 * 2]; + float *ay = Y; + + memset(A, 0, sizeof(A)); + memset(X, 0, sizeof(X)); + memset(Y, 0, sizeof(Y)); + for (i = 0; i < (2 * N - 2); i += 4) + { + ay[0] = NAN; + ay[1] = 1.0; + ay += 4; + ay[0] = INFINITY; + ay[1] = 1.0; + ay += 4; + } + Y[4 * N - 4] = NAN; + Y[4 * N - 3] = 1.0; + BLASFUNC(cgemv)(&trans, &N, &N, alpha, A, &N, X, &incX, beta, Y, &incY); + for (i = 0; i < 4 * N; i += 2) { + if ((i >> 1) % 2) { + ASSERT_TRUE(Y[i] == 0.0); + ASSERT_TRUE(Y[i + 1] == 0.0); + } + else { + if ((i >> 2) % 2) { + ASSERT_TRUE(isnan(Y[i])); + ASSERT_TRUE(isinf(Y[i + 1])); + } + else { + ASSERT_TRUE(isnan(Y[i])); + ASSERT_TRUE(isnan(Y[i + 1])); + } + } + } +} + +CTEST(cgemv, 2_0_nan_1_inf_1) +{ + int i; + blasint N = 17; + blasint incX = 1; + blasint incY = 1; + float alpha[2] = {0.0, 0.0}; + float beta[2] = {2.0, 0.0}; + char trans = 'N'; + float A[17 * 17 * 4]; + float X[17 * 2]; + float Y[17 * 2]; + + memset(A, 0, sizeof(A)); + memset(X, 0, sizeof(X)); + for (i = 0; i < (2 * N - 2); i += 4) + { + Y[i] = NAN; + Y[i + 1] = 1.0; + + Y[i + 2] = INFINITY; + Y[i + 3] = 1.0; + } + Y[2 * N - 2] = NAN; + Y[2 * N - 1] = 1.0; + BLASFUNC(cgemv)(&trans, &N, &N, alpha, A, &N, X, &incX, beta, Y, &incY); + for (i = 0; i < 2 * N; i += 2) { + if ((i >> 1) % 2){ + ASSERT_TRUE(isinf(Y[i])); + ASSERT_TRUE(isnan(Y[i + 1])); + } + else { + ASSERT_TRUE(isnan(Y[i])); + ASSERT_TRUE(isnan(Y[i + 1])); + } + } +} + +CTEST(cgemv, 2_0_nan_1_inf_1_incy_2) +{ + int i; + blasint N = 17; + blasint incX = 1; + blasint incY = 2; + float alpha[2] = {0.0, 0.0}; + float beta[2] = {2.0, 0.0}; + char trans = 'N'; + float A[17 * 17 * 4]; + float X[17]; + float Y[17 * 2 * 2]; + float *ay = Y; + + memset(A, 0, sizeof(A)); + memset(X, 0, sizeof(X)); + memset(Y, 0, sizeof(Y)); + for (i = 0; i < (2 * N - 2); i += 4) + { + ay[0] = NAN; + ay[1] = 1.0; + ay += 4; + ay[0] = INFINITY; + ay[1] = 1.0; + ay += 4; + } + Y[4 * N - 4] = NAN; + Y[4 * N - 3] = 1.0; + BLASFUNC(cgemv)(&trans, &N, &N, alpha, A, &N, X, &incX, beta, Y, &incY); + for (i = 0; i < 4 * N; i += 2) { + if ((i >> 1) % 2) { + ASSERT_TRUE(Y[i] == 0.0); + ASSERT_TRUE(Y[i + 1] == 0.0); + } + else { + if ((i >> 2) % 2) { + ASSERT_TRUE(isinf(Y[i])); + ASSERT_TRUE(isnan(Y[i + 1])); + } + else { + ASSERT_TRUE(isnan(Y[i])); + ASSERT_TRUE(isnan(Y[i + 1])); + } + } + } +} + +#endif + +#ifdef BUILD_COMPLEX16 + +CTEST(zgemv, 0_nan_inf) +{ + int i; + blasint N = 17; + blasint incX = 1; + blasint incY = 1; + double alpha[2] = {0.0, 0.0}; + double beta[2] = {0.0, 0.0}; + char trans = 'N'; + double A[17 * 17 * 4]; + double X[17 * 2]; + double Y[17 * 2]; + + memset(A, 0, sizeof(A)); + memset(X, 0, sizeof(X)); + for (i = 0; i < (2 * N - 2); i += 4) + { + Y[i] = NAN; + Y[i + 1] = NAN; + + Y[i + 2] = INFINITY; + Y[i + 3] = INFINITY; + } + Y[2 * N - 1] = NAN; + Y[2 * N - 2] = NAN; + BLASFUNC(zgemv)(&trans, &N, &N, alpha, A, &N, X, &incX, beta, Y, &incY); + for (i = 0; i < 2 * N; i ++) + ASSERT_TRUE(Y[i] == 0.0); +} + +CTEST(zgemv, 0_nan_inf_incy_2) +{ + int i; + blasint N = 17; + blasint incX = 1; + blasint incY = 2; + double alpha[2] = {0.0, 0.0}; + double beta[2] = {0.0, 0.0}; + char trans = 'N'; + double A[17 * 17 * 4]; + double X[17]; + double Y[17 * 2 * 2]; + double *ay = Y; + + memset(A, 0, sizeof(A)); + memset(X, 0, sizeof(X)); + memset(Y, 0, sizeof(Y)); + for (i = 0; i < (2 * N - 2); i += 4) + { + ay[0] = NAN; + ay[1] = NAN; + ay += 4; + ay[0] = INFINITY; + ay[1] = INFINITY; + ay += 4; + } + Y[4 * N - 4] = NAN; + Y[4 * N - 3] = NAN; + BLASFUNC(zgemv)(&trans, &N, &N, alpha, A, &N, X, &incX, beta, Y, &incY); + for (i = 0; i < 4 * N; i ++) + ASSERT_TRUE(Y[i] == 0.0); +} + +CTEST(zgemv, 0_2_nan_1_inf_1) +{ + int i; + blasint N = 17; + blasint incX = 1; + blasint incY = 1; + double alpha[2] = {0.0, 0.0}; + double beta[2] = {0.0, 2.0}; + char trans = 'N'; + double A[17 * 17 * 4]; + double X[17 * 2]; + double Y[17 * 2]; + + memset(A, 0, sizeof(A)); + memset(X, 0, sizeof(X)); + for (i = 0; i < (2 * N - 2); i += 4) + { + Y[i] = NAN; + Y[i + 1] = 1.0; + + Y[i + 2] = INFINITY; + Y[i + 3] = 1.0; + } + Y[2 * N - 2] = NAN; + Y[2 * N - 1] = 1.0; + BLASFUNC(zgemv)(&trans, &N, &N, alpha, A, &N, X, &incX, beta, Y, &incY); + for (i = 0; i < 2 * N; i += 2) { + if ((i >> 1) % 2){ + ASSERT_TRUE(isnan(Y[i])); + ASSERT_TRUE(isinf(Y[i + 1])); + } + else { + ASSERT_TRUE(isnan(Y[i])); + ASSERT_TRUE(isnan(Y[i + 1])); + } + } +} + +CTEST(zgemv, 0_2_nan_1_inf_1_incy_2) +{ + int i; + blasint N = 17; + blasint incX = 1; + blasint incY = 2; + double alpha[2] = {0.0, 0.0}; + double beta[2] = {0.0, 2.0}; + char trans = 'N'; + double A[17 * 17 * 4]; + double X[17]; + double Y[17 * 2 * 2]; + double *ay = Y; + + memset(A, 0, sizeof(A)); + memset(X, 0, sizeof(X)); + memset(Y, 0, sizeof(Y)); + for (i = 0; i < (2 * N - 2); i += 4) + { + ay[0] = NAN; + ay[1] = 1.0; + ay += 4; + ay[0] = INFINITY; + ay[1] = 1.0; + ay += 4; + } + Y[4 * N - 4] = NAN; + Y[4 * N - 3] = 1.0; + BLASFUNC(zgemv)(&trans, &N, &N, alpha, A, &N, X, &incX, beta, Y, &incY); + for (i = 0; i < 4 * N; i += 2) { + if ((i >> 1) % 2) { + ASSERT_TRUE(Y[i] == 0.0); + ASSERT_TRUE(Y[i + 1] == 0.0); + } + else { + if ((i >> 2) % 2) { + ASSERT_TRUE(isnan(Y[i])); + ASSERT_TRUE(isinf(Y[i + 1])); + } + else { + ASSERT_TRUE(isnan(Y[i])); + ASSERT_TRUE(isnan(Y[i + 1])); + } + } + } +} + +CTEST(zgemv, 2_0_nan_1_inf_1) +{ + int i; + blasint N = 17; + blasint incX = 1; + blasint incY = 1; + double alpha[2] = {0.0, 0.0}; + double beta[2] = {2.0, 0.0}; + char trans = 'N'; + double A[17 * 17 * 4]; + double X[17 * 2]; + double Y[17 * 2]; + + memset(A, 0, sizeof(A)); + memset(X, 0, sizeof(X)); + for (i = 0; i < (2 * N - 2); i += 4) + { + Y[i] = NAN; + Y[i + 1] = 1.0; + + Y[i + 2] = INFINITY; + Y[i + 3] = 1.0; + } + Y[2 * N - 2] = NAN; + Y[2 * N - 1] = 1.0; + BLASFUNC(zgemv)(&trans, &N, &N, alpha, A, &N, X, &incX, beta, Y, &incY); + for (i = 0; i < 2 * N; i += 2) { + if ((i >> 1) % 2){ + ASSERT_TRUE(isinf(Y[i])); + ASSERT_TRUE(isnan(Y[i + 1])); + } + else { + ASSERT_TRUE(isnan(Y[i])); + ASSERT_TRUE(isnan(Y[i + 1])); + } + } +} + +CTEST(zgemv, 2_0_nan_1_inf_1_incy_2) +{ + int i; + blasint N = 17; + blasint incX = 1; + blasint incY = 2; + double alpha[2] = {0.0, 0.0}; + double beta[2] = {2.0, 0.0}; + char trans = 'N'; + double A[17 * 17 * 4]; + double X[17]; + double Y[17 * 2 * 2]; + double *ay = Y; + + memset(A, 0, sizeof(A)); + memset(X, 0, sizeof(X)); + memset(Y, 0, sizeof(Y)); + for (i = 0; i < (2 * N - 2); i += 4) + { + ay[0] = NAN; + ay[1] = 1.0; + ay += 4; + ay[0] = INFINITY; + ay[1] = 1.0; + ay += 4; + } + Y[4 * N - 4] = NAN; + Y[4 * N - 3] = 1.0; + BLASFUNC(zgemv)(&trans, &N, &N, alpha, A, &N, X, &incX, beta, Y, &incY); + for (i = 0; i < 4 * N; i += 2) { + if ((i >> 1) % 2) { + ASSERT_TRUE(Y[i] == 0.0); + ASSERT_TRUE(Y[i + 1] == 0.0); + } + else { + if ((i >> 2) % 2) { + ASSERT_TRUE(isinf(Y[i])); + ASSERT_TRUE(isnan(Y[i + 1])); + } + else { + ASSERT_TRUE(isnan(Y[i])); + ASSERT_TRUE(isnan(Y[i + 1])); + } + } + } +} + +#endif diff --git a/utest/test_zscal.c b/utest/test_zscal.c index 09e63752c..57d78b690 100644 --- a/utest/test_zscal.c +++ b/utest/test_zscal.c @@ -442,6 +442,33 @@ CTEST(cscal, i_0inf_inc_2) ASSERT_TRUE(isnan(inf[17])); } +CTEST(cscal, i00_NAN) +{ + blasint N=9; + blasint incX=1; + float i[] = {0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0 }; + float nan[] = {NAN, 0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0}; + BLASFUNC(cscal)(&N, i, nan, &incX); + ASSERT_TRUE(isnan(nan[0])); + ASSERT_TRUE(isnan(nan[1])); + ASSERT_TRUE(isnan(nan[16])); + ASSERT_TRUE(isnan(nan[17])); +} + +CTEST(cscal, i00_NAN_incx_2) +{ + blasint N=9; + blasint incX=2; + float i[] = {0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0 }; + float nan[] = {0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, + 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN}; + BLASFUNC(cscal)(&N, i, nan, &incX); + ASSERT_TRUE(isnan(nan[0])); + ASSERT_TRUE(isnan(nan[1])); + ASSERT_TRUE(isnan(nan[16])); + ASSERT_TRUE(isnan(nan[17])); +} + #endif #ifdef BUILD_COMPLEX16 @@ -588,4 +615,31 @@ CTEST(zscal, i_0inf_inc_2) ASSERT_TRUE(isnan(inf[17])); } +CTEST(zscal, i00_NAN) +{ + blasint N=9; + blasint incX=1; + double i[] = {0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0 }; + double nan[] = {NAN, 0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0}; + BLASFUNC(zscal)(&N, i, nan, &incX); + ASSERT_TRUE(isnan(nan[0])); + ASSERT_TRUE(isnan(nan[1])); + ASSERT_TRUE(isnan(nan[16])); + ASSERT_TRUE(isnan(nan[17])); +} + +CTEST(zscal, i00_NAN_incx_2) +{ + blasint N=9; + blasint incX=2; + double i[] = {0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0 }; + double nan[] = {0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, + 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN}; + BLASFUNC(zscal)(&N, i, nan, &incX); + ASSERT_TRUE(isnan(nan[0])); + ASSERT_TRUE(isnan(nan[1])); + ASSERT_TRUE(isnan(nan[16])); + ASSERT_TRUE(isnan(nan[17])); +} + #endif From b2117bb2cadd0c1894104ce6a7c0980cd7c9ffb7 Mon Sep 17 00:00:00 2001 From: gxw Date: Thu, 16 Jan 2025 19:44:11 +0800 Subject: [PATCH 016/205] LoongArch64: Fixed LSX version of cscal and zscal --- kernel/loongarch64/cscal_lsx.S | 218 +++++++++------------------------ 1 file changed, 58 insertions(+), 160 deletions(-) diff --git a/kernel/loongarch64/cscal_lsx.S b/kernel/loongarch64/cscal_lsx.S index 241d3d16e..c235a206a 100644 --- a/kernel/loongarch64/cscal_lsx.S +++ b/kernel/loongarch64/cscal_lsx.S @@ -33,6 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHAI $f1 #define X $r7 #define INCX $r8 +#define DUMMY2 $r9 #define I $r12 #define TEMP $r13 @@ -65,6 +66,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. bge $r0, N, .L999 bge $r0, INCX, .L999 + ld.d DUMMY2, $sp, 0 li.d TEMP, 1 movgr2fr.d a1, $r0 FFINT a1, a1 @@ -84,24 +86,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. srai.d I, N, 2 bne INCX, TEMP, .L22 +/////// INCX == 1 //////// .L11: - bge $r0, I, .L997 CMPEQ $fcc0, ALPHAR, a1 CMPEQ $fcc1, ALPHAI, a1 - bceqz $fcc0, .L13 - b .L14 - .align 3 + bge $r0, I, .L19 -.L13: - bceqz $fcc1, .L114 //alpha_r != 0.0 && alpha_i != 0.0 - b .L113 //alpha_r != 0.0 && alpha_i == 0.0 +/////// INCX == 1 && N >= 4 //////// + bnez DUMMY2, .L17 // if DUMMPY2 == 1, called from c/zscal. -.L14: - bceqz $fcc1, .L114 //alpha_r == 0.0 && alpha_i != 0.0 - b .L111 //alpha_r == 0.0 && alpha_i == 0.0 - .align 3 + bceqz $fcc0, .L17 -.L111: //alpha_r == 0.0 && alpha_i == 0.0 + bceqz $fcc1, .L17 + +.L15: //alpha_r == 0.0 && alpha_i == 0.0 vst VXZ, X, 0 * SIZE #ifdef DOUBLE vst VXZ, X, 2 * SIZE @@ -112,50 +110,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif addi.d X, X, 8 * SIZE addi.d I, I, -1 - blt $r0, I, .L111 - b .L997 - .align 3 - -.L113: //alpha_r != 0.0 && alpha_i == 0.0 - vld VX0, X, 0 * SIZE -#ifdef DOUBLE - vld VX1, X, 2 * SIZE - vpickev.d x1, VX1, VX0 - vpickod.d x2, VX1, VX0 - vfmul.d x3, VXAR, x1 - vfmul.d x4, VXAR, x2 - vilvl.d VX2, x4 ,x3 - vilvh.d VX3, x4, x3 - vst VX2, X, 0 * SIZE - vst VX3, X, 2 * SIZE - vld VX0, X, 4 * SIZE - vld VX1, X, 6 * SIZE - vpickev.d x1, VX1, VX0 - vpickod.d x2, VX1, VX0 - vfmul.d x3, VXAR, x1 - vfmul.d x4, VXAR, x2 - vilvl.d VX2, x4 ,x3 - vilvh.d VX3, x4, x3 - vst VX2, X, 4 * SIZE - vst VX3, X, 6 * SIZE -#else - vld VX1, X, 4 * SIZE - vpickev.w x1, VX1, VX0 - vpickod.w x2, VX1, VX0 - vfmul.s x3, VXAR, x1 - vfmul.s x4, VXAR, x2 - vilvl.w VX2, x4 ,x3 - vilvh.w VX3, x4, x3 - vst VX2, X, 0 * SIZE - vst VX3, X, 4 * SIZE -#endif - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L113 - b .L997 + blt $r0, I, .L15 + b .L19 .align 3 -.L114: //alpha_r != 0.0 && alpha_i != 0.0 +.L17: vld VX0, X, 0 * SIZE #ifdef DOUBLE vld VX1, X, 2 * SIZE @@ -196,29 +155,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif addi.d X, X, 8 * SIZE addi.d I, I, -1 - blt $r0, I, .L114 - b .L997 + blt $r0, I, .L17 + b .L19 .align 3 +/////// INCX == 1 && N < 8 /////// +.L19: + andi I, N, 3 + beqz I, .L999 + bnez DUMMY2, .L998 // if DUMMPY2 == 1, called from c/zscal. + + bceqz $fcc0, .L998 + + bceqz $fcc1, .L998 + + b .L995 // alpha_r == 0.0 && alpha_i == 0.0 + +/////// INCX != 1 //////// .L22: - bge $r0, I, .L997 - move XX, X CMPEQ $fcc0, ALPHAR, a1 CMPEQ $fcc1, ALPHAI, a1 - bceqz $fcc0, .L23 - b .L24 - .align 3 + move XX, X + bge $r0, I, .L29 + bnez DUMMY2, .L25 // if DUMMPY2 == 1, called from c/zscal. -.L23: - bceqz $fcc1, .L224 //alpha_r != 0.0 && alpha_i != 0.0 - b .L223 //alpha_r != 0.0 && alpha_i == 0.0 + bceqz $fcc0, .L25 -.L24: - bceqz $fcc1, .L224 //alpha_r == 0.0 && alpha_i != 0.0 - b .L221 //alpha_r == 0.0 && alpha_i == 0.0 - .align 3 + bceqz $fcc1, .L25 -.L221: //alpha_r == 0.0 && alpha_i == 0.0 +.L27: //alpha_r == 0.0 && alpha_i == 0.0 #ifdef DOUBLE vstelm.d VXZ, X, 0, 0 vstelm.d VXZ, X, 1 * SIZE, 0 @@ -246,92 +211,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif add.d X, X, INCX addi.d I, I, -1 - blt $r0, I, .L221 - b .L997 + blt $r0, I, .L27 + b .L29 .align 3 -.L223: //alpha_r != 0.0 && alpha_i == 0.0 -#ifdef DOUBLE - ld.d t1, X, 0 * SIZE - ld.d t2, X, 1 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - ld.d t4, X, 1 * SIZE - add.d X, X, INCX - vinsgr2vr.d x1, t1, 0 - vinsgr2vr.d x2, t2, 0 - vinsgr2vr.d x1, t3, 1 - vinsgr2vr.d x2, t4, 1 - vfmul.d x3, VXAR, x1 - vfmul.d x4, VXAR, x2 - vstelm.d x3, XX, 0 * SIZE, 0 - vstelm.d x4, XX, 1 * SIZE, 0 - add.d XX, XX, INCX - vstelm.d x3, XX, 0 * SIZE, 1 - vstelm.d x4, XX, 1 * SIZE, 1 - add.d XX, XX, INCX - - ld.d t1, X, 0 * SIZE - ld.d t2, X, 1 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - ld.d t4, X, 1 * SIZE - vinsgr2vr.d x1, t1, 0 - vinsgr2vr.d x2, t2, 0 - vinsgr2vr.d x1, t3, 1 - vinsgr2vr.d x2, t4, 1 - add.d X, X, INCX - vfmul.d x3, VXAR, x1 - vfmul.d x4, VXAR, x2 - addi.d I, I, -1 - vstelm.d x3, XX, 0 * SIZE, 0 - vstelm.d x4, XX, 1 * SIZE, 0 - add.d XX, XX, INCX - vstelm.d x3, XX, 0 * SIZE, 1 - vstelm.d x4, XX, 1 * SIZE, 1 -#else - ld.w t1, X, 0 * SIZE - ld.w t2, X, 1 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - ld.w t4, X, 1 * SIZE - add.d X, X, INCX - vinsgr2vr.w x1, t1, 0 - vinsgr2vr.w x2, t2, 0 - vinsgr2vr.w x1, t3, 1 - vinsgr2vr.w x2, t4, 1 - ld.w t1, X, 0 * SIZE - ld.w t2, X, 1 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - ld.w t4, X, 1 * SIZE - vinsgr2vr.w x1, t1, 2 - vinsgr2vr.w x2, t2, 2 - vinsgr2vr.w x1, t3, 3 - vinsgr2vr.w x2, t4, 3 - add.d X, X, INCX - - vfmul.s x3, VXAR, x1 - vfmul.s x4, VXAR, x2 - addi.d I, I, -1 - vstelm.w x3, XX, 0 * SIZE, 0 - vstelm.w x4, XX, 1 * SIZE, 0 - add.d XX, XX, INCX - vstelm.w x3, XX, 0 * SIZE, 1 - vstelm.w x4, XX, 1 * SIZE, 1 - add.d XX, XX, INCX - vstelm.w x3, XX, 0 * SIZE, 2 - vstelm.w x4, XX, 1 * SIZE, 2 - add.d XX, XX, INCX - vstelm.w x3, XX, 0 * SIZE, 3 - vstelm.w x4, XX, 1 * SIZE, 3 -#endif - add.d XX, XX, INCX - blt $r0, I, .L223 - b .L997 - .align 3 - -.L224: //alpha_r != 0.0 && alpha_i != 0.0 +.L25: #ifdef DOUBLE ld.d t1, X, 0 * SIZE ld.d t2, X, 1 * SIZE @@ -414,15 +298,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vstelm.w x4, XX, 1 * SIZE, 3 #endif add.d XX, XX, INCX - blt $r0, I, .L224 - b .L997 + blt $r0, I, .L25 + b .L29 .align 3 -.L997: - andi I, N, 3 - bge $r0, I, .L999 - .align 3 +/////// INCX != 1 && N < 8 /////// +.L29: + andi I, N, 3 + beqz I, .L999 + bnez DUMMY2, .L998 // if DUMMPY2 == 1, called from c/zscal. + + bceqz $fcc0, .L998 + + bceqz $fcc1, .L998 + b .L995 // alpha_r == 0.0 && alpha_i == 0.0 + +.L995: // alpha_r == 0.0 && alpha_i == 0.0 + ST a1, X, 0 * SIZE + ST a1, X, 1 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L995 + b .L999 .L998: LD a1, X, 0 * SIZE LD a2, X, 1 * SIZE @@ -435,7 +333,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ST s2, X, 1 * SIZE add.d X, X, INCX blt $r0, I, .L998 - .align 3 + b .L999 .L999: move $r4, $r12 From c0318cea6e76d2aafd52875d502780329960728e Mon Sep 17 00:00:00 2001 From: Annop Wongwathanarat Date: Mon, 20 Jan 2025 14:37:16 +0000 Subject: [PATCH 017/205] Simplify gemv_t_sve_v1x3 kernel --- CONTRIBUTORS.md | 3 ++ kernel/arm64/gemv_t_sve_v1x3.c | 94 ++++++++++++++++++++-------------- 2 files changed, 58 insertions(+), 39 deletions(-) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 508dbcd0e..4df690228 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -232,3 +232,6 @@ In chronological order: * Aniket P. Garade Sushil Pratap Singh Juliya James * [2024-12-13] Optimized swap and rot Level-1 BLAS routines with ARM SVE + +* Annop Wongwathanarat + * [2025-01-21] Optimize gemv_t_sve_v1x3 kernel \ No newline at end of file diff --git a/kernel/arm64/gemv_t_sve_v1x3.c b/kernel/arm64/gemv_t_sve_v1x3.c index e481abec7..bcd0de0bf 100644 --- a/kernel/arm64/gemv_t_sve_v1x3.c +++ b/kernel/arm64/gemv_t_sve_v1x3.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2024, The OpenBLAS Project +Copyright (c) 2024, 2025 The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without @@ -56,12 +56,16 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG ix,iy; BLASLONG j; FLOAT *a_ptr; + FLOAT *y_ptr; FLOAT temp; iy = 0; if (inc_x == 1) { - BLASLONG width = (n + 3 - 1) / 3; + BLASLONG width = n / 3; + BLASLONG sve_size = SV_COUNT(); + svbool_t pg_true = SV_TRUE(); + svbool_t pg = SV_WHILE(0, m % sve_size); FLOAT *a0_ptr = a + lda * width * 0; FLOAT *a1_ptr = a + lda * width * 1; @@ -72,60 +76,41 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, FLOAT *y2_ptr = y + inc_y * width * 2; for (j = 0; j < width; j++) { - svbool_t pg00 = ((j + width * 0) < n) ? SV_TRUE() : svpfalse(); - svbool_t pg01 = ((j + width * 1) < n) ? SV_TRUE() : svpfalse(); - svbool_t pg02 = ((j + width * 2) < n) ? SV_TRUE() : svpfalse(); - SV_TYPE temp00_vec = SV_DUP(0.0); SV_TYPE temp01_vec = SV_DUP(0.0); SV_TYPE temp02_vec = SV_DUP(0.0); i = 0; - BLASLONG sve_size = SV_COUNT(); while ((i + sve_size * 1 - 1) < m) { - SV_TYPE x0_vec = svld1_vnum(SV_TRUE(), x + i, 0); + SV_TYPE x0_vec = svld1(pg_true, x + i); - SV_TYPE a00_vec = svld1_vnum(pg00, a0_ptr + i, 0); - SV_TYPE a01_vec = svld1_vnum(pg01, a1_ptr + i, 0); - SV_TYPE a02_vec = svld1_vnum(pg02, a2_ptr + i, 0); + SV_TYPE a00_vec = svld1(pg_true, a0_ptr + i); + SV_TYPE a01_vec = svld1(pg_true, a1_ptr + i); + SV_TYPE a02_vec = svld1(pg_true, a2_ptr + i); - temp00_vec = svmla_m(pg00, temp00_vec, a00_vec, x0_vec); - temp01_vec = svmla_m(pg01, temp01_vec, a01_vec, x0_vec); - temp02_vec = svmla_m(pg02, temp02_vec, a02_vec, x0_vec); + temp00_vec = svmla_x(pg_true, temp00_vec, a00_vec, x0_vec); + temp01_vec = svmla_x(pg_true, temp01_vec, a01_vec, x0_vec); + temp02_vec = svmla_x(pg_true, temp02_vec, a02_vec, x0_vec); i += sve_size * 1; } if (i < m) { - svbool_t pg0 = SV_WHILE(i + sve_size * 0, m); - - pg00 = svand_z(SV_TRUE(), pg0, pg00); - pg01 = svand_z(SV_TRUE(), pg0, pg01); - pg02 = svand_z(SV_TRUE(), pg0, pg02); + SV_TYPE x0_vec = svld1(pg, x + i); - SV_TYPE x0_vec = svld1_vnum(pg0, x + i, 0); + SV_TYPE a00_vec = svld1(pg, a0_ptr + i); + SV_TYPE a01_vec = svld1(pg, a1_ptr + i); + SV_TYPE a02_vec = svld1(pg, a2_ptr + i); - SV_TYPE a00_vec = svld1_vnum(pg00, a0_ptr + i, 0); - SV_TYPE a01_vec = svld1_vnum(pg01, a1_ptr + i, 0); - SV_TYPE a02_vec = svld1_vnum(pg02, a2_ptr + i, 0); - - temp00_vec = svmla_m(pg00, temp00_vec, a00_vec, x0_vec); - temp01_vec = svmla_m(pg01, temp01_vec, a01_vec, x0_vec); - temp02_vec = svmla_m(pg02, temp02_vec, a02_vec, x0_vec); + temp00_vec = svmla_m(pg, temp00_vec, a00_vec, x0_vec); + temp01_vec = svmla_m(pg, temp01_vec, a01_vec, x0_vec); + temp02_vec = svmla_m(pg, temp02_vec, a02_vec, x0_vec); } - if ((j + width * 0) < n) { - temp = svaddv(SV_TRUE(), temp00_vec); - y0_ptr[iy] += alpha * temp; - } - if ((j + width * 1) < n) { - temp = svaddv(SV_TRUE(), temp01_vec); - y1_ptr[iy] += alpha * temp; - } - if ((j + width * 2) < n) { - temp = svaddv(SV_TRUE(), temp02_vec); - y2_ptr[iy] += alpha * temp; - } + y0_ptr[iy] += alpha * svaddv(pg_true, temp00_vec); + y1_ptr[iy] += alpha * svaddv(pg_true, temp01_vec); + y2_ptr[iy] += alpha * svaddv(pg_true, temp02_vec); + iy += inc_y; a0_ptr += lda; @@ -133,6 +118,37 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, a2_ptr += lda; } + a_ptr = a2_ptr; + y_ptr = y2_ptr; + for (j = width * 3; j < n; j++) { + SV_TYPE temp_vec = SV_DUP(0.0); + + i = 0; + while ((i + sve_size * 1 - 1) < m) { + SV_TYPE x_vec = svld1(pg_true, x + i); + + SV_TYPE a_vec = svld1(pg_true, a_ptr + i); + + temp_vec = svmla_x(pg_true, temp_vec, a_vec, x_vec); + + i += sve_size * 1; + } + + if (i < m) { + SV_TYPE x_vec = svld1(pg, x + i); + + SV_TYPE a_vec = svld1(pg, a_ptr + i); + + temp_vec = svmla_m(pg, temp_vec, a_vec, x_vec); + } + + y_ptr[iy] += alpha * svaddv(pg_true, temp_vec); + + iy += inc_y; + + a_ptr += lda; + } + return(0); } From 3c8df6358f1b2537b983af4b2f93df87cf91e2c9 Mon Sep 17 00:00:00 2001 From: "tingbo.liao" Date: Wed, 22 Jan 2025 11:41:12 +0800 Subject: [PATCH 018/205] Further rearranged the rotm kernel for the different architectures. Signed-off-by: tingbo.liao --- cmake/kernel.cmake | 3 + common_d.h | 2 + common_level1.h | 6 +- common_macro.h | 3 + common_param.h | 3 + common_q.h | 2 + common_s.h | 2 + interface/rotm.c | 140 +------------- kernel/CMakeLists.txt | 3 + kernel/Makefile.L1 | 22 ++- kernel/alpha/KERNEL | 12 ++ kernel/arm/KERNEL | 10 + kernel/arm64/KERNEL | 10 + kernel/arm64/KERNEL.generic | 12 ++ kernel/csky/KERNEL | 10 + kernel/e2k/KERNEL | 10 + kernel/generic/rotm.c | 159 ++++++++++++++++ kernel/ia64/KERNEL | 12 ++ kernel/loongarch64/KERNEL | 12 ++ kernel/loongarch64/KERNEL.generic | 12 ++ kernel/mips/KERNEL | 10 + kernel/mips/KERNEL.generic | 12 ++ kernel/mips64/KERNEL | 12 ++ kernel/mips64/KERNEL.generic | 12 ++ kernel/power/KERNEL | 12 ++ kernel/riscv64/KERNEL | 10 + kernel/riscv64/KERNEL.C910V | 4 + kernel/riscv64/KERNEL.RISCV64_GENERIC | 4 + kernel/riscv64/KERNEL.RISCV64_ZVL128B | 4 + kernel/riscv64/KERNEL.RISCV64_ZVL256B | 4 + kernel/riscv64/KERNEL.x280 | 4 + kernel/riscv64/rotm_rvv.c | 260 ++++++++++++++++++++++++++ kernel/setparam-ref.c | 5 +- kernel/sparc/KERNEL | 11 ++ kernel/x86/KERNEL | 11 ++ kernel/x86/KERNEL.generic | 12 ++ kernel/x86_64/KERNEL | 12 ++ kernel/x86_64/KERNEL.generic | 12 ++ kernel/zarch/KERNEL | 10 + kernel/zarch/KERNEL.ZARCH_GENERIC | 9 + utest/test_rot.c | 36 ++++ 41 files changed, 770 insertions(+), 141 deletions(-) create mode 100644 kernel/generic/rotm.c create mode 100644 kernel/riscv64/rotm_rvv.c diff --git a/cmake/kernel.cmake b/cmake/kernel.cmake index efededcf3..2cea6d9e6 100644 --- a/cmake/kernel.cmake +++ b/cmake/kernel.cmake @@ -79,6 +79,9 @@ macro(SetDefaultL1) SetFallback(CROTKERNEL zrot.S) SetFallback(ZROTKERNEL zrot.S) SetFallback(XROTKERNEL zrot.S) + SetFallback(SROTMKERNEL rotm.S) + SetFallback(DROTMKERNEL rotm.S) + SetFallback(QROTMKERNEL rotm.S) SetFallback(SSCALKERNEL scal.S) SetFallback(DSCALKERNEL scal.S) SetFallback(CSCALKERNEL zscal.S) diff --git a/common_d.h b/common_d.h index 6f4bb2ded..1e8c33d7a 100644 --- a/common_d.h +++ b/common_d.h @@ -22,6 +22,7 @@ #define DSUM_K dsum_k #define DSWAP_K dswap_k #define DROT_K drot_k +#define DROTM_K drotm_k #define DGEMV_N dgemv_n #define DGEMV_T dgemv_t @@ -180,6 +181,7 @@ #define DSUM_K gotoblas -> dsum_k #define DSWAP_K gotoblas -> dswap_k #define DROT_K gotoblas -> drot_k +#define DROTM_K gotoblas -> drotm_k #define DGEMV_N gotoblas -> dgemv_n #define DGEMV_T gotoblas -> dgemv_t diff --git a/common_level1.h b/common_level1.h index d2ed47e56..85b39f7a7 100644 --- a/common_level1.h +++ b/common_level1.h @@ -213,9 +213,9 @@ int srotmg_k(float *, float *, float *, float *, float *); int drotmg_k(double *, double *, double *, double *, double *); int qrotmg_k(xdouble *, xdouble *, xdouble *, xdouble *, xdouble *); -int srotm_k (BLASLONG, float, BLASLONG, float, BLASLONG, float); -int drotm_k (BLASLONG, double, BLASLONG, double, BLASLONG, double); -int qrotm_k (BLASLONG, xdouble, BLASLONG, xdouble, BLASLONG, xdouble); +int srotm_k (BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int drotm_k (BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int qrotm_k (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); int saxpby_k (BLASLONG, float, float *, BLASLONG, float, float *, BLASLONG); diff --git a/common_macro.h b/common_macro.h index a924651de..820cb472a 100644 --- a/common_macro.h +++ b/common_macro.h @@ -70,6 +70,7 @@ #define SUM_K QSUM_K #define SWAP_K QSWAP_K #define ROT_K QROT_K +#define ROTM_K QROTM_K #define GEMV_N QGEMV_N #define GEMV_T QGEMV_T @@ -361,6 +362,7 @@ #define SUM_K DSUM_K #define SWAP_K DSWAP_K #define ROT_K DROT_K +#define ROTM_K DROTM_K #define GEMV_N DGEMV_N #define GEMV_T DGEMV_T @@ -977,6 +979,7 @@ #define SUM_K SSUM_K #define SWAP_K SSWAP_K #define ROT_K SROT_K +#define ROTM_K SROTM_K #define GEMV_N SGEMV_N #define GEMV_T SGEMV_T diff --git a/common_param.h b/common_param.h index c082d248e..a3e4cea6b 100644 --- a/common_param.h +++ b/common_param.h @@ -197,6 +197,7 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); //double (*dsdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); int (*srot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float); + int (*srotm_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); #endif #if (BUILD_SINGLE==1) || (BUILD_DOUBLE==1) || (BUILD_COMPLEX==1) int (*saxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); @@ -330,6 +331,7 @@ BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG); #endif #if (BUILD_DOUBLE==1) || (BUILD_COMPLEX16==1) int (*drot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG, double, double); + int (*drotm_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int (*daxpy_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); int (*dscal_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); int (*dswap_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); @@ -439,6 +441,7 @@ BLASLONG (*iqmin_k) (BLASLONG, xdouble *, BLASLONG); int (*qcopy_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); xdouble (*qdot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); int (*qrot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble); + int (*qrotm_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); int (*qaxpy_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); int (*qscal_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); diff --git a/common_q.h b/common_q.h index b4ace3a62..1d976f1e8 100644 --- a/common_q.h +++ b/common_q.h @@ -22,6 +22,7 @@ #define QSUM_K qsum_k #define QSWAP_K qswap_k #define QROT_K qrot_k +#define QROTM_K qrotm_k #define QGEMV_N qgemv_n #define QGEMV_T qgemv_t @@ -165,6 +166,7 @@ #define QSUM_K gotoblas -> qsum_k #define QSWAP_K gotoblas -> qswap_k #define QROT_K gotoblas -> qrot_k +#define QROTM_K gotoblas -> qrotm_k #define QGEMV_N gotoblas -> qgemv_n #define QGEMV_T gotoblas -> qgemv_t diff --git a/common_s.h b/common_s.h index fdd80b62f..7c7390259 100644 --- a/common_s.h +++ b/common_s.h @@ -24,6 +24,7 @@ #define SSCAL_K sscal_k #define SSWAP_K sswap_k #define SROT_K srot_k +#define SROTM_K srotm_k #define SGEMV_N sgemv_n #define SGEMV_T sgemv_t @@ -189,6 +190,7 @@ #define SSCAL_K gotoblas -> sscal_k #define SSWAP_K gotoblas -> sswap_k #define SROT_K gotoblas -> srot_k +#define SROTM_K gotoblas -> srotm_k #define SGEMV_N gotoblas -> sgemv_n #define SGEMV_T gotoblas -> sgemv_t diff --git a/interface/rotm.c b/interface/rotm.c index 9dc08354a..9ef87da32 100644 --- a/interface/rotm.c +++ b/interface/rotm.c @@ -7,149 +7,21 @@ void NAME(blasint *N, FLOAT *dx, blasint *INCX, FLOAT *dy, blasint *INCY, FLOAT *dparam){ - blasint n = *N; - blasint incx = *INCX; - blasint incy = *INCY; + blasint n = *N; + blasint incx = *INCX; + blasint incy = *INCY; + PRINT_DEBUG_NAME #else void CNAME(blasint n, FLOAT *dx, blasint incx, FLOAT *dy, blasint incy, FLOAT *dparam){ -#endif - - blasint i__1, i__2; + PRINT_DEBUG_CNAME; - blasint i__; - FLOAT w, z__; - blasint kx, ky; - FLOAT dh11, dh12, dh22, dh21, dflag; - blasint nsteps; - -#ifndef CBLAS - PRINT_DEBUG_CNAME; -#else - PRINT_DEBUG_CNAME; #endif - --dparam; - --dy; - --dx; - - dflag = dparam[1]; - if (n <= 0 || dflag == - 2.0) goto L140; - - if (! (incx == incy && incx > 0)) goto L70; - - nsteps = n * incx; - if (dflag < 0.) { - goto L50; - } else if (dflag == 0) { - goto L10; - } else { - goto L30; - } -L10: - dh12 = dparam[4]; - dh21 = dparam[3]; - i__1 = nsteps; - i__2 = incx; - for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) { - w = dx[i__]; - z__ = dy[i__]; - dx[i__] = w + z__ * dh12; - dy[i__] = w * dh21 + z__; -/* L20: */ - } - goto L140; -L30: - dh11 = dparam[2]; - dh22 = dparam[5]; - i__2 = nsteps; - i__1 = incx; - for (i__ = 1; i__1 < 0 ? i__ >= i__2 : i__ <= i__2; i__ += i__1) { - w = dx[i__]; - z__ = dy[i__]; - dx[i__] = w * dh11 + z__; - dy[i__] = -w + dh22 * z__; -/* L40: */ - } - goto L140; -L50: - dh11 = dparam[2]; - dh12 = dparam[4]; - dh21 = dparam[3]; - dh22 = dparam[5]; - i__1 = nsteps; - i__2 = incx; - for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) { - w = dx[i__]; - z__ = dy[i__]; - dx[i__] = w * dh11 + z__ * dh12; - dy[i__] = w * dh21 + z__ * dh22; -/* L60: */ - } - goto L140; -L70: - kx = 1; - ky = 1; - if (incx < 0) { - kx = (1 - n) * incx + 1; - } - if (incy < 0) { - ky = (1 - n) * incy + 1; - } + ROTM_K(n, dx, incx, dy, incy, dparam); - if (dflag < 0.) { - goto L120; - } else if (dflag == 0) { - goto L80; - } else { - goto L100; - } -L80: - dh12 = dparam[4]; - dh21 = dparam[3]; - i__2 = n; - for (i__ = 1; i__ <= i__2; ++i__) { - w = dx[kx]; - z__ = dy[ky]; - dx[kx] = w + z__ * dh12; - dy[ky] = w * dh21 + z__; - kx += incx; - ky += incy; -/* L90: */ - } - goto L140; -L100: - dh11 = dparam[2]; - dh22 = dparam[5]; - i__2 = n; - for (i__ = 1; i__ <= i__2; ++i__) { - w = dx[kx]; - z__ = dy[ky]; - dx[kx] = w * dh11 + z__; - dy[ky] = -w + dh22 * z__; - kx += incx; - ky += incy; -/* L110: */ - } - goto L140; -L120: - dh11 = dparam[2]; - dh12 = dparam[4]; - dh21 = dparam[3]; - dh22 = dparam[5]; - i__2 = n; - for (i__ = 1; i__ <= i__2; ++i__) { - w = dx[kx]; - z__ = dy[ky]; - dx[kx] = w * dh11 + z__ * dh12; - dy[ky] = w * dh21 + z__ * dh22; - kx += incx; - ky += incy; -/* L130: */ - } -L140: return; } diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index 74e6760c2..bc713e603 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -125,6 +125,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) GenerateNamedObjects("${KERNELDIR}/${SNRM2KERNEL}" "" "nrm2_k" false "" "" false "SINGLE") GenerateNamedObjects("${KERNELDIR}/${SDOTKERNEL}" "" "dot_k" false "" "" false "SINGLE") GenerateNamedObjects("${KERNELDIR}/${SROTKERNEL}" "" "rot_k" false "" "" false "SINGLE") + GenerateNamedObjects("${KERNELDIR}/${SROTMKERNEL}" "" "rotm_k" false "" "" false "SINGLE") endif () if (BUILD_COMPLEX16 AND NOT BUILD_DOUBLE) GenerateNamedObjects("${KERNELDIR}/${DAMAXKERNEL}" "USE_ABS" "amax_k" false "" "" false "DOUBLE") @@ -148,6 +149,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) GenerateNamedObjects("${KERNELDIR}/${DCOPYKERNEL}" "C_INTERFACE" "copy_k" false "" "" false "DOUBLE") GenerateNamedObjects("${KERNELDIR}/${DNRM2KERNEL}" "" "nrm2_k" false "" "" false "DOUBLE") GenerateNamedObjects("${KERNELDIR}/${DROTKERNEL}" "" "rot_k" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DROTMKERNEL}" "" "rotm_k" false "" "" false "DOUBLE") GenerateNamedObjects("${KERNELDIR}/${DDOTKERNEL}" "" "dot_k" false "" "" false "DOUBLE") GenerateNamedObjects("${KERNELDIR}/${DSWAPKERNEL}" "" "swap_k" false "" "" false "DOUBLE") GenerateNamedObjects("${KERNELDIR}/${DAXPYKERNEL}" "" "axpy_k" false "" "" false "DOUBLE") @@ -1105,6 +1107,7 @@ endif () GenerateNamedObjects("${KERNELDIR}/${DCOPYKERNEL}" "C_INTERFACE" "copy_k" false "" "" false "DOUBLE") GenerateNamedObjects("${KERNELDIR}/${DNRM2KERNEL}" "" "nrm2_k" false "" "" false "DOUBLE") GenerateNamedObjects("${KERNELDIR}/${DROTKERNEL}" "" "rot_k" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DROTMKERNEL}" "" "rotm_k" false "" "" false "DOUBLE") GenerateNamedObjects("${KERNELDIR}/${DDOTKERNEL}" "" "dot_k" false "" "" false "DOUBLE") GenerateNamedObjects("${KERNELDIR}/${DSWAPKERNEL}" "" "swap_k" false "" "" false "DOUBLE") GenerateNamedObjects("${KERNELDIR}/${DAXPYKERNEL}" "" "axpy_k" false "" "" false "DOUBLE") diff --git a/kernel/Makefile.L1 b/kernel/Makefile.L1 index 09337363d..6e864e3d8 100644 --- a/kernel/Makefile.L1 +++ b/kernel/Makefile.L1 @@ -336,6 +336,18 @@ ifndef XROTKERNEL XROTKERNEL = zrot.S endif +ifndef SROTMKERNEL +SROTMKERNEL = rotm.S +endif + +ifndef DROTMKERNEL +DROTMKERNEL = rotm.S +endif + +ifndef QROTMKERNEL +QROTMKERNEL = rotm.S +endif + ### SCAL ### ifndef SSCALKERNEL @@ -504,14 +516,14 @@ SBLASOBJS += \ sasum_k$(TSUFFIX).$(SUFFIX) ssum_k$(TSUFFIX).$(SUFFIX) saxpy_k$(TSUFFIX).$(SUFFIX) scopy_k$(TSUFFIX).$(SUFFIX) \ sdot_k$(TSUFFIX).$(SUFFIX) sdsdot_k$(TSUFFIX).$(SUFFIX) dsdot_k$(TSUFFIX).$(SUFFIX) \ snrm2_k$(TSUFFIX).$(SUFFIX) srot_k$(TSUFFIX).$(SUFFIX) sscal_k$(TSUFFIX).$(SUFFIX) sswap_k$(TSUFFIX).$(SUFFIX) \ - saxpby_k$(TSUFFIX).$(SUFFIX) + saxpby_k$(TSUFFIX).$(SUFFIX) srotm_k$(TSUFFIX).$(SUFFIX) DBLASOBJS += \ damax_k$(TSUFFIX).$(SUFFIX) damin_k$(TSUFFIX).$(SUFFIX) dmax_k$(TSUFFIX).$(SUFFIX) dmin_k$(TSUFFIX).$(SUFFIX) \ idamax_k$(TSUFFIX).$(SUFFIX) idamin_k$(TSUFFIX).$(SUFFIX) idmax_k$(TSUFFIX).$(SUFFIX) idmin_k$(TSUFFIX).$(SUFFIX) \ dasum_k$(TSUFFIX).$(SUFFIX) daxpy_k$(TSUFFIX).$(SUFFIX) dcopy_k$(TSUFFIX).$(SUFFIX) ddot_k$(TSUFFIX).$(SUFFIX) \ dnrm2_k$(TSUFFIX).$(SUFFIX) drot_k$(TSUFFIX).$(SUFFIX) dscal_k$(TSUFFIX).$(SUFFIX) dswap_k$(TSUFFIX).$(SUFFIX) \ - daxpby_k$(TSUFFIX).$(SUFFIX) dsum_k$(TSUFFIX).$(SUFFIX) + daxpby_k$(TSUFFIX).$(SUFFIX) dsum_k$(TSUFFIX).$(SUFFIX) drotm_k$(TSUFFIX).$(SUFFIX) QBLASOBJS += \ qamax_k$(TSUFFIX).$(SUFFIX) qamin_k$(TSUFFIX).$(SUFFIX) qmax_k$(TSUFFIX).$(SUFFIX) qmin_k$(TSUFFIX).$(SUFFIX) \ @@ -841,6 +853,12 @@ $(KDIR)srot_k$(TSUFFIX).$(SUFFIX) $(KDIR)srot_k$(TPSUFFIX).$(PSUFFIX) : $(KERN $(KDIR)drot_k$(TSUFFIX).$(SUFFIX) $(KDIR)drot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DROTKERNEL) $(CC) -c $(CFLAGS) $(FMAFLAG) -UCOMPLEX -UCOMPLEX -DDOUBLE $< -o $@ +$(KDIR)srotm_k$(TSUFFIX).$(SUFFIX) $(KDIR)srotm_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SROTMKERNEL) + $(CC) -c $(CFLAGS) $(FMAFLAG) -UCOMPLEX -UCOMPLEX -UDOUBLE $< -o $@ + +$(KDIR)drotm_k$(TSUFFIX).$(SUFFIX) $(KDIR)drotm_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DROTMKERNEL) + $(CC) -c $(CFLAGS) $(FMAFLAG) -UCOMPLEX -UCOMPLEX -DDOUBLE $< -o $@ + $(KDIR)qrot_k$(TSUFFIX).$(SUFFIX) $(KDIR)qrot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QROTKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -DXDOUBLE $< -o $@ diff --git a/kernel/alpha/KERNEL b/kernel/alpha/KERNEL index 01734bf9c..42ae595aa 100644 --- a/kernel/alpha/KERNEL +++ b/kernel/alpha/KERNEL @@ -122,3 +122,15 @@ ZTRSMKERNEL_LN = ztrsm_kernel_2x2_LN.S ZTRSMKERNEL_LT = ztrsm_kernel_2x2_LT.S ZTRSMKERNEL_RN = ztrsm_kernel_2x2_LT.S ZTRSMKERNEL_RT = ztrsm_kernel_2x2_RT.S + +ifndef SROTMKERNEL +SROTMKERNEL = ../generic/rotm.c +endif + +ifndef DROTMKERNEL +DROTMKERNEL = ../generic/rotm.c +endif + +ifndef QROTMKERNEL +QROTMKERNEL = ../generic/rotm.c +endif diff --git a/kernel/arm/KERNEL b/kernel/arm/KERNEL index aeccfbf4c..a6ad0bf02 100644 --- a/kernel/arm/KERNEL +++ b/kernel/arm/KERNEL @@ -43,4 +43,14 @@ ifndef ZGEMM_BETA ZGEMM_BETA = ../generic/zgemm_beta.c endif +ifndef SROTMKERNEL +SROTMKERNEL = ../generic/rotm.c +endif + +ifndef DROTMKERNEL +DROTMKERNEL = ../generic/rotm.c +endif +ifndef QROTMKERNEL +QROTMKERNEL = ../generic/rotm.c +endif diff --git a/kernel/arm64/KERNEL b/kernel/arm64/KERNEL index 7d7e648c4..05d95683d 100644 --- a/kernel/arm64/KERNEL +++ b/kernel/arm64/KERNEL @@ -45,4 +45,14 @@ ifndef ZGEMM_BETA ZGEMM_BETA = ../generic/zgemm_beta.c endif +ifndef SROTMKERNEL +SROTMKERNEL = ../generic/rotm.c +endif + +ifndef DROTMKERNEL +DROTMKERNEL = ../generic/rotm.c +endif +ifndef QROTMKERNEL +QROTMKERNEL = ../generic/rotm.c +endif diff --git a/kernel/arm64/KERNEL.generic b/kernel/arm64/KERNEL.generic index 838adb05a..65c301e68 100644 --- a/kernel/arm64/KERNEL.generic +++ b/kernel/arm64/KERNEL.generic @@ -171,3 +171,15 @@ QCABS_KERNEL = ../generic/cabs.c #Dump kernel CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c + +ifndef SROTMKERNEL +SROTMKERNEL = ../generic/rotm.c +endif + +ifndef DROTMKERNEL +DROTMKERNEL = ../generic/rotm.c +endif + +ifndef QROTMKERNEL +QROTMKERNEL = ../generic/rotm.c +endif diff --git a/kernel/csky/KERNEL b/kernel/csky/KERNEL index afa8a0881..0302057a2 100644 --- a/kernel/csky/KERNEL +++ b/kernel/csky/KERNEL @@ -146,4 +146,14 @@ DGEMM_BETA = ../generic/gemm_beta.c CGEMM_BETA = ../generic/zgemm_beta.c ZGEMM_BETA = ../generic/zgemm_beta.c +ifndef SROTMKERNEL +SROTMKERNEL = ../generic/rotm.c +endif +ifndef DROTMKERNEL +DROTMKERNEL = ../generic/rotm.c +endif + +ifndef QROTMKERNEL +QROTMKERNEL = ../generic/rotm.c +endif diff --git a/kernel/e2k/KERNEL b/kernel/e2k/KERNEL index afa8a0881..0302057a2 100644 --- a/kernel/e2k/KERNEL +++ b/kernel/e2k/KERNEL @@ -146,4 +146,14 @@ DGEMM_BETA = ../generic/gemm_beta.c CGEMM_BETA = ../generic/zgemm_beta.c ZGEMM_BETA = ../generic/zgemm_beta.c +ifndef SROTMKERNEL +SROTMKERNEL = ../generic/rotm.c +endif +ifndef DROTMKERNEL +DROTMKERNEL = ../generic/rotm.c +endif + +ifndef QROTMKERNEL +QROTMKERNEL = ../generic/rotm.c +endif diff --git a/kernel/generic/rotm.c b/kernel/generic/rotm.c new file mode 100644 index 000000000..e151aa5f8 --- /dev/null +++ b/kernel/generic/rotm.c @@ -0,0 +1,159 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG n, FLOAT *dx, BLASLONG incx, FLOAT *dy, BLASLONG incy, FLOAT *dparam) +{ + BLASLONG i__1, i__2; + BLASLONG i__; + FLOAT w, z__; + BLASLONG kx, ky; + FLOAT dh11, dh12, dh22, dh21, dflag; + BLASLONG nsteps; + + --dparam; + --dy; + --dx; + + dflag = dparam[1]; + if (n <= 0 || dflag == - 2.0) goto L140; + + if (! (incx == incy && incx > 0)) goto L70; + + nsteps = n * incx; + if (dflag < 0.) { + goto L50; + } else if (dflag == 0) { + goto L10; + } else { + goto L30; + } +L10: + dh12 = dparam[4]; + dh21 = dparam[3]; + i__1 = nsteps; + i__2 = incx; + for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) { + w = dx[i__]; + z__ = dy[i__]; + dx[i__] = w + z__ * dh12; + dy[i__] = w * dh21 + z__; +/* L20: */ + } + goto L140; +L30: + dh11 = dparam[2]; + dh22 = dparam[5]; + i__2 = nsteps; + i__1 = incx; + for (i__ = 1; i__1 < 0 ? i__ >= i__2 : i__ <= i__2; i__ += i__1) { + w = dx[i__]; + z__ = dy[i__]; + dx[i__] = w * dh11 + z__; + dy[i__] = -w + dh22 * z__; +/* L40: */ + } + goto L140; +L50: + dh11 = dparam[2]; + dh12 = dparam[4]; + dh21 = dparam[3]; + dh22 = dparam[5]; + i__1 = nsteps; + i__2 = incx; + for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) { + w = dx[i__]; + z__ = dy[i__]; + dx[i__] = w * dh11 + z__ * dh12; + dy[i__] = w * dh21 + z__ * dh22; +/* L60: */ + } + goto L140; +L70: + kx = 1; + ky = 1; + if (incx < 0) { + kx = (1 - n) * incx + 1; + } + if (incy < 0) { + ky = (1 - n) * incy + 1; + } + + if (dflag < 0.) { + goto L120; + } else if (dflag == 0) { + goto L80; + } else { + goto L100; + } +L80: + dh12 = dparam[4]; + dh21 = dparam[3]; + i__2 = n; + for (i__ = 1; i__ <= i__2; ++i__) { + w = dx[kx]; + z__ = dy[ky]; + dx[kx] = w + z__ * dh12; + dy[ky] = w * dh21 + z__; + kx += incx; + ky += incy; +/* L90: */ + } + goto L140; +L100: + dh11 = dparam[2]; + dh22 = dparam[5]; + i__2 = n; + for (i__ = 1; i__ <= i__2; ++i__) { + w = dx[kx]; + z__ = dy[ky]; + dx[kx] = w * dh11 + z__; + dy[ky] = -w + dh22 * z__; + kx += incx; + ky += incy; +/* L110: */ + } + goto L140; +L120: + dh11 = dparam[2]; + dh12 = dparam[4]; + dh21 = dparam[3]; + dh22 = dparam[5]; + i__2 = n; + for (i__ = 1; i__ <= i__2; ++i__) { + w = dx[kx]; + z__ = dy[ky]; + dx[kx] = w * dh11 + z__ * dh12; + dy[ky] = w * dh21 + z__ * dh22; + kx += incx; + ky += incy; +/* L130: */ + } +L140: + return(0); +} diff --git a/kernel/ia64/KERNEL b/kernel/ia64/KERNEL index 870aac473..bbfec7d55 100644 --- a/kernel/ia64/KERNEL +++ b/kernel/ia64/KERNEL @@ -142,3 +142,15 @@ ZTRSMKERNEL_RT = ztrsm_kernel_RT.S CGEMM3MKERNEL = zgemm3m_kernel.S ZGEMM3MKERNEL = zgemm3m_kernel.S + +ifndef SROTMKERNEL +SROTMKERNEL = ../generic/rotm.c +endif + +ifndef DROTMKERNEL +DROTMKERNEL = ../generic/rotm.c +endif + +ifndef QROTMKERNEL +QROTMKERNEL = ../generic/rotm.c +endif diff --git a/kernel/loongarch64/KERNEL b/kernel/loongarch64/KERNEL index e5d145a71..46d8daaa9 100644 --- a/kernel/loongarch64/KERNEL +++ b/kernel/loongarch64/KERNEL @@ -236,3 +236,15 @@ ZGEMM3MKERNEL = zgemm3m_kernel.S endif DSDOTKERNEL = dot.S + +ifndef SROTMKERNEL +SROTMKERNEL = ../generic/rotm.c +endif + +ifndef DROTMKERNEL +DROTMKERNEL = ../generic/rotm.c +endif + +ifndef QROTMKERNEL +QROTMKERNEL = ../generic/rotm.c +endif diff --git a/kernel/loongarch64/KERNEL.generic b/kernel/loongarch64/KERNEL.generic index 213add9ee..b2e4cb44a 100644 --- a/kernel/loongarch64/KERNEL.generic +++ b/kernel/loongarch64/KERNEL.generic @@ -169,3 +169,15 @@ QCABS_KERNEL = ../generic/cabs.c #Dump kernel CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c + +ifndef SROTMKERNEL +SROTMKERNEL = ../generic/rotm.c +endif + +ifndef DROTMKERNEL +DROTMKERNEL = ../generic/rotm.c +endif + +ifndef QROTMKERNEL +QROTMKERNEL = ../generic/rotm.c +endif diff --git a/kernel/mips/KERNEL b/kernel/mips/KERNEL index aeccfbf4c..a6ad0bf02 100644 --- a/kernel/mips/KERNEL +++ b/kernel/mips/KERNEL @@ -43,4 +43,14 @@ ifndef ZGEMM_BETA ZGEMM_BETA = ../generic/zgemm_beta.c endif +ifndef SROTMKERNEL +SROTMKERNEL = ../generic/rotm.c +endif + +ifndef DROTMKERNEL +DROTMKERNEL = ../generic/rotm.c +endif +ifndef QROTMKERNEL +QROTMKERNEL = ../generic/rotm.c +endif diff --git a/kernel/mips/KERNEL.generic b/kernel/mips/KERNEL.generic index 17f2ef976..1f03c6594 100644 --- a/kernel/mips/KERNEL.generic +++ b/kernel/mips/KERNEL.generic @@ -158,3 +158,15 @@ ZHEMV_L_KERNEL = ../generic/zhemv_k.c CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c + +ifndef SROTMKERNEL +SROTMKERNEL = ../generic/rotm.c +endif + +ifndef DROTMKERNEL +DROTMKERNEL = ../generic/rotm.c +endif + +ifndef QROTMKERNEL +QROTMKERNEL = ../generic/rotm.c +endif diff --git a/kernel/mips64/KERNEL b/kernel/mips64/KERNEL index 54939a9ef..2ebd8a5bd 100644 --- a/kernel/mips64/KERNEL +++ b/kernel/mips64/KERNEL @@ -199,3 +199,15 @@ endif ifndef IQMAXKERNEL IQMAXKERNEL = imax.S endif + +ifndef SROTMKERNEL +SROTMKERNEL = ../generic/rotm.c +endif + +ifndef DROTMKERNEL +DROTMKERNEL = ../generic/rotm.c +endif + +ifndef QROTMKERNEL +QROTMKERNEL = ../generic/rotm.c +endif diff --git a/kernel/mips64/KERNEL.generic b/kernel/mips64/KERNEL.generic index 17f2ef976..1f03c6594 100644 --- a/kernel/mips64/KERNEL.generic +++ b/kernel/mips64/KERNEL.generic @@ -158,3 +158,15 @@ ZHEMV_L_KERNEL = ../generic/zhemv_k.c CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c + +ifndef SROTMKERNEL +SROTMKERNEL = ../generic/rotm.c +endif + +ifndef DROTMKERNEL +DROTMKERNEL = ../generic/rotm.c +endif + +ifndef QROTMKERNEL +QROTMKERNEL = ../generic/rotm.c +endif diff --git a/kernel/power/KERNEL b/kernel/power/KERNEL index 9070450f4..45fe0dd29 100644 --- a/kernel/power/KERNEL +++ b/kernel/power/KERNEL @@ -73,3 +73,15 @@ endif ifndef IQMAXKERNEL IQMAXKERNEL = imax.S endif + +ifndef SROTMKERNEL +SROTMKERNEL = ../generic/rotm.c +endif + +ifndef DROTMKERNEL +DROTMKERNEL = ../generic/rotm.c +endif + +ifndef QROTMKERNEL +QROTMKERNEL = ../generic/rotm.c +endif diff --git a/kernel/riscv64/KERNEL b/kernel/riscv64/KERNEL index 68d68b5f8..cd9405203 100644 --- a/kernel/riscv64/KERNEL +++ b/kernel/riscv64/KERNEL @@ -27,4 +27,14 @@ ifndef ZGEMM_BETA ZGEMM_BETA = ../generic/zgemm_beta.c endif +ifndef SROTMKERNEL +SROTMKERNEL = ../generic/rotm.c +endif + +ifndef DROTMKERNEL +DROTMKERNEL = ../generic/rotm.c +endif +ifndef QROTMKERNEL +QROTMKERNEL = ../generic/rotm.c +endif diff --git a/kernel/riscv64/KERNEL.C910V b/kernel/riscv64/KERNEL.C910V index 2798a870e..666b3cc5e 100644 --- a/kernel/riscv64/KERNEL.C910V +++ b/kernel/riscv64/KERNEL.C910V @@ -71,6 +71,10 @@ DROTKERNEL = rot_vector.c CROTKERNEL = zrot_vector.c ZROTKERNEL = zrot_vector.c +SROTMKERNEL = ../generic/rotm.c +DROTMKERNEL = ../generic/rotm.c +QROTMKERNEL = ../generic/rotm.c + SSCALKERNEL = scal_vector.c DSCALKERNEL = scal_vector.c CSCALKERNEL = zscal_vector.c diff --git a/kernel/riscv64/KERNEL.RISCV64_GENERIC b/kernel/riscv64/KERNEL.RISCV64_GENERIC index 67f81cacd..cf7d15d36 100644 --- a/kernel/riscv64/KERNEL.RISCV64_GENERIC +++ b/kernel/riscv64/KERNEL.RISCV64_GENERIC @@ -71,6 +71,10 @@ DROTKERNEL = ../riscv64/rot.c CROTKERNEL = ../riscv64/zrot.c ZROTKERNEL = ../riscv64/zrot.c +SROTMKERNEL = ../generic/rotm.c +DROTMKERNEL = ../generic/rotm.c +QROTMKERNEL = ../generic/rotm.c + SSCALKERNEL = ../riscv64/scal.c DSCALKERNEL = ../riscv64/scal.c CSCALKERNEL = ../riscv64/zscal.c diff --git a/kernel/riscv64/KERNEL.RISCV64_ZVL128B b/kernel/riscv64/KERNEL.RISCV64_ZVL128B index fec69ee09..7fbc26d21 100644 --- a/kernel/riscv64/KERNEL.RISCV64_ZVL128B +++ b/kernel/riscv64/KERNEL.RISCV64_ZVL128B @@ -71,6 +71,10 @@ DROTKERNEL = rot_rvv.c CROTKERNEL = zrot_rvv.c ZROTKERNEL = zrot_rvv.c +SROTMKERNEL = ../generic/rotm.c +DROTMKERNEL = ../generic/rotm.c +QROTMKERNEL = ../generic/rotm.c + SSCALKERNEL = scal_rvv.c DSCALKERNEL = scal_rvv.c CSCALKERNEL = zscal_rvv.c diff --git a/kernel/riscv64/KERNEL.RISCV64_ZVL256B b/kernel/riscv64/KERNEL.RISCV64_ZVL256B index d8690682f..9915fd949 100644 --- a/kernel/riscv64/KERNEL.RISCV64_ZVL256B +++ b/kernel/riscv64/KERNEL.RISCV64_ZVL256B @@ -66,6 +66,10 @@ DROTKERNEL = rot_vector.c CROTKERNEL = zrot_vector.c ZROTKERNEL = zrot_vector.c +SROTMKERNEL = ../generic/rotm.c +DROTMKERNEL = ../generic/rotm.c +QROTMKERNEL = ../generic/rotm.c + SSCALKERNEL = scal_vector.c DSCALKERNEL = scal_vector.c CSCALKERNEL = zscal_vector.c diff --git a/kernel/riscv64/KERNEL.x280 b/kernel/riscv64/KERNEL.x280 index e909ca959..18515e812 100644 --- a/kernel/riscv64/KERNEL.x280 +++ b/kernel/riscv64/KERNEL.x280 @@ -98,6 +98,10 @@ DROTKERNEL = rot_rvv.c CROTKERNEL = zrot_rvv.c ZROTKERNEL = zrot_rvv.c +SROTMKERNEL = rotm_rvv.c +DROTMKERNEL = rotm_rvv.c +QROTMKERNEL = ../generic/rotm.c + SSCALKERNEL = scal_rvv.c DSCALKERNEL = scal_rvv.c CSCALKERNEL = zscal_rvv.c diff --git a/kernel/riscv64/rotm_rvv.c b/kernel/riscv64/rotm_rvv.c new file mode 100644 index 000000000..49605666f --- /dev/null +++ b/kernel/riscv64/rotm_rvv.c @@ -0,0 +1,260 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define FLOAT_V_T vfloat32m8_t +#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 +#define VSSEV_FLOAT __riscv_vsse32_v_f32m8 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m8 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m8 +#define VFMSACVF_FLOAT __riscv_vfmsac_vf_f32m8 +#else +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define FLOAT_V_T vfloat64m8_t +#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 +#define VSSEV_FLOAT __riscv_vsse64_v_f64m8 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m8 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m8 +#define VFMSACVF_FLOAT __riscv_vfmsac_vf_f64m8 +#endif + +int CNAME(BLASLONG n, FLOAT *dx, BLASLONG incx, FLOAT *dy, BLASLONG incy, FLOAT *dparam) +{ + BLASLONG i__1, i__2; + BLASLONG kx, ky; + FLOAT dh11, dh12, dh22, dh21, dflag; + BLASLONG nsteps; + + --dparam; + --dy; + --dx; + + FLOAT_V_T v_w, v_z__, v_dx, v_dy; + BLASLONG stride, stride_x, stride_y, offset; + + dflag = dparam[1]; + if (n <= 0 || dflag == - 2.0) goto L140; + + if (!(incx == incy && incx > 0)) goto L70; + + nsteps = n * incx; + if (dflag < 0.) { + goto L50; + } else if (dflag == 0) { + goto L10; + } else { + goto L30; + } +L10: + dh12 = dparam[4]; + dh21 = dparam[3]; + i__1 = nsteps; + i__2 = incx; + if(i__2 < 0){ + offset = i__1 - 2; + dx += offset; + dy += offset; + i__1 = -i__1; + i__2 = -i__2; + } + stride = i__2 * sizeof(FLOAT); + n = i__1 / i__2; + for (size_t vl; n > 0; n -= vl, dx += vl*i__2, dy += vl*i__2) { + vl = VSETVL(n); + + v_w = VLSEV_FLOAT(&dx[1], stride, vl); + v_z__ = VLSEV_FLOAT(&dy[1], stride, vl); + + v_dx = VFMACCVF_FLOAT(v_w, dh12, v_z__, vl); + v_dy = VFMACCVF_FLOAT(v_z__, dh21, v_w, vl); + + VSSEV_FLOAT(&dx[1], stride, v_dx, vl); + VSSEV_FLOAT(&dy[1], stride, v_dy, vl); + } + goto L140; +L30: + dh11 = dparam[2]; + dh22 = dparam[5]; + i__2 = nsteps; + i__1 = incx; + if(i__1 < 0){ + offset = i__2 - 2; + dx += offset; + dy += offset; + i__1 = -i__1; + i__2 = -i__2; + } + stride = i__1 * sizeof(FLOAT); + n = i__2 / i__1; + for (size_t vl; n > 0; n -= vl, dx += vl*i__1, dy += vl*i__1) { + vl = VSETVL(n); + + v_w = VLSEV_FLOAT(&dx[1], stride, vl); + v_z__ = VLSEV_FLOAT(&dy[1], stride, vl); + + v_dx = VFMACCVF_FLOAT(v_z__, dh11, v_w, vl); + v_dy = VFMSACVF_FLOAT(v_w, dh22, v_z__, vl); + + VSSEV_FLOAT(&dx[1], stride, v_dx, vl); + VSSEV_FLOAT(&dy[1], stride, v_dy, vl); + } + goto L140; +L50: + dh11 = dparam[2]; + dh12 = dparam[4]; + dh21 = dparam[3]; + dh22 = dparam[5]; + i__1 = nsteps; + i__2 = incx; + if(i__2 < 0){ + offset = i__1 - 2; + dx += offset; + dy += offset; + i__1 = -i__1; + i__2 = -i__2; + } + stride = i__2 * sizeof(FLOAT); + n = i__1 / i__2; + for (size_t vl; n > 0; n -= vl, dx += vl*i__2, dy += vl*i__2) { + vl = VSETVL(n); + + v_w = VLSEV_FLOAT(&dx[1], stride, vl); + v_z__ = VLSEV_FLOAT(&dy[1], stride, vl); + + v_dx = VFMULVF_FLOAT(v_w, dh11, vl); + v_dx = VFMACCVF_FLOAT(v_dx, dh12, v_z__, vl); + VSSEV_FLOAT(&dx[1], stride, v_dx, vl); + + v_dy = VFMULVF_FLOAT(v_w, dh21, vl); + v_dy = VFMACCVF_FLOAT(v_dy, dh22, v_z__, vl); + VSSEV_FLOAT(&dy[1], stride, v_dy, vl); + } + goto L140; +L70: + kx = 1; + ky = 1; + if (incx < 0) { + kx = (1 - n) * incx + 1; + } + if (incy < 0) { + ky = (1 - n) * incy + 1; + } + + if (dflag < 0.) { + goto L120; + } else if (dflag == 0) { + goto L80; + } else { + goto L100; + } +L80: + dh12 = dparam[4]; + dh21 = dparam[3]; + if(incx < 0){ + incx = -incx; + dx -= n*incx; + } + if(incy < 0){ + incy = -incy; + dy -= n*incy; + } + stride_x = incx * sizeof(FLOAT); + stride_y = incy * sizeof(FLOAT); + for (size_t vl; n > 0; n -= vl, dx += vl*incx, dy += vl*incy) { + vl = VSETVL(n); + + v_w = VLSEV_FLOAT(&dx[kx], stride_x, vl); + v_z__ = VLSEV_FLOAT(&dy[ky], stride_y, vl); + + v_dx = VFMACCVF_FLOAT(v_w, dh12, v_z__, vl); + v_dy = VFMACCVF_FLOAT(v_z__, dh21, v_w, vl); + + VSSEV_FLOAT(&dx[kx], stride_x, v_dx, vl); + VSSEV_FLOAT(&dy[ky], stride_y, v_dy, vl); + } + goto L140; +L100: + dh11 = dparam[2]; + dh22 = dparam[5]; + if(incx < 0){ + incx = -incx; + dx -= n*incx; + } + if(incy < 0){ + incy = -incy; + dy -= n*incy; + } + stride_x = incx * sizeof(FLOAT); + stride_y = incy * sizeof(FLOAT); + for (size_t vl; n > 0; n -= vl, dx += vl*incx, dy += vl*incy) { + vl = VSETVL(n); + + v_w = VLSEV_FLOAT(&dx[kx], stride_x, vl); + v_z__ = VLSEV_FLOAT(&dy[ky], stride_y, vl); + + v_dx = VFMACCVF_FLOAT(v_z__, dh11, v_w, vl); + v_dy = VFMSACVF_FLOAT(v_w, dh22, v_z__, vl); + + VSSEV_FLOAT(&dx[kx], stride_x, v_dx, vl); + VSSEV_FLOAT(&dy[ky], stride_y, v_dy, vl); + } + goto L140; +L120: + dh11 = dparam[2]; + dh12 = dparam[4]; + dh21 = dparam[3]; + dh22 = dparam[5]; + if(incx < 0){ + incx = -incx; + dx -= n*incx; + } + if(incy < 0){ + incy = -incy; + dy -= n*incy; + } + stride_x = incx * sizeof(FLOAT); + stride_y = incy * sizeof(FLOAT); + for (size_t vl; n > 0; n -= vl, dx += vl*incx, dy += vl*incy) { + vl = VSETVL(n); + + v_w = VLSEV_FLOAT(&dx[kx], stride_x, vl); + v_z__ = VLSEV_FLOAT(&dy[ky], stride_y, vl); + + v_dx = VFMULVF_FLOAT(v_w, dh11, vl); + v_dx = VFMACCVF_FLOAT(v_dx, dh12, v_z__, vl); + VSSEV_FLOAT(&dx[kx], stride_x, v_dx, vl); + + v_dy = VFMULVF_FLOAT(v_w, dh21, vl); + v_dy = VFMACCVF_FLOAT(v_dy, dh22, v_z__, vl); + VSSEV_FLOAT(&dy[ky], stride_y, v_dy, vl); + } +L140: + return(0); +} diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index fa61a209e..09b148b3e 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -158,7 +158,7 @@ gotoblas_t TABLE_NAME = { #if (BUILD_SINGLE==1) || (BUILD_DOUBLE==1) || (BUILD_COMPLEX==1) scopy_kTS, sdot_kTS, // dsdot_kTS, - srot_kTS, saxpy_kTS, + srot_kTS, saxpy_kTS, srotm_kTS, #endif #if (BUILD_SINGLE==1) || (BUILD_DOUBLE==1) || (BUILD_COMPLEX==1) || (BUILD_COMPLEX16==1) sscal_kTS, @@ -260,6 +260,7 @@ gotoblas_t TABLE_NAME = { #endif #if (BUILD_DOUBLE==1) || (BUILD_COMPLEX16==1) drot_kTS, + drotm_kTS, daxpy_kTS, dscal_kTS, dswap_kTS, @@ -334,7 +335,7 @@ gotoblas_t TABLE_NAME = { qrot_kTS, qaxpy_kTS, qscal_kTS, qswap_kTS, qgemv_nTS, qgemv_tTS, qger_kTS, qsymv_LTS, qsymv_UTS, - + qrotm_kTS, qgemm_kernelTS, qgemm_betaTS, #if QGEMM_DEFAULT_UNROLL_M != QGEMM_DEFAULT_UNROLL_N qgemm_incopyTS, qgemm_itcopyTS, diff --git a/kernel/sparc/KERNEL b/kernel/sparc/KERNEL index a8c958bb4..d6580609b 100644 --- a/kernel/sparc/KERNEL +++ b/kernel/sparc/KERNEL @@ -75,3 +75,14 @@ DGEMM_BETA = ../generic/gemm_beta.c CGEMM_BETA = ../generic/zgemm_beta.c ZGEMM_BETA = ../generic/zgemm_beta.c +ifndef SROTMKERNEL +SROTMKERNEL = ../generic/rotm.c +endif + +ifndef DROTMKERNEL +DROTMKERNEL = ../generic/rotm.c +endif + +ifndef QROTMKERNEL +QROTMKERNEL = ../generic/rotm.c +endif diff --git a/kernel/x86/KERNEL b/kernel/x86/KERNEL index 83b51db13..1095c1528 100644 --- a/kernel/x86/KERNEL +++ b/kernel/x86/KERNEL @@ -189,3 +189,14 @@ ZGEMM_BETA = ../generic/zgemm_beta.c QGEMM_BETA = ../generic/gemm_beta.c XGEMM_BETA = ../generic/zgemm_beta.c +ifndef SROTMKERNEL +SROTMKERNEL = ../generic/rotm.c +endif + +ifndef DROTMKERNEL +DROTMKERNEL = ../generic/rotm.c +endif + +ifndef QROTMKERNEL +QROTMKERNEL = ../generic/rotm.c +endif diff --git a/kernel/x86/KERNEL.generic b/kernel/x86/KERNEL.generic index 0aac0ce99..ada3ff42d 100644 --- a/kernel/x86/KERNEL.generic +++ b/kernel/x86/KERNEL.generic @@ -162,3 +162,15 @@ ZHEMV_L_KERNEL = ../generic/zhemv_k.c CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c + +ifndef SROTMKERNEL +SROTMKERNEL = ../generic/rotm.c +endif + +ifndef DROTMKERNEL +DROTMKERNEL = ../generic/rotm.c +endif + +ifndef QROTMKERNEL +QROTMKERNEL = ../generic/rotm.c +endif diff --git a/kernel/x86_64/KERNEL b/kernel/x86_64/KERNEL index 2deb5a864..c270ff077 100644 --- a/kernel/x86_64/KERNEL +++ b/kernel/x86_64/KERNEL @@ -290,6 +290,18 @@ ifndef QROTKERNEL QROTKERNEL = rot.S endif +ifndef SROTMKERNEL +SROTMKERNEL = ../generic/rotm.c +endif + +ifndef DROTMKERNEL +DROTMKERNEL = ../generic/rotm.c +endif + +ifndef QROTMKERNEL +QROTMKERNEL = ../generic/rotm.c +endif + ifndef CROTKERNEL CROTKERNEL = zrot_sse.S endif diff --git a/kernel/x86_64/KERNEL.generic b/kernel/x86_64/KERNEL.generic index 7cb0cb836..36dc9f43d 100644 --- a/kernel/x86_64/KERNEL.generic +++ b/kernel/x86_64/KERNEL.generic @@ -168,3 +168,15 @@ QCABS_KERNEL = ../generic/cabs.c #Dump kernel CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c + +ifndef SROTMKERNEL +SROTMKERNEL = ../generic/rotm.c +endif + +ifndef DROTMKERNEL +DROTMKERNEL = ../generic/rotm.c +endif + +ifndef QROTMKERNEL +QROTMKERNEL = ../generic/rotm.c +endif diff --git a/kernel/zarch/KERNEL b/kernel/zarch/KERNEL index 68d68b5f8..cd9405203 100644 --- a/kernel/zarch/KERNEL +++ b/kernel/zarch/KERNEL @@ -27,4 +27,14 @@ ifndef ZGEMM_BETA ZGEMM_BETA = ../generic/zgemm_beta.c endif +ifndef SROTMKERNEL +SROTMKERNEL = ../generic/rotm.c +endif + +ifndef DROTMKERNEL +DROTMKERNEL = ../generic/rotm.c +endif +ifndef QROTMKERNEL +QROTMKERNEL = ../generic/rotm.c +endif diff --git a/kernel/zarch/KERNEL.ZARCH_GENERIC b/kernel/zarch/KERNEL.ZARCH_GENERIC index 33850d0f7..6321cf6e3 100644 --- a/kernel/zarch/KERNEL.ZARCH_GENERIC +++ b/kernel/zarch/KERNEL.ZARCH_GENERIC @@ -135,5 +135,14 @@ ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +ifndef SROTMKERNEL +SROTMKERNEL = ../generic/rotm.c +endif +ifndef DROTMKERNEL +DROTMKERNEL = ../generic/rotm.c +endif +ifndef QROTMKERNEL +QROTMKERNEL = ../generic/rotm.c +endif diff --git a/utest/test_rot.c b/utest/test_rot.c index 03776586b..e4ba44a03 100644 --- a/utest/test_rot.c +++ b/utest/test_rot.c @@ -70,6 +70,24 @@ CTEST(rot,drot_inc_1) ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS); } } +CTEST(rot,drotm_inc_1) +{ + blasint i = 0; + blasint N = 12, incX = 1, incY = 1; + double param[5] = {1.0, 2.0, 3.0, 4.0, 5.0}; + double x_actual[] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0}; + double y_actual[] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0}; + double x_referece[] = {3.0, 6.0, 9.0, 12.0, 15.0, 18.0, 21.0, 24.0, 27.0, 30.0, 33.0, 36.0}; + double y_referece[] = {4.0, 8.0, 12.0, 16.0, 20.0, 24.0, 28.0, 32.0, 36.0, 40.0, 44.0, 48.0}; + + //OpenBLAS + BLASFUNC(drotm)(&N, x_actual, &incX, y_actual, &incY, param); + + for(i = 0; i < N; i++){ + ASSERT_DBL_NEAR_TOL(x_referece[i], x_actual[i], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(y_referece[i], y_actual[i], DOUBLE_EPS); + } +} #endif #ifdef BUILD_COMPLEX16 @@ -130,6 +148,24 @@ CTEST(rot,srot_inc_1) ASSERT_DBL_NEAR_TOL(y2[i], y1[i], SINGLE_EPS); } } +CTEST(rot,srotm_inc_1) +{ + blasint i = 0; + blasint N = 12, incX = 1, incY = 1; + float param[5] = {1.0, 2.0, 3.0, 4.0, 5.0}; + float x_actual[] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0}; + float y_actual[] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0}; + float x_referece[] = {3.0, 6.0, 9.0, 12.0, 15.0, 18.0, 21.0, 24.0, 27.0, 30.0, 33.0, 36.0}; + float y_referece[] = {4.0, 8.0, 12.0, 16.0, 20.0, 24.0, 28.0, 32.0, 36.0, 40.0, 44.0, 48.0}; + + //OpenBLAS + BLASFUNC(srotm)(&N, x_actual, &incX, y_actual, &incY, param); + + for(i = 0; i < N; i++){ + ASSERT_DBL_NEAR_TOL(x_referece[i], x_actual[i], SINGLE_EPS); + ASSERT_DBL_NEAR_TOL(y_referece[i], y_actual[i], SINGLE_EPS); + } +} #endif #ifdef BUILD_COMPLEX From 5392f6df6908dfb64fd972d4c467058226547294 Mon Sep 17 00:00:00 2001 From: gxw Date: Thu, 16 Jan 2025 19:50:22 +0800 Subject: [PATCH 019/205] LoongArch64: Fixed LASX version of cscal and zscal --- kernel/loongarch64/cscal_lasx.S | 244 ++++++++------------------------ 1 file changed, 61 insertions(+), 183 deletions(-) diff --git a/kernel/loongarch64/cscal_lasx.S b/kernel/loongarch64/cscal_lasx.S index f53526663..daeb180e9 100644 --- a/kernel/loongarch64/cscal_lasx.S +++ b/kernel/loongarch64/cscal_lasx.S @@ -33,6 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHAI $f1 #define X $r7 #define INCX $r8 +#define DUMMY2 $r9 #define I $r12 #define TEMP $r13 @@ -65,6 +66,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. bge $r0, N, .L999 bge $r0, INCX, .L999 + ld.d DUMMY2, $sp, 0 li.d TEMP, 1 movgr2fr.d a1, $r0 FFINT a1, a1 @@ -86,24 +88,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif bne INCX, TEMP, .L22 +/////// INCX == 1 //////// .L11: - bge $r0, I, .L997 CMPEQ $fcc0, ALPHAR, a1 CMPEQ $fcc1, ALPHAI, a1 - bceqz $fcc0, .L13 - b .L14 - .align 3 + bge $r0, I, .L19 +/////// INCX == 1 && N >= 4 //////// + bnez DUMMY2, .L17 // if DUMMPY2 == 1, called from c/zscal. -.L13: - bceqz $fcc1, .L114 //alpha_r != 0.0 && alpha_i != 0.0 - b .L113 //alpha_r != 0.0 && alpha_i == 0.0 + bceqz $fcc0, .L17 -.L14: - bceqz $fcc1, .L114 //alpha_r == 0.0 && alpha_i != 0.0 - b .L111 //alpha_r == 0.0 && alpha_i == 0.0 - .align 3 + bceqz $fcc1, .L17 -.L111: //alpha_r == 0.0 && alpha_i == 0.0 +.L15: //alpha_r == 0.0 && alpha_i == 0.0 xvst VXZ, X, 0 * SIZE #ifdef DOUBLE xvst VXZ, X, 4 * SIZE @@ -113,41 +110,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi.d X, X, 16 * SIZE #endif addi.d I, I, -1 - blt $r0, I, .L111 - b .L997 + blt $r0, I, .L15 + b .L19 .align 3 -.L113: //alpha_r != 0.0 && alpha_i == 0.0 - xvld VX0, X, 0 * SIZE -#ifdef DOUBLE - xvld VX1, X, 4 * SIZE - xvpickev.d x1, VX1, VX0 - xvpickod.d x2, VX1, VX0 - xvfmul.d x3, VXAR, x1 - xvfmul.d x4, VXAR, x2 - xvilvl.d VX2, x4 ,x3 - xvilvh.d VX3, x4, x3 - xvst VX2, X, 0 * SIZE - xvst VX3, X, 4 * SIZE - addi.d X, X, 8 * SIZE -#else - xvld VX1, X, 8 * SIZE - xvpickev.w x1, VX1, VX0 - xvpickod.w x2, VX1, VX0 - xvfmul.s x3, VXAR, x1 - xvfmul.s x4, VXAR, x2 - xvilvl.w VX2, x4 ,x3 - xvilvh.w VX3, x4, x3 - xvst VX2, X, 0 * SIZE - xvst VX3, X, 8 * SIZE - addi.d X, X, 16 * SIZE -#endif - addi.d I, I, -1 - blt $r0, I, .L113 - b .L997 - .align 3 - -.L114: //alpha_r != 0.0 && alpha_i != 0.0 +.L17: xvld VX0, X, 0 * SIZE #ifdef DOUBLE xvld VX1, X, 4 * SIZE @@ -177,29 +144,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi.d X, X, 16 * SIZE #endif addi.d I, I, -1 - blt $r0, I, .L114 - b .L997 + blt $r0, I, .L17 + b .L19 + .align 3 + +/////// INCX == 1 && N < 8 /////// +.L19: +#ifdef DOUBLE + andi I, N, 3 +#else + andi I, N, 7 +#endif + beqz I, .L999 + bnez DUMMY2, .L998 // if DUMMPY2 == 1, called from c/zscal. + + bceqz $fcc0, .L998 + + bceqz $fcc1, .L998 + + b .L995 // alpha_r == 0.0 && alpha_i == 0.0 .align 3 +/////// INCX != 1 //////// .L22: - bge $r0, I, .L997 - move XX, X CMPEQ $fcc0, ALPHAR, a1 CMPEQ $fcc1, ALPHAI, a1 - bceqz $fcc0, .L23 - b .L24 - .align 3 - -.L23: - bceqz $fcc1, .L224 //alpha_r != 0.0 && alpha_i != 0.0 - b .L223 //alpha_r != 0.0 && alpha_i == 0.0 + move XX, X + bge $r0, I, .L29 + bnez DUMMY2, .L25 // if DUMMPY2 == 1, called from c/zscal. + bceqz $fcc0, .L25 -.L24: - bceqz $fcc1, .L224 //alpha_r == 0.0 && alpha_i != 0.0 - b .L221 //alpha_r == 0.0 && alpha_i == 0.0 - .align 3 + bceqz $fcc1, .L25 -.L221: //alpha_r == 0.0 && alpha_i == 0.0 +.L27: //alpha_r == 0.0 && alpha_i == 0.0 #ifdef DOUBLE xvstelm.d VXZ, X, 0, 0 xvstelm.d VXZ, X, 1 * SIZE, 0 @@ -239,122 +216,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif add.d X, X, INCX addi.d I, I, -1 - blt $r0, I, .L221 - b .L997 + blt $r0, I, .L27 + b .L29 .align 3 -.L223: //alpha_r != 0.0 && alpha_i == 0.0 -#ifdef DOUBLE - ld.d t1, X, 0 * SIZE - ld.d t2, X, 1 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - ld.d t4, X, 1 * SIZE - add.d X, X, INCX - xvinsgr2vr.d x1, t1, 0 - xvinsgr2vr.d x2, t2, 0 - xvinsgr2vr.d x1, t3, 1 - xvinsgr2vr.d x2, t4, 1 - ld.d t1, X, 0 * SIZE - ld.d t2, X, 1 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - ld.d t4, X, 1 * SIZE - xvinsgr2vr.d x1, t1, 2 - xvinsgr2vr.d x2, t2, 2 - xvinsgr2vr.d x1, t3, 3 - xvinsgr2vr.d x2, t4, 3 - add.d X, X, INCX - - xvfmul.d x3, VXAR, x1 - xvfmul.d x4, VXAR, x2 - addi.d I, I, -1 - xvstelm.d x3, XX, 0 * SIZE, 0 - xvstelm.d x4, XX, 1 * SIZE, 0 - add.d XX, XX, INCX - xvstelm.d x3, XX, 0 * SIZE, 1 - xvstelm.d x4, XX, 1 * SIZE, 1 - add.d XX, XX, INCX - xvstelm.d x3, XX, 0 * SIZE, 2 - xvstelm.d x4, XX, 1 * SIZE, 2 - add.d XX, XX, INCX - xvstelm.d x3, XX, 0 * SIZE, 3 - xvstelm.d x4, XX, 1 * SIZE, 3 -#else - ld.w t1, X, 0 * SIZE - ld.w t2, X, 1 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - ld.w t4, X, 1 * SIZE - add.d X, X, INCX - xvinsgr2vr.w x1, t1, 0 - xvinsgr2vr.w x2, t2, 0 - xvinsgr2vr.w x1, t3, 1 - xvinsgr2vr.w x2, t4, 1 - ld.w t1, X, 0 * SIZE - ld.w t2, X, 1 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - ld.w t4, X, 1 * SIZE - xvinsgr2vr.w x1, t1, 2 - xvinsgr2vr.w x2, t2, 2 - xvinsgr2vr.w x1, t3, 3 - xvinsgr2vr.w x2, t4, 3 - add.d X, X, INCX - ld.w t1, X, 0 * SIZE - ld.w t2, X, 1 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - ld.w t4, X, 1 * SIZE - add.d X, X, INCX - xvinsgr2vr.w x1, t1, 4 - xvinsgr2vr.w x2, t2, 4 - xvinsgr2vr.w x1, t3, 5 - xvinsgr2vr.w x2, t4, 5 - ld.w t1, X, 0 * SIZE - ld.w t2, X, 1 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - ld.w t4, X, 1 * SIZE - xvinsgr2vr.w x1, t1, 6 - xvinsgr2vr.w x2, t2, 6 - xvinsgr2vr.w x1, t3, 7 - xvinsgr2vr.w x2, t4, 7 - add.d X, X, INCX - - xvfmul.s x3, VXAR, x1 - xvfmul.s x4, VXAR, x2 - addi.d I, I, -1 - xvstelm.w x3, XX, 0 * SIZE, 0 - xvstelm.w x4, XX, 1 * SIZE, 0 - add.d XX, XX, INCX - xvstelm.w x3, XX, 0 * SIZE, 1 - xvstelm.w x4, XX, 1 * SIZE, 1 - add.d XX, XX, INCX - xvstelm.w x3, XX, 0 * SIZE, 2 - xvstelm.w x4, XX, 1 * SIZE, 2 - add.d XX, XX, INCX - xvstelm.w x3, XX, 0 * SIZE, 3 - xvstelm.w x4, XX, 1 * SIZE, 3 - add.d XX, XX, INCX - xvstelm.w x3, XX, 0 * SIZE, 4 - xvstelm.w x4, XX, 1 * SIZE, 4 - add.d XX, XX, INCX - xvstelm.w x3, XX, 0 * SIZE, 5 - xvstelm.w x4, XX, 1 * SIZE, 5 - add.d XX, XX, INCX - xvstelm.w x3, XX, 0 * SIZE, 6 - xvstelm.w x4, XX, 1 * SIZE, 6 - add.d XX, XX, INCX - xvstelm.w x3, XX, 0 * SIZE, 7 - xvstelm.w x4, XX, 1 * SIZE, 7 -#endif - add.d XX, XX, INCX - blt $r0, I, .L223 - b .L997 - .align 3 - -.L224: //alpha_r != 0.0 && alpha_i != 0.0 +.L25: #ifdef DOUBLE ld.d t1, X, 0 * SIZE ld.d t2, X, 1 * SIZE @@ -465,19 +331,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvstelm.w x4, XX, 1 * SIZE, 7 #endif add.d XX, XX, INCX - blt $r0, I, .L224 - b .L997 + blt $r0, I, .L25 + b .L29 .align 3 -.L997: +/////// INCX != 1 && N < 8 /////// +.L29: #ifdef DOUBLE - andi I, N, 3 + andi I, N, 3 #else - andi I, N, 7 + andi I, N, 7 #endif - bge $r0, I, .L999 - .align 3 + beqz I, .L999 + bnez DUMMY2, .L998 // if DUMMPY2 == 1, called from c/zscal. + bceqz $fcc0, .L998 + + bceqz $fcc1, .L998 + +.L995: // alpha_r == 0.0 && alpha_i == 0.0 + ST a1, X, 0 * SIZE + ST a1, X, 1 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L995 + b .L999 .L998: LD a1, X, 0 * SIZE LD a2, X, 1 * SIZE @@ -490,7 +368,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ST s2, X, 1 * SIZE add.d X, X, INCX blt $r0, I, .L998 - .align 3 + b .L999 .L999: move $r4, $r12 From 2da86b80c939187936dd155def9380332cb3a67b Mon Sep 17 00:00:00 2001 From: gxw Date: Wed, 22 Jan 2025 14:32:20 +0800 Subject: [PATCH 020/205] LoongArch64: Fixed scalar version of cscal and zscal --- kernel/loongarch64/zscal.S | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/loongarch64/zscal.S b/kernel/loongarch64/zscal.S index a12e527a5..f6213b159 100644 --- a/kernel/loongarch64/zscal.S +++ b/kernel/loongarch64/zscal.S @@ -53,6 +53,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE li.d TEMP, 2 * SIZE + ld.d XX, $sp, 0 // Load dummy2 + slli.d XX, XX, ZBASE_SHIFT MTC a1, $r0 slli.d INCX, INCX, ZBASE_SHIFT bge $r0, N, .L999 @@ -60,6 +62,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. CMPEQ $fcc1, ALPHA_I, a1 bceqz $fcc0, .L50 bceqz $fcc1, .L50 + beq XX, TEMP, .L50 // if dummp2 == 1, do not directly copy 0 srai.d I, N, 2 bne INCX, TEMP, .L20 bge $r0, I, .L15 From 4d5b13f765ac4632c2330020ca2946b3de4a7aa0 Mon Sep 17 00:00:00 2001 From: Marek Michalowski Date: Tue, 21 Jan 2025 12:29:58 +0000 Subject: [PATCH 021/205] Add thread throttling profile for SGEMV on `NEOVERSEV1` --- CONTRIBUTORS.md | 5 ++++- interface/gemv.c | 36 +++++++++++++++++++++++++++++++----- 2 files changed, 35 insertions(+), 6 deletions(-) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 508dbcd0e..fcc80cc7e 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -231,4 +231,7 @@ In chronological order: * [2024-01-24] Optimize GEMV forwarding on ARM64 systems * Aniket P. Garade Sushil Pratap Singh Juliya James - * [2024-12-13] Optimized swap and rot Level-1 BLAS routines with ARM SVE + * [2024-12-13] Optimized swap and rot Level-1 BLAS routines with ARM SVE + +* Marek Michalowski + * [2025-01-21] Add thread throttling profile for SGEMV on `NEOVERSEV1` diff --git a/interface/gemv.c b/interface/gemv.c index 2c121f130..f91f364ee 100644 --- a/interface/gemv.c +++ b/interface/gemv.c @@ -63,6 +63,36 @@ static int (*gemv_thread[])(BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT }; #endif +#ifdef DYNAMIC_ARCH + extern char* gotoblas_corename(void); +#endif + +#if defined(DYNAMIC_ARCH) || defined(NEOVERSEV1) +static inline int get_gemv_optimal_nthreads_neoversev1(BLASLONG MN, int ncpu) { + return + MN < 25600L ? 1 + : MN < 63001L ? MIN(ncpu, 4) + : MN < 459684L ? MIN(ncpu, 16) + : ncpu; +} +#endif + +static inline int get_gemv_optimal_nthreads(BLASLONG MN) { + int ncpu = num_cpu_avail(3); +#if defined(NEOVERSEV1) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) + return get_gemv_optimal_nthreads_neoversev1(MN, ncpu); +#elif defined(DYNAMIC_ARCH) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) + if (strcmp(gotoblas_corename(), "neoversev1") == 0) { + return get_gemv_optimal_nthreads_neoversev1(MN, ncpu); + } +#endif + + if ( MN < 115200L * GEMM_MULTITHREAD_THRESHOLD ) + return 1; + else + return num_cpu_avail(2); +} + #ifndef CBLAS void NAME(char *TRANS, blasint *M, blasint *N, @@ -225,11 +255,7 @@ void CNAME(enum CBLAS_ORDER order, STACK_ALLOC(buffer_size, FLOAT, buffer); #ifdef SMP - - if ( 1L * m * n < 115200L * GEMM_MULTITHREAD_THRESHOLD ) - nthreads = 1; - else - nthreads = num_cpu_avail(2); + nthreads = get_gemv_optimal_nthreads(1L * m * n); if (nthreads == 1) { #endif From b58cba9eb6e32f3abae6c2f5a712039c6cca54de Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 22 Jan 2025 15:51:49 +0100 Subject: [PATCH 022/205] fix qrotm build rules --- kernel/Makefile.L1 | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/kernel/Makefile.L1 b/kernel/Makefile.L1 index 6e864e3d8..0fc672094 100644 --- a/kernel/Makefile.L1 +++ b/kernel/Makefile.L1 @@ -530,7 +530,7 @@ QBLASOBJS += \ iqamax_k$(TSUFFIX).$(SUFFIX) iqamin_k$(TSUFFIX).$(SUFFIX) iqmax_k$(TSUFFIX).$(SUFFIX) iqmin_k$(TSUFFIX).$(SUFFIX) \ qasum_k$(TSUFFIX).$(SUFFIX) qaxpy_k$(TSUFFIX).$(SUFFIX) qcopy_k$(TSUFFIX).$(SUFFIX) qdot_k$(TSUFFIX).$(SUFFIX) \ qnrm2_k$(TSUFFIX).$(SUFFIX) qrot_k$(TSUFFIX).$(SUFFIX) qscal_k$(TSUFFIX).$(SUFFIX) qswap_k$(TSUFFIX).$(SUFFIX) \ - qsum_k$(TSUFFIX).$(SUFFIX) + qsum_k$(TSUFFIX).$(SUFFIX) qrotm_k$(TSUFFIX).$(SUFFIX) CBLASOBJS += \ camax_k$(TSUFFIX).$(SUFFIX) camin_k$(TSUFFIX).$(SUFFIX) icamax_k$(TSUFFIX).$(SUFFIX) icamin_k$(TSUFFIX).$(SUFFIX) \ @@ -853,14 +853,17 @@ $(KDIR)srot_k$(TSUFFIX).$(SUFFIX) $(KDIR)srot_k$(TPSUFFIX).$(PSUFFIX) : $(KERN $(KDIR)drot_k$(TSUFFIX).$(SUFFIX) $(KDIR)drot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DROTKERNEL) $(CC) -c $(CFLAGS) $(FMAFLAG) -UCOMPLEX -UCOMPLEX -DDOUBLE $< -o $@ +$(KDIR)qrot_k$(TSUFFIX).$(SUFFIX) $(KDIR)qrot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QROTKERNEL) + $(CC) -c $(CFLAGS) $(FMAFLAG) -UCOMPLEX -UCOMPLEX -DXDOUBLE $< -o $@ + $(KDIR)srotm_k$(TSUFFIX).$(SUFFIX) $(KDIR)srotm_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SROTMKERNEL) $(CC) -c $(CFLAGS) $(FMAFLAG) -UCOMPLEX -UCOMPLEX -UDOUBLE $< -o $@ $(KDIR)drotm_k$(TSUFFIX).$(SUFFIX) $(KDIR)drotm_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DROTMKERNEL) $(CC) -c $(CFLAGS) $(FMAFLAG) -UCOMPLEX -UCOMPLEX -DDOUBLE $< -o $@ -$(KDIR)qrot_k$(TSUFFIX).$(SUFFIX) $(KDIR)qrot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QROTKERNEL) - $(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -DXDOUBLE $< -o $@ +$(KDIR)qrotm_k$(TSUFFIX).$(SUFFIX) $(KDIR)qrotm_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QROTMKERNEL) + $(CC) -c $(CFLAGS) $(FMAFLAG) -UCOMPLEX -UCOMPLEX -DXDOUBLE $< -o $@ $(KDIR)csrot_k$(TSUFFIX).$(SUFFIX) $(KDIR)csrot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CROTKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -UDOUBLE $< -o $@ From 4924319c508bbf72bd0ff9d56a5deff5fb58f31b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 22 Jan 2025 16:07:35 +0100 Subject: [PATCH 023/205] fix position of srotm, qrotm --- kernel/setparam-ref.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index 09b148b3e..3ed45697e 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -72,9 +72,9 @@ gotoblas_t TABLE_NAME = { samax_kTS, samin_kTS, smax_kTS, smin_kTS, isamax_kTS, isamin_kTS, ismax_kTS, ismin_kTS, - snrm2_kTS, sasum_kTS, ssum_kTS, scopy_kTS, sbdot_kTS, + snrm2_kTS, sasum_kTS, ssum_kTS, scopy_kTS, sbdot_kTS, dsdot_kTS, - srot_kTS, saxpy_kTS, sscal_kTS, sswap_kTS, + srot_kTS, srotm_kTS, saxpy_kTS, sscal_kTS, sswap_kTS, sbgemv_nTS, sbgemv_tTS, sger_kTS, ssymv_LTS, ssymv_UTS, @@ -158,7 +158,7 @@ gotoblas_t TABLE_NAME = { #if (BUILD_SINGLE==1) || (BUILD_DOUBLE==1) || (BUILD_COMPLEX==1) scopy_kTS, sdot_kTS, // dsdot_kTS, - srot_kTS, saxpy_kTS, srotm_kTS, + srot_kTS, srotm_kTS, saxpy_kTS, #endif #if (BUILD_SINGLE==1) || (BUILD_DOUBLE==1) || (BUILD_COMPLEX==1) || (BUILD_COMPLEX16==1) sscal_kTS, @@ -332,10 +332,9 @@ gotoblas_t TABLE_NAME = { qamax_kTS, qamin_kTS, qmax_kTS, qmin_kTS, iqamax_kTS, iqamin_kTS, iqmax_kTS, iqmin_kTS, qnrm2_kTS, qasum_kTS, qsum_kTS, qcopy_kTS, qdot_kTS, - qrot_kTS, qaxpy_kTS, qscal_kTS, qswap_kTS, + qrot_kTS, qrotm_kTS, qaxpy_kTS, qscal_kTS, qswap_kTS, qgemv_nTS, qgemv_tTS, qger_kTS, qsymv_LTS, qsymv_UTS, - qrotm_kTS, qgemm_kernelTS, qgemm_betaTS, #if QGEMM_DEFAULT_UNROLL_M != QGEMM_DEFAULT_UNROLL_N qgemm_incopyTS, qgemm_itcopyTS, From 111c9b0733008175bcfdccc5e3329ac96b314c69 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 22 Jan 2025 19:51:43 +0100 Subject: [PATCH 024/205] Add translations for C_COMPILER and OSNAME --- cmake/utils.cmake | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 9befc9a3c..a93f21686 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -16,6 +16,14 @@ endfunction () macro(ParseMakefileVars MAKEFILE_IN) message(STATUS "Reading vars from ${MAKEFILE_IN}...") set (C_COMPILER ${CMAKE_C_COMPILER_ID}) + set (OSNAME ${CMAKE_SYSTEM_NAME}) + if (${C_COMPILER} MATCHES Clang) + set (C_COMPILER CLANG) + endif () + if (${OSNAME} STREQUAL Windows) + set (OSNAME WINNT) + endif () +message(STATUS OS ${OSNAME} COMPILER ${C_COMPILER}) set (IfElse 0) set (ElseSeen 0) set (SkipIfs 0) From 1a6a9fb22f66cacabe620bd5be897833ecfaaded Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 23 Jan 2025 00:17:04 +0100 Subject: [PATCH 025/205] add another generator line for rotm --- kernel/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index bc713e603..b43cda2c1 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -65,6 +65,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) GenerateNamedObjects("${KERNELDIR}/${${float_char}COPYKERNEL}" "C_INTERFACE" "copy_k" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}NRM2KERNEL}" "" "nrm2_k" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}ROTKERNEL}" "" "rot_k" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}ROTMKERNEL}" "" "rotm_k" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}SCALKERNEL}" "" "scal_k" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}SWAPKERNEL}" "" "swap_k" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}AXPBYKERNEL}" "" "axpby_k" false "" "" false ${float_type}) From d1bfa979f7830ddf799cde43cfc6ae22000c4c52 Mon Sep 17 00:00:00 2001 From: Deeksha Goplani Date: Thu, 23 Jan 2025 09:41:45 +0530 Subject: [PATCH 026/205] small gemm kernel packing modifications --- kernel/arm64/dgemm_small_kernel_tn_sve.c | 2 +- kernel/arm64/dgemm_small_kernel_tt_sve.c | 2 +- kernel/arm64/sgemm_small_kernel_tn_sve.c | 2 +- kernel/arm64/sgemm_small_kernel_tt_sve.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/arm64/dgemm_small_kernel_tn_sve.c b/kernel/arm64/dgemm_small_kernel_tn_sve.c index 2ef23d7ee..8419e5065 100644 --- a/kernel/arm64/dgemm_small_kernel_tn_sve.c +++ b/kernel/arm64/dgemm_small_kernel_tn_sve.c @@ -213,7 +213,7 @@ CNAME(BLASLONG M, const BLASLONG n2 = N & -2; const BLASLONG n8 = N & -8; - const int pack_a = M >= v_size2 && N >= 8 && K >= 8 ? 1 : 0; + const int pack_a = M >= v_size2 && N >= 8 ? 1 : 0; FLOAT* packed_a = (pack_a) ? packed_a = (FLOAT*)malloc(K * v_size2 * sizeof(FLOAT)) : NULL; diff --git a/kernel/arm64/dgemm_small_kernel_tt_sve.c b/kernel/arm64/dgemm_small_kernel_tt_sve.c index efe11a9f9..0f06b4ecb 100644 --- a/kernel/arm64/dgemm_small_kernel_tt_sve.c +++ b/kernel/arm64/dgemm_small_kernel_tt_sve.c @@ -219,7 +219,7 @@ CNAME(BLASLONG M, const BLASLONG n4 = N & -4; const BLASLONG n2 = N & -2; - const int pack_a = M >= v_size2 && N >= 8 && K >= 8 ? 1 : 0; + const int pack_a = M >= v_size2 && N >= 8 ? 1 : 0; FLOAT* packed_a = (pack_a) ? packed_a = (FLOAT*)malloc(K * v_size2 * sizeof(FLOAT)) : NULL; diff --git a/kernel/arm64/sgemm_small_kernel_tn_sve.c b/kernel/arm64/sgemm_small_kernel_tn_sve.c index 114640950..c874af400 100644 --- a/kernel/arm64/sgemm_small_kernel_tn_sve.c +++ b/kernel/arm64/sgemm_small_kernel_tn_sve.c @@ -222,7 +222,7 @@ CNAME(BLASLONG M, const BLASLONG n8 = N & -8; const BLASLONG n4 = N & -4; - const int pack_a = M >= v_size2 && N >= 8 && K >= 8 ? 1 : 0; + const int pack_a = M >= v_size2 && N >= 8 ? 1 : 0; FLOAT* packed_a = (pack_a) ? packed_a = (FLOAT*)malloc(K * v_size2 * sizeof(FLOAT)) : NULL; diff --git a/kernel/arm64/sgemm_small_kernel_tt_sve.c b/kernel/arm64/sgemm_small_kernel_tt_sve.c index 731c9861b..b29e3e46b 100644 --- a/kernel/arm64/sgemm_small_kernel_tt_sve.c +++ b/kernel/arm64/sgemm_small_kernel_tt_sve.c @@ -223,7 +223,7 @@ CNAME(BLASLONG M, const BLASLONG n8 = N & -8; const BLASLONG n4 = N & -4; - const int pack_a = M >= v_size2 && N >= 8 && K >= 8 ? 1 : 0; + const int pack_a = M >= v_size2 && N >= 8 ? 1 : 0; FLOAT* packed_a = (pack_a) ? packed_a = (FLOAT*)malloc(K * v_size2 * sizeof(FLOAT)) : NULL; From 1ebcbdbab35e0b027e06347c241a38fb61adbf82 Mon Sep 17 00:00:00 2001 From: gxw Date: Thu, 23 Jan 2025 09:08:42 +0000 Subject: [PATCH 027/205] LoongArch64: Fixed the issue of using the old-style TARGET in cmake builds --- cmake/system.cmake | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/cmake/system.cmake b/cmake/system.cmake index 7413c88c8..9c437fc99 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -21,7 +21,15 @@ endif() # Other files expect CORE, which is actually TARGET and will become TARGET_CORE for kernel build. Confused yet? # It seems we are meant to use TARGET as input and CORE internally as kernel. if(NOT DEFINED CORE AND DEFINED TARGET) - set(CORE ${TARGET}) + if (${TARGET} STREQUAL "LOONGSON3R5") + set(CORE "LA464") + elseif (${TARGET} STREQUAL "LOONGSON2K1000") + set(CORE "LA264") + elseif (${TARGET} STREQUAL "LOONGSONGENERIC") + set(CORE "LA64_GENERIC)") + else () + set(CORE ${TARGET}) + endif() endif() # TARGET_CORE will override TARGET which is used in DYNAMIC_ARCH=1. From 9faebb3c974ce3665f879e5af8df0b6016e140fa Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 23 Jan 2025 17:59:45 +0100 Subject: [PATCH 028/205] fix lost indentation in the rules for the thread safety test --- Makefile.install | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile.install b/Makefile.install index cd1dcdabc..10e6425cc 100644 --- a/Makefile.install +++ b/Makefile.install @@ -315,8 +315,8 @@ endif endif ifeq ($(CPP_THREAD_SAFETY_TEST), 1) -@install -m 666 cpp_thread_test/dgemm_tester $(DESTDIR)$(OPENBLAS_BINARY_DIR) -@install -m 666 cpp_thread_test/dgemv_tester $(DESTDIR)$(OPENBLAS_BINARY_DIR) + @install -m 666 cpp_thread_test/dgemm_tester $(DESTDIR)$(OPENBLAS_BINARY_DIR) + @install -m 666 cpp_thread_test/dgemv_tester $(DESTDIR)$(OPENBLAS_BINARY_DIR) endif endif From 252c43265d155f6e14b6879c9fe322afcca6d77c Mon Sep 17 00:00:00 2001 From: Matthew Brett Date: Fri, 24 Jan 2025 12:58:20 +0000 Subject: [PATCH 029/205] Fix Windows on ARM build instructions The command as merged uses the compiler target as the compiler path. I have run and tested a build with this command. @Mugundanmcw - is this correct? --- docs/install.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/install.md b/docs/install.md index a3174202f..9b0d45a49 100644 --- a/docs/install.md +++ b/docs/install.md @@ -480,13 +480,13 @@ the LLVM toolchain enables native compilation of the Fortran sources of LAPACK a 4. Navigate to the OpenBLAS source code directory and start building OpenBLAS by invoking Ninja: - + ```cmd cd OpenBLAS mkdir build cd build - - cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release -DTARGET=ARMV8 -DBINARY=64 -DCMAKE_C_COMPILER=clang-cl -DCMAKE_C_COMPILER=arm64-pc-windows-msvc -DCMAKE_ASM_COMPILER=arm64-pc-windows-msvc -DCMAKE_Fortran_COMPILER=flang-new + + cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release -DTARGET=ARMV8 -DBINARY=64 -DCMAKE_C_COMPILER=clang-cl -DCMAKE_C_COMPILER_TARGET=arm64-pc-windows-msvc -DCMAKE_ASM_COMPILER_TARGET=arm64-pc-windows-msvc -DCMAKE_Fortran_COMPILER=flang-new ninja -j16 ``` From 1829ac5b4433283d336973e144ba9e57a4201ce9 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 25 Jan 2025 17:32:11 +0100 Subject: [PATCH 030/205] Add (dummy) declaration of SBROT_M --- common_param.h | 1 + 1 file changed, 1 insertion(+) diff --git a/common_param.h b/common_param.h index a3e4cea6b..f74a106db 100644 --- a/common_param.h +++ b/common_param.h @@ -77,6 +77,7 @@ BLASLONG (*isbmin_k) (BLASLONG, float *, BLASLONG); double (*dsbdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); int (*sbrot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float); + int (*sbrotm_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*sbaxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); int (*sbscal_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); From 7a27e2b00dbab44355b1850264d6c7222b5afc9b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 26 Jan 2025 18:36:58 +0100 Subject: [PATCH 031/205] Simplify build instructions for Windows on Arm --- docs/install.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/install.md b/docs/install.md index 9b0d45a49..656c6a121 100644 --- a/docs/install.md +++ b/docs/install.md @@ -486,7 +486,7 @@ the LLVM toolchain enables native compilation of the Fortran sources of LAPACK a mkdir build cd build - cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release -DTARGET=ARMV8 -DBINARY=64 -DCMAKE_C_COMPILER=clang-cl -DCMAKE_C_COMPILER_TARGET=arm64-pc-windows-msvc -DCMAKE_ASM_COMPILER_TARGET=arm64-pc-windows-msvc -DCMAKE_Fortran_COMPILER=flang-new + cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release -DTARGET=ARMV8 -DBINARY=64 -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang-new ninja -j16 ``` From d7036cfd7492ffa0a0debea48cab312f6237bad0 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 27 Jan 2025 09:32:17 +0100 Subject: [PATCH 032/205] Remove trailing blanks that break the cmake parser --- kernel/power/KERNEL.PPCG4 | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/power/KERNEL.PPCG4 b/kernel/power/KERNEL.PPCG4 index 0297df597..27e776a0f 100644 --- a/kernel/power/KERNEL.PPCG4 +++ b/kernel/power/KERNEL.PPCG4 @@ -71,12 +71,12 @@ CSCALKERNEL = zscal_ppc440.S ZSCALKERNEL = zscal_ppc440.S SGEMMKERNEL = gemm_kernel_g4.S -SGEMMINCOPY = -SGEMMITCOPY = +SGEMMINCOPY = +SGEMMITCOPY = SGEMMONCOPY = gemm_ncopy_4.S SGEMMOTCOPY = gemm_tcopy_4.S -SGEMMINCOPYOBJ = -SGEMMITCOPYOBJ = +SGEMMINCOPYOBJ = +SGEMMITCOPYOBJ = SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = gemm_kernel_g4.S From 5de507294003526684483086edfbba0843009286 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 30 Jan 2025 16:55:26 +0100 Subject: [PATCH 033/205] Improve flang-new identification and add CI job for it on OSX-x86_64 (#5103) * AzureCI: Add LLVM/flang-new build on OSX-x86_64 * distinguish classic flang from flang-new in name based recognition --- azure-pipelines.yml | 14 +++++++++++++- f_check | 7 +++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 0bdf4e316..26f4c2af3 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -164,7 +164,19 @@ jobs: - script: | brew update make CC=gcc-12 FC=gfortran-12 - + +- job: OSX_LLVM_flangnew + pool: + vmImage: 'macOS-latest' + variables: + LD_LIBRARY_PATH: /usr/local/opt/llvm/lib + LIBRARY_PATH: /usr/local/opt/llvm/lib + steps: + - script: | + brew update + brew install llvm flang + make TARGET=NEHALEM CC=/usr/local/opt/llvm/bin/clang FC=/usr/local/Cellar/flang/19.1.7_1/bin/flang-new NO_SHARED=1 + - job: OSX_OpenMP_Clang pool: vmImage: 'macOS-latest' diff --git a/f_check b/f_check index 93c5962de..244f6bcae 100755 --- a/f_check +++ b/f_check @@ -245,6 +245,13 @@ else ;; *flang*) vendor=FLANG + data=`$compiler -v 2>&1 > /dev/null` + v="${data#*version *}" + v="${v%%*.}" + major="${v%%.*}" + if [ "$major" -ge 17 ]; then + vendor=FLANGNEW + fi bu=_ openmp='-fopenmp' ;; From 6cd9bbe531c2551b52e2b1931039c6a4e7ba8471 Mon Sep 17 00:00:00 2001 From: John Hein Date: Sat, 1 Feb 2025 17:16:05 -0700 Subject: [PATCH 034/205] fix signedness of pointer to integer type passed to blas_lock() --- driver/level3/level3_thread.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index 3d56c45a9..a37292e8e 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -547,7 +547,7 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG #ifdef USE_OPENMP static omp_lock_t level3_lock, critical_section_lock; - static volatile BLASLONG init_lock = 0, omp_lock_initialized = 0, + static volatile BLASULONG init_lock = 0, omp_lock_initialized = 0, parallel_section_left = MAX_PARALLEL_NUMBER; // Lock initialization; Todo : Maybe this part can be moved to blas_init() in blas_server_omp.c From 4379a6fbe37038082c657bba5be5c67331a0bd0b Mon Sep 17 00:00:00 2001 From: Aditya Tewari Date: Tue, 5 Nov 2024 16:22:45 +0000 Subject: [PATCH 035/205] * checkpoint sbgemm for SVE-256 --- cmake/system.cmake | 4 +- kernel/arm64/KERNEL.NEOVERSEV1 | 14 +- kernel/arm64/sbgemm_beta_neoversev1.c | 83 +++ kernel/arm64/sbgemm_kernel_8x4_neoversev1.c | 46 ++ .../arm64/sbgemm_kernel_8x4_neoversev1_impl.c | 472 ++++++++++++++++++ kernel/arm64/sbgemm_ncopy_4_neoversev1.c | 127 +++++ kernel/arm64/sbgemm_ncopy_8_neoversev1.c | 180 +++++++ kernel/arm64/sbgemm_tcopy_4_neoversev1.c | 148 ++++++ kernel/arm64/sbgemm_tcopy_8_neoversev1.c | 200 ++++++++ param.h | 7 + 10 files changed, 1277 insertions(+), 4 deletions(-) create mode 100644 kernel/arm64/sbgemm_beta_neoversev1.c create mode 100644 kernel/arm64/sbgemm_kernel_8x4_neoversev1.c create mode 100644 kernel/arm64/sbgemm_kernel_8x4_neoversev1_impl.c create mode 100644 kernel/arm64/sbgemm_ncopy_4_neoversev1.c create mode 100644 kernel/arm64/sbgemm_ncopy_8_neoversev1.c create mode 100644 kernel/arm64/sbgemm_tcopy_4_neoversev1.c create mode 100644 kernel/arm64/sbgemm_tcopy_8_neoversev1.c diff --git a/cmake/system.cmake b/cmake/system.cmake index 9c437fc99..1bcd7eef9 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -291,10 +291,10 @@ if (DEFINED TARGET) if (${TARGET} STREQUAL NEOVERSEV1) if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) - set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -Msve_intrinsics -march=armv8.4-a+sve -mtune=neoverse-v1") + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -Msve_intrinsics -march=armv8.4-a+sve+bf16 -mtune=neoverse-v1") else () if (CMAKE_C_COMPILER_VERSION VERSION_GREATER 10.4 OR CMAKE_C_COMPILER_VERSION VERSION_EQUAL 10.4) - set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.4-a+sve -mtune=neoverse-v1") + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.4-a+sve+bf16 -mtune=neoverse-v1") else () message(FATAL_ERROR "Compiler ${CMAKE_C_COMPILER} ${CMAKE_C_COMPILER_VERSION} does not support Neoverse V1.") endif() diff --git a/kernel/arm64/KERNEL.NEOVERSEV1 b/kernel/arm64/KERNEL.NEOVERSEV1 index 859466409..7a7de3c7a 100644 --- a/kernel/arm64/KERNEL.NEOVERSEV1 +++ b/kernel/arm64/KERNEL.NEOVERSEV1 @@ -1,4 +1,14 @@ include $(KERNELDIR)/KERNEL.ARMV8SVE -SGEMVTKERNEL = gemv_t_sve_v1x3.c -DGEMVTKERNEL = gemv_t_sve_v1x3.c +SGEMVTKERNEL = gemv_t_sve.c +DGEMVTKERNEL = gemv_t_sve.c +SBGEMM_BETA = sbgemm_beta_neoversev1.c +SBGEMMKERNEL = sbgemm_kernel_$(SBGEMM_UNROLL_M)x$(SBGEMM_UNROLL_N)_neoversev1.c +SBGEMMINCOPY = sbgemm_ncopy_$(SBGEMM_UNROLL_M)_neoversev1.c +SBGEMMITCOPY = sbgemm_tcopy_$(SBGEMM_UNROLL_M)_neoversev1.c +SBGEMMONCOPY = sbgemm_ncopy_$(SBGEMM_UNROLL_N)_neoversev1.c +SBGEMMOTCOPY = sbgemm_tcopy_$(SBGEMM_UNROLL_N)_neoversev1.c +SBGEMMINCOPYOBJ = sbgemm_incopy$(TSUFFIX).$(SUFFIX) +SBGEMMITCOPYOBJ = sbgemm_itcopy$(TSUFFIX).$(SUFFIX) +SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX) +SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX) diff --git a/kernel/arm64/sbgemm_beta_neoversev1.c b/kernel/arm64/sbgemm_beta_neoversev1.c new file mode 100644 index 000000000..572d499d7 --- /dev/null +++ b/kernel/arm64/sbgemm_beta_neoversev1.c @@ -0,0 +1,83 @@ +/*************************************************************************** + * Copyright (c) 2024, The OpenBLAS Project + * All rights reserved. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of the OpenBLAS project nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * *****************************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, IFLOAT *dummy2, + BLASLONG dummy3, IFLOAT *dummy4, BLASLONG dummy5, FLOAT *c, + BLASLONG ldc) { + + BLASLONG i, j; + BLASLONG chunk, remain; + FLOAT *c_offset1, *c_offset; + c_offset = c; + chunk = m >> 3; + remain = m & 7; + if (beta == ZERO) { + for (j = n; j > 0; j--) { + c_offset1 = c_offset; + c_offset += ldc; + for (i = chunk; i > 0; i--) { + *(c_offset1 + 0) = ZERO; + *(c_offset1 + 1) = ZERO; + *(c_offset1 + 2) = ZERO; + *(c_offset1 + 3) = ZERO; + *(c_offset1 + 4) = ZERO; + *(c_offset1 + 5) = ZERO; + *(c_offset1 + 6) = ZERO; + *(c_offset1 + 7) = ZERO; + c_offset1 += 8; + } + for (i = remain; i > 0; i--) { + *c_offset1 = ZERO; + c_offset1++; + } + } + } else { + for (j = n; j > 0; j--) { + c_offset1 = c_offset; + c_offset += ldc; + for (i = chunk; i > 0; i--) { + *(c_offset1 + 0) *= beta; + *(c_offset1 + 1) *= beta; + *(c_offset1 + 2) *= beta; + *(c_offset1 + 3) *= beta; + *(c_offset1 + 4) *= beta; + *(c_offset1 + 5) *= beta; + *(c_offset1 + 6) *= beta; + *(c_offset1 + 7) *= beta; + c_offset1 += 8; + } + for (i = remain; i > 0; i--) { + *c_offset1 *= beta; + c_offset1++; + } + } + } + return 0; +}; diff --git a/kernel/arm64/sbgemm_kernel_8x4_neoversev1.c b/kernel/arm64/sbgemm_kernel_8x4_neoversev1.c new file mode 100644 index 000000000..d866fb335 --- /dev/null +++ b/kernel/arm64/sbgemm_kernel_8x4_neoversev1.c @@ -0,0 +1,46 @@ +/*************************************************************************** + * Copyright (c) 2024, The OpenBLAS Project + * All rights reserved. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of the OpenBLAS project nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * *****************************************************************************/ + +#include + +#include "common.h" + +#define ALPHA_ONE +#include "sbgemm_kernel_8x4_neoversev1_impl.c" +#undef ALPHA_ONE +#include "sbgemm_kernel_8x4_neoversev1_impl.c" + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT *A, IFLOAT *B, + FLOAT *C, BLASLONG ldc) { + if (alpha == 1.0f) + return sbgemm_kernel_neoversev1_alpha_one(m, n, k, alpha, A, B, C, ldc); + else + return sbgemm_kernel_neoversev1_alpha(m, n, k, alpha, A, B, C, ldc); + return 0; +} + diff --git a/kernel/arm64/sbgemm_kernel_8x4_neoversev1_impl.c b/kernel/arm64/sbgemm_kernel_8x4_neoversev1_impl.c new file mode 100644 index 000000000..86daa117e --- /dev/null +++ b/kernel/arm64/sbgemm_kernel_8x4_neoversev1_impl.c @@ -0,0 +1,472 @@ +/*************************************************************************** + * Copyright (c) 2024, The OpenBLAS Project + * All rights reserved. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of the OpenBLAS project nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * *****************************************************************************/ + +#include + +#include "common.h" + +#define INIT_C(M, N) mc##M##N = svdup_f32(0); + +#define MATMUL(M, N) mc##M##N = svbfmmla(mc##M##N, ma##M, mb##N); + +#define INIT_C_8x4 \ + do { \ + INIT_C(0, 0); \ + INIT_C(0, 1); \ + INIT_C(1, 0); \ + INIT_C(1, 1); \ + INIT_C(2, 0); \ + INIT_C(2, 1); \ + INIT_C(3, 0); \ + INIT_C(3, 1); \ + } while (0); + +#ifdef ALPHA_ONE +#define UPDATE_C(PG, PTR, DST, SRC) \ + do { \ + DST = svld1_f32((PG), (PTR)); \ + DST = svadd_z((PG), SRC, DST); \ + svst1_f32((PG), (PTR), DST); \ + } while (0); +#else +#define UPDATE_C(PG, PTR, DST, SRC) \ + do { \ + DST = svld1_f32((PG), (PTR)); \ + DST = svmad_z((PG), svalpha, SRC, DST); \ + svst1_f32((PG), (PTR), DST); \ + } while (0); +#endif + +#ifdef ALPHA_ONE +int sbgemm_kernel_neoversev1_alpha_one(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT * B, FLOAT * C, BLASLONG ldc) +#else +int sbgemm_kernel_neoversev1_alpha(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT * B, FLOAT * C, BLASLONG ldc) +#endif +{ + BLASLONG pad_k = (k + 3) & ~3; + + svbfloat16_t ma0, ma1, ma2, ma3, mb0, mb1; + svfloat32_t mc00, mc01, mc10, mc11, mc20, mc21, mc30, mc31, + vc0, vc1, vc2, vc3, vc4, vc5, vc6, vc7, + oc0, oc1, oc2, oc3, oc4, oc5, oc6, oc7; + svfloat32_t svalpha = svdup_f32(alpha); + + svbool_t pg16 = svptrue_b16(); + svbool_t pg16_low = svdupq_b16(1, 1, 1, 1, 0, 0, 0, 0); + svbool_t pg32 = svptrue_b32(); + svbool_t pg32_low = svdupq_b32(1, 1, 0, 0); + svbool_t pg32_first = svdupq_b32(1, 0, 0, 0); + + bfloat16_t *ptr_a = (bfloat16_t *)A; + bfloat16_t *ptr_b = (bfloat16_t *)B; + FLOAT *ptr_c = C; + + bfloat16_t *ptr_a0, *ptr_a1, *ptr_a2, *ptr_a3; + bfloat16_t *ptr_b0, *ptr_b1; + FLOAT *ptr_c0, *ptr_c1, *ptr_c2, *ptr_c3; + + for (BLASLONG j = 0; j < n / 4; j++) { + ptr_c0 = ptr_c; + ptr_c1 = ptr_c0 + ldc; + ptr_c2 = ptr_c1 + ldc; + ptr_c3 = ptr_c2 + ldc; + ptr_c += 4 * ldc; + ptr_a = (bfloat16_t *)A; + + for (BLASLONG i = 0; i < m / 8; i++) { + ptr_a0 = ptr_a; + ptr_a += 8 * pad_k; + + ptr_b0 = ptr_b; + + INIT_C_8x4; + + for (BLASLONG p = 0; p < pad_k; p += 4) { + ma0 = svld1_bf16(pg16, ptr_a0); + ma1 = svld1_bf16(pg16, ptr_a0 + 8); + ma2 = svld1_bf16(pg16, ptr_a0 + 16); + ma3 = svld1_bf16(pg16, ptr_a0 + 24); + + mb0 = svld1_bf16(pg16, ptr_b0); + mb1 = svld1_bf16(pg16, ptr_b0 + 8); + + MATMUL(0, 0); MATMUL(0, 1); + MATMUL(1, 0); MATMUL(1, 1); + MATMUL(2, 0); MATMUL(2, 1); + MATMUL(3, 0); MATMUL(3, 1); + + ptr_a0 += 32; + ptr_b0 += 16; + } + + vc0 = svuzp1(mc00, mc10); + vc1 = svuzp1(mc20, mc30); + vc2 = svuzp2(mc00, mc10); + vc3 = svuzp2(mc20, mc30); + vc4 = svuzp1(mc01, mc11); + vc5 = svuzp1(mc21, mc31); + vc6 = svuzp2(mc01, mc11); + vc7 = svuzp2(mc21, mc31); + + UPDATE_C(pg32, ptr_c0, oc0, vc0); + UPDATE_C(pg32, ptr_c0+4, oc1, vc1); + UPDATE_C(pg32, ptr_c1, oc2, vc2); + UPDATE_C(pg32, ptr_c1+4, oc3, vc3); + UPDATE_C(pg32, ptr_c2, oc4, vc4) + UPDATE_C(pg32, ptr_c2+4, oc5, vc5); + UPDATE_C(pg32, ptr_c3, oc6, vc6) + UPDATE_C(pg32, ptr_c3+4, oc7, vc7); + + ptr_c0 += 8; + ptr_c1 += 8; + ptr_c2 += 8; + ptr_c3 += 8; + } + + if (m & 4) { + ptr_a0 = ptr_a; + ptr_a += 4 * pad_k; + ptr_b0 = ptr_b; + + INIT_C(0, 0); INIT_C(0, 1); + INIT_C(1, 0); INIT_C(1, 1); + + for (BLASLONG p = 0; p < pad_k; p += 4) { + ma0 = svld1_bf16(pg16, ptr_a0); + ma1 = svld1_bf16(pg16, ptr_a0 + 8); + mb0 = svld1_bf16(pg16, ptr_b0); + mb1 = svld1_bf16(pg16, ptr_b0 + 8); + + MATMUL(0, 0); MATMUL(0, 1); + MATMUL(1, 0); MATMUL(1, 1); + + ptr_a0 += 16; + ptr_b0 += 16; + } + + vc0 = svuzp1(mc00, mc10); + vc1 = svuzp2(mc00, mc10); + vc2 = svuzp1(mc01, mc11); + vc3 = svuzp2(mc01, mc11); + + UPDATE_C(pg32, ptr_c0, oc0, vc0); + UPDATE_C(pg32, ptr_c1, oc1, vc1); + UPDATE_C(pg32, ptr_c2, oc2, vc2); + UPDATE_C(pg32, ptr_c3, oc3, vc3); + + ptr_c0 += 4; + ptr_c1 += 4; + ptr_c2 += 4; + ptr_c3 += 4; + } + + if (m & 2) { + ptr_a0 = ptr_a; + ptr_a += 2 * pad_k; + ptr_b0 = ptr_b; + + INIT_C(0, 0); INIT_C(0, 1); + for (BLASLONG p = 0; p < pad_k; p += 4) { + ma0 = svld1_bf16(pg16, ptr_a0); + mb0 = svld1_bf16(pg16, ptr_b0); + mb1 = svld1_bf16(pg16, ptr_b0 + 8); + + MATMUL(0, 0); MATMUL(0, 1); + + ptr_a0 += 8; + ptr_b0 += 16; + } + + vc0 = svuzp1(mc00, mc00); + vc1 = svuzp2(mc00, mc00); + vc2 = svuzp1(mc01, mc01); + vc3 = svuzp2(mc01, mc01); + + UPDATE_C(pg32_low, ptr_c0, oc0, vc0); + UPDATE_C(pg32_low, ptr_c1, oc1, vc1); + UPDATE_C(pg32_low, ptr_c2, oc2, vc2); + UPDATE_C(pg32_low, ptr_c3, oc3, vc3); + + ptr_c0 += 2; + ptr_c1 += 2; + ptr_c2 += 2; + ptr_c3 += 2; + } + + if (m & 1) { + ptr_a0 = ptr_a; + ptr_b0 = ptr_b; + + INIT_C(0, 0); INIT_C(0, 1); + for (BLASLONG p = 0; p < pad_k; p += 4) { + ma0 = svld1_bf16(pg16_low, ptr_a0); + mb0 = svld1_bf16(pg16, ptr_b0); + mb1 = svld1_bf16(pg16, ptr_b0 + 8); + + MATMUL(0, 0); MATMUL(0, 1); + + ptr_a0 += 4; + ptr_b0 += 16; + } + + vc1 = svuzp2(mc00, mc00); + vc3 = svuzp2(mc01, mc01); + + UPDATE_C(pg32_first, ptr_c0, oc0, mc00); + UPDATE_C(pg32_first, ptr_c1, oc1, vc1); + UPDATE_C(pg32_first, ptr_c2, oc2, mc01); + UPDATE_C(pg32_first, ptr_c3, oc3, vc3); + + } + + ptr_b += 4 * pad_k; + } + + if (n & 2) { + ptr_c0 = ptr_c; + ptr_c1 = ptr_c0 + ldc; + ptr_c += 2 * ldc; + ptr_a = (bfloat16_t *)A; + + for (BLASLONG i = 0; i < m / 8; i++) { + ptr_a0 = ptr_a; + ptr_a += 8 * pad_k; + + ptr_b0 = ptr_b; + + INIT_C(0, 0); + INIT_C(1, 0); + INIT_C(2, 0); + INIT_C(3, 0); + + for (BLASLONG p = 0; p < pad_k; p += 4) { + ma0 = svld1_bf16(pg16, ptr_a0); + ma1 = svld1_bf16(pg16, ptr_a0 + 8); + ma2 = svld1_bf16(pg16, ptr_a0 + 16); + ma3 = svld1_bf16(pg16, ptr_a0 + 24); + + mb0 = svld1_bf16(pg16, ptr_b0); + + MATMUL(0, 0); + MATMUL(1, 0); + MATMUL(2, 0); + MATMUL(3, 0); + + ptr_a0 += 32; + ptr_b0 += 8; + } + + vc0 = svuzp1(mc00, mc10); + vc1 = svuzp1(mc20, mc30); + vc2 = svuzp2(mc00, mc10); + vc3 = svuzp2(mc20, mc30); + + UPDATE_C(pg32, ptr_c0, oc0, vc0); + UPDATE_C(pg32, ptr_c0 + 4, oc1, vc1); + UPDATE_C(pg32, ptr_c1, oc2, vc2); + UPDATE_C(pg32, ptr_c1 + 4, oc3, vc3); + + ptr_c0 += 8; + ptr_c1 += 8; + } + + if (m & 4) { + ptr_a0 = ptr_a; + ptr_a += 4 * pad_k; + ptr_b0 = ptr_b; + + INIT_C(0, 0); + INIT_C(1, 0); + + for (BLASLONG p = 0; p < pad_k; p += 4) { + ma0 = svld1_bf16(pg16, ptr_a0); + ma1 = svld1_bf16(pg16, ptr_a0 + 8); + mb0 = svld1_bf16(pg16, ptr_b0); + MATMUL(0, 0); + MATMUL(1, 0); + ptr_a0 += 16; + ptr_b0 += 8; + } + + vc0 = svuzp1(mc00, mc10); + vc1 = svuzp2(mc00, mc10); + + UPDATE_C(pg32, ptr_c0, oc0, vc0); + UPDATE_C(pg32, ptr_c1, oc1, vc1); + + ptr_c0 += 4; + ptr_c1 += 4; + } + + if (m & 2) { + ptr_a0 = ptr_a; + ptr_a += 2 * pad_k; + ptr_b0 = ptr_b; + + INIT_C(0, 0); + + for (BLASLONG p = 0; p < pad_k; p += 4) { + ma0 = svld1_bf16(pg16, ptr_a0); + mb0 = svld1_bf16(pg16, ptr_b0); + + MATMUL(0, 0); + + ptr_a0 += 8; + ptr_b0 += 8; + } + + vc0 = svuzp1(mc00, mc00); + vc1 = svuzp2(mc00, mc00); + UPDATE_C(pg32_low, ptr_c0, oc0, vc0); + UPDATE_C(pg32_low, ptr_c1, oc1, vc1); + + ptr_c0 += 2; + ptr_c1 += 2; + + } + + if (m & 1) { + ptr_a0 = ptr_a; + ptr_b0 = ptr_b; + INIT_C(0, 0); + for (BLASLONG p = 0; p < pad_k; p += 4) { + ma0 = svld1_bf16(pg16_low, ptr_a0); + mb0 = svld1_bf16(pg16, ptr_b0); + MATMUL(0, 0); + ptr_a0 += 4; + ptr_b0 += 8; + } + vc1 = svuzp2(mc00, mc00); + + UPDATE_C(pg32_first, ptr_c0, oc0, mc00); + UPDATE_C(pg32_first, ptr_c1, oc1, vc1); + } + + ptr_b += 2 * pad_k; + } + + if (n & 1) { + ptr_c0 = ptr_c; + ptr_a = (bfloat16_t *)A; + + for (BLASLONG i = 0; i < m / 8; i++) { + ptr_a0 = ptr_a; + ptr_a += 8 * pad_k; + + ptr_b0 = ptr_b; + + INIT_C(0, 0); + INIT_C(1, 0); + INIT_C(2, 0); + INIT_C(3, 0); + + for (BLASLONG p = 0; p < pad_k; p += 4) { + ma0 = svld1_bf16(pg16, ptr_a0); + ma1 = svld1_bf16(pg16, ptr_a0 + 8); + ma2 = svld1_bf16(pg16, ptr_a0 + 16); + ma3 = svld1_bf16(pg16, ptr_a0 + 24); + + mb0 = svld1_bf16(pg16_low, ptr_b0); + + MATMUL(0, 0); + MATMUL(1, 0); + MATMUL(2, 0); + MATMUL(3, 0); + + ptr_a0 += 32; + ptr_b0 += 4; + } + + vc0 = svuzp1(mc00, mc10); + vc1 = svuzp1(mc20, mc30); + + UPDATE_C(pg32, ptr_c0, oc0, vc0); + UPDATE_C(pg32, ptr_c0 + 4, oc1, vc1); + + ptr_c0 += 8; + } + + if (m & 4) { + ptr_a0 = ptr_a; + ptr_a += 4 * pad_k; + ptr_b0 = ptr_b; + INIT_C(0, 0); + INIT_C(1, 0); + for (BLASLONG p = 0; p < pad_k; p += 4) { + ma0 = svld1_bf16(pg16, ptr_a0); + ma1 = svld1_bf16(pg16, ptr_a0 + 8); + mb0 = svld1_bf16(pg16_low, ptr_b0); + MATMUL(0, 0); + MATMUL(1, 0); + ptr_a0 += 16; + ptr_b0 += 4; + } + vc0 = svuzp1(mc00, mc10); + UPDATE_C(pg32, ptr_c0, oc0, vc0); + ptr_c0 += 4; + } + + if (m & 2) { + ptr_a0 = ptr_a; + ptr_a += 2 * pad_k; + ptr_b0 = ptr_b; + + INIT_C(0, 0); + + for (BLASLONG p = 0; p < pad_k; p += 4) { + ma0 = svld1_bf16(pg16, ptr_a0); + mb0 = svld1_bf16(pg16_low, ptr_b0); + + MATMUL(0, 0); + + ptr_a0 += 8; + ptr_b0 += 4; + } + vc0 = svuzp1(mc00, mc00); + UPDATE_C(pg32_low, ptr_c0, oc0, vc0); + ptr_c0 += 2; + } + + if (m & 1) { + ptr_a0 = ptr_a; + ptr_b0 = ptr_b; + INIT_C(0, 0); + for (BLASLONG p = 0; p < pad_k; p += 4) { + ma0 = svld1_bf16(pg16_low, ptr_a0); + mb0 = svld1_bf16(pg16_low, ptr_b0); + MATMUL(0, 0); + ptr_a0 += 4; + ptr_b0 += 4; + } + UPDATE_C(pg32_first, ptr_c0, oc0, mc00); + } + } + + return 0; +} + diff --git a/kernel/arm64/sbgemm_ncopy_4_neoversev1.c b/kernel/arm64/sbgemm_ncopy_4_neoversev1.c new file mode 100644 index 000000000..59d0dc58c --- /dev/null +++ b/kernel/arm64/sbgemm_ncopy_4_neoversev1.c @@ -0,0 +1,127 @@ +/*************************************************************************** + * Copyright (c) 2024, The OpenBLAS Project + * All rights reserved. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of the OpenBLAS project nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * *****************************************************************************/ + +#include + +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) { + IFLOAT *a_offset; + IFLOAT *a_offsetx[4]; + IFLOAT *b_offset; + a_offset = a; + b_offset = b; + + svbool_t pg16 = svdupq_b16(1, 1, 1, 1, 0, 0, 0, 0); + svbfloat16_t v0, v1, v2, v3; + + for (BLASLONG j = 0; j < n / 4; j++) { + a_offsetx[0] = a_offset; + a_offsetx[1] = a_offsetx[0] + lda; + a_offsetx[2] = a_offsetx[1] + lda; + a_offsetx[3] = a_offsetx[2] + lda; + a_offset += 4 * lda; + + for (BLASLONG i = 0; i < m / 4; i++) { + v0 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[0]); + v1 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[1]); + v2 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[2]); + v3 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[3]); + + svst1_bf16(pg16, (bfloat16_t *)b_offset, v0); + svst1_bf16(pg16, (bfloat16_t *)b_offset + 4, v1); + svst1_bf16(pg16, (bfloat16_t *)b_offset + 8, v2); + svst1_bf16(pg16, (bfloat16_t *)b_offset + 12, v3); + + b_offset += 16; + a_offsetx[0] += 4; + a_offsetx[1] += 4; + a_offsetx[2] += 4; + a_offsetx[3] += 4; + } + + if (m & 3) { + BLASLONG rest = m & 3; + for (BLASLONG col = 0; col < 4; col++) { + b_offset[4 * col] = a_offsetx[col][0]; + b_offset[4 * col + 1] = rest == 1 ? 0 : a_offsetx[col][1]; + b_offset[4 * col + 2] = rest <= 2 ? 0 : a_offsetx[col][2]; + b_offset[4 * col + 3] = rest <= 3 ? 0 : a_offsetx[col][3]; + } + b_offset += 16; + } + } + + if (n & 2) { + a_offsetx[0] = a_offset; + a_offsetx[1] = a_offsetx[0] + lda; + a_offset += 2 * lda; + + for (BLASLONG i = 0; i < m / 4; i++) { + v0 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[0]); + v1 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[1]); + svst1_bf16(pg16, (bfloat16_t *)b_offset, v0); + svst1_bf16(pg16, (bfloat16_t *)b_offset + 4, v1); + + b_offset += 8; + a_offsetx[0] += 4; + a_offsetx[1] += 4; + } + + if (m & 3) { + BLASLONG rest = m & 3; + for (BLASLONG col = 0; col < 2; col++) { + b_offset[4 * col] = a_offsetx[col][0]; + b_offset[4 * col + 1] = rest == 1 ? 0 : a_offsetx[col][1]; + b_offset[4 * col + 2] = rest <= 2 ? 0 : a_offsetx[col][2]; + b_offset[4 * col + 3] = rest <= 3 ? 0 : a_offsetx[col][3]; + } + b_offset += 8; + } + } + + if (n & 1) { + a_offsetx[0] = a_offset; + for (BLASLONG i = 0; i < m / 4; i++) { + v0 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[0]); + svst1_bf16(pg16, (bfloat16_t *)b_offset, v0); + b_offset += 4; + a_offsetx[0] += 4; + } + if (m & 3) { + BLASLONG rest = m & 3; + b_offset[0] = a_offsetx[0][0]; + b_offset[1] = rest == 1 ? 0 : a_offsetx[0][1]; + b_offset[2] = rest <= 2 ? 0 : a_offsetx[0][2]; + b_offset[3] = rest <= 3 ? 0 : a_offsetx[0][3]; + } + } + + return 0; +} + diff --git a/kernel/arm64/sbgemm_ncopy_8_neoversev1.c b/kernel/arm64/sbgemm_ncopy_8_neoversev1.c new file mode 100644 index 000000000..34412f05f --- /dev/null +++ b/kernel/arm64/sbgemm_ncopy_8_neoversev1.c @@ -0,0 +1,180 @@ +/*************************************************************************** + * Copyright (c) 2024, The OpenBLAS Project + * All rights reserved. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of the OpenBLAS project nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * *****************************************************************************/ + +#include + +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) { + IFLOAT *a_offset; + IFLOAT *a_offsetx[8]; + IFLOAT *b_offset; + a_offset = a; + b_offset = b; + + svbool_t pg16 = svdupq_b16(1, 1, 1, 1, 0, 0, 0, 0); + svbfloat16_t v0, v1, v2, v3, v4, v5, v6, v7; + + for (BLASLONG j = 0; j < n / 8; j++) { + a_offsetx[0] = a_offset; + a_offsetx[1] = a_offsetx[0] + lda; + a_offsetx[2] = a_offsetx[1] + lda; + a_offsetx[3] = a_offsetx[2] + lda; + a_offsetx[4] = a_offsetx[3] + lda; + a_offsetx[5] = a_offsetx[4] + lda; + a_offsetx[6] = a_offsetx[5] + lda; + a_offsetx[7] = a_offsetx[6] + lda; + a_offset += 8 * lda; + + for (BLASLONG i = 0; i < m / 4; i++) { + v0 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[0]); + v1 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[1]); + v2 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[2]); + v3 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[3]); + v4 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[4]); + v5 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[5]); + v6 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[6]); + v7 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[7]); + + svst1_bf16(pg16, (bfloat16_t *)b_offset, v0); + svst1_bf16(pg16, (bfloat16_t *)b_offset + 4, v1); + svst1_bf16(pg16, (bfloat16_t *)b_offset + 8, v2); + svst1_bf16(pg16, (bfloat16_t *)b_offset + 12, v3); + svst1_bf16(pg16, (bfloat16_t *)b_offset + 16, v4); + svst1_bf16(pg16, (bfloat16_t *)b_offset + 20, v5); + svst1_bf16(pg16, (bfloat16_t *)b_offset + 24, v6); + svst1_bf16(pg16, (bfloat16_t *)b_offset + 28, v7); + + b_offset += 32; + a_offsetx[0] += 4; + a_offsetx[1] += 4; + a_offsetx[2] += 4; + a_offsetx[3] += 4; + a_offsetx[4] += 4; + a_offsetx[5] += 4; + a_offsetx[6] += 4; + a_offsetx[7] += 4; + } + + if (m & 3) { + BLASLONG rest = m & 3; + for (BLASLONG col = 0; col < 8; col++) { + b_offset[4 * col] = a_offsetx[col][0]; + b_offset[4 * col + 1] = rest == 1 ? 0 : a_offsetx[col][1]; + b_offset[4 * col + 2] = rest <= 2 ? 0 : a_offsetx[col][2]; + b_offset[4 * col + 3] = rest <= 3 ? 0 : a_offsetx[col][3]; + } + b_offset += 32; + } + } + + if (n & 4) { + a_offsetx[0] = a_offset; + a_offsetx[1] = a_offsetx[0] + lda; + a_offsetx[2] = a_offsetx[1] + lda; + a_offsetx[3] = a_offsetx[2] + lda; + a_offset += 4 * lda; + + for (BLASLONG i = 0; i < m / 4; i++) { + v0 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[0]); + v1 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[1]); + v2 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[2]); + v3 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[3]); + + svst1_bf16(pg16, (bfloat16_t *)b_offset, v0); + svst1_bf16(pg16, (bfloat16_t *)b_offset + 4, v1); + svst1_bf16(pg16, (bfloat16_t *)b_offset + 8, v2); + svst1_bf16(pg16, (bfloat16_t *)b_offset + 12, v3); + + b_offset += 16; + a_offsetx[0] += 4; + a_offsetx[1] += 4; + a_offsetx[2] += 4; + a_offsetx[3] += 4; + } + + if (m & 3) { + BLASLONG rest = m & 3; + for (BLASLONG col = 0; col < 4; col++) { + b_offset[4 * col] = a_offsetx[col][0]; + b_offset[4 * col + 1] = rest == 1 ? 0 : a_offsetx[col][1]; + b_offset[4 * col + 2] = rest <= 2 ? 0 : a_offsetx[col][2]; + b_offset[4 * col + 3] = rest <= 3 ? 0 : a_offsetx[col][3]; + } + b_offset += 16; + } + } + + if (n & 2) { + a_offsetx[0] = a_offset; + a_offsetx[1] = a_offsetx[0] + lda; + a_offset += 2 * lda; + + for (BLASLONG i = 0; i < m / 4; i++) { + v0 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[0]); + v1 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[1]); + svst1_bf16(pg16, (bfloat16_t *)b_offset, v0); + svst1_bf16(pg16, (bfloat16_t *)b_offset + 4, v1); + + b_offset += 8; + a_offsetx[0] += 4; + a_offsetx[1] += 4; + } + + if (m & 3) { + BLASLONG rest = m & 3; + for (BLASLONG col = 0; col < 2; col++) { + b_offset[4 * col] = a_offsetx[col][0]; + b_offset[4 * col + 1] = rest == 1 ? 0 : a_offsetx[col][1]; + b_offset[4 * col + 2] = rest <= 2 ? 0 : a_offsetx[col][2]; + b_offset[4 * col + 3] = rest <= 3 ? 0 : a_offsetx[col][3]; + } + b_offset += 8; + } + } + + if (n & 1) { + a_offsetx[0] = a_offset; + for (BLASLONG i = 0; i < m / 4; i++) { + v0 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[0]); + svst1_bf16(pg16, (bfloat16_t *)b_offset, v0); + b_offset += 4; + a_offsetx[0] += 4; + } + if (m & 3) { + BLASLONG rest = m & 3; + b_offset[0] = a_offsetx[0][0]; + b_offset[1] = rest == 1 ? 0 : a_offsetx[0][1]; + b_offset[2] = rest <= 2 ? 0 : a_offsetx[0][2]; + b_offset[3] = rest <= 3 ? 0 : a_offsetx[0][3]; + } + } + + return 0; +} + diff --git a/kernel/arm64/sbgemm_tcopy_4_neoversev1.c b/kernel/arm64/sbgemm_tcopy_4_neoversev1.c new file mode 100644 index 000000000..5f6241ff8 --- /dev/null +++ b/kernel/arm64/sbgemm_tcopy_4_neoversev1.c @@ -0,0 +1,148 @@ +/*************************************************************************** + * Copyright (c) 2024, The OpenBLAS Project + * All rights reserved. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of the OpenBLAS project nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * *****************************************************************************/ +#include + +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) { + IFLOAT *a_offset, *a_offset0, *a_offset1, *a_offset2, *a_offset3; + IFLOAT *b_offset; + a_offset = a; + b_offset = b; + + uint16x4_t v0_h, v1_h, v2_h, v3_h, v4_h, v5_h, v6_h, v7_h; + + for (BLASLONG j = 0; j < n / 4; j++) { + a_offset0 = a_offset; + a_offset1 = a_offset0 + lda; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + a_offset += 4; + + for (BLASLONG i = 0; i < m / 4; i++) { + v0_h = vld1_u16(a_offset0); + v1_h = vld1_u16(a_offset1); + v2_h = vld1_u16(a_offset2); + v3_h = vld1_u16(a_offset3); + + v4_h = vtrn1_u16(v0_h, v1_h); + v5_h = vtrn2_u16(v0_h, v1_h); + v6_h = vtrn1_u16(v2_h, v3_h); + v7_h = vtrn2_u16(v2_h, v3_h); + + v0_h = (uint16x4_t)vtrn1_u32((uint32x2_t)v4_h, (uint32x2_t)v6_h); + v1_h = (uint16x4_t)vtrn1_u32((uint32x2_t)v5_h, (uint32x2_t)v7_h); + v2_h = (uint16x4_t)vtrn2_u32((uint32x2_t)v4_h, (uint32x2_t)v6_h); + v3_h = (uint16x4_t)vtrn2_u32((uint32x2_t)v5_h, (uint32x2_t)v7_h); + + vst1_u16(b_offset, v0_h); + vst1_u16(b_offset + 4, v1_h); + vst1_u16(b_offset + 8, v2_h); + vst1_u16(b_offset + 12, v3_h); + + b_offset += 16; + a_offset0 += 4 * lda; + a_offset1 += 4 * lda; + a_offset2 += 4 * lda; + a_offset3 += 4 * lda; + } + + if (m & 3) { + BLASLONG rest = m & 3; + for (BLASLONG line = 0; line < 4; line++) { + b_offset[line * 4] = a_offset0[line]; + b_offset[line * 4 + 1] = rest == 1 ? 0 : a_offset1[line]; + b_offset[line * 4 + 2] = rest <= 2 ? 0 : a_offset2[line]; + b_offset[line * 4 + 3] = rest <= 3 ? 0 : a_offset3[line]; + } + b_offset += 16; + } + } + + if (n & 2) { + a_offset0 = a_offset; + a_offset1 = a_offset0 + lda; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + a_offset += 2; + + for (BLASLONG i = 0; i < m / 4; i++) { + for (BLASLONG line = 0; line < 2; line++) { + b_offset[line * 4] = a_offset0[line]; + b_offset[line * 4 + 1] = a_offset1[line]; + b_offset[line * 4 + 2] = a_offset2[line]; + b_offset[line * 4 + 3] = a_offset3[line]; + } + b_offset += 8; + a_offset0 += 4 * lda; + a_offset1 += 4 * lda; + a_offset2 += 4 * lda; + a_offset3 += 4 * lda; + } + + if (m & 3) { + BLASLONG rest = m & 3; + for (BLASLONG line = 0; line < 2; line++) { + b_offset[line * 4] = a_offset0[line]; + b_offset[line * 4 + 1] = rest == 1 ? 0 : a_offset1[line]; + b_offset[line * 4 + 2] = rest <= 2 ? 0 : a_offset2[line]; + b_offset[line * 4 + 3] = rest <= 3 ? 0 : a_offset3[line]; + } + b_offset += 8; + } + } + + if (n & 1) { + a_offset0 = a_offset; + a_offset1 = a_offset0 + lda; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + + for (BLASLONG i = 0; i < m / 4; i++) { + b_offset[0] = *a_offset0; + b_offset[1] = *a_offset1; + b_offset[2] = *a_offset2; + b_offset[3] = *a_offset3; + b_offset += 4; + a_offset0 += 4 * lda; + a_offset1 += 4 * lda; + a_offset2 += 4 * lda; + a_offset3 += 4 * lda; + } + + if (m & 3) { + BLASLONG rest = m & 3; + b_offset[0] = *a_offset0; + b_offset[1] = rest == 1 ? 0 : *a_offset1; + b_offset[2] = rest <= 2 ? 0 : *a_offset2; + b_offset[3] = rest <= 3 ? 0 : *a_offset3; + } + } + return 0; +} + diff --git a/kernel/arm64/sbgemm_tcopy_8_neoversev1.c b/kernel/arm64/sbgemm_tcopy_8_neoversev1.c new file mode 100644 index 000000000..1a1198d02 --- /dev/null +++ b/kernel/arm64/sbgemm_tcopy_8_neoversev1.c @@ -0,0 +1,200 @@ +/*************************************************************************** + * Copyright (c) 2024, The OpenBLAS Project + * All rights reserved. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of the OpenBLAS project nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * *****************************************************************************/ +#include + +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) { + IFLOAT *a_offset, *a_offset0, *a_offset1, *a_offset2, *a_offset3; + IFLOAT *b_offset; + a_offset = a; + b_offset = b; + + uint16x8_t v0, v1, v2, v3, v4, v5, v6, v7; + uint16x4_t v0_h, v1_h, v2_h, v3_h, v4_h, v5_h, v6_h, v7_h; + + for (BLASLONG j = 0; j < n / 8; j++) { + a_offset0 = a_offset; + a_offset1 = a_offset0 + lda; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + a_offset += 8; + + for (BLASLONG i = 0; i < m / 4; i++) { + v0 = vld1q_u16(a_offset0); + v1 = vld1q_u16(a_offset1); + v2 = vld1q_u16(a_offset2); + v3 = vld1q_u16(a_offset3); + + v4 = vtrn1q_u16(v0, v1); + v5 = vtrn2q_u16(v0, v1); + v6 = vtrn1q_u16(v2, v3); + v7 = vtrn2q_u16(v2, v3); + + v0 = (uint16x8_t)vtrn1q_u32((uint32x4_t)v4, (uint32x4_t)v6); + v1 = (uint16x8_t)vtrn1q_u32((uint32x4_t)v5, (uint32x4_t)v7); + v2 = (uint16x8_t)vtrn2q_u32((uint32x4_t)v4, (uint32x4_t)v6); + v3 = (uint16x8_t)vtrn2q_u32((uint32x4_t)v5, (uint32x4_t)v7); + + vst1_u16(b_offset, vget_low_u16(v0)); + vst1_u16(b_offset + 4, vget_low_u16(v1)); + vst1_u16(b_offset + 8, vget_low_u16(v2)); + vst1_u16(b_offset + 12, vget_low_u16(v3)); + vst1_u16(b_offset + 16, vget_high_u16(v0)); + vst1_u16(b_offset + 20, vget_high_u16(v1)); + vst1_u16(b_offset + 24, vget_high_u16(v2)); + vst1_u16(b_offset + 28, vget_high_u16(v3)); + + b_offset += 32; + a_offset0 += 4 * lda; + a_offset1 += 4 * lda; + a_offset2 += 4 * lda; + a_offset3 += 4 * lda; + } + + if (m & 3) { + BLASLONG rest = m & 3; + for (BLASLONG line = 0; line < 8; line++) { + b_offset[line * 4] = a_offset0[line]; + b_offset[line * 4 + 1] = rest == 1 ? 0 : a_offset1[line]; + b_offset[line * 4 + 2] = rest <= 2 ? 0 : a_offset2[line]; + b_offset[line * 4 + 3] = rest <= 3 ? 0 : a_offset3[line]; + } + b_offset += 32; + } + } + + if (n & 4) { + a_offset0 = a_offset; + a_offset1 = a_offset0 + lda; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + a_offset += 4; + + for (BLASLONG i = 0; i < m / 4; i++) { + v0_h = vld1_u16(a_offset0); + v1_h = vld1_u16(a_offset1); + v2_h = vld1_u16(a_offset2); + v3_h = vld1_u16(a_offset3); + + v4_h = vtrn1_u16(v0_h, v1_h); + v5_h = vtrn2_u16(v0_h, v1_h); + v6_h = vtrn1_u16(v2_h, v3_h); + v7_h = vtrn2_u16(v2_h, v3_h); + + v0_h = (uint16x4_t)vtrn1_u32((uint32x2_t)v4_h, (uint32x2_t)v6_h); + v1_h = (uint16x4_t)vtrn1_u32((uint32x2_t)v5_h, (uint32x2_t)v7_h); + v2_h = (uint16x4_t)vtrn2_u32((uint32x2_t)v4_h, (uint32x2_t)v6_h); + v3_h = (uint16x4_t)vtrn2_u32((uint32x2_t)v5_h, (uint32x2_t)v7_h); + + vst1_u16(b_offset, v0_h); + vst1_u16(b_offset + 4, v1_h); + vst1_u16(b_offset + 8, v2_h); + vst1_u16(b_offset + 12, v3_h); + + b_offset += 16; + a_offset0 += 4 * lda; + a_offset1 += 4 * lda; + a_offset2 += 4 * lda; + a_offset3 += 4 * lda; + } + + if (m & 3) { + BLASLONG rest = m & 3; + for (BLASLONG line = 0; line < 4; line++) { + b_offset[line * 4] = a_offset0[line]; + b_offset[line * 4 + 1] = rest == 1 ? 0 : a_offset1[line]; + b_offset[line * 4 + 2] = rest <= 2 ? 0 : a_offset2[line]; + b_offset[line * 4 + 3] = rest <= 3 ? 0 : a_offset3[line]; + } + b_offset += 16; + } + } + + if (n & 2) { + a_offset0 = a_offset; + a_offset1 = a_offset0 + lda; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + a_offset += 2; + + for (BLASLONG i = 0; i < m / 4; i++) { + for (BLASLONG line = 0; line < 2; line++) { + b_offset[line * 4] = a_offset0[line]; + b_offset[line * 4 + 1] = a_offset1[line]; + b_offset[line * 4 + 2] = a_offset2[line]; + b_offset[line * 4 + 3] = a_offset3[line]; + } + b_offset += 8; + a_offset0 += 4 * lda; + a_offset1 += 4 * lda; + a_offset2 += 4 * lda; + a_offset3 += 4 * lda; + } + + if (m & 3) { + BLASLONG rest = m & 3; + for (BLASLONG line = 0; line < 2; line++) { + b_offset[line * 4] = a_offset0[line]; + b_offset[line * 4 + 1] = rest == 1 ? 0 : a_offset1[line]; + b_offset[line * 4 + 2] = rest <= 2 ? 0 : a_offset2[line]; + b_offset[line * 4 + 3] = rest <= 3 ? 0 : a_offset3[line]; + } + b_offset += 8; + } + } + + if (n & 1) { + a_offset0 = a_offset; + a_offset1 = a_offset0 + lda; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + + for (BLASLONG i = 0; i < m / 4; i++) { + b_offset[0] = *a_offset0; + b_offset[1] = *a_offset1; + b_offset[2] = *a_offset2; + b_offset[3] = *a_offset3; + b_offset += 4; + a_offset0 += 4 * lda; + a_offset1 += 4 * lda; + a_offset2 += 4 * lda; + a_offset3 += 4 * lda; + } + + if (m & 3) { + BLASLONG rest = m & 3; + b_offset[0] = *a_offset0; + b_offset[1] = rest == 1 ? 0 : *a_offset1; + b_offset[2] = rest <= 2 ? 0 : *a_offset2; + b_offset[3] = rest <= 3 ? 0 : *a_offset3; + } + } + return 0; +} + diff --git a/param.h b/param.h index 27743c6ef..70b926e96 100644 --- a/param.h +++ b/param.h @@ -3553,6 +3553,13 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define SWITCH_RATIO 16 #define GEMM_PREFERED_SIZE 8 #endif +#undef SBGEMM_ALIGN_K +#define SBGEMM_ALIGN_K 4 + +#undef SBGEMM_DEFAULT_UNROLL_M +#undef SBGEMM_DEFAULT_UNROLL_N +#define SBGEMM_DEFAULT_UNROLL_M 8 +#define SBGEMM_DEFAULT_UNROLL_N 4 #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 8 From c748e6a33871f0dfa3bf6569c88a676c9a387411 Mon Sep 17 00:00:00 2001 From: Ye Tao Date: Mon, 2 Dec 2024 17:03:10 +0000 Subject: [PATCH 036/205] optimized sbgemm kernel for neoverse-v1 (sve-256) Signed-off-by: Ye Tao --- CONTRIBUTORS.md | 2 + Makefile.arm64 | 4 +- cmake/system.cmake | 4 +- kernel/arm64/KERNEL.NEOVERSEV1 | 12 +- ...rsev1.c => sbgemm_kernel_4x4_neoversev1.c} | 6 +- .../arm64/sbgemm_kernel_4x4_neoversev1_impl.c | 414 +++++++++++++++ .../arm64/sbgemm_kernel_8x4_neoversev1_impl.c | 472 ------------------ kernel/arm64/sbgemm_ncopy_4_neoversev1.c | 129 +++-- kernel/arm64/sbgemm_ncopy_8_neoversev1.c | 180 ------- kernel/arm64/sbgemm_tcopy_4_neoversev1.c | 347 ++++++++++--- kernel/arm64/sbgemm_tcopy_8_neoversev1.c | 200 -------- param.h | 6 +- 12 files changed, 789 insertions(+), 987 deletions(-) rename kernel/arm64/{sbgemm_kernel_8x4_neoversev1.c => sbgemm_kernel_4x4_neoversev1.c} (93%) create mode 100644 kernel/arm64/sbgemm_kernel_4x4_neoversev1_impl.c delete mode 100644 kernel/arm64/sbgemm_kernel_8x4_neoversev1_impl.c delete mode 100644 kernel/arm64/sbgemm_ncopy_8_neoversev1.c delete mode 100644 kernel/arm64/sbgemm_tcopy_8_neoversev1.c diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index d7e75bb97..f4a93aa1b 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -240,3 +240,5 @@ In chronological order: * Marek Michalowski * [2025-01-21] Add thread throttling profile for SGEMV on `NEOVERSEV1` +* Ye Tao + * [2025-02-03] Optimize SBGEMM kernel on NEOVERSEV1 diff --git a/Makefile.arm64 b/Makefile.arm64 index 2909a83e0..bea905f58 100644 --- a/Makefile.arm64 +++ b/Makefile.arm64 @@ -101,7 +101,7 @@ ifeq ($(CORE), NEOVERSEV1) ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG))) ifeq (1, $(filter 1,$(GCCVERSIONGTEQ10) $(ISCLANG))) ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11) $(ISCLANG))) -CCOMMON_OPT += -march=armv8.4-a+sve +CCOMMON_OPT += -march=armv8.4-a+sve+bf16 ifeq (1, $(ISCLANG)) CCOMMON_OPT += -mtune=cortex-x1 else @@ -111,7 +111,7 @@ ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-v1 endif else -CCOMMON_OPT += -march=armv8.4-a+sve +CCOMMON_OPT += -march=armv8.4-a+sve+bf16 ifneq ($(CROSS), 1) CCOMMON_OPT += -mtune=native endif diff --git a/cmake/system.cmake b/cmake/system.cmake index 1bcd7eef9..9c437fc99 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -291,10 +291,10 @@ if (DEFINED TARGET) if (${TARGET} STREQUAL NEOVERSEV1) if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) - set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -Msve_intrinsics -march=armv8.4-a+sve+bf16 -mtune=neoverse-v1") + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -Msve_intrinsics -march=armv8.4-a+sve -mtune=neoverse-v1") else () if (CMAKE_C_COMPILER_VERSION VERSION_GREATER 10.4 OR CMAKE_C_COMPILER_VERSION VERSION_EQUAL 10.4) - set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.4-a+sve+bf16 -mtune=neoverse-v1") + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.4-a+sve -mtune=neoverse-v1") else () message(FATAL_ERROR "Compiler ${CMAKE_C_COMPILER} ${CMAKE_C_COMPILER_VERSION} does not support Neoverse V1.") endif() diff --git a/kernel/arm64/KERNEL.NEOVERSEV1 b/kernel/arm64/KERNEL.NEOVERSEV1 index 7a7de3c7a..8845e6860 100644 --- a/kernel/arm64/KERNEL.NEOVERSEV1 +++ b/kernel/arm64/KERNEL.NEOVERSEV1 @@ -1,14 +1,18 @@ include $(KERNELDIR)/KERNEL.ARMV8SVE -SGEMVTKERNEL = gemv_t_sve.c -DGEMVTKERNEL = gemv_t_sve.c +SGEMVTKERNEL = gemv_t_sve_v1x3.c +DGEMVTKERNEL = gemv_t_sve_v1x3.c +ifeq ($(BUILD_BFLOAT16), 1) SBGEMM_BETA = sbgemm_beta_neoversev1.c SBGEMMKERNEL = sbgemm_kernel_$(SBGEMM_UNROLL_M)x$(SBGEMM_UNROLL_N)_neoversev1.c +ifneq ($(SBGEMM_UNROLL_M), $(SBGEMM_UNROLL_N)) SBGEMMINCOPY = sbgemm_ncopy_$(SBGEMM_UNROLL_M)_neoversev1.c SBGEMMITCOPY = sbgemm_tcopy_$(SBGEMM_UNROLL_M)_neoversev1.c -SBGEMMONCOPY = sbgemm_ncopy_$(SBGEMM_UNROLL_N)_neoversev1.c -SBGEMMOTCOPY = sbgemm_tcopy_$(SBGEMM_UNROLL_N)_neoversev1.c SBGEMMINCOPYOBJ = sbgemm_incopy$(TSUFFIX).$(SUFFIX) SBGEMMITCOPYOBJ = sbgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +SBGEMMONCOPY = sbgemm_ncopy_$(SBGEMM_UNROLL_N)_neoversev1.c +SBGEMMOTCOPY = sbgemm_tcopy_$(SBGEMM_UNROLL_N)_neoversev1.c SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX) SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX) +endif \ No newline at end of file diff --git a/kernel/arm64/sbgemm_kernel_8x4_neoversev1.c b/kernel/arm64/sbgemm_kernel_4x4_neoversev1.c similarity index 93% rename from kernel/arm64/sbgemm_kernel_8x4_neoversev1.c rename to kernel/arm64/sbgemm_kernel_4x4_neoversev1.c index d866fb335..889b5fc5b 100644 --- a/kernel/arm64/sbgemm_kernel_8x4_neoversev1.c +++ b/kernel/arm64/sbgemm_kernel_4x4_neoversev1.c @@ -1,5 +1,5 @@ /*************************************************************************** - * Copyright (c) 2024, The OpenBLAS Project + * Copyright (c) 2024-2025, The OpenBLAS Project * All rights reserved. * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are @@ -31,9 +31,9 @@ #include "common.h" #define ALPHA_ONE -#include "sbgemm_kernel_8x4_neoversev1_impl.c" +#include "sbgemm_kernel_4x4_neoversev1_impl.c" #undef ALPHA_ONE -#include "sbgemm_kernel_8x4_neoversev1_impl.c" +#include "sbgemm_kernel_4x4_neoversev1_impl.c" int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT *A, IFLOAT *B, FLOAT *C, BLASLONG ldc) { diff --git a/kernel/arm64/sbgemm_kernel_4x4_neoversev1_impl.c b/kernel/arm64/sbgemm_kernel_4x4_neoversev1_impl.c new file mode 100644 index 000000000..b6d9e9816 --- /dev/null +++ b/kernel/arm64/sbgemm_kernel_4x4_neoversev1_impl.c @@ -0,0 +1,414 @@ +/*************************************************************************** + * Copyright (c) 2024-2025, The OpenBLAS Project + * All rights reserved. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of the OpenBLAS project nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * *****************************************************************************/ + +#include + +#include "common.h" + +#define INIT_C(M, N) mc##M##N = svdup_f32(0); + +#define MATMUL(M, N) mc##M##N = svbfmmla(mc##M##N, ma##M, mb##N); + +#define INIT_C_4x4 \ + do { \ + INIT_C(0, 0); \ + INIT_C(0, 1); \ + INIT_C(1, 0); \ + INIT_C(1, 1); \ + } while (0); + +#ifdef ALPHA_ONE +#define UPDATE_C(PG, PTR, DST, SRC) \ + do { \ + DST = svld1_f32((PG), (PTR)); \ + DST = svadd_z((PG), SRC, DST); \ + svst1_f32((PG), (PTR), DST); \ + } while (0); +#else +#define UPDATE_C(PG, PTR, DST, SRC) \ + do { \ + DST = svld1_f32((PG), (PTR)); \ + DST = svmad_z((PG), svalpha, SRC, DST); \ + svst1_f32((PG), (PTR), DST); \ + } while (0); +#endif + +#define ZIP_EVEN_ELEMENTS(PG, mc0, mc1, tmp, vc) \ + do { \ + (tmp) = svuzp1_f32((mc0), (mc1)); \ + (vc) = svcompact_f32((PG), (tmp)); \ + } while (0) + +#define ZIP_ODD_ELEMENTS(PG, mc0, mc1, tmp, vc) \ + do { \ + (tmp) = svuzp2_f32((mc0), (mc1)); \ + (vc) = svcompact_f32((PG), (tmp)); \ + } while (0) + +#define ACCUMULATE_LAST4_TO_FIRST4(M, N, TMP) \ + do { \ + TMP = svext_f32(mc##M##N, mc##M##N, 4); \ + mc##M##N = svadd_f32_z(svptrue_b32(), mc##M##N, (TMP)); \ + } while (0) + +#ifdef ALPHA_ONE +int sbgemm_kernel_neoversev1_alpha_one(BLASLONG m, BLASLONG n, BLASLONG k, + FLOAT alpha, IFLOAT *A, IFLOAT *B, + FLOAT *C, BLASLONG ldc) +#else +int sbgemm_kernel_neoversev1_alpha(BLASLONG m, BLASLONG n, BLASLONG k, + FLOAT alpha, IFLOAT *A, IFLOAT *B, FLOAT *C, + BLASLONG ldc) +#endif +{ + + BLASLONG pad_k = (k + 7) & ~7; + svbfloat16_t ma0, ma1, mb0, mb1; + svfloat32_t mc00, mc01, mc10, mc11, vc0, vc1, vc2, vc3, oc0, oc1, oc2, oc3; + svfloat32_t tmp; + svfloat32_t svalpha = svdup_f32(alpha); + + svbool_t pg16_all = svptrue_b16(); + + svbool_t pg32_first_1 = svwhilelt_b32(0, 1); + svbool_t pg32_first_2 = svwhilelt_b32(0, 2); + svbool_t pg32_first_4 = svwhilelt_b32(0, 4); + + svbool_t pg32_select_first_2_per_quadword = svdupq_b32(1, 1, 0, 0); + + bfloat16_t *ptr_a = (bfloat16_t *)A; + bfloat16_t *ptr_b = (bfloat16_t *)B; + FLOAT *ptr_c = C; + + bfloat16_t *ptr_a0; + bfloat16_t *ptr_b0; + FLOAT *ptr_c0, *ptr_c1, *ptr_c2, *ptr_c3; + + for (BLASLONG j = 0; j < n / 4; j++) { + ptr_c0 = ptr_c; + ptr_c1 = ptr_c0 + ldc; + ptr_c2 = ptr_c1 + ldc; + ptr_c3 = ptr_c2 + ldc; + ptr_c += 4 * ldc; + ptr_a = (bfloat16_t *)A; + + for (BLASLONG i = 0; i < m / 4; i++) { + ptr_a0 = ptr_a; + ptr_a += 4 * pad_k; + + ptr_b0 = ptr_b; + + INIT_C_4x4; + + for (BLASLONG p = 0; p < pad_k; p += 8) { + ma0 = svld1_bf16(pg16_all, ptr_a0); + ma1 = svld1_bf16(pg16_all, ptr_a0 + 16); + + mb0 = svld1_bf16(pg16_all, ptr_b0); + mb1 = svld1_bf16(pg16_all, ptr_b0 + 16); + + MATMUL(0, 0); + MATMUL(0, 1); + MATMUL(1, 0); + MATMUL(1, 1); + + ptr_a0 += 32; + ptr_b0 += 32; + } + + ACCUMULATE_LAST4_TO_FIRST4(0, 0, tmp); + ACCUMULATE_LAST4_TO_FIRST4(0, 1, tmp); + ACCUMULATE_LAST4_TO_FIRST4(1, 0, tmp); + ACCUMULATE_LAST4_TO_FIRST4(1, 1, tmp); + + ZIP_EVEN_ELEMENTS(pg32_select_first_2_per_quadword, mc00, mc10, tmp, vc0); + ZIP_ODD_ELEMENTS(pg32_select_first_2_per_quadword, mc00, mc10, tmp, vc1); + + ZIP_EVEN_ELEMENTS(pg32_select_first_2_per_quadword, mc01, mc11, tmp, vc2); + ZIP_ODD_ELEMENTS(pg32_select_first_2_per_quadword, mc01, mc11, tmp, vc3); + + UPDATE_C(pg32_first_4, ptr_c0, oc0, vc0); + UPDATE_C(pg32_first_4, ptr_c1, oc1, vc1); + UPDATE_C(pg32_first_4, ptr_c2, oc2, vc2) + UPDATE_C(pg32_first_4, ptr_c3, oc3, vc3) + + ptr_c0 += 4; + ptr_c1 += 4; + ptr_c2 += 4; + ptr_c3 += 4; + } + + if (m & 2) { + ptr_a0 = ptr_a; + ptr_a += 2 * pad_k; + + ptr_b0 = ptr_b; + INIT_C(0, 0); + INIT_C(0, 1); + for (BLASLONG p = 0; p < pad_k; p += 8) { + ma0 = svld1_bf16(pg16_all, ptr_a0); + mb0 = svld1_bf16(pg16_all, ptr_b0); + mb1 = svld1_bf16(pg16_all, ptr_b0 + 16); + + MATMUL(0, 0); + MATMUL(0, 1); + + ptr_a0 += 16; + ptr_b0 += 32; + } + + ACCUMULATE_LAST4_TO_FIRST4(0, 0, tmp); + ACCUMULATE_LAST4_TO_FIRST4(0, 1, tmp); + + vc0 = svuzp1(mc00, mc00); + vc1 = svuzp2(mc00, mc00); + vc2 = svuzp1(mc01, mc01); + vc3 = svuzp2(mc01, mc01); + + UPDATE_C(pg32_first_2, ptr_c0, oc0, vc0); + UPDATE_C(pg32_first_2, ptr_c1, oc1, vc1); + UPDATE_C(pg32_first_2, ptr_c2, oc2, vc2); + UPDATE_C(pg32_first_2, ptr_c3, oc3, vc3); + + ptr_c0 += 2; + ptr_c1 += 2; + ptr_c2 += 2; + ptr_c3 += 2; + } + + if (m & 1) { + ptr_a0 = ptr_a; + ptr_b0 = ptr_b; + + INIT_C(0, 0); + INIT_C(0, 1); + for (BLASLONG p = 0; p < pad_k; p += 8) { + ma0 = svld1_bf16(pg16_all, ptr_a0); + mb0 = svld1_bf16(pg16_all, ptr_b0); + mb1 = svld1_bf16(pg16_all, ptr_b0 + 16); + + MATMUL(0, 0); + MATMUL(0, 1); + + ptr_a0 += 16; + ptr_b0 += 32; + } + + ACCUMULATE_LAST4_TO_FIRST4(0, 0, tmp); + ACCUMULATE_LAST4_TO_FIRST4(0, 1, tmp); + + // use compact is more straightforward + vc1 = svuzp2(mc00, mc00); + vc3 = svuzp2(mc01, mc01); + + UPDATE_C(pg32_first_1, ptr_c0, oc0, mc00); + UPDATE_C(pg32_first_1, ptr_c1, oc1, vc1); + UPDATE_C(pg32_first_1, ptr_c2, oc2, mc01); + UPDATE_C(pg32_first_1, ptr_c3, oc3, vc3); + } + + ptr_b += 4 * pad_k; + } + + if (n & 2) { + ptr_c0 = ptr_c; + ptr_c1 = ptr_c0 + ldc; + ptr_c += 2 * ldc; + ptr_a = (bfloat16_t *)A; + + for (BLASLONG i = 0; i < m / 4; i++) { + ptr_a0 = ptr_a; + ptr_a += 4 * pad_k; + + ptr_b0 = ptr_b; + + INIT_C(0, 0); + INIT_C(1, 0); + + for (BLASLONG p = 0; p < pad_k; p += 8) { + ma0 = svld1_bf16(pg16_all, ptr_a0); + ma1 = svld1_bf16(pg16_all, ptr_a0 + 16); + + mb0 = svld1_bf16(pg16_all, ptr_b0); + + MATMUL(0, 0); + MATMUL(1, 0); + + ptr_a0 += 32; + ptr_b0 += 16; + } + + ACCUMULATE_LAST4_TO_FIRST4(0, 0, tmp); + ACCUMULATE_LAST4_TO_FIRST4(1, 0, tmp); + + ZIP_EVEN_ELEMENTS(pg32_select_first_2_per_quadword, mc00, mc10, tmp, vc0); + ZIP_ODD_ELEMENTS(pg32_select_first_2_per_quadword, mc00, mc10, tmp, vc2); + + UPDATE_C(pg32_first_4, ptr_c0, oc0, vc0); + UPDATE_C(pg32_first_4, ptr_c1, oc2, vc2); + + ptr_c0 += 4; + ptr_c1 += 4; + } + + if (m & 2) { + ptr_a0 = ptr_a; + ptr_a += 2 * pad_k; + ptr_b0 = ptr_b; + + INIT_C(0, 0); + + for (BLASLONG p = 0; p < pad_k; p += 8) { + ma0 = svld1_bf16(pg16_all, ptr_a0); + mb0 = svld1_bf16(pg16_all, ptr_b0); + + MATMUL(0, 0); + + ptr_a0 += 16; + ptr_b0 += 16; + } + + ACCUMULATE_LAST4_TO_FIRST4(0, 0, tmp); + vc0 = svuzp1(mc00, mc00); + vc1 = svuzp2(mc00, mc00); + + UPDATE_C(pg32_first_2, ptr_c0, oc0, vc0); + UPDATE_C(pg32_first_2, ptr_c1, oc1, vc1); + + ptr_c0 += 2; + ptr_c1 += 2; + } + + if (m & 1) { + ptr_a0 = ptr_a; + ptr_b0 = ptr_b; + INIT_C(0, 0); + for (BLASLONG p = 0; p < pad_k; p += 8) { + ma0 = svld1_bf16(pg16_all, ptr_a0); + mb0 = svld1_bf16(pg16_all, ptr_b0); + MATMUL(0, 0); + ptr_a0 += 16; + ptr_b0 += 16; + } + + ACCUMULATE_LAST4_TO_FIRST4(0, 0, tmp); + vc1 = svuzp2(mc00, mc00); + + UPDATE_C(pg32_first_1, ptr_c0, oc0, mc00); + UPDATE_C(pg32_first_1, ptr_c1, oc1, vc1); + } + + ptr_b += 2 * pad_k; + } + + if (n & 1) { // TODO: this case seems a overhead. find out whether it's in our + // case. + ptr_c0 = ptr_c; + ptr_a = (bfloat16_t *)A; + + for (BLASLONG i = 0; i < m / 4; i++) { + ptr_a0 = ptr_a; + ptr_a += 4 * pad_k; + + ptr_b0 = ptr_b; + + INIT_C(0, 0); + INIT_C(1, 0); + + for (BLASLONG p = 0; p < pad_k; p += 8) { + ma0 = svld1_bf16(pg16_all, ptr_a0); + ma1 = svld1_bf16(pg16_all, ptr_a0 + 16); + + mb0 = svld1_bf16(pg16_all, ptr_b0); + + MATMUL(0, 0); + MATMUL(1, 0); + + ptr_a0 += 32; + ptr_b0 += 16; + } + + ACCUMULATE_LAST4_TO_FIRST4(0, 0, tmp); + ACCUMULATE_LAST4_TO_FIRST4(1, 0, tmp); + + ZIP_EVEN_ELEMENTS(pg32_select_first_2_per_quadword, mc00, mc10, tmp, vc0); + + UPDATE_C(pg32_first_4, ptr_c0, oc0, vc0); + + ptr_c0 += 4; + } + + if (m & 2) { + ptr_a0 = ptr_a; + ptr_a += 2 * pad_k; + ptr_b0 = ptr_b; + + INIT_C(0, 0); + + for (BLASLONG p = 0; p < pad_k; p += 8) { + ma0 = svld1_bf16(pg16_all, ptr_a0); + mb0 = svld1_bf16(pg16_all, ptr_b0); + + MATMUL(0, 0); + + ptr_a0 += 16; + ptr_b0 += 16; + } + + ACCUMULATE_LAST4_TO_FIRST4(0, 0, tmp); + + vc0 = svuzp1(mc00, mc00); + + UPDATE_C(pg32_first_2, ptr_c0, oc0, vc0); + + ptr_c0 += 2; + } + + if (m & 1) { + ptr_a0 = ptr_a; + ptr_b0 = ptr_b; + + INIT_C(0, 0); + for (BLASLONG p = 0; p < pad_k; p += 8) { + + ma0 = svld1_bf16(pg16_all, ptr_a0); + mb0 = svld1_bf16(pg16_all, ptr_b0); + + MATMUL(0, 0); + ptr_a0 += 16; + ptr_b0 += 16; + } + + ACCUMULATE_LAST4_TO_FIRST4(0, 0, tmp); + + UPDATE_C(pg32_first_1, ptr_c0, oc0, mc00); + } + } + + return 0; +} diff --git a/kernel/arm64/sbgemm_kernel_8x4_neoversev1_impl.c b/kernel/arm64/sbgemm_kernel_8x4_neoversev1_impl.c deleted file mode 100644 index 86daa117e..000000000 --- a/kernel/arm64/sbgemm_kernel_8x4_neoversev1_impl.c +++ /dev/null @@ -1,472 +0,0 @@ -/*************************************************************************** - * Copyright (c) 2024, The OpenBLAS Project - * All rights reserved. - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * 3. Neither the name of the OpenBLAS project nor the names of - * its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - * *****************************************************************************/ - -#include - -#include "common.h" - -#define INIT_C(M, N) mc##M##N = svdup_f32(0); - -#define MATMUL(M, N) mc##M##N = svbfmmla(mc##M##N, ma##M, mb##N); - -#define INIT_C_8x4 \ - do { \ - INIT_C(0, 0); \ - INIT_C(0, 1); \ - INIT_C(1, 0); \ - INIT_C(1, 1); \ - INIT_C(2, 0); \ - INIT_C(2, 1); \ - INIT_C(3, 0); \ - INIT_C(3, 1); \ - } while (0); - -#ifdef ALPHA_ONE -#define UPDATE_C(PG, PTR, DST, SRC) \ - do { \ - DST = svld1_f32((PG), (PTR)); \ - DST = svadd_z((PG), SRC, DST); \ - svst1_f32((PG), (PTR), DST); \ - } while (0); -#else -#define UPDATE_C(PG, PTR, DST, SRC) \ - do { \ - DST = svld1_f32((PG), (PTR)); \ - DST = svmad_z((PG), svalpha, SRC, DST); \ - svst1_f32((PG), (PTR), DST); \ - } while (0); -#endif - -#ifdef ALPHA_ONE -int sbgemm_kernel_neoversev1_alpha_one(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT * B, FLOAT * C, BLASLONG ldc) -#else -int sbgemm_kernel_neoversev1_alpha(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT * B, FLOAT * C, BLASLONG ldc) -#endif -{ - BLASLONG pad_k = (k + 3) & ~3; - - svbfloat16_t ma0, ma1, ma2, ma3, mb0, mb1; - svfloat32_t mc00, mc01, mc10, mc11, mc20, mc21, mc30, mc31, - vc0, vc1, vc2, vc3, vc4, vc5, vc6, vc7, - oc0, oc1, oc2, oc3, oc4, oc5, oc6, oc7; - svfloat32_t svalpha = svdup_f32(alpha); - - svbool_t pg16 = svptrue_b16(); - svbool_t pg16_low = svdupq_b16(1, 1, 1, 1, 0, 0, 0, 0); - svbool_t pg32 = svptrue_b32(); - svbool_t pg32_low = svdupq_b32(1, 1, 0, 0); - svbool_t pg32_first = svdupq_b32(1, 0, 0, 0); - - bfloat16_t *ptr_a = (bfloat16_t *)A; - bfloat16_t *ptr_b = (bfloat16_t *)B; - FLOAT *ptr_c = C; - - bfloat16_t *ptr_a0, *ptr_a1, *ptr_a2, *ptr_a3; - bfloat16_t *ptr_b0, *ptr_b1; - FLOAT *ptr_c0, *ptr_c1, *ptr_c2, *ptr_c3; - - for (BLASLONG j = 0; j < n / 4; j++) { - ptr_c0 = ptr_c; - ptr_c1 = ptr_c0 + ldc; - ptr_c2 = ptr_c1 + ldc; - ptr_c3 = ptr_c2 + ldc; - ptr_c += 4 * ldc; - ptr_a = (bfloat16_t *)A; - - for (BLASLONG i = 0; i < m / 8; i++) { - ptr_a0 = ptr_a; - ptr_a += 8 * pad_k; - - ptr_b0 = ptr_b; - - INIT_C_8x4; - - for (BLASLONG p = 0; p < pad_k; p += 4) { - ma0 = svld1_bf16(pg16, ptr_a0); - ma1 = svld1_bf16(pg16, ptr_a0 + 8); - ma2 = svld1_bf16(pg16, ptr_a0 + 16); - ma3 = svld1_bf16(pg16, ptr_a0 + 24); - - mb0 = svld1_bf16(pg16, ptr_b0); - mb1 = svld1_bf16(pg16, ptr_b0 + 8); - - MATMUL(0, 0); MATMUL(0, 1); - MATMUL(1, 0); MATMUL(1, 1); - MATMUL(2, 0); MATMUL(2, 1); - MATMUL(3, 0); MATMUL(3, 1); - - ptr_a0 += 32; - ptr_b0 += 16; - } - - vc0 = svuzp1(mc00, mc10); - vc1 = svuzp1(mc20, mc30); - vc2 = svuzp2(mc00, mc10); - vc3 = svuzp2(mc20, mc30); - vc4 = svuzp1(mc01, mc11); - vc5 = svuzp1(mc21, mc31); - vc6 = svuzp2(mc01, mc11); - vc7 = svuzp2(mc21, mc31); - - UPDATE_C(pg32, ptr_c0, oc0, vc0); - UPDATE_C(pg32, ptr_c0+4, oc1, vc1); - UPDATE_C(pg32, ptr_c1, oc2, vc2); - UPDATE_C(pg32, ptr_c1+4, oc3, vc3); - UPDATE_C(pg32, ptr_c2, oc4, vc4) - UPDATE_C(pg32, ptr_c2+4, oc5, vc5); - UPDATE_C(pg32, ptr_c3, oc6, vc6) - UPDATE_C(pg32, ptr_c3+4, oc7, vc7); - - ptr_c0 += 8; - ptr_c1 += 8; - ptr_c2 += 8; - ptr_c3 += 8; - } - - if (m & 4) { - ptr_a0 = ptr_a; - ptr_a += 4 * pad_k; - ptr_b0 = ptr_b; - - INIT_C(0, 0); INIT_C(0, 1); - INIT_C(1, 0); INIT_C(1, 1); - - for (BLASLONG p = 0; p < pad_k; p += 4) { - ma0 = svld1_bf16(pg16, ptr_a0); - ma1 = svld1_bf16(pg16, ptr_a0 + 8); - mb0 = svld1_bf16(pg16, ptr_b0); - mb1 = svld1_bf16(pg16, ptr_b0 + 8); - - MATMUL(0, 0); MATMUL(0, 1); - MATMUL(1, 0); MATMUL(1, 1); - - ptr_a0 += 16; - ptr_b0 += 16; - } - - vc0 = svuzp1(mc00, mc10); - vc1 = svuzp2(mc00, mc10); - vc2 = svuzp1(mc01, mc11); - vc3 = svuzp2(mc01, mc11); - - UPDATE_C(pg32, ptr_c0, oc0, vc0); - UPDATE_C(pg32, ptr_c1, oc1, vc1); - UPDATE_C(pg32, ptr_c2, oc2, vc2); - UPDATE_C(pg32, ptr_c3, oc3, vc3); - - ptr_c0 += 4; - ptr_c1 += 4; - ptr_c2 += 4; - ptr_c3 += 4; - } - - if (m & 2) { - ptr_a0 = ptr_a; - ptr_a += 2 * pad_k; - ptr_b0 = ptr_b; - - INIT_C(0, 0); INIT_C(0, 1); - for (BLASLONG p = 0; p < pad_k; p += 4) { - ma0 = svld1_bf16(pg16, ptr_a0); - mb0 = svld1_bf16(pg16, ptr_b0); - mb1 = svld1_bf16(pg16, ptr_b0 + 8); - - MATMUL(0, 0); MATMUL(0, 1); - - ptr_a0 += 8; - ptr_b0 += 16; - } - - vc0 = svuzp1(mc00, mc00); - vc1 = svuzp2(mc00, mc00); - vc2 = svuzp1(mc01, mc01); - vc3 = svuzp2(mc01, mc01); - - UPDATE_C(pg32_low, ptr_c0, oc0, vc0); - UPDATE_C(pg32_low, ptr_c1, oc1, vc1); - UPDATE_C(pg32_low, ptr_c2, oc2, vc2); - UPDATE_C(pg32_low, ptr_c3, oc3, vc3); - - ptr_c0 += 2; - ptr_c1 += 2; - ptr_c2 += 2; - ptr_c3 += 2; - } - - if (m & 1) { - ptr_a0 = ptr_a; - ptr_b0 = ptr_b; - - INIT_C(0, 0); INIT_C(0, 1); - for (BLASLONG p = 0; p < pad_k; p += 4) { - ma0 = svld1_bf16(pg16_low, ptr_a0); - mb0 = svld1_bf16(pg16, ptr_b0); - mb1 = svld1_bf16(pg16, ptr_b0 + 8); - - MATMUL(0, 0); MATMUL(0, 1); - - ptr_a0 += 4; - ptr_b0 += 16; - } - - vc1 = svuzp2(mc00, mc00); - vc3 = svuzp2(mc01, mc01); - - UPDATE_C(pg32_first, ptr_c0, oc0, mc00); - UPDATE_C(pg32_first, ptr_c1, oc1, vc1); - UPDATE_C(pg32_first, ptr_c2, oc2, mc01); - UPDATE_C(pg32_first, ptr_c3, oc3, vc3); - - } - - ptr_b += 4 * pad_k; - } - - if (n & 2) { - ptr_c0 = ptr_c; - ptr_c1 = ptr_c0 + ldc; - ptr_c += 2 * ldc; - ptr_a = (bfloat16_t *)A; - - for (BLASLONG i = 0; i < m / 8; i++) { - ptr_a0 = ptr_a; - ptr_a += 8 * pad_k; - - ptr_b0 = ptr_b; - - INIT_C(0, 0); - INIT_C(1, 0); - INIT_C(2, 0); - INIT_C(3, 0); - - for (BLASLONG p = 0; p < pad_k; p += 4) { - ma0 = svld1_bf16(pg16, ptr_a0); - ma1 = svld1_bf16(pg16, ptr_a0 + 8); - ma2 = svld1_bf16(pg16, ptr_a0 + 16); - ma3 = svld1_bf16(pg16, ptr_a0 + 24); - - mb0 = svld1_bf16(pg16, ptr_b0); - - MATMUL(0, 0); - MATMUL(1, 0); - MATMUL(2, 0); - MATMUL(3, 0); - - ptr_a0 += 32; - ptr_b0 += 8; - } - - vc0 = svuzp1(mc00, mc10); - vc1 = svuzp1(mc20, mc30); - vc2 = svuzp2(mc00, mc10); - vc3 = svuzp2(mc20, mc30); - - UPDATE_C(pg32, ptr_c0, oc0, vc0); - UPDATE_C(pg32, ptr_c0 + 4, oc1, vc1); - UPDATE_C(pg32, ptr_c1, oc2, vc2); - UPDATE_C(pg32, ptr_c1 + 4, oc3, vc3); - - ptr_c0 += 8; - ptr_c1 += 8; - } - - if (m & 4) { - ptr_a0 = ptr_a; - ptr_a += 4 * pad_k; - ptr_b0 = ptr_b; - - INIT_C(0, 0); - INIT_C(1, 0); - - for (BLASLONG p = 0; p < pad_k; p += 4) { - ma0 = svld1_bf16(pg16, ptr_a0); - ma1 = svld1_bf16(pg16, ptr_a0 + 8); - mb0 = svld1_bf16(pg16, ptr_b0); - MATMUL(0, 0); - MATMUL(1, 0); - ptr_a0 += 16; - ptr_b0 += 8; - } - - vc0 = svuzp1(mc00, mc10); - vc1 = svuzp2(mc00, mc10); - - UPDATE_C(pg32, ptr_c0, oc0, vc0); - UPDATE_C(pg32, ptr_c1, oc1, vc1); - - ptr_c0 += 4; - ptr_c1 += 4; - } - - if (m & 2) { - ptr_a0 = ptr_a; - ptr_a += 2 * pad_k; - ptr_b0 = ptr_b; - - INIT_C(0, 0); - - for (BLASLONG p = 0; p < pad_k; p += 4) { - ma0 = svld1_bf16(pg16, ptr_a0); - mb0 = svld1_bf16(pg16, ptr_b0); - - MATMUL(0, 0); - - ptr_a0 += 8; - ptr_b0 += 8; - } - - vc0 = svuzp1(mc00, mc00); - vc1 = svuzp2(mc00, mc00); - UPDATE_C(pg32_low, ptr_c0, oc0, vc0); - UPDATE_C(pg32_low, ptr_c1, oc1, vc1); - - ptr_c0 += 2; - ptr_c1 += 2; - - } - - if (m & 1) { - ptr_a0 = ptr_a; - ptr_b0 = ptr_b; - INIT_C(0, 0); - for (BLASLONG p = 0; p < pad_k; p += 4) { - ma0 = svld1_bf16(pg16_low, ptr_a0); - mb0 = svld1_bf16(pg16, ptr_b0); - MATMUL(0, 0); - ptr_a0 += 4; - ptr_b0 += 8; - } - vc1 = svuzp2(mc00, mc00); - - UPDATE_C(pg32_first, ptr_c0, oc0, mc00); - UPDATE_C(pg32_first, ptr_c1, oc1, vc1); - } - - ptr_b += 2 * pad_k; - } - - if (n & 1) { - ptr_c0 = ptr_c; - ptr_a = (bfloat16_t *)A; - - for (BLASLONG i = 0; i < m / 8; i++) { - ptr_a0 = ptr_a; - ptr_a += 8 * pad_k; - - ptr_b0 = ptr_b; - - INIT_C(0, 0); - INIT_C(1, 0); - INIT_C(2, 0); - INIT_C(3, 0); - - for (BLASLONG p = 0; p < pad_k; p += 4) { - ma0 = svld1_bf16(pg16, ptr_a0); - ma1 = svld1_bf16(pg16, ptr_a0 + 8); - ma2 = svld1_bf16(pg16, ptr_a0 + 16); - ma3 = svld1_bf16(pg16, ptr_a0 + 24); - - mb0 = svld1_bf16(pg16_low, ptr_b0); - - MATMUL(0, 0); - MATMUL(1, 0); - MATMUL(2, 0); - MATMUL(3, 0); - - ptr_a0 += 32; - ptr_b0 += 4; - } - - vc0 = svuzp1(mc00, mc10); - vc1 = svuzp1(mc20, mc30); - - UPDATE_C(pg32, ptr_c0, oc0, vc0); - UPDATE_C(pg32, ptr_c0 + 4, oc1, vc1); - - ptr_c0 += 8; - } - - if (m & 4) { - ptr_a0 = ptr_a; - ptr_a += 4 * pad_k; - ptr_b0 = ptr_b; - INIT_C(0, 0); - INIT_C(1, 0); - for (BLASLONG p = 0; p < pad_k; p += 4) { - ma0 = svld1_bf16(pg16, ptr_a0); - ma1 = svld1_bf16(pg16, ptr_a0 + 8); - mb0 = svld1_bf16(pg16_low, ptr_b0); - MATMUL(0, 0); - MATMUL(1, 0); - ptr_a0 += 16; - ptr_b0 += 4; - } - vc0 = svuzp1(mc00, mc10); - UPDATE_C(pg32, ptr_c0, oc0, vc0); - ptr_c0 += 4; - } - - if (m & 2) { - ptr_a0 = ptr_a; - ptr_a += 2 * pad_k; - ptr_b0 = ptr_b; - - INIT_C(0, 0); - - for (BLASLONG p = 0; p < pad_k; p += 4) { - ma0 = svld1_bf16(pg16, ptr_a0); - mb0 = svld1_bf16(pg16_low, ptr_b0); - - MATMUL(0, 0); - - ptr_a0 += 8; - ptr_b0 += 4; - } - vc0 = svuzp1(mc00, mc00); - UPDATE_C(pg32_low, ptr_c0, oc0, vc0); - ptr_c0 += 2; - } - - if (m & 1) { - ptr_a0 = ptr_a; - ptr_b0 = ptr_b; - INIT_C(0, 0); - for (BLASLONG p = 0; p < pad_k; p += 4) { - ma0 = svld1_bf16(pg16_low, ptr_a0); - mb0 = svld1_bf16(pg16_low, ptr_b0); - MATMUL(0, 0); - ptr_a0 += 4; - ptr_b0 += 4; - } - UPDATE_C(pg32_first, ptr_c0, oc0, mc00); - } - } - - return 0; -} - diff --git a/kernel/arm64/sbgemm_ncopy_4_neoversev1.c b/kernel/arm64/sbgemm_ncopy_4_neoversev1.c index 59d0dc58c..100f5c68e 100644 --- a/kernel/arm64/sbgemm_ncopy_4_neoversev1.c +++ b/kernel/arm64/sbgemm_ncopy_4_neoversev1.c @@ -1,5 +1,5 @@ /*************************************************************************** - * Copyright (c) 2024, The OpenBLAS Project + * Copyright (c) 2024-2025, The OpenBLAS Project * All rights reserved. * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are @@ -37,8 +37,17 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) { a_offset = a; b_offset = b; - svbool_t pg16 = svdupq_b16(1, 1, 1, 1, 0, 0, 0, 0); + bfloat16_t zero_value_bf16; + *((uint16_t *)(&zero_value_bf16)) = 0; + + svbool_t pg16_all = svptrue_b16(); // 16 elements for sve-256 machine. + svbool_t pg16_first_8 = svwhilelt_b16(0, 8); + svbfloat16_t v0, v1, v2, v3; + svuint64_t t0, t1; + + BLASLONG rest = m & 7; + svbool_t pg16_rest = svwhilelt_b16_s32(0, rest); for (BLASLONG j = 0; j < n / 4; j++) { a_offsetx[0] = a_offset; @@ -47,33 +56,41 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) { a_offsetx[3] = a_offsetx[2] + lda; a_offset += 4 * lda; - for (BLASLONG i = 0; i < m / 4; i++) { - v0 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[0]); - v1 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[1]); - v2 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[2]); - v3 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[3]); + for (BLASLONG i = 0; i < m / 8; i++) { + v0 = svld1_bf16(pg16_first_8, (bfloat16_t *)a_offsetx[0]); + v1 = svld1_bf16(pg16_first_8, (bfloat16_t *)a_offsetx[1]); + v2 = svld1_bf16(pg16_first_8, (bfloat16_t *)a_offsetx[2]); + v3 = svld1_bf16(pg16_first_8, (bfloat16_t *)a_offsetx[3]); - svst1_bf16(pg16, (bfloat16_t *)b_offset, v0); - svst1_bf16(pg16, (bfloat16_t *)b_offset + 4, v1); - svst1_bf16(pg16, (bfloat16_t *)b_offset + 8, v2); - svst1_bf16(pg16, (bfloat16_t *)b_offset + 12, v3); + t0 = svzip1_u64(svreinterpret_u64_bf16(v0), svreinterpret_u64_bf16(v1)); + t1 = svzip1_u64(svreinterpret_u64_bf16(v2), svreinterpret_u64_bf16(v3)); - b_offset += 16; - a_offsetx[0] += 4; - a_offsetx[1] += 4; - a_offsetx[2] += 4; - a_offsetx[3] += 4; + svst1_bf16(pg16_all, (bfloat16_t *)b_offset, svreinterpret_bf16_u64(t0)); + svst1_bf16(pg16_all, (bfloat16_t *)b_offset + 16, + svreinterpret_bf16_u64(t1)); + + a_offsetx[0] += 8; + a_offsetx[1] += 8; + a_offsetx[2] += 8; + a_offsetx[3] += 8; + + b_offset += 32; } - if (m & 3) { - BLASLONG rest = m & 3; - for (BLASLONG col = 0; col < 4; col++) { - b_offset[4 * col] = a_offsetx[col][0]; - b_offset[4 * col + 1] = rest == 1 ? 0 : a_offsetx[col][1]; - b_offset[4 * col + 2] = rest <= 2 ? 0 : a_offsetx[col][2]; - b_offset[4 * col + 3] = rest <= 3 ? 0 : a_offsetx[col][3]; - } - b_offset += 16; + if (rest) { // remainder along k dim + v0 = svld1_bf16(pg16_rest, (bfloat16_t *)a_offsetx[0]); + v1 = svld1_bf16(pg16_rest, (bfloat16_t *)a_offsetx[1]); + v2 = svld1_bf16(pg16_rest, (bfloat16_t *)a_offsetx[2]); + v3 = svld1_bf16(pg16_rest, (bfloat16_t *)a_offsetx[3]); + + t0 = svzip1_u64(svreinterpret_u64_bf16(v0), svreinterpret_u64_bf16(v1)); + t1 = svzip1_u64(svreinterpret_u64_bf16(v2), svreinterpret_u64_bf16(v3)); + + svst1_bf16(pg16_all, (bfloat16_t *)b_offset, svreinterpret_bf16_u64(t0)); + svst1_bf16(pg16_all, (bfloat16_t *)b_offset + 16, + svreinterpret_bf16_u64(t1)); + + b_offset += 32; } } @@ -82,46 +99,50 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) { a_offsetx[1] = a_offsetx[0] + lda; a_offset += 2 * lda; - for (BLASLONG i = 0; i < m / 4; i++) { - v0 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[0]); - v1 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[1]); - svst1_bf16(pg16, (bfloat16_t *)b_offset, v0); - svst1_bf16(pg16, (bfloat16_t *)b_offset + 4, v1); + for (BLASLONG i = 0; i < m / 8; i++) { + v0 = svld1_bf16(pg16_first_8, (bfloat16_t *)a_offsetx[0]); + v1 = svld1_bf16(pg16_first_8, (bfloat16_t *)a_offsetx[1]); + + t0 = svzip1_u64(svreinterpret_u64_bf16(v0), svreinterpret_u64_bf16(v1)); + svst1_bf16(pg16_all, (bfloat16_t *)b_offset, svreinterpret_bf16_u64(t0)); - b_offset += 8; - a_offsetx[0] += 4; - a_offsetx[1] += 4; + b_offset += 16; + a_offsetx[0] += 8; + a_offsetx[1] += 8; } - if (m & 3) { - BLASLONG rest = m & 3; - for (BLASLONG col = 0; col < 2; col++) { - b_offset[4 * col] = a_offsetx[col][0]; - b_offset[4 * col + 1] = rest == 1 ? 0 : a_offsetx[col][1]; - b_offset[4 * col + 2] = rest <= 2 ? 0 : a_offsetx[col][2]; - b_offset[4 * col + 3] = rest <= 3 ? 0 : a_offsetx[col][3]; - } - b_offset += 8; + if (rest) { // remainder along k dim + v0 = svld1_bf16(pg16_rest, (bfloat16_t *)a_offsetx[0]); + v1 = svld1_bf16(pg16_rest, (bfloat16_t *)a_offsetx[1]); + + t0 = svzip1_u64(svreinterpret_u64_bf16(v0), svreinterpret_u64_bf16(v1)); + svst1_bf16(pg16_all, (bfloat16_t *)b_offset, svreinterpret_bf16_u64(t0)); + + b_offset += 16; } } if (n & 1) { a_offsetx[0] = a_offset; - for (BLASLONG i = 0; i < m / 4; i++) { - v0 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[0]); - svst1_bf16(pg16, (bfloat16_t *)b_offset, v0); - b_offset += 4; - a_offsetx[0] += 4; + + for (BLASLONG i = 0; i < m / 8; i++) { + v0 = svld1_bf16(pg16_first_8, (bfloat16_t *)a_offsetx[0]); + v1 = svdup_bf16(zero_value_bf16); + + t0 = svzip1_u64(svreinterpret_u64_bf16(v0), svreinterpret_u64_bf16(v1)); + svst1_bf16(pg16_all, (bfloat16_t *)b_offset, svreinterpret_bf16_u64(t0)); + + b_offset += 16; + a_offsetx[0] += 8; } - if (m & 3) { - BLASLONG rest = m & 3; - b_offset[0] = a_offsetx[0][0]; - b_offset[1] = rest == 1 ? 0 : a_offsetx[0][1]; - b_offset[2] = rest <= 2 ? 0 : a_offsetx[0][2]; - b_offset[3] = rest <= 3 ? 0 : a_offsetx[0][3]; + + if (rest) { // remainder along k dim + v0 = svld1_bf16(pg16_rest, (bfloat16_t *)a_offsetx[0]); + v1 = svdup_bf16(zero_value_bf16); + t0 = svzip1_u64(svreinterpret_u64_bf16(v0), svreinterpret_u64_bf16(v1)); + svst1_bf16(pg16_all, (bfloat16_t *)b_offset, svreinterpret_bf16_u64(t0)); } } return 0; } - diff --git a/kernel/arm64/sbgemm_ncopy_8_neoversev1.c b/kernel/arm64/sbgemm_ncopy_8_neoversev1.c deleted file mode 100644 index 34412f05f..000000000 --- a/kernel/arm64/sbgemm_ncopy_8_neoversev1.c +++ /dev/null @@ -1,180 +0,0 @@ -/*************************************************************************** - * Copyright (c) 2024, The OpenBLAS Project - * All rights reserved. - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * 3. Neither the name of the OpenBLAS project nor the names of - * its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - * *****************************************************************************/ - -#include - -#include "common.h" - -int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) { - IFLOAT *a_offset; - IFLOAT *a_offsetx[8]; - IFLOAT *b_offset; - a_offset = a; - b_offset = b; - - svbool_t pg16 = svdupq_b16(1, 1, 1, 1, 0, 0, 0, 0); - svbfloat16_t v0, v1, v2, v3, v4, v5, v6, v7; - - for (BLASLONG j = 0; j < n / 8; j++) { - a_offsetx[0] = a_offset; - a_offsetx[1] = a_offsetx[0] + lda; - a_offsetx[2] = a_offsetx[1] + lda; - a_offsetx[3] = a_offsetx[2] + lda; - a_offsetx[4] = a_offsetx[3] + lda; - a_offsetx[5] = a_offsetx[4] + lda; - a_offsetx[6] = a_offsetx[5] + lda; - a_offsetx[7] = a_offsetx[6] + lda; - a_offset += 8 * lda; - - for (BLASLONG i = 0; i < m / 4; i++) { - v0 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[0]); - v1 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[1]); - v2 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[2]); - v3 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[3]); - v4 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[4]); - v5 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[5]); - v6 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[6]); - v7 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[7]); - - svst1_bf16(pg16, (bfloat16_t *)b_offset, v0); - svst1_bf16(pg16, (bfloat16_t *)b_offset + 4, v1); - svst1_bf16(pg16, (bfloat16_t *)b_offset + 8, v2); - svst1_bf16(pg16, (bfloat16_t *)b_offset + 12, v3); - svst1_bf16(pg16, (bfloat16_t *)b_offset + 16, v4); - svst1_bf16(pg16, (bfloat16_t *)b_offset + 20, v5); - svst1_bf16(pg16, (bfloat16_t *)b_offset + 24, v6); - svst1_bf16(pg16, (bfloat16_t *)b_offset + 28, v7); - - b_offset += 32; - a_offsetx[0] += 4; - a_offsetx[1] += 4; - a_offsetx[2] += 4; - a_offsetx[3] += 4; - a_offsetx[4] += 4; - a_offsetx[5] += 4; - a_offsetx[6] += 4; - a_offsetx[7] += 4; - } - - if (m & 3) { - BLASLONG rest = m & 3; - for (BLASLONG col = 0; col < 8; col++) { - b_offset[4 * col] = a_offsetx[col][0]; - b_offset[4 * col + 1] = rest == 1 ? 0 : a_offsetx[col][1]; - b_offset[4 * col + 2] = rest <= 2 ? 0 : a_offsetx[col][2]; - b_offset[4 * col + 3] = rest <= 3 ? 0 : a_offsetx[col][3]; - } - b_offset += 32; - } - } - - if (n & 4) { - a_offsetx[0] = a_offset; - a_offsetx[1] = a_offsetx[0] + lda; - a_offsetx[2] = a_offsetx[1] + lda; - a_offsetx[3] = a_offsetx[2] + lda; - a_offset += 4 * lda; - - for (BLASLONG i = 0; i < m / 4; i++) { - v0 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[0]); - v1 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[1]); - v2 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[2]); - v3 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[3]); - - svst1_bf16(pg16, (bfloat16_t *)b_offset, v0); - svst1_bf16(pg16, (bfloat16_t *)b_offset + 4, v1); - svst1_bf16(pg16, (bfloat16_t *)b_offset + 8, v2); - svst1_bf16(pg16, (bfloat16_t *)b_offset + 12, v3); - - b_offset += 16; - a_offsetx[0] += 4; - a_offsetx[1] += 4; - a_offsetx[2] += 4; - a_offsetx[3] += 4; - } - - if (m & 3) { - BLASLONG rest = m & 3; - for (BLASLONG col = 0; col < 4; col++) { - b_offset[4 * col] = a_offsetx[col][0]; - b_offset[4 * col + 1] = rest == 1 ? 0 : a_offsetx[col][1]; - b_offset[4 * col + 2] = rest <= 2 ? 0 : a_offsetx[col][2]; - b_offset[4 * col + 3] = rest <= 3 ? 0 : a_offsetx[col][3]; - } - b_offset += 16; - } - } - - if (n & 2) { - a_offsetx[0] = a_offset; - a_offsetx[1] = a_offsetx[0] + lda; - a_offset += 2 * lda; - - for (BLASLONG i = 0; i < m / 4; i++) { - v0 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[0]); - v1 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[1]); - svst1_bf16(pg16, (bfloat16_t *)b_offset, v0); - svst1_bf16(pg16, (bfloat16_t *)b_offset + 4, v1); - - b_offset += 8; - a_offsetx[0] += 4; - a_offsetx[1] += 4; - } - - if (m & 3) { - BLASLONG rest = m & 3; - for (BLASLONG col = 0; col < 2; col++) { - b_offset[4 * col] = a_offsetx[col][0]; - b_offset[4 * col + 1] = rest == 1 ? 0 : a_offsetx[col][1]; - b_offset[4 * col + 2] = rest <= 2 ? 0 : a_offsetx[col][2]; - b_offset[4 * col + 3] = rest <= 3 ? 0 : a_offsetx[col][3]; - } - b_offset += 8; - } - } - - if (n & 1) { - a_offsetx[0] = a_offset; - for (BLASLONG i = 0; i < m / 4; i++) { - v0 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[0]); - svst1_bf16(pg16, (bfloat16_t *)b_offset, v0); - b_offset += 4; - a_offsetx[0] += 4; - } - if (m & 3) { - BLASLONG rest = m & 3; - b_offset[0] = a_offsetx[0][0]; - b_offset[1] = rest == 1 ? 0 : a_offsetx[0][1]; - b_offset[2] = rest <= 2 ? 0 : a_offsetx[0][2]; - b_offset[3] = rest <= 3 ? 0 : a_offsetx[0][3]; - } - } - - return 0; -} - diff --git a/kernel/arm64/sbgemm_tcopy_4_neoversev1.c b/kernel/arm64/sbgemm_tcopy_4_neoversev1.c index 5f6241ff8..140e8f7ed 100644 --- a/kernel/arm64/sbgemm_tcopy_4_neoversev1.c +++ b/kernel/arm64/sbgemm_tcopy_4_neoversev1.c @@ -1,5 +1,5 @@ /*************************************************************************** - * Copyright (c) 2024, The OpenBLAS Project + * Copyright (c) 2024-2025, The OpenBLAS Project * All rights reserved. * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are @@ -25,62 +25,214 @@ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * *****************************************************************************/ -#include - #include "common.h" +#include +#include int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) { - IFLOAT *a_offset, *a_offset0, *a_offset1, *a_offset2, *a_offset3; + BLASLONG pad_m = ((m + 7) & ~7); + BLASLONG rest = (m & 7); // rest along m dim + + IFLOAT *a_offset; + IFLOAT *a_offset0, *a_offset1, *a_offset2, *a_offset3; + IFLOAT *a_offset4, *a_offset5, *a_offset6, *a_offset7; + IFLOAT *b_offset; + IFLOAT *b_offset0, *b_offset1; + a_offset = a; b_offset = b; - uint16x4_t v0_h, v1_h, v2_h, v3_h, v4_h, v5_h, v6_h, v7_h; + svuint16_t c0, c1, c2, c3, c4, c5, c6, c7; + svuint16_t t0, t1, t2, t3; + svuint32_t m00, m01, m10, m11; + svuint64_t st_offsets_0, st_offsets_1; + + svbool_t pg16_first_4 = svwhilelt_b16(0, 4); + svbool_t pg16_first_8 = svwhilelt_b16(0, 8); + + svbool_t pg64_first_4 = svwhilelt_b64(0, 4); + + u_int32_t sizeof_u64 = 8; + u_int64_t _st_offsets_0[4] = { + 0 * sizeof_u64, + 1 * sizeof_u64, + 4 * sizeof_u64, + 5 * sizeof_u64, + }; + + u_int64_t _st_offsets_1[4] = { + 2 * sizeof_u64, + 3 * sizeof_u64, + 6 * sizeof_u64, + 7 * sizeof_u64, + }; + + st_offsets_0 = svld1_u64(pg64_first_4, _st_offsets_0); + st_offsets_1 = svld1_u64(pg64_first_4, _st_offsets_1); + + for (BLASLONG j = 0; j < n / 8; j++) { + a_offset0 = a_offset; + a_offset1 = a_offset0 + lda; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + a_offset4 = a_offset3 + lda; + a_offset5 = a_offset4 + lda; + a_offset6 = a_offset5 + lda; + a_offset7 = a_offset6 + lda; + a_offset += 8; + + b_offset0 = b_offset; + b_offset1 = b_offset0 + 4 * pad_m; + + b_offset += 8 * pad_m; + for (BLASLONG i = 0; i < m / 8; i++) { + // transpose 8x8 matrix and pack into two 4x8 block consists of two 2x4 + // small blocks + c0 = svld1_u16(pg16_first_8, a_offset0); + c1 = svld1_u16(pg16_first_8, a_offset1); + c2 = svld1_u16(pg16_first_8, a_offset2); + c3 = svld1_u16(pg16_first_8, a_offset3); + c4 = svld1_u16(pg16_first_8, a_offset4); + c5 = svld1_u16(pg16_first_8, a_offset5); + c6 = svld1_u16(pg16_first_8, a_offset6); + c7 = svld1_u16(pg16_first_8, a_offset7); + + t0 = svzip1_u16(c0, c1); + t1 = svzip1_u16(c2, c3); + t2 = svzip1_u16(c4, c5); + t3 = svzip1_u16(c6, c7); + + m00 = svzip1_u32(svreinterpret_u32_u16(t0), svreinterpret_u32_u16(t1)); + m10 = svzip2_u32(svreinterpret_u32_u16(t0), svreinterpret_u32_u16(t1)); + m01 = svzip1_u32(svreinterpret_u32_u16(t2), svreinterpret_u32_u16(t3)); + m11 = svzip2_u32(svreinterpret_u32_u16(t2), svreinterpret_u32_u16(t3)); + + svst1_scatter_u64offset_u64(pg64_first_4, (u_int64_t *)b_offset0, + st_offsets_0, svreinterpret_u64_u32(m00)); + svst1_scatter_u64offset_u64(pg64_first_4, (u_int64_t *)b_offset0, + st_offsets_1, svreinterpret_u64_u32(m01)); + svst1_scatter_u64offset_u64(pg64_first_4, (u_int64_t *)b_offset1, + st_offsets_0, svreinterpret_u64_u32(m10)); + svst1_scatter_u64offset_u64(pg64_first_4, (u_int64_t *)b_offset1, + st_offsets_1, svreinterpret_u64_u32(m11)); + + a_offset0 += 8 * lda; + a_offset1 += 8 * lda; + a_offset2 += 8 * lda; + a_offset3 += 8 * lda; + a_offset4 += 8 * lda; + a_offset5 += 8 * lda; + a_offset6 += 8 * lda; + a_offset7 += 8 * lda; + + b_offset0 += 32; + b_offset1 += 32; + } + + if (rest) { + c0 = svld1_u16(pg16_first_8, a_offset0); + c1 = (rest >= 2 ? svld1_u16(pg16_first_8, a_offset1) : svdup_u16(0)); + c2 = (rest >= 3 ? svld1_u16(pg16_first_8, a_offset2) : svdup_u16(0)); + c3 = (rest >= 4 ? svld1_u16(pg16_first_8, a_offset3) : svdup_u16(0)); + c4 = (rest >= 5 ? svld1_u16(pg16_first_8, a_offset4) : svdup_u16(0)); + c5 = (rest >= 6 ? svld1_u16(pg16_first_8, a_offset5) : svdup_u16(0)); + c6 = (rest == 7 ? svld1_u16(pg16_first_8, a_offset6) : svdup_u16(0)); + c7 = (svdup_u16(0)); + + t0 = svzip1_u16(c0, c1); + t1 = svzip1_u16(c2, c3); + t2 = svzip1_u16(c4, c5); + t3 = svzip1_u16(c6, c7); + + m00 = svzip1_u32(svreinterpret_u32_u16(t0), svreinterpret_u32_u16(t1)); + m10 = svzip2_u32(svreinterpret_u32_u16(t0), svreinterpret_u32_u16(t1)); + m01 = svzip1_u32(svreinterpret_u32_u16(t2), svreinterpret_u32_u16(t3)); + m11 = svzip2_u32(svreinterpret_u32_u16(t2), svreinterpret_u32_u16(t3)); - for (BLASLONG j = 0; j < n / 4; j++) { + svst1_scatter_u64offset_u64(pg64_first_4, (u_int64_t *)b_offset0, + st_offsets_0, svreinterpret_u64_u32(m00)); + svst1_scatter_u64offset_u64(pg64_first_4, (u_int64_t *)b_offset0, + st_offsets_1, svreinterpret_u64_u32(m01)); + svst1_scatter_u64offset_u64(pg64_first_4, (u_int64_t *)b_offset1, + st_offsets_0, svreinterpret_u64_u32(m10)); + svst1_scatter_u64offset_u64(pg64_first_4, (u_int64_t *)b_offset1, + st_offsets_1, svreinterpret_u64_u32(m11)); + } + } + + if (n & 4) { a_offset0 = a_offset; a_offset1 = a_offset0 + lda; a_offset2 = a_offset1 + lda; a_offset3 = a_offset2 + lda; + a_offset4 = a_offset3 + lda; + a_offset5 = a_offset4 + lda; + a_offset6 = a_offset5 + lda; + a_offset7 = a_offset6 + lda; a_offset += 4; - for (BLASLONG i = 0; i < m / 4; i++) { - v0_h = vld1_u16(a_offset0); - v1_h = vld1_u16(a_offset1); - v2_h = vld1_u16(a_offset2); - v3_h = vld1_u16(a_offset3); + b_offset0 = b_offset; + b_offset += 4 * pad_m; - v4_h = vtrn1_u16(v0_h, v1_h); - v5_h = vtrn2_u16(v0_h, v1_h); - v6_h = vtrn1_u16(v2_h, v3_h); - v7_h = vtrn2_u16(v2_h, v3_h); + for (BLASLONG i = 0; i < m / 8; i++) { + // transpose 8x8 matrix and pack into two 4x8 block consists of two 2x4 + // small blocks + c0 = svld1_u16(pg16_first_4, a_offset0); + c1 = svld1_u16(pg16_first_4, a_offset1); + c2 = svld1_u16(pg16_first_4, a_offset2); + c3 = svld1_u16(pg16_first_4, a_offset3); + c4 = svld1_u16(pg16_first_4, a_offset4); + c5 = svld1_u16(pg16_first_4, a_offset5); + c6 = svld1_u16(pg16_first_4, a_offset6); + c7 = svld1_u16(pg16_first_4, a_offset7); - v0_h = (uint16x4_t)vtrn1_u32((uint32x2_t)v4_h, (uint32x2_t)v6_h); - v1_h = (uint16x4_t)vtrn1_u32((uint32x2_t)v5_h, (uint32x2_t)v7_h); - v2_h = (uint16x4_t)vtrn2_u32((uint32x2_t)v4_h, (uint32x2_t)v6_h); - v3_h = (uint16x4_t)vtrn2_u32((uint32x2_t)v5_h, (uint32x2_t)v7_h); + t0 = svzip1_u16(c0, c1); + t1 = svzip1_u16(c2, c3); + t2 = svzip1_u16(c4, c5); + t3 = svzip1_u16(c6, c7); - vst1_u16(b_offset, v0_h); - vst1_u16(b_offset + 4, v1_h); - vst1_u16(b_offset + 8, v2_h); - vst1_u16(b_offset + 12, v3_h); + m00 = svzip1_u32(svreinterpret_u32_u16(t0), svreinterpret_u32_u16(t1)); + m01 = svzip1_u32(svreinterpret_u32_u16(t2), svreinterpret_u32_u16(t3)); + svst1_scatter_u64offset_u64(pg64_first_4, (u_int64_t *)b_offset0, + st_offsets_0, svreinterpret_u64_u32(m00)); + svst1_scatter_u64offset_u64(pg64_first_4, (u_int64_t *)b_offset0, + st_offsets_1, svreinterpret_u64_u32(m01)); - b_offset += 16; - a_offset0 += 4 * lda; - a_offset1 += 4 * lda; - a_offset2 += 4 * lda; - a_offset3 += 4 * lda; + a_offset0 += 8 * lda; + a_offset1 += 8 * lda; + a_offset2 += 8 * lda; + a_offset3 += 8 * lda; + a_offset4 += 8 * lda; + a_offset5 += 8 * lda; + a_offset6 += 8 * lda; + a_offset7 += 8 * lda; + + b_offset0 += 32; } - if (m & 3) { - BLASLONG rest = m & 3; - for (BLASLONG line = 0; line < 4; line++) { - b_offset[line * 4] = a_offset0[line]; - b_offset[line * 4 + 1] = rest == 1 ? 0 : a_offset1[line]; - b_offset[line * 4 + 2] = rest <= 2 ? 0 : a_offset2[line]; - b_offset[line * 4 + 3] = rest <= 3 ? 0 : a_offset3[line]; - } - b_offset += 16; + if (rest) { + c0 = svld1_u16(pg16_first_4, a_offset0); // rest >= 1 + c1 = (rest >= 2 ? svld1_u16(pg16_first_4, a_offset1) : svdup_u16(0)); + c2 = (rest >= 3 ? svld1_u16(pg16_first_4, a_offset2) : svdup_u16(0)); + c3 = (rest >= 4 ? svld1_u16(pg16_first_4, a_offset3) : svdup_u16(0)); + c4 = (rest >= 5 ? svld1_u16(pg16_first_4, a_offset4) : svdup_u16(0)); + c5 = (rest >= 6 ? svld1_u16(pg16_first_4, a_offset5) : svdup_u16(0)); + c6 = (rest == 7 ? svld1_u16(pg16_first_4, a_offset6) : svdup_u16(0)); + c7 = (svdup_u16(0)); + + t0 = svzip1_u16(c0, c1); + t1 = svzip1_u16(c2, c3); + t2 = svzip1_u16(c4, c5); + t3 = svzip1_u16(c6, c7); + + m00 = svzip1_u32(svreinterpret_u32_u16(t0), svreinterpret_u32_u16(t1)); + m01 = svzip1_u32(svreinterpret_u32_u16(t2), svreinterpret_u32_u16(t3)); + + svst1_scatter_u64offset_u64(pg64_first_4, (u_int64_t *)b_offset0, + st_offsets_0, svreinterpret_u64_u32(m00)); + svst1_scatter_u64offset_u64(pg64_first_4, (u_int64_t *)b_offset0, + st_offsets_1, svreinterpret_u64_u32(m01)); } } @@ -89,31 +241,54 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) { a_offset1 = a_offset0 + lda; a_offset2 = a_offset1 + lda; a_offset3 = a_offset2 + lda; + a_offset4 = a_offset3 + lda; + a_offset5 = a_offset4 + lda; + a_offset6 = a_offset5 + lda; + a_offset7 = a_offset6 + lda; a_offset += 2; - for (BLASLONG i = 0; i < m / 4; i++) { + b_offset0 = b_offset; + b_offset1 = b_offset0 + 8; + + b_offset += 2 * pad_m; + + for (BLASLONG i = 0; i < m / 8; i++) { for (BLASLONG line = 0; line < 2; line++) { - b_offset[line * 4] = a_offset0[line]; - b_offset[line * 4 + 1] = a_offset1[line]; - b_offset[line * 4 + 2] = a_offset2[line]; - b_offset[line * 4 + 3] = a_offset3[line]; + b_offset0[line * 4] = a_offset0[line]; + b_offset0[line * 4 + 1] = a_offset1[line]; + b_offset0[line * 4 + 2] = a_offset2[line]; + b_offset0[line * 4 + 3] = a_offset3[line]; + + b_offset1[line * 4] = a_offset4[line]; + b_offset1[line * 4 + 1] = a_offset5[line]; + b_offset1[line * 4 + 2] = a_offset6[line]; + b_offset1[line * 4 + 3] = a_offset7[line]; } - b_offset += 8; - a_offset0 += 4 * lda; - a_offset1 += 4 * lda; - a_offset2 += 4 * lda; - a_offset3 += 4 * lda; + b_offset0 += 16; + b_offset1 += 16; + + a_offset0 += 8 * lda; + a_offset1 += 8 * lda; + a_offset2 += 8 * lda; + a_offset3 += 8 * lda; + a_offset4 += 8 * lda; + a_offset5 += 8 * lda; + a_offset6 += 8 * lda; + a_offset7 += 8 * lda; } - if (m & 3) { - BLASLONG rest = m & 3; + if (rest) { for (BLASLONG line = 0; line < 2; line++) { - b_offset[line * 4] = a_offset0[line]; - b_offset[line * 4 + 1] = rest == 1 ? 0 : a_offset1[line]; - b_offset[line * 4 + 2] = rest <= 2 ? 0 : a_offset2[line]; - b_offset[line * 4 + 3] = rest <= 3 ? 0 : a_offset3[line]; + b_offset0[line * 4] = a_offset0[line]; + b_offset0[line * 4 + 1] = rest == 1 ? 0 : a_offset1[line]; + b_offset0[line * 4 + 2] = rest <= 2 ? 0 : a_offset2[line]; + b_offset0[line * 4 + 3] = rest <= 3 ? 0 : a_offset3[line]; + + b_offset1[line * 4] = rest <= 4 ? 0 : a_offset4[line]; + b_offset1[line * 4 + 1] = rest <= 5 ? 0 : a_offset5[line]; + b_offset1[line * 4 + 2] = rest <= 6 ? 0 : a_offset6[line]; + b_offset1[line * 4 + 3] = 0; } - b_offset += 8; } } @@ -122,27 +297,65 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) { a_offset1 = a_offset0 + lda; a_offset2 = a_offset1 + lda; a_offset3 = a_offset2 + lda; + a_offset4 = a_offset3 + lda; + a_offset5 = a_offset4 + lda; + a_offset6 = a_offset5 + lda; + a_offset7 = a_offset6 + lda; - for (BLASLONG i = 0; i < m / 4; i++) { - b_offset[0] = *a_offset0; - b_offset[1] = *a_offset1; - b_offset[2] = *a_offset2; - b_offset[3] = *a_offset3; - b_offset += 4; - a_offset0 += 4 * lda; - a_offset1 += 4 * lda; - a_offset2 += 4 * lda; - a_offset3 += 4 * lda; + for (BLASLONG i = 0; i < m / 8; i++) { + b_offset[0] = a_offset0[0]; + b_offset[1] = a_offset1[0]; + b_offset[2] = a_offset2[0]; + b_offset[3] = a_offset3[0]; + + b_offset[4] = 0; + b_offset[5] = 0; + b_offset[6] = 0; + b_offset[7] = 0; + + b_offset[8] = a_offset4[0]; + b_offset[9] = a_offset5[0]; + b_offset[10] = a_offset6[0]; + b_offset[11] = a_offset7[0]; + + b_offset[12] = 0; + b_offset[13] = 0; + b_offset[14] = 0; + b_offset[15] = 0; + + b_offset += 16; + a_offset0 += 8 * lda; + a_offset1 += 8 * lda; + a_offset2 += 8 * lda; + a_offset3 += 8 * lda; + a_offset4 += 8 * lda; + a_offset5 += 8 * lda; + a_offset6 += 8 * lda; + a_offset7 += 8 * lda; } - if (m & 3) { - BLASLONG rest = m & 3; + if (rest) { b_offset[0] = *a_offset0; b_offset[1] = rest == 1 ? 0 : *a_offset1; b_offset[2] = rest <= 2 ? 0 : *a_offset2; b_offset[3] = rest <= 3 ? 0 : *a_offset3; + + b_offset[4] = 0; + b_offset[5] = 0; + b_offset[6] = 0; + b_offset[7] = 0; + + b_offset[8] = rest <= 4 ? 0 : *a_offset4; + b_offset[9] = rest <= 5 ? 0 : *a_offset5; + b_offset[10] = rest <= 6 ? 0 : *a_offset6; + b_offset[11] = 0; + + b_offset[12] = 0; + b_offset[13] = 0; + b_offset[14] = 0; + b_offset[15] = 0; } } + return 0; } - diff --git a/kernel/arm64/sbgemm_tcopy_8_neoversev1.c b/kernel/arm64/sbgemm_tcopy_8_neoversev1.c deleted file mode 100644 index 1a1198d02..000000000 --- a/kernel/arm64/sbgemm_tcopy_8_neoversev1.c +++ /dev/null @@ -1,200 +0,0 @@ -/*************************************************************************** - * Copyright (c) 2024, The OpenBLAS Project - * All rights reserved. - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * 3. Neither the name of the OpenBLAS project nor the names of - * its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - * *****************************************************************************/ -#include - -#include "common.h" - -int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) { - IFLOAT *a_offset, *a_offset0, *a_offset1, *a_offset2, *a_offset3; - IFLOAT *b_offset; - a_offset = a; - b_offset = b; - - uint16x8_t v0, v1, v2, v3, v4, v5, v6, v7; - uint16x4_t v0_h, v1_h, v2_h, v3_h, v4_h, v5_h, v6_h, v7_h; - - for (BLASLONG j = 0; j < n / 8; j++) { - a_offset0 = a_offset; - a_offset1 = a_offset0 + lda; - a_offset2 = a_offset1 + lda; - a_offset3 = a_offset2 + lda; - a_offset += 8; - - for (BLASLONG i = 0; i < m / 4; i++) { - v0 = vld1q_u16(a_offset0); - v1 = vld1q_u16(a_offset1); - v2 = vld1q_u16(a_offset2); - v3 = vld1q_u16(a_offset3); - - v4 = vtrn1q_u16(v0, v1); - v5 = vtrn2q_u16(v0, v1); - v6 = vtrn1q_u16(v2, v3); - v7 = vtrn2q_u16(v2, v3); - - v0 = (uint16x8_t)vtrn1q_u32((uint32x4_t)v4, (uint32x4_t)v6); - v1 = (uint16x8_t)vtrn1q_u32((uint32x4_t)v5, (uint32x4_t)v7); - v2 = (uint16x8_t)vtrn2q_u32((uint32x4_t)v4, (uint32x4_t)v6); - v3 = (uint16x8_t)vtrn2q_u32((uint32x4_t)v5, (uint32x4_t)v7); - - vst1_u16(b_offset, vget_low_u16(v0)); - vst1_u16(b_offset + 4, vget_low_u16(v1)); - vst1_u16(b_offset + 8, vget_low_u16(v2)); - vst1_u16(b_offset + 12, vget_low_u16(v3)); - vst1_u16(b_offset + 16, vget_high_u16(v0)); - vst1_u16(b_offset + 20, vget_high_u16(v1)); - vst1_u16(b_offset + 24, vget_high_u16(v2)); - vst1_u16(b_offset + 28, vget_high_u16(v3)); - - b_offset += 32; - a_offset0 += 4 * lda; - a_offset1 += 4 * lda; - a_offset2 += 4 * lda; - a_offset3 += 4 * lda; - } - - if (m & 3) { - BLASLONG rest = m & 3; - for (BLASLONG line = 0; line < 8; line++) { - b_offset[line * 4] = a_offset0[line]; - b_offset[line * 4 + 1] = rest == 1 ? 0 : a_offset1[line]; - b_offset[line * 4 + 2] = rest <= 2 ? 0 : a_offset2[line]; - b_offset[line * 4 + 3] = rest <= 3 ? 0 : a_offset3[line]; - } - b_offset += 32; - } - } - - if (n & 4) { - a_offset0 = a_offset; - a_offset1 = a_offset0 + lda; - a_offset2 = a_offset1 + lda; - a_offset3 = a_offset2 + lda; - a_offset += 4; - - for (BLASLONG i = 0; i < m / 4; i++) { - v0_h = vld1_u16(a_offset0); - v1_h = vld1_u16(a_offset1); - v2_h = vld1_u16(a_offset2); - v3_h = vld1_u16(a_offset3); - - v4_h = vtrn1_u16(v0_h, v1_h); - v5_h = vtrn2_u16(v0_h, v1_h); - v6_h = vtrn1_u16(v2_h, v3_h); - v7_h = vtrn2_u16(v2_h, v3_h); - - v0_h = (uint16x4_t)vtrn1_u32((uint32x2_t)v4_h, (uint32x2_t)v6_h); - v1_h = (uint16x4_t)vtrn1_u32((uint32x2_t)v5_h, (uint32x2_t)v7_h); - v2_h = (uint16x4_t)vtrn2_u32((uint32x2_t)v4_h, (uint32x2_t)v6_h); - v3_h = (uint16x4_t)vtrn2_u32((uint32x2_t)v5_h, (uint32x2_t)v7_h); - - vst1_u16(b_offset, v0_h); - vst1_u16(b_offset + 4, v1_h); - vst1_u16(b_offset + 8, v2_h); - vst1_u16(b_offset + 12, v3_h); - - b_offset += 16; - a_offset0 += 4 * lda; - a_offset1 += 4 * lda; - a_offset2 += 4 * lda; - a_offset3 += 4 * lda; - } - - if (m & 3) { - BLASLONG rest = m & 3; - for (BLASLONG line = 0; line < 4; line++) { - b_offset[line * 4] = a_offset0[line]; - b_offset[line * 4 + 1] = rest == 1 ? 0 : a_offset1[line]; - b_offset[line * 4 + 2] = rest <= 2 ? 0 : a_offset2[line]; - b_offset[line * 4 + 3] = rest <= 3 ? 0 : a_offset3[line]; - } - b_offset += 16; - } - } - - if (n & 2) { - a_offset0 = a_offset; - a_offset1 = a_offset0 + lda; - a_offset2 = a_offset1 + lda; - a_offset3 = a_offset2 + lda; - a_offset += 2; - - for (BLASLONG i = 0; i < m / 4; i++) { - for (BLASLONG line = 0; line < 2; line++) { - b_offset[line * 4] = a_offset0[line]; - b_offset[line * 4 + 1] = a_offset1[line]; - b_offset[line * 4 + 2] = a_offset2[line]; - b_offset[line * 4 + 3] = a_offset3[line]; - } - b_offset += 8; - a_offset0 += 4 * lda; - a_offset1 += 4 * lda; - a_offset2 += 4 * lda; - a_offset3 += 4 * lda; - } - - if (m & 3) { - BLASLONG rest = m & 3; - for (BLASLONG line = 0; line < 2; line++) { - b_offset[line * 4] = a_offset0[line]; - b_offset[line * 4 + 1] = rest == 1 ? 0 : a_offset1[line]; - b_offset[line * 4 + 2] = rest <= 2 ? 0 : a_offset2[line]; - b_offset[line * 4 + 3] = rest <= 3 ? 0 : a_offset3[line]; - } - b_offset += 8; - } - } - - if (n & 1) { - a_offset0 = a_offset; - a_offset1 = a_offset0 + lda; - a_offset2 = a_offset1 + lda; - a_offset3 = a_offset2 + lda; - - for (BLASLONG i = 0; i < m / 4; i++) { - b_offset[0] = *a_offset0; - b_offset[1] = *a_offset1; - b_offset[2] = *a_offset2; - b_offset[3] = *a_offset3; - b_offset += 4; - a_offset0 += 4 * lda; - a_offset1 += 4 * lda; - a_offset2 += 4 * lda; - a_offset3 += 4 * lda; - } - - if (m & 3) { - BLASLONG rest = m & 3; - b_offset[0] = *a_offset0; - b_offset[1] = rest == 1 ? 0 : *a_offset1; - b_offset[2] = rest <= 2 ? 0 : *a_offset2; - b_offset[3] = rest <= 3 ? 0 : *a_offset3; - } - } - return 0; -} - diff --git a/param.h b/param.h index 70b926e96..36e6f619f 100644 --- a/param.h +++ b/param.h @@ -3553,12 +3553,12 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define SWITCH_RATIO 16 #define GEMM_PREFERED_SIZE 8 #endif -#undef SBGEMM_ALIGN_K -#define SBGEMM_ALIGN_K 4 +#undef SBGEMM_ALIGN_K #undef SBGEMM_DEFAULT_UNROLL_M #undef SBGEMM_DEFAULT_UNROLL_N -#define SBGEMM_DEFAULT_UNROLL_M 8 +#define SBGEMM_ALIGN_K 8 +#define SBGEMM_DEFAULT_UNROLL_M 4 #define SBGEMM_DEFAULT_UNROLL_N 4 #define SGEMM_DEFAULT_UNROLL_M 16 From 09414a4187fafe70aa4633e2ded1d3a1bbc4db14 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 6 Feb 2025 18:52:00 +0100 Subject: [PATCH 037/205] Ensure that GEMMTR name appears in XERBLA if gemmt was called as such --- interface/CMakeLists.txt | 4 ++-- interface/Makefile | 12 ++++++------ interface/gemmt.c | 23 +++++++++++++++++++++++ 3 files changed, 31 insertions(+), 8 deletions(-) diff --git a/interface/CMakeLists.txt b/interface/CMakeLists.txt index 8d815c452..c0d5896e1 100644 --- a/interface/CMakeLists.txt +++ b/interface/CMakeLists.txt @@ -109,7 +109,7 @@ endif () GenerateNamedObjects("trsm.c" "TRMM" "trmm" ${CBLAS_FLAG}) # gemmtr is gemmt under the name adopted by the Reference BLAS - GenerateNamedObjects("gemm.c" "" "gemmtr" ${CBLAS_FLAG}) + GenerateNamedObjects("gemm.c" "RNAME" "gemmtr" ${CBLAS_FLAG}) # max and imax are compiled 4 times GenerateNamedObjects("max.c" "" "" ${CBLAS_FLAG}) @@ -126,7 +126,7 @@ if (BUILD_BFLOAT16) GenerateNamedObjects("bf16dot.c" "" "sbdot" ${CBLAS_FLAG} "" "" true "BFLOAT16") GenerateNamedObjects("gemm.c" "" "sbgemm" ${CBLAS_FLAG} "" "" true "BFLOAT16") GenerateNamedObjects("gemmt.c" "" "sbgemmt" ${CBLAS_FLAG} "" "" true "BFLOAT16") - GenerateNamedObjects("gemmt.c" "" "sbgemmtr" ${CBLAS_FLAG} "" "" true "BFLOAT16") + GenerateNamedObjects("gemmt.c" "RNAME" "sbgemmtr" ${CBLAS_FLAG} "" "" true "BFLOAT16") GenerateNamedObjects("sbgemv.c" "" "sbgemv" ${CBLAS_FLAG} "" "" true "BFLOAT16") GenerateNamedObjects("tobf16.c" "SINGLE_PREC" "sbstobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16") GenerateNamedObjects("tobf16.c" "DOUBLE_PREC" "sbdtobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16") diff --git a/interface/Makefile b/interface/Makefile index c22e087c0..849d59bef 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -1306,7 +1306,7 @@ sbgemm.$(SUFFIX) sbgemm.$(PSUFFIX) : gemm.c ../param.h sbgemmt.$(SUFFIX) sbgemmt.$(PSUFFIX) : sbgemmt.c ../param.h $(CC) -c $(CFLAGS) $< -o $(@F) sbgemmtr.$(SUFFIX) sbgemmtr.$(PSUFFIX) : sbgemmt.c ../param.h - $(CC) -c $(CFLAGS) $< -o $(@F) + $(CC) -c $(CFLAGS) -DRNAME $< -o $(@F) endif sgemm.$(SUFFIX) sgemm.$(PSUFFIX) : gemm.c ../param.h @@ -1343,19 +1343,19 @@ zgemmt.$(SUFFIX) zgemmt.$(PSUFFIX) : gemmt.c ../param.h $(CC) -c $(CFLAGS) $< -o $(@F) sgemmtr.$(SUFFIX) sgemmtr.$(PSUFFIX) : gemmt.c ../param.h - $(CC) -c $(CFLAGS) $< -o $(@F) + $(CC) -c $(CFLAGS) -DRNAME $< -o $(@F) dgemmtr.$(SUFFIX) dgemmtr.$(PSUFFIX) : gemmt.c ../param.h - $(CC) -c $(CFLAGS) $< -o $(@F) + $(CC) -c $(CFLAGS) -DRNAME $< -o $(@F) qgemmtr.$(SUFFIX) qgemmtr.$(PSUFFIX) : gemmt.c ../param.h - $(CC) -c $(CFLAGS) $< -o $(@F) + $(CC) -c $(CFLAGS) -DRNAME $< -o $(@F) cgemmtr.$(SUFFIX) cgemmtr.$(PSUFFIX) : gemmt.c ../param.h - $(CC) -c $(CFLAGS) $< -o $(@F) + $(CC) -c $(CFLAGS) -DRNAME $< -o $(@F) zgemmtr.$(SUFFIX) zgemmtr.$(PSUFFIX) : gemmt.c ../param.h - $(CC) -c $(CFLAGS) $< -o $(@F) + $(CC) -c $(CFLAGS) -DRNAME $< -o $(@F) ssymm.$(SUFFIX) ssymm.$(PSUFFIX) : symm.c $(CC) -c $(CFLAGS) $< -o $(@F) diff --git a/interface/gemmt.c b/interface/gemmt.c index 01747af41..cf01c4dcf 100644 --- a/interface/gemmt.c +++ b/interface/gemmt.c @@ -38,6 +38,7 @@ #ifndef COMPLEX #define SMP_THRESHOLD_MIN 65536.0 +#ifdef RNAME #ifdef XDOUBLE #define ERROR_NAME "QGEMMT " #elif defined(DOUBLE) @@ -48,7 +49,28 @@ #define ERROR_NAME "SGEMMT " #endif #else +#ifdef XDOUBLE +#define ERROR_NAME "QGEMMTR" +#elif defined(DOUBLE) +#define ERROR_NAME "DGEMMTR" +#elif defined(BFLOAT16) +#define ERROR_NAME "SBGEMMTR" +#else +#define ERROR_NAME "SGEMMTR" +#endif +#endif +#else #define SMP_THRESHOLD_MIN 8192.0 +#ifdef RNAME +#ifdef XDOUBLE +#define ERROR_NAME "XGEMMTR" +#elif defined(DOUBLE) +#define ERROR_NAME "ZGEMMTR" +#else +#define ERROR_NAME "CGEMMTR" +#endif +#endif +#else #ifdef XDOUBLE #define ERROR_NAME "XGEMMT " #elif defined(DOUBLE) @@ -57,6 +79,7 @@ #define ERROR_NAME "CGEMMT " #endif #endif +#endif #ifndef GEMM_MULTITHREAD_THRESHOLD #define GEMM_MULTITHREAD_THRESHOLD 4 From 7c3e169b67c246e3e2fc1174c6b39e0f54e5d139 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 6 Feb 2025 19:21:08 +0100 Subject: [PATCH 038/205] Update gemmt.c --- interface/gemmt.c | 1 - 1 file changed, 1 deletion(-) diff --git a/interface/gemmt.c b/interface/gemmt.c index cf01c4dcf..d63115f77 100644 --- a/interface/gemmt.c +++ b/interface/gemmt.c @@ -69,7 +69,6 @@ #else #define ERROR_NAME "CGEMMTR" #endif -#endif #else #ifdef XDOUBLE #define ERROR_NAME "XGEMMT " From ff30ac96662e850fdc7f617dfd1a0078298b2229 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 6 Feb 2025 19:51:23 +0100 Subject: [PATCH 039/205] Update Makefile --- interface/Makefile | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/interface/Makefile b/interface/Makefile index 849d59bef..f09a6f46b 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -1304,7 +1304,7 @@ ifeq ($(BUILD_BFLOAT16),1) sbgemm.$(SUFFIX) sbgemm.$(PSUFFIX) : gemm.c ../param.h $(CC) -c $(CFLAGS) $< -o $(@F) sbgemmt.$(SUFFIX) sbgemmt.$(PSUFFIX) : sbgemmt.c ../param.h - $(CC) -c $(CFLAGS) $< -o $(@F) + $(CC) -c $(CFLAGS) -URNAME $< -o $(@F) sbgemmtr.$(SUFFIX) sbgemmtr.$(PSUFFIX) : sbgemmt.c ../param.h $(CC) -c $(CFLAGS) -DRNAME $< -o $(@F) endif @@ -1328,19 +1328,19 @@ xgemm.$(SUFFIX) xgemm.$(PSUFFIX) : gemm.c ../param.h $(CC) -c $(CFLAGS) $< -o $(@F) sgemmt.$(SUFFIX) sgemmt.$(PSUFFIX) : gemmt.c ../param.h - $(CC) -c $(CFLAGS) $< -o $(@F) + $(CC) -c $(CFLAGS) -URNAME $< -o $(@F) dgemmt.$(SUFFIX) dgemmt.$(PSUFFIX) : gemmt.c ../param.h - $(CC) -c $(CFLAGS) $< -o $(@F) + $(CC) -c $(CFLAGS) -URNAME $< -o $(@F) qgemmt.$(SUFFIX) qgemmt.$(PSUFFIX) : gemmt.c ../param.h - $(CC) -c $(CFLAGS) $< -o $(@F) + $(CC) -c $(CFLAGS) -URNAME $< -o $(@F) cgemmt.$(SUFFIX) cgemmt.$(PSUFFIX) : gemmt.c ../param.h - $(CC) -c $(CFLAGS) $< -o $(@F) + $(CC) -c $(CFLAGS) -URNAME $< -o $(@F) zgemmt.$(SUFFIX) zgemmt.$(PSUFFIX) : gemmt.c ../param.h - $(CC) -c $(CFLAGS) $< -o $(@F) + $(CC) -c $(CFLAGS) -URNAME $< -o $(@F) sgemmtr.$(SUFFIX) sgemmtr.$(PSUFFIX) : gemmt.c ../param.h $(CC) -c $(CFLAGS) -DRNAME $< -o $(@F) From db7e5f1fa751ec454155764dab93ab9554e73bdc Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 6 Feb 2025 21:26:20 +0100 Subject: [PATCH 040/205] Update gemmt.c --- interface/gemmt.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/interface/gemmt.c b/interface/gemmt.c index d63115f77..bcccf5a74 100644 --- a/interface/gemmt.c +++ b/interface/gemmt.c @@ -40,23 +40,23 @@ #define SMP_THRESHOLD_MIN 65536.0 #ifdef RNAME #ifdef XDOUBLE -#define ERROR_NAME "QGEMMT " +#define ERROR_NAME "QGEMMTR" #elif defined(DOUBLE) -#define ERROR_NAME "DGEMMT " +#define ERROR_NAME "DGEMMTR" #elif defined(BFLOAT16) -#define ERROR_NAME "SBGEMMT " +#define ERROR_NAME "SBGEMMTR" #else -#define ERROR_NAME "SGEMMT " +#define ERROR_NAME "SGEMMTR" #endif #else #ifdef XDOUBLE -#define ERROR_NAME "QGEMMTR" +#define ERROR_NAME "QGEMMT " #elif defined(DOUBLE) -#define ERROR_NAME "DGEMMTR" +#define ERROR_NAME "DGEMMT " #elif defined(BFLOAT16) -#define ERROR_NAME "SBGEMMTR" +#define ERROR_NAME "SBGEMMT " #else -#define ERROR_NAME "SGEMMTR" +#define ERROR_NAME "SGEMMT " #endif #endif #else From 0fd5448b2c3a3e16410bff49489f968c44e51d91 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 8 Feb 2025 19:33:05 +0100 Subject: [PATCH 041/205] Handle INCX=0 --- interface/nrm2.c | 70 +++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 69 insertions(+), 1 deletion(-) diff --git a/interface/nrm2.c b/interface/nrm2.c index 331ebc3d0..22b24396f 100644 --- a/interface/nrm2.c +++ b/interface/nrm2.c @@ -61,6 +61,37 @@ FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){ #else return fabsf(x[0]); #endif +#endif + + if (incx == 0) +#ifndef COMPLEX +#ifdef DOUBLE + return (sqrt((double)n)*fabs(x[0])); +#else + return (sqrt((float)n)*fabsf(x[0])); +#endif +#else +#ifdef DOUBLE + { + double fr=fabs(x[0]); + double fi=fabs(x[1]); + double fmin=MIN(fr,fi); + double fmax=MAX(fr,fi); + if (fmax==0.) return(fmax); + if (fmax==fmin) return(sqrt((double)n)*sqrt(2.)*fmax); + return (sqrt((double)n) * fmax * sqrt (1. + (fmin/fmax)*(fmin/fmax))); + } +#else + { + float fr=fabs(x[0]); + float fi=fabs(x[1]); + float fmin=MIN(fr,fi); + float fmax=MAX(fr,fi); + if (fmax==0.) return(fmax); + if (fmax==fmin) return(sqrt((float)n)*sqrt(2.)*fmax); + return (sqrt((float)n) * fmax * sqrt (1. + (fmin/fmax)*(fmin/fmax))); + } +#endif #endif if (incx < 0) @@ -97,13 +128,50 @@ FLOAT CNAME(blasint n, FLOAT *x, blasint incx){ if (n <= 0) return 0.; - #ifndef COMPLEX if (n == 1) +#ifndef COMPLEX #ifdef DOUBLE return fabs(x[0]); #else return fabsf(x[0]); #endif +#else +#ifdef DOUBLE + return fabs(x[0]+fabs(x[1])); +#else + return fabsf(x[0]+fabsf(x[1])); +#endif +#endif + + if (incx == 0) +#ifndef COMPLEX +#ifdef DOUBLE + return (sqrt((double)n)*fabs(x[0])); +#else + return (sqrt((float)n)*fabsf(x[0])); +#endif +#else +#ifdef DOUBLE + { + double fr=fabs(x[0]); + double fi=fabs(x[1]); + double fmin=MIN(fr,fi); + double fmax=MAX(fr,fi); + if (fmax==0.) return(fmax); + if (fmax==fmin) return(sqrt((double)n)*sqrt(2.)*fmax); + return (sqrt((double)n) * fmax * sqrt (1. + (fmin/fmax)*(fmin/fmax))); + } +#else + { + float fr=fabs(x[0]); + float fi=fabs(x[1]); + float fmin=MIN(fr,fi); + float fmax=MAX(fr,fi); + if (fmax==0.) return(fmax); + if (fmax==fmin) return(sqrt((float)n)*sqrt(2.)*fmax); + return (sqrt((float)n) * fmax * sqrt (1. + (fmin/fmax)*(fmin/fmax))); + } +#endif #endif if (incx < 0) From 60d0be0e971c064ea7a2bee0cb1247c387e18b3c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 8 Feb 2025 23:42:21 +0100 Subject: [PATCH 042/205] Update nrm2.c --- interface/nrm2.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/interface/nrm2.c b/interface/nrm2.c index 22b24396f..cfeb13df8 100644 --- a/interface/nrm2.c +++ b/interface/nrm2.c @@ -128,19 +128,13 @@ FLOAT CNAME(blasint n, FLOAT *x, blasint incx){ if (n <= 0) return 0.; - if (n == 1) #ifndef COMPLEX + if (n == 1) #ifdef DOUBLE return fabs(x[0]); #else return fabsf(x[0]); #endif -#else -#ifdef DOUBLE - return fabs(x[0]+fabs(x[1])); -#else - return fabsf(x[0]+fabsf(x[1])); -#endif #endif if (incx == 0) From 3a4a9b21eb8d5b18e6b18b077d7c8282504a3409 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 9 Feb 2025 20:16:03 +0100 Subject: [PATCH 043/205] Disable tests with incx,incy=0 (undefined behavior) --- utest/test_extensions/test_crot.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/utest/test_extensions/test_crot.c b/utest/test_extensions/test_crot.c index 1ff456813..f400b345c 100644 --- a/utest/test_extensions/test_crot.c +++ b/utest/test_extensions/test_crot.c @@ -166,7 +166,7 @@ static float c_api_check_csrot(blasint n, blasint inc_x, blasint inc_y, float *c norm += cblas_scnrm2(n, data_crot.y_test, inc_y_abs); return (norm / 2); } - +#if 0 /** * Fortran API specific test * Test crot by comparing it with caxpby. @@ -192,7 +192,7 @@ CTEST(crot, inc_x_0_inc_y_0) float norm = check_csrot(n, inc_x, inc_y, c, s); ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); } - +#endif /** * Fortran API specific test * Test crot by comparing it with caxpby. @@ -478,7 +478,7 @@ CTEST(crot, check_n_zero) float norm = check_csrot(n, inc_x, inc_y, c, s); ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); } - +#if 0 /** * C API specific test * Test crot by comparing it with caxpby. @@ -504,7 +504,7 @@ CTEST(crot, c_api_inc_x_0_inc_y_0) float norm = c_api_check_csrot(n, inc_x, inc_y, c, s); ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS); } - +#endif /** * C API specific test * Test crot by comparing it with caxpby. From 57208b8bcea098213c908ae35001c233e33dcd38 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 9 Feb 2025 20:17:29 +0100 Subject: [PATCH 044/205] Disable tests with incx,incy=0 (undefined behavior) --- utest/test_extensions/test_zrot.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/utest/test_extensions/test_zrot.c b/utest/test_extensions/test_zrot.c index c5ae22fc5..f4ee06391 100644 --- a/utest/test_extensions/test_zrot.c +++ b/utest/test_extensions/test_zrot.c @@ -164,7 +164,7 @@ static double c_api_check_zdrot(blasint n, blasint inc_x, blasint inc_y, double norm += cblas_dznrm2(n, data_zrot.y_test, inc_y_abs); return (norm / 2); } - +#if 0 /** * Fortran API specific test * Test zrot by comparing it with zaxpby. @@ -190,7 +190,7 @@ CTEST(zrot, inc_x_0_inc_y_0) double norm = check_zdrot(n, inc_x, inc_y, c, s); ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); } - +#endif /** * Fortran API specific test * Test zrot by comparing it with zaxpby. @@ -476,7 +476,7 @@ CTEST(zrot, check_n_zero) double norm = check_zdrot(n, inc_x, inc_y, c, s); ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); } - +#if 0 /** * C API specific test * Test zrot by comparing it with zaxpby. @@ -502,7 +502,7 @@ CTEST(zrot, c_api_inc_x_0_inc_y_0) double norm = c_api_check_zdrot(n, inc_x, inc_y, c, s); ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS); } - +#endif /** * C API specific test * Test zrot by comparing it with zaxpby. From 5d6356bc16e0033a591478ff009b04d98990b512 Mon Sep 17 00:00:00 2001 From: Hao Chen Date: Mon, 20 Jan 2025 10:45:01 +0800 Subject: [PATCH 045/205] LoongArch64: Fixed amax_lsx.S Fixed register zeroing operation Signed-off-by: Hao Chen Signed-off-by: gxw --- kernel/loongarch64/amax_lsx.S | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/kernel/loongarch64/amax_lsx.S b/kernel/loongarch64/amax_lsx.S index fb3b77a0e..504e331ef 100644 --- a/kernel/loongarch64/amax_lsx.S +++ b/kernel/loongarch64/amax_lsx.S @@ -56,17 +56,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. LDINT INCX, 0(INCX) #endif + vxor.v VM0, VM0, VM0 bge $r0, N, .L999 bge $r0, INCX, .L999 li.d TEMP, 1 slli.d TEMP, TEMP, BASE_SHIFT slli.d INCX, INCX, BASE_SHIFT -#ifdef DOUBLE - vldrepl.d VM0, X, 0 -#else - vldrepl.w VM0, X, 0 -#endif - VFSUB VM0, VM0, VM0 bne INCX, TEMP, .L20 srai.d I, N, 3 From 31d326f8951330ffcf191da987fdac5ffe48665a Mon Sep 17 00:00:00 2001 From: Hao Chen Date: Mon, 20 Jan 2025 10:45:20 +0800 Subject: [PATCH 046/205] LoongArch64: Fixed dot_lsx.S Fixed incorrect register usage in instructions Signed-off-by: gxw --- kernel/loongarch64/dot_lsx.S | 84 +++++++++++++----------------------- 1 file changed, 29 insertions(+), 55 deletions(-) diff --git a/kernel/loongarch64/dot_lsx.S b/kernel/loongarch64/dot_lsx.S index 8a74d82e7..ecdf8da44 100644 --- a/kernel/loongarch64/dot_lsx.S +++ b/kernel/loongarch64/dot_lsx.S @@ -53,8 +53,8 @@ PROLOGUE #endif /* init $f8 and $f9 to zero */ - SUB s1, s1, s1 - SUB s2, s2, s2 + vxor.v $vr8, $vr8, $vr8 + vxor.v $vr9, $vr9, $vr9 slli.d INCX, INCX, BASE_SHIFT li.d TEMP, SIZE slli.d INCY, INCY, BASE_SHIFT @@ -64,20 +64,6 @@ PROLOGUE /* !((inc_x == 1) && (inc_y == 1)) */ - /* init $vr8 and $vr9 to zero */ -#ifdef DOUBLE - vldrepl.d $vr0, X, 0 -#else - vldrepl.w $vr0, X, 0 -#endif -#ifdef DSDOT - vfcvtl.d.s $vr0, $vr0 - vfsub.d $vr8, $vr0, $vr0 - vfsub.d $vr9, $vr0, $vr0 -#else - VFSUB $vr8, $vr0, $vr0 - VFSUB $vr9, $vr0, $vr0 -#endif #ifdef DOUBLE srai.d I, N, 3 @@ -99,31 +85,31 @@ PROLOGUE addi.w I, I, -1 addi.d X, X, 64 addi.d Y, Y, 64 -#ifdef DSDOT +#ifndef DOUBLE vfcvtl.d.s $vr10, $vr0 vfcvtl.d.s $vr11, $vr4 vfcvth.d.s $vr12, $vr0 vfcvth.d.s $vr13, $vr4 - vfmadd.d $vr8, $vr10, $vr12, $vr8 - vfmadd.d $vr9, $vr11, $vr13, $vr9 + vfmadd.d $vr8, $vr10, $vr11, $vr8 + vfmadd.d $vr9, $vr12, $vr13, $vr9 vfcvtl.d.s $vr10, $vr1 vfcvtl.d.s $vr11, $vr5 vfcvth.d.s $vr12, $vr1 vfcvth.d.s $vr13, $vr5 - vfmadd.d $vr8, $vr10, $vr12, $vr8 - vfmadd.d $vr9, $vr11, $vr13, $vr9 + vfmadd.d $vr8, $vr10, $vr11, $vr8 + vfmadd.d $vr9, $vr12, $vr13, $vr9 vfcvtl.d.s $vr10, $vr2 vfcvtl.d.s $vr11, $vr6 vfcvth.d.s $vr12, $vr2 vfcvth.d.s $vr13, $vr6 - vfmadd.d $vr8, $vr10, $vr12, $vr8 - vfmadd.d $vr9, $vr11, $vr13, $vr9 + vfmadd.d $vr8, $vr10, $vr11, $vr8 + vfmadd.d $vr9, $vr12, $vr13, $vr9 vfcvtl.d.s $vr10, $vr3 vfcvtl.d.s $vr11, $vr7 vfcvth.d.s $vr12, $vr3 vfcvth.d.s $vr13, $vr7 - vfmadd.d $vr8, $vr10, $vr12, $vr8 - vfmadd.d $vr9, $vr11, $vr13, $vr9 + vfmadd.d $vr8, $vr10, $vr11, $vr8 + vfmadd.d $vr9, $vr12, $vr13, $vr9 #else VFMADD $vr8, $vr0, $vr4, $vr8 VFMADD $vr9, $vr1, $vr5, $vr9 @@ -149,13 +135,13 @@ PROLOGUE addi.w I, I, -1 addi.d X, X, 16 addi.d Y, Y, 16 -#ifdef DSDOT +#ifndef DOUBLE vfcvtl.d.s $vr10, $vr0 vfcvtl.d.s $vr11, $vr4 vfcvth.d.s $vr12, $vr0 vfcvth.d.s $vr13, $vr4 - vfmadd.d $vr8, $vr10, $vr12, $vr8 - vfmadd.d $vr9, $vr11, $vr13, $vr9 + vfmadd.d $vr8, $vr10, $vr11, $vr8 + vfmadd.d $vr9, $vr12, $vr13, $vr9 #else VFMADD $vr8, $vr0, $vr4, $vr8 #endif @@ -163,23 +149,10 @@ PROLOGUE .align 3 .L14: /* store dot in s1 $f8 */ -#ifdef DSDOT vfadd.d $vr8, $vr8, $vr9 - fsub.s s2, s2, s2 /* set s2 to 0.0 */ + fsub.d s2, s2, s2 /* set s2 to 0.0 */ vpackod.d $vr0, $vr8, $vr8 vfadd.d $vr8, $vr8, $vr0 -#else - VFADD $vr8, $vr8, $vr9 - SUB s2, s2, s2 /* set s2 to 0.0 */ - vpackod.d $vr0, $vr8, $vr8 -#ifdef DOUBLE - VFADD $vr8, $vr8, $vr0 -#else - VFADD $vr8, $vr8, $vr0 - vpackod.w $vr0, $vr8, $vr8 - VFADD $vr8, $vr8, $vr0 -#endif /* defined DOUBLE */ -#endif /* defined DSDOT */ .align 3 .L15: #ifdef DOUBLE @@ -193,7 +166,7 @@ PROLOGUE /* DOUBLE: 1 ; FLOAT: 1~3 */ LD a1, X, 0 LD b1, Y, 0 -#ifdef DSDOT +#ifndef DOUBLE fcvt.d.s a1, a1 fcvt.d.s b1, b1 fmadd.d s1, b1, a1, s1 @@ -236,7 +209,7 @@ PROLOGUE add.d X, X, INCX LD b1, Y, 0 * SIZE add.d Y, Y, INCY -#ifdef DSDOT +#ifndef DOUBLE fcvt.d.s a1, a1 fcvt.d.s b1, b1 fmadd.d s1, b1, a1, s1 @@ -248,7 +221,7 @@ PROLOGUE add.d X, X, INCX LD b1, Y, 0 * SIZE add.d Y, Y, INCY -#ifdef DSDOT +#ifndef DOUBLE fcvt.d.s a1, a1 fcvt.d.s b1, b1 fmadd.d s2, b1, a1, s2 @@ -260,7 +233,7 @@ PROLOGUE add.d X, X, INCX LD b1, Y, 0 * SIZE add.d Y, Y, INCY -#ifdef DSDOT +#ifndef DOUBLE fcvt.d.s a1, a1 fcvt.d.s b1, b1 fmadd.d s1, b1, a1, s1 @@ -272,7 +245,7 @@ PROLOGUE add.d X, X, INCX LD b1, Y, 0 * SIZE add.d Y, Y, INCY -#ifdef DSDOT +#ifndef DOUBLE fcvt.d.s a1, a1 fcvt.d.s b1, b1 fmadd.d s2, b1, a1, s2 @@ -284,7 +257,7 @@ PROLOGUE add.d X, X, INCX LD b1, Y, 0 * SIZE add.d Y, Y, INCY -#ifdef DSDOT +#ifndef DOUBLE fcvt.d.s a1, a1 fcvt.d.s b1, b1 fmadd.d s1, b1, a1, s1 @@ -296,7 +269,7 @@ PROLOGUE add.d X, X, INCX LD b1, Y, 0 * SIZE add.d Y, Y, INCY -#ifdef DSDOT +#ifndef DOUBLE fcvt.d.s a1, a1 fcvt.d.s b1, b1 fmadd.d s2, b1, a1, s2 @@ -308,7 +281,7 @@ PROLOGUE add.d X, X, INCX LD b1, Y, 0 * SIZE add.d Y, Y, INCY -#ifdef DSDOT +#ifndef DOUBLE fcvt.d.s a1, a1 fcvt.d.s b1, b1 fmadd.d s1, b1, a1, s1 @@ -321,7 +294,7 @@ PROLOGUE LD b1, Y, 0 * SIZE add.d Y, Y, INCY addi.d I, I, -1 -#ifdef DSDOT +#ifndef DOUBLE fcvt.d.s a1, a1 fcvt.d.s b1, b1 fmadd.d s2, b1, a1, s2 @@ -342,7 +315,7 @@ PROLOGUE LD b1, Y, 0 * SIZE add.d Y, Y, INCY addi.d I, I, -1 -#ifdef DSDOT +#ifndef DOUBLE fcvt.d.s a1, a1 fcvt.d.s b1, b1 fmadd.d s1, b1, a1, s1 @@ -353,12 +326,13 @@ PROLOGUE .align 3 .L999: -#ifdef DSDOT fadd.d $f0, s1, s2 + move $r4, $r17 +#if defined(DOUBLE) +#elif defined(DSDOT) #else - ADD $f0, s1, s2 + fcvt.s.d $f0, $f0 #endif - move $r4, $r17 jirl $r0, $r1, 0x0 EPILOGUE From 7f1ebc7ae62e48c45b61da09b1e2ed8e36518850 Mon Sep 17 00:00:00 2001 From: Hao Chen Date: Thu, 6 Feb 2025 16:52:06 +0800 Subject: [PATCH 047/205] LoongArch64: Fixed iamax_lsx.S Fixed index retrieval issue when there are identical maximum absolute values Signed-off-by: Hao Chen Signed-off-by: gxw --- kernel/loongarch64/iamax_lsx.S | 234 +++++++++++++++++++-------------- 1 file changed, 134 insertions(+), 100 deletions(-) diff --git a/kernel/loongarch64/iamax_lsx.S b/kernel/loongarch64/iamax_lsx.S index ce5b3c724..4985458ca 100644 --- a/kernel/loongarch64/iamax_lsx.S +++ b/kernel/loongarch64/iamax_lsx.S @@ -56,19 +56,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VI3 $vr8 #define VI4 $vr19 #define VT0 $vr23 +#define VZE $vr3 +#define VT1 $vr4 +#define VT2 $vr5 +#define VC0 $vr6 PROLOGUE li.d i0, 0 bge $r0, N, .L999 bge $r0, INCX, .L999 li.d TEMP, 1 + vldi VZE, 0 slli.d TEMP, TEMP, BASE_SHIFT slli.d INCX, INCX, BASE_SHIFT bne INCX, TEMP, .L20 vld VM0, X, 0 #ifdef DOUBLE + vfsub.d VT1, VZE, VM0 addi.d i0, i0, 1 srai.d I, N, 3 + vfmaxa.d VM0, VM0, VT1 bge $r0, I, .L11 slli.d i0, i0, 1 //2 vreplgr2vr.d VINC2, i0 @@ -79,12 +86,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi.d i0, i0, 1 vinsgr2vr.d VI1, i0, 1 addi.d i0, i0, 3 - vinsgr2vr.d VI0, i0, 0 //1 + vinsgr2vr.d VI0, i0, 0 //initialize the index value for vectorization addi.d i0, i0, 1 - vinsgr2vr.d VI0, i0, 1 //2 + vinsgr2vr.d VI0, i0, 1 #else + vfsub.s VT1, VZE, VM0 addi.w i0, i0, 1 srai.d I, N, 3 + vfmaxa.s VM0, VM0, VT1 bge $r0, I, .L21 slli.w i0, i0, 2 //4 vreplgr2vr.w VINC2, i0 @@ -115,39 +124,51 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vadd.d VI1, VI1, VINC4 vld VX1, X, 2 * SIZE vadd.d VI2, VI1, VINC2 - vfmaxa.d x1, VX0, VX1 - vfcmp.ceq.d VT0, VX0, x1 - vbitsel.v x2, VI2, VI1, VT0 + vfsub.d VT1, VZE, VX0 + vfsub.d VT2, VZE, VX1 + vfmaxa.d VX0, VX0, VT1 + vfmaxa.d VX1, VX1, VT2 + vfcmp.clt.d VT0, VX0, VX1 //abx(x0) < abs(x1) + vbitsel.v x1, VX0, VX1, VT0 //abs(maxf) + vbitsel.v x2, VI1, VI2, VT0 //i + vld VX0, X, 4 * SIZE vadd.d VI1, VI2, VINC2 vld VX1, X, 6 * SIZE vadd.d VI2, VI1, VINC2 - vfmaxa.d x3, VX0, VX1 - vfcmp.ceq.d VT0, VX0, x3 - vbitsel.v x4, VI2, VI1, VT0 - vfmaxa.d x3, x1, x3 - vfcmp.ceq.d VT0, x1, x3 - vbitsel.v x2, x4, x2, VT0 - vfmaxa.d VM1, VM0, x3 - vfcmp.ceq.d VT0, VM0, VM1 - vbitsel.v VM0, VM1, VM0, VT0 - vbitsel.v VI0, x2, VI0, VT0 + vfsub.d VT1, VZE, VX0 + vfsub.d VT2, VZE, VX1 + vfmaxa.d VX0, VX0, VT1 + vfmaxa.d VX1, VX1, VT2 + vfcmp.clt.d VT0, VX0, VX1 + vbitsel.v x3, VX0, VX1, VT0 //abs(maxf) + vbitsel.v x4, VI1, VI2, VT0 //i + vfcmp.clt.d VC0, x1, x3 + vbitsel.v x1, x1, x3, VC0 //abs(maxf) + vbitsel.v x2, x2, x4, VC0 //i + vfcmp.clt.d VT0, VM0, x1 addi.d I, I, -1 addi.d X, X, 8 * SIZE + vbitsel.v VM0, VM0, x1, VT0 + vbitsel.v VI0, VI0, x2, VT0 #else vld VX0, X, 0 * SIZE vadd.w VI1, VI1, VINC4 vld VX1, X, 4 * SIZE vadd.w VI2, VI1, VINC2 - vfmaxa.s VM1, VX0, VX1 - vfcmp.ceq.s VT0, VX0, VM1 + vfsub.s VT1, VZE, VX0 + vfsub.s VT2, VZE, VX1 + vfmaxa.s VX0, VX0, VT1 + vfmaxa.s VX1, VX1, VT2 + vfcmp.clt.s VT0, VX0, VX1 + vbitsel.v x1, VX0, VX1, VT0 //abs(maxf) + vbitsel.v x2, VI1, VI2, VT0 //i addi.d I, I, -1 - vbitsel.v VI2, VI2, VI1, VT0 - vfmaxa.s VM1, VM0, VM1 - vfcmp.ceq.s VT0, VM0, VM1 + vfcmp.clt.s VT0, VM0, x1 addi.d X, X, 8 * SIZE - vbitsel.v VM0, VM1, VM0, VT0 - vbitsel.v VI0, VI2, VI0, VT0 + vbitsel.v VM0, VM0, x1, VT0 + vbitsel.v VI0, VI0, x2, VT0 + #endif blt $r0, I, .L10 .align 3 @@ -158,7 +179,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vreplvei.d VI2, VI0, 1 vreplvei.d x1, VM0, 0 vreplvei.d x2, VM0, 1 - fcmp.ceq.d $fcc0, $f10, $f9 + fcmp.ceq.d $fcc0, $f9, $f10 bceqz $fcc0, .L16 vfcmp.clt.d VT0, VI1, VI2 vbitsel.v VI0, VI2, VI1, VT0 @@ -172,28 +193,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vreplvei.w x2, VM0, 1 vreplvei.w x3, VM0, 2 vreplvei.w x4, VM0, 3 - vfmaxa.s VM1, x1, x2 - vfcmp.ceq.s VT0, VM1, x1 - vbitsel.v VINC2, VI2, VI1, VT0 - vfmaxa.s VM0, x3, x4 - vfcmp.ceq.s VT0, x3, VM0 - vbitsel.v VINC4, VI4, VI3, VT0 - vfmaxa.s VM0, VM0, VM1 - vfcmp.ceq.s VT0, VM0, VM1 - vbitsel.v VI0, VINC4, VINC2, VT0 - fcmp.ceq.d $fcc0, $f15, $f9 - bceqz $fcc0, .L26 - vfcmp.clt.s VT0, VI1, VI0 - vbitsel.v VI0, VI0, VI1, VT0 b .L26 #endif .align 3 #ifdef DOUBLE .L16: - vfmaxa.d VM0, x1, x2 - vfcmp.ceq.d VT0, x1, VM0 - vbitsel.v VI0, VI2, VI1, VT0 + vfcmp.clt.d VT0, x1, x2 + vbitsel.v VI0, VI1, VI2, VT0 + vbitsel.v VM0, x1, x2, VT0 .align 3 .L17: @@ -212,10 +220,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L13: fld.d $f9, X, 0 - vfmaxa.d VM1, x1, VM0 - vfcmp.ceq.d VT0, VM0, VM1 - vbitsel.v VM0, VM1, VM0, VT0 - vbitsel.v VI0, VI1, VI0, VT0 + fsub.d $f10, $f3, $f9 + vfmaxa.d x1, x1, x2 + vfcmp.clt.d VT0, VM0, x1 + vbitsel.v VM0, VM0, x1, VT0 + vbitsel.v VI0, VI0, VI1, VT0 addi.d I, I, -1 addi.d i1, i1, 1 addi.d X, X, SIZE @@ -241,10 +250,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add.d TEMP, TEMP, INCX vinsgr2vr.d VM0, t2, 1 slli.d i0, i0, 1 //2 + vfsub.d VT1, VZE, VM0 vreplgr2vr.d VINC2, i0 slli.d i0, i0, 1 //4 vreplgr2vr.d VINC4, i0 addi.d i0, i0, -7 + vfmaxa.d VM0, VM0, VT1 vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization addi.d i0, i0, 1 vinsgr2vr.d VI1, i0, 1 @@ -269,9 +280,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add.d X, X, INCX vinsgr2vr.d VX1, t2, 1 vadd.d VI2, VI1, VINC2 - vfmaxa.d x1, VX0, VX1 - vfcmp.ceq.d VT0, VX0, x1 - vbitsel.v x2, VI2, VI1, VT0 + + vfsub.d VT1, VZE, VX0 + vfsub.d VT2, VZE, VX1 + vfmaxa.d VX0, VX0, VT1 + vfmaxa.d VX1, VX1, VT2 + vfcmp.clt.d VT0, VX0, VX1 + vbitsel.v x1, VX0, VX1, VT0 + vbitsel.v x2, VI1, VI2, VT0 ld.d t1, X, 0 * SIZE add.d X, X, INCX vinsgr2vr.d VX0, t1, 0 @@ -286,16 +302,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add.d X, X, INCX vinsgr2vr.d VX1, t2, 1 vadd.d VI2, VI1, VINC2 - vfmaxa.d x3, VX0, VX1 - vfcmp.ceq.d VT0, VX0, x3 - vbitsel.v x4, VI2, VI1, VT0 - vfmaxa.d x3, x1, x3 - vfcmp.ceq.d VT0, x1, x3 - vbitsel.v x2, x4, x2, VT0 - vfmaxa.d VM1, VM0, x3 - vbitsel.v VM0, VM1, VM0, VT0 - vfcmp.ceq.d VT0, VM0, VM1 - vbitsel.v VI0, x2, VI0, VT0 + vfsub.d VT1, VZE, VX0 + vfsub.d VT2, VZE, VX1 + vfmaxa.d VX0, VX0, VT1 + vfmaxa.d VX1, VX1, VT2 + vfcmp.clt.d VT0, VX0, VX1 + vbitsel.v x3, VX0, VX1, VT0 + vbitsel.v x4, VI1, VI2, VT0 + vfcmp.clt.d VC0, x1, x3 + vbitsel.v x1, x1, x3, VC0 + vbitsel.v x2, x2, x4, VC0 + vfcmp.clt.d VT0, VM0, x1 + vbitsel.v VM0, VM0, x1, VT0 + vbitsel.v VI0, VI0, x2, VT0 + addi.d I, I, -1 blt $r0, I, .L24 .align 3 @@ -313,9 +333,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .align 3 .L26: - vfmaxa.d VM0, x1, x2 - vfcmp.ceq.d VT0, x1, VM0 - vbitsel.v VI0, VI2, VI1, VT0 + vfcmp.clt.d VT0, x1, x2 + vbitsel.v VI0, VI1, VI2, VT0 + vbitsel.v VM0, x1, x2, VT0 .align 3 .L27: @@ -389,14 +409,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vinsgr2vr.w VX1, t3, 2 vinsgr2vr.w VX1, t4, 3 vadd.w VI2, VI1, VINC2 - vfmaxa.s VM1, VX0, VX1 - vfcmp.ceq.s VT0, VX0, VM1 - vbitsel.v VI2, VI2, VI1, VT0 - vfmaxa.s VM1, VM0, VM1 - vfcmp.ceq.s VT0, VM0, VM1 + vfsub.s VT1, VZE, VX0 + vfsub.s VT2, VZE, VX1 + vfmaxa.s VX0, VX0, VT1 + vfmaxa.s VX1, VX1, VT2 + vfcmp.clt.s VT0, VX0, VX1 + vbitsel.v x1, VX0, VX1, VT0 + vbitsel.v x2, VI1, VI2, VT0 //i + addi.d I, I, -1 - vbitsel.v VM0, VM1, VM0, VT0 - vbitsel.v VI0, VI2, VI0, VT0 + vfcmp.clt.s VT0, VM0, x1 + vbitsel.v VM0, VM0, x1, VT0 + vbitsel.v VI0, VI0, x2, VT0 blt $r0, I, .L24 .align 3 @@ -409,42 +433,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vreplvei.w x2, VM0, 1 vreplvei.w x3, VM0, 2 vreplvei.w x4, VM0, 3 - vfmaxa.s VM1, x1, x2 - vfcmp.ceq.s VT0, VM1, x1 - vbitsel.v VINC2, VI2, VI1, VT0 - vfmaxa.s VM0, x3, x4 - vfcmp.ceq.s VT0, x3, VM0 - vbitsel.v VINC4, VI4, VI3, VT0 - vfmaxa.s VM0, VM0, VM1 - vfcmp.ceq.s VT0, VM0, VM1 - vbitsel.v VI0, VINC4, VINC2, VT0 - fcmp.ceq.d $fcc0, $f15, $f9 - bceqz $fcc0, .L26 - vfcmp.clt.s VT0, VI1, VI0 - vbitsel.v VI0, VI0, VI1, VT0 .align 3 .L26: - fcmp.ceq.d $fcc0, $f15, $f10 - bceqz $fcc0, .L27 - vfcmp.clt.s VT0, VI2, VI0 - vbitsel.v VI0, VI0, VI2, VT0 + fcmp.ceq.s $fcc0, $f9, $f10 + bceqz $fcc0, .L31 + vfcmp.clt.s VT0, VI1, VI2 + vbitsel.v VI1, VI2, VI1, VT0 + b .L32 .align 3 - -.L27: - fcmp.ceq.d $fcc0, $f15, $f11 - bceqz $fcc0, .L28 - vfcmp.clt.s VT0, VI3, VI0 - vbitsel.v VI0, VI0, VI3, VT0 +.L31: + vfcmp.clt.s VT0, x1, x2 + vbitsel.v VI1, VI1, VI2, VT0 + vbitsel.v x1, x1, x2, VT0 .align 3 - -.L28: - fcmp.ceq.d $fcc0, $f15, $f12 - bceqz $fcc0, .L29 - vfcmp.clt.s VT0, VI4, VI0 - vbitsel.v VI0, VI0, VI4, VT0 +.L32: + fcmp.ceq.s $fcc0, $f11, $f12 + bceqz $fcc0, .L33 + vfcmp.clt.s VT1, VI3, VI4 + vbitsel.v VI3, VI4, VI3, VT1 + b .L34 + .align 3 +.L33: + vfcmp.clt.s VT1, x3, x4 + vbitsel.v x3, x3, x4, VT1 + vbitsel.v VI3, VI3, VI4, VT1 + .align 3 +.L34: + fcmp.ceq.s $fcc0, $f9, $f11 + bceqz $fcc0, .L35 + vfcmp.clt.s VT0, VI1, VI3 + vbitsel.v VI0, VI3, VI1, VT0 + vxor.v VM0, x1, VZE + b .L29 + .align 3 +.L35: + vfcmp.clt.s VT0, x1, x3 + vbitsel.v VM0, x1, x3, VT0 + vbitsel.v VI0, VI1, VI3, VT0 .align 3 - .L29: movfr2gr.s i0, $f20 .align 3 @@ -462,10 +489,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L22: LD $f9, X, 0 - VFMAXA VM1, x1, VM0 - VCMPEQ VT0, VM0, VM1 - vbitsel.v VM0, VM1, VM0, VT0 - vbitsel.v VI0, VI1, VI0, VT0 +#ifdef DOUBLE + fsub.d $f10, $f3, $f9 + vfmaxa.d x1, x1, x2 + vfcmp.clt.d VT0, VM0, x1 +#else + fsub.s $f10, $f3, $f9 + vfmaxa.s x1, x1, x2 + vfcmp.clt.s VT0, VM0, x1 +#endif + vbitsel.v VM0, VM0, x1, VT0 + vbitsel.v VI0, VI0, VI1, VT0 addi.d I, I, -1 addi.d i1, i1, 1 add.d X, X, INCX From c2212d0abd533e9a866de3f48afece269afbcd94 Mon Sep 17 00:00:00 2001 From: Hao Chen Date: Fri, 7 Feb 2025 18:02:04 +0800 Subject: [PATCH 048/205] LoongArch64: Fixed copy_lsx.S Fixed incorrect store operation Signed-off-by: gxw --- kernel/loongarch64/copy_lsx.S | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/loongarch64/copy_lsx.S b/kernel/loongarch64/copy_lsx.S index bb10f3565..96e34a6e4 100644 --- a/kernel/loongarch64/copy_lsx.S +++ b/kernel/loongarch64/copy_lsx.S @@ -270,9 +270,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add.d Y, Y, INCY ST a2, Y, 0 add.d Y, Y, INCY - ST a3, X, 0 + ST a3, Y, 0 add.d Y, Y, INCY - ST a4, X, 0 + ST a4, Y, 0 add.d Y, Y, INCY LD a1, X, 0 add.d X, X, INCX @@ -286,9 +286,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add.d Y, Y, INCY ST a2, Y, 0 add.d Y, Y, INCY - ST a3, X, 0 + ST a3, Y, 0 add.d Y, Y, INCY - ST a4, X, 0 + ST a4, Y, 0 add.d Y, Y, INCY addi.d I, I, -1 blt $r0, I, .L222 From e8c740368c4bcf68b86f176fc32f26b4f42934f2 Mon Sep 17 00:00:00 2001 From: gxw Date: Wed, 12 Feb 2025 14:52:49 +0800 Subject: [PATCH 049/205] LoongArch64: Fixed rot_lsx.S ane crot_lsx.S Do not check whether the input parameters c and s are zero, as this may cause errors with special values (same as scal). Although OpenBLAS's own test suite doesn't catch this, it will cause LAPACK test cases to fail. --- kernel/loongarch64/crot_lsx.S | 597 +------------- kernel/loongarch64/rot_lsx.S | 1405 +++------------------------------ 2 files changed, 119 insertions(+), 1883 deletions(-) diff --git a/kernel/loongarch64/crot_lsx.S b/kernel/loongarch64/crot_lsx.S index af8f13b77..663bd8247 100644 --- a/kernel/loongarch64/crot_lsx.S +++ b/kernel/loongarch64/crot_lsx.S @@ -75,6 +75,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. slli.d TEMP, TEMP, ZBASE_SHIFT slli.d INCX, INCX, ZBASE_SHIFT slli.d INCY, INCY, ZBASE_SHIFT + move YY, Y + move XX, X MTG t1, C MTG t2, S MTG t3, a1 @@ -89,25 +91,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vreplgr2vr.w VXZ, t3 srai.d I, N, 2 #endif + bge $r0, I, .L997 beq INCX, $r0, .L996 beq INCY, $r0, .L996 - bne INCX, TEMP, .L22 // INCX!=1 or INCY!=1 - bne INCY, TEMP, .L22 - -.L11: - bge $r0, I, .L997 - CMPEQ $fcc0, C, a1 - bcnez $fcc0, .L110 - CMPEQ $fcc0, S, a1 - bcnez $fcc0, .L112 // C!=0 S==0 - b .L111 // C!=0 S!=0 - .align 3 - -.L110: - CMPEQ $fcc0, S, a1 - bcnez $fcc0, .L114 // C==0 S==0 - b .L113 // C==0 S!=0 - .align 3 + bne INCX, TEMP, .L221 // INCX!=1 or INCY!=1 + bne INCY, TEMP, .L221 .L111: // C!=0 S!=0 vld VX0, X, 0 * SIZE @@ -168,151 +156,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. b .L997 .align 3 -.L112: // C!=0 S==0 - vld VX0, X, 0 * SIZE - vld VX2, Y, 0 * SIZE -#ifdef DOUBLE - vld VX1, X, 2 * SIZE - vld VX3, Y, 2 * SIZE - vpickev.d x1, VX1, VX0 - vpickod.d x2, VX1, VX0 - vpickev.d x3, VX3, VX2 - vpickod.d x4, VX3, VX2 - vfmul.d VX0, x1, VXC - vfmul.d VX1, x3, VXC - vfmul.d VX2, x2, VXC - vfmul.d VX3, x4, VXC - vilvl.d x1, VX2 ,VX0 - vilvh.d x2, VX2, VX0 - vilvl.d x3, VX3 ,VX1 - vilvh.d x4, VX3, VX1 - vst x1, X, 0 * SIZE - vst x3, Y, 0 * SIZE - vst x2, X, 2 * SIZE - vst x4, Y, 2 * SIZE - addi.d X, X, 4 * SIZE - addi.d Y, Y, 4 * SIZE -#else - vld VX1, X, 4 * SIZE - vld VX3, Y, 4 * SIZE - vpickev.w x1, VX1, VX0 - vpickod.w x2, VX1, VX0 - vpickev.w x3, VX3, VX2 - vpickod.w x4, VX3, VX2 - vfmul.s VX0, x1, VXC - vfmul.s VX1, x3, VXC - vfmul.s VX2, x2, VXC - vfmul.s VX3, x4, VXC - vilvl.w x1, VX2 ,VX0 - vilvh.w x2, VX2, VX0 - vilvl.w x3, VX3 ,VX1 - vilvh.w x4, VX3, VX1 - vst x1, X, 0 * SIZE - vst x3, Y, 0 * SIZE - vst x2, X, 4 * SIZE - vst x4, Y, 4 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE -#endif - addi.d I, I, -1 - blt $r0, I, .L112 - b .L997 - .align 3 - -.L113: // C==0 S!=0 - vld VX0, X, 0 * SIZE - vld VX2, Y, 0 * SIZE -#ifdef DOUBLE - vld VX1, X, 2 * SIZE - vld VX3, Y, 2 * SIZE - vpickev.d x1, VX1, VX0 - vpickod.d x2, VX1, VX0 - vpickev.d x3, VX3, VX2 - vpickod.d x4, VX3, VX2 - vfmul.d VX0, x3, VXS - vfmul.d VX1, x1, VXS - vfsub.d VX1, VXZ, VX1 - vfmul.d VX2, x4, VXS - vfmul.d VX3, x2, VXS - vfsub.d VX3, VXZ, VX3 - vilvl.d x1, VX2 ,VX0 - vilvh.d x2, VX2, VX0 - vilvl.d x3, VX3 ,VX1 - vilvh.d x4, VX3, VX1 - vst x1, X, 0 * SIZE - vst x3, Y, 0 * SIZE - vst x2, X, 2 * SIZE - vst x4, Y, 2 * SIZE - addi.d X, X, 4 * SIZE - addi.d Y, Y, 4 * SIZE -#else - vld VX1, X, 4 * SIZE - vld VX3, Y, 4 * SIZE - vpickev.w x1, VX1, VX0 - vpickod.w x2, VX1, VX0 - vpickev.w x3, VX3, VX2 - vpickod.w x4, VX3, VX2 - vfmul.s VX0, x3, VXS - vfmul.s VX1, x1, VXS - vfsub.s VX1, VXZ, VX1 - vfmul.s VX2, x4, VXS - vfmul.s VX3, x2, VXS - vfsub.s VX3, VXZ, VX3 - vilvl.w x1, VX2 ,VX0 - vilvh.w x2, VX2, VX0 - vilvl.w x3, VX3 ,VX1 - vilvh.w x4, VX3, VX1 - vst x1, X, 0 * SIZE - vst x3, Y, 0 * SIZE - vst x2, X, 4 * SIZE - vst x4, Y, 4 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE -#endif - addi.d I, I, -1 - blt $r0, I, .L113 - b .L997 - .align 3 - -.L114: // C==0 S==0 - vst VXZ, X, 0 * SIZE - vst VXZ, Y, 0 * SIZE -#ifdef DOUBLE - vst VXZ, X, 2 * SIZE - vst VXZ, Y, 2 * SIZE - addi.d X, X, 4 * SIZE - addi.d Y, Y, 4 * SIZE -#else - vst VXZ, X, 4 * SIZE - vst VXZ, Y, 4 * SIZE - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE -#endif - addi.d I, I, -1 - blt $r0, I, .L114 - b .L997 - .align 3 - -.L22: -#ifdef DOUBLE - srai.d I, N, 2 -#endif - bge $r0, I, .L997 - move YY, Y - move XX, X - CMPEQ $fcc0, C, a1 - bcnez $fcc0, .L220 - CMPEQ $fcc0, S, a1 - bcnez $fcc0, .L222 // C!=0 S==0 - b .L221 // C!=0 S!=0 - .align 3 - -.L220: - CMPEQ $fcc0, S, a1 - bcnez $fcc0, .L224 // C==0 S==0 - b .L223 // C==0 S!=0 - .align 3 - .L221: // C!=0 S!=0 #ifdef DOUBLE ld.d t1, X, 0 * SIZE @@ -355,50 +198,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vstelm.d VX1, YY, 0, 1 vstelm.d VX3, YY, 1 * SIZE, 1 add.d YY, YY, INCY - - ld.d t1, X, 0 * SIZE - ld.d t2, X, 1 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - ld.d t4, X, 1 * SIZE - vinsgr2vr.d x1, t1, 0 - vinsgr2vr.d x2, t2, 0 - vinsgr2vr.d x1, t3, 1 - vinsgr2vr.d x2, t4, 1 - add.d X, X, INCX - ld.d t1, Y, 0 * SIZE - ld.d t2, Y, 1 * SIZE - add.d Y, Y, INCY - ld.d t3, Y, 0 * SIZE - ld.d t4, Y, 1 * SIZE - vinsgr2vr.d x3, t1, 0 - vinsgr2vr.d x4, t2, 0 - vinsgr2vr.d x3, t3, 1 - vinsgr2vr.d x4, t4, 1 - add.d Y, Y, INCY - vfmul.d VX0, x1, VXC - vfmadd.d VX0, x3, VXS, VX0 - vfmul.d VX1, x1, VXS - vfmsub.d VX1, x3, VXC, VX1 - vfmul.d VX2, x2, VXC - vfmadd.d VX2, x4, VXS, VX2 - vfmul.d VX3, x2, VXS - vfmsub.d VX3, x4, VXC, VX3 - vstelm.d VX0, XX, 0, 0 - vstelm.d VX2, XX, 1 * SIZE, 0 - add.d XX, XX, INCX - vstelm.d VX0, XX, 0, 1 - vstelm.d VX2, XX, 1 * SIZE, 1 - add.d XX, XX, INCX - vstelm.d VX1, YY, 0, 0 - vstelm.d VX3, YY, 1 * SIZE, 0 - add.d YY, YY, INCY - vstelm.d VX1, YY, 0, 1 - vstelm.d VX3, YY, 1 * SIZE, 1 - add.d YY, YY, INCY - addi.d I, I, -1 - blt $r0, I, .L221 - b .L995 #else ld.w t1, X, 0 * SIZE ld.w t2, X, 1 * SIZE @@ -473,396 +272,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vstelm.w VX1, YY, 0, 3 vstelm.w VX3, YY, 1 * SIZE, 3 add.d YY, YY, INCY - addi.d I, I, -1 - blt $r0, I, .L221 - b .L997 #endif - .align 3 - -.L222: // C!=0 S==0 -#ifdef DOUBLE - ld.d t1, X, 0 * SIZE - ld.d t2, X, 1 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - ld.d t4, X, 1 * SIZE - add.d X, X, INCX - vinsgr2vr.d x1, t1, 0 - vinsgr2vr.d x2, t2, 0 - vinsgr2vr.d x1, t3, 1 - vinsgr2vr.d x2, t4, 1 - ld.d t1, Y, 0 * SIZE - ld.d t2, Y, 1 * SIZE - add.d Y, Y, INCY - ld.d t3, Y, 0 * SIZE - ld.d t4, Y, 1 * SIZE - vinsgr2vr.d x3, t1, 0 - vinsgr2vr.d x4, t2, 0 - vinsgr2vr.d x3, t3, 1 - vinsgr2vr.d x4, t4, 1 - add.d Y, Y, INCY - vfmul.d VX0, x1, VXC - vfmul.d VX1, x3, VXC - vfmul.d VX2, x2, VXC - vfmul.d VX3, x4, VXC - vstelm.d VX0, XX, 0, 0 - vstelm.d VX2, XX, 1 * SIZE, 0 - add.d XX, XX, INCX - vstelm.d VX0, XX, 0, 1 - vstelm.d VX2, XX, 1 * SIZE, 1 - add.d XX, XX, INCX - vstelm.d VX1, YY, 0, 0 - vstelm.d VX3, YY, 1 * SIZE, 0 - add.d YY, YY, INCY - vstelm.d VX1, YY, 0, 1 - vstelm.d VX3, YY, 1 * SIZE, 1 - add.d YY, YY, INCY - - ld.d t1, X, 0 * SIZE - ld.d t2, X, 1 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - ld.d t4, X, 1 * SIZE - vinsgr2vr.d x1, t1, 0 - vinsgr2vr.d x2, t2, 0 - vinsgr2vr.d x1, t3, 1 - vinsgr2vr.d x2, t4, 1 - add.d X, X, INCX - ld.d t1, Y, 0 * SIZE - ld.d t2, Y, 1 * SIZE - add.d Y, Y, INCY - ld.d t3, Y, 0 * SIZE - ld.d t4, Y, 1 * SIZE - vinsgr2vr.d x3, t1, 0 - vinsgr2vr.d x4, t2, 0 - vinsgr2vr.d x3, t3, 1 - vinsgr2vr.d x4, t4, 1 - add.d Y, Y, INCY - vfmul.d VX0, x1, VXC - vfmul.d VX1, x3, VXC - vfmul.d VX2, x2, VXC - vfmul.d VX3, x4, VXC - vstelm.d VX0, XX, 0, 0 - vstelm.d VX2, XX, 1 * SIZE, 0 - add.d XX, XX, INCX - vstelm.d VX0, XX, 0, 1 - vstelm.d VX2, XX, 1 * SIZE, 1 - add.d XX, XX, INCX - vstelm.d VX1, YY, 0, 0 - vstelm.d VX3, YY, 1 * SIZE, 0 - add.d YY, YY, INCY - vstelm.d VX1, YY, 0, 1 - vstelm.d VX3, YY, 1 * SIZE, 1 - add.d YY, YY, INCY addi.d I, I, -1 - blt $r0, I, .L222 - b .L995 -#else - ld.w t1, X, 0 * SIZE - ld.w t2, X, 1 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - ld.w t4, X, 1 * SIZE - add.d X, X, INCX - vinsgr2vr.w x1, t1, 0 - vinsgr2vr.w x2, t2, 0 - vinsgr2vr.w x1, t3, 1 - vinsgr2vr.w x2, t4, 1 - ld.w t1, Y, 0 * SIZE - ld.w t2, Y, 1 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - ld.w t4, Y, 1 * SIZE - add.d Y, Y, INCY - vinsgr2vr.w x3, t1, 0 - vinsgr2vr.w x4, t2, 0 - vinsgr2vr.w x3, t3, 1 - vinsgr2vr.w x4, t4, 1 - ld.w t1, X, 0 * SIZE - ld.w t2, X, 1 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - ld.w t4, X, 1 * SIZE - vinsgr2vr.w x1, t1, 2 - vinsgr2vr.w x2, t2, 2 - vinsgr2vr.w x1, t3, 3 - vinsgr2vr.w x2, t4, 3 - add.d X, X, INCX - ld.w t1, Y, 0 * SIZE - ld.w t2, Y, 1 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - ld.w t4, Y, 1 * SIZE - vinsgr2vr.w x3, t1, 2 - vinsgr2vr.w x4, t2, 2 - vinsgr2vr.w x3, t3, 3 - vinsgr2vr.w x4, t4, 3 - add.d Y, Y, INCY - vfmul.s VX0, x1, VXC - vfmul.s VX1, x3, VXC - vfmul.s VX2, x2, VXC - vfmul.s VX3, x4, VXC - vstelm.w VX0, XX, 0, 0 - vstelm.w VX2, XX, 1 * SIZE, 0 - add.d XX, XX, INCX - vstelm.w VX0, XX, 0, 1 - vstelm.w VX2, XX, 1 * SIZE, 1 - add.d XX, XX, INCX - vstelm.w VX0, XX, 0, 2 - vstelm.w VX2, XX, 1 * SIZE, 2 - add.d XX, XX, INCX - vstelm.w VX0, XX, 0, 3 - vstelm.w VX2, XX, 1 * SIZE, 3 - add.d XX, XX, INCX - vstelm.w VX1, YY, 0, 0 - vstelm.w VX3, YY, 1 * SIZE, 0 - add.d YY, YY, INCY - vstelm.w VX1, YY, 0, 1 - vstelm.w VX3, YY, 1 * SIZE, 1 - add.d YY, YY, INCY - vstelm.w VX1, YY, 0, 2 - vstelm.w VX3, YY, 1 * SIZE, 2 - add.d YY, YY, INCY - vstelm.w VX1, YY, 0, 3 - vstelm.w VX3, YY, 1 * SIZE, 3 - add.d YY, YY, INCY - addi.d I, I, -1 - blt $r0, I, .L222 - b .L997 -#endif - .align 3 - -.L223: // C==0 S!=0 -#ifdef DOUBLE - ld.d t1, X, 0 * SIZE - ld.d t2, X, 1 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - ld.d t4, X, 1 * SIZE - add.d X, X, INCX - vinsgr2vr.d x1, t1, 0 - vinsgr2vr.d x2, t2, 0 - vinsgr2vr.d x1, t3, 1 - vinsgr2vr.d x2, t4, 1 - ld.d t1, Y, 0 * SIZE - ld.d t2, Y, 1 * SIZE - add.d Y, Y, INCY - ld.d t3, Y, 0 * SIZE - ld.d t4, Y, 1 * SIZE - vinsgr2vr.d x3, t1, 0 - vinsgr2vr.d x4, t2, 0 - vinsgr2vr.d x3, t3, 1 - vinsgr2vr.d x4, t4, 1 - add.d Y, Y, INCY - vfmul.d VX0, x3, VXS - vfmul.d VX1, x1, VXS - vfsub.d VX1, VXZ, VX1 - vfmul.d VX2, x4, VXS - vfmul.d VX3, x2, VXS - vfsub.d VX3, VXZ, VX3 - vstelm.d VX0, XX, 0, 0 - vstelm.d VX2, XX, 1 * SIZE, 0 - add.d XX, XX, INCX - vstelm.d VX0, XX, 0, 1 - vstelm.d VX2, XX, 1 * SIZE, 1 - add.d XX, XX, INCX - vstelm.d VX1, YY, 0, 0 - vstelm.d VX3, YY, 1 * SIZE, 0 - add.d YY, YY, INCY - vstelm.d VX1, YY, 0, 1 - vstelm.d VX3, YY, 1 * SIZE, 1 - add.d YY, YY, INCY - - ld.d t1, X, 0 * SIZE - ld.d t2, X, 1 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - ld.d t4, X, 1 * SIZE - vinsgr2vr.d x1, t1, 0 - vinsgr2vr.d x2, t2, 0 - vinsgr2vr.d x1, t3, 1 - vinsgr2vr.d x2, t4, 1 - add.d X, X, INCX - ld.d t1, Y, 0 * SIZE - ld.d t2, Y, 1 * SIZE - add.d Y, Y, INCY - ld.d t3, Y, 0 * SIZE - ld.d t4, Y, 1 * SIZE - vinsgr2vr.d x3, t1, 0 - vinsgr2vr.d x4, t2, 0 - vinsgr2vr.d x3, t3, 1 - vinsgr2vr.d x4, t4, 1 - add.d Y, Y, INCY - vfmul.d VX0, x3, VXS - vfmul.d VX1, x1, VXS - vfsub.d VX1, VXZ, VX1 - vfmul.d VX2, x4, VXS - vfmul.d VX3, x2, VXS - vfsub.d VX3, VXZ, VX3 - vstelm.d VX0, XX, 0, 0 - vstelm.d VX2, XX, 1 * SIZE, 0 - add.d XX, XX, INCX - vstelm.d VX0, XX, 0, 1 - vstelm.d VX2, XX, 1 * SIZE, 1 - add.d XX, XX, INCX - vstelm.d VX1, YY, 0, 0 - vstelm.d VX3, YY, 1 * SIZE, 0 - add.d YY, YY, INCY - vstelm.d VX1, YY, 0, 1 - vstelm.d VX3, YY, 1 * SIZE, 1 - add.d YY, YY, INCY - addi.d I, I, -1 - blt $r0, I, .L223 - b .L995 -#else - ld.w t1, X, 0 * SIZE - ld.w t2, X, 1 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - ld.w t4, X, 1 * SIZE - add.d X, X, INCX - vinsgr2vr.w x1, t1, 0 - vinsgr2vr.w x2, t2, 0 - vinsgr2vr.w x1, t3, 1 - vinsgr2vr.w x2, t4, 1 - ld.w t1, Y, 0 * SIZE - ld.w t2, Y, 1 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - ld.w t4, Y, 1 * SIZE - add.d Y, Y, INCY - vinsgr2vr.w x3, t1, 0 - vinsgr2vr.w x4, t2, 0 - vinsgr2vr.w x3, t3, 1 - vinsgr2vr.w x4, t4, 1 - ld.w t1, X, 0 * SIZE - ld.w t2, X, 1 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - ld.w t4, X, 1 * SIZE - vinsgr2vr.w x1, t1, 2 - vinsgr2vr.w x2, t2, 2 - vinsgr2vr.w x1, t3, 3 - vinsgr2vr.w x2, t4, 3 - add.d X, X, INCX - ld.w t1, Y, 0 * SIZE - ld.w t2, Y, 1 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - ld.w t4, Y, 1 * SIZE - vinsgr2vr.w x3, t1, 2 - vinsgr2vr.w x4, t2, 2 - vinsgr2vr.w x3, t3, 3 - vinsgr2vr.w x4, t4, 3 - add.d Y, Y, INCY - vfmul.s VX0, x3, VXS - vfmul.s VX1, x1, VXS - vfsub.s VX1, VXZ, VX1 - vfmul.s VX2, x4, VXS - vfmul.s VX3, x2, VXS - vfsub.s VX3, VXZ, VX3 - vstelm.w VX0, XX, 0, 0 - vstelm.w VX2, XX, 1 * SIZE, 0 - add.d XX, XX, INCX - vstelm.w VX0, XX, 0, 1 - vstelm.w VX2, XX, 1 * SIZE, 1 - add.d XX, XX, INCX - vstelm.w VX0, XX, 0, 2 - vstelm.w VX2, XX, 1 * SIZE, 2 - add.d XX, XX, INCX - vstelm.w VX0, XX, 0, 3 - vstelm.w VX2, XX, 1 * SIZE, 3 - add.d XX, XX, INCX - vstelm.w VX1, YY, 0, 0 - vstelm.w VX3, YY, 1 * SIZE, 0 - add.d YY, YY, INCY - vstelm.w VX1, YY, 0, 1 - vstelm.w VX3, YY, 1 * SIZE, 1 - add.d YY, YY, INCY - vstelm.w VX1, YY, 0, 2 - vstelm.w VX3, YY, 1 * SIZE, 2 - add.d YY, YY, INCY - vstelm.w VX1, YY, 0, 3 - vstelm.w VX3, YY, 1 * SIZE, 3 - add.d YY, YY, INCY - addi.d I, I, -1 - blt $r0, I, .L223 - b .L997 -#endif - .align 3 - -.L224: // C==0 S==0 -#ifdef DOUBLE - vstelm.d VXZ, XX, 0, 0 - vstelm.d VXZ, XX, 1 * SIZE, 0 - add.d XX, XX, INCX - vstelm.d VXZ, XX, 0, 0 - vstelm.d VXZ, XX, 1 * SIZE, 0 - add.d XX, XX, INCX - vstelm.d VXZ, XX, 0, 0 - vstelm.d VXZ, XX, 1 * SIZE, 0 - add.d XX, XX, INCX - vstelm.d VXZ, XX, 0, 0 - vstelm.d VXZ, XX, 1 * SIZE, 0 - add.d XX, XX, INCX - vstelm.d VXZ, YY, 0, 0 - vstelm.d VXZ, YY, 1 * SIZE, 0 - add.d YY, YY, INCY - vstelm.d VXZ, YY, 0, 0 - vstelm.d VXZ, YY, 1 * SIZE, 0 - add.d YY, YY, INCY - vstelm.d VXZ, YY, 0, 0 - vstelm.d VXZ, YY, 1 * SIZE, 0 - add.d YY, YY, INCY - vstelm.d VXZ, YY, 0, 0 - vstelm.d VXZ, YY, 1 * SIZE, 0 - add.d YY, YY, INCY - addi.d I, I, -1 - blt $r0, I, .L224 - move X, XX - move Y, YY - b .L995 -#else - vstelm.w VXZ, XX, 0, 0 - vstelm.w VXZ, XX, 1 * SIZE, 0 - add.d XX, XX, INCX - vstelm.w VXZ, XX, 0, 0 - vstelm.w VXZ, XX, 1 * SIZE, 0 - add.d XX, XX, INCX - vstelm.w VXZ, XX, 0, 0 - vstelm.w VXZ, XX, 1 * SIZE, 0 - add.d XX, XX, INCX - vstelm.w VXZ, XX, 0, 0 - vstelm.w VXZ, XX, 1 * SIZE, 0 - add.d XX, XX, INCX - vstelm.w VXZ, YY, 0, 0 - vstelm.w VXZ, YY, 1 * SIZE, 0 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 0 - vstelm.w VXZ, YY, 1 * SIZE, 0 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 0 - vstelm.w VXZ, YY, 1 * SIZE, 0 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 0 - vstelm.w VXZ, YY, 1 * SIZE, 0 - add.d YY, YY, INCY - addi.d I, I, -1 - blt $r0, I, .L224 - move X, XX - move Y, YY + blt $r0, I, .L221 b .L997 -#endif .align 3 -#ifdef DOUBLE - .L995: - andi I, N, 3 - bge $r0, I, .L999 - b .L998 - .align 3 - -#endif .L996: move I, N b .L998 diff --git a/kernel/loongarch64/rot_lsx.S b/kernel/loongarch64/rot_lsx.S index 3bb77aaec..ea1df4027 100644 --- a/kernel/loongarch64/rot_lsx.S +++ b/kernel/loongarch64/rot_lsx.S @@ -64,6 +64,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. slli.d TEMP, TEMP, BASE_SHIFT slli.d INCX, INCX, BASE_SHIFT slli.d INCY, INCY, BASE_SHIFT + move XX, X + move YY, Y #ifdef DOUBLE movfr2gr.d t1, C vreplgr2vr.d VXC, t1 @@ -80,27 +82,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vreplgr2vr.w VXZ, t3 #endif srai.d I, N, 3 + bge $r0, I, .L997 bne INCX, TEMP, .L20 - bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 - b .L11 // INCX==1 and INCY==1 + bne INCY, TEMP, .L121 // INCX==1 and INCY!=1 + b .L111 // INCX==1 and INCY==1 .L20: - bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 - b .L21 // INCX!=1 and INCY==1 - -.L11: - bge $r0, I, .L997 - CMPEQ $fcc0, C, a1 - bcnez $fcc0, .L110 - CMPEQ $fcc0, S, a1 - bcnez $fcc0, .L112 // C!=0 S==0 - b .L111 // C!=0 S!=0 - .align 3 - -.L110: - CMPEQ $fcc0, S, a1 - bcnez $fcc0, .L114 // C==0 S==0 - b .L113 // C==0 S!=0 - .align 3 + bne INCY, TEMP, .L221 // INCX!=1 and INCY!=1 + b .L211 // INCX!=1 and INCY==1 .L111: // C!=0 S!=0 vld VX0, X, 0 * SIZE @@ -151,129 +139,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. b .L997 .align 3 -.L112: // C!=0 S==0 - vld VX0, X, 0 * SIZE - vld VX2, Y, 0 * SIZE -#ifdef DOUBLE - vld VX1, X, 2 * SIZE - vld VX3, Y, 2 * SIZE -#else - vld VX1, X, 4 * SIZE - vld VX3, Y, 4 * SIZE -#endif - VMUL VT0, VX0, VXC - VMUL VT1, VX2, VXC - vst VT0, X, 0 * SIZE - vst VT1, Y, 0 * SIZE - VMUL VT0, VX1, VXC - VMUL VT1, VX3, VXC -#ifdef DOUBLE - vst VT0, X, 2 * SIZE - vst VT1, Y, 2 * SIZE - vld VX0, X, 4 * SIZE - vld VX2, Y, 4 * SIZE - vld VX1, X, 6 * SIZE - vld VX3, Y, 6 * SIZE - VMUL VT0, VX0, VXC - VMUL VT1, VX2, VXC -#endif - vst VT0, X, 4 * SIZE - vst VT1, Y, 4 * SIZE -#ifdef DOUBLE - VMUL VT0, VX1, VXC - VMUL VT1, VX3, VXC - vst VT0, X, 6 * SIZE - vst VT1, Y, 6 * SIZE -#endif - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L112 - b .L997 - .align 3 - -.L113: // C==0 S!=0 - vld VX0, X, 0 * SIZE - vld VX2, Y, 0 * SIZE -#ifdef DOUBLE - vld VX1, X, 2 * SIZE - vld VX3, Y, 2 * SIZE -#else - vld VX1, X, 4 * SIZE - vld VX3, Y, 4 * SIZE -#endif - VMUL VT0, VX2, VXS - VMUL VT1, VX0, VXS - VFSUB VT1, VXZ, VT1 - vst VT0, X, 0 * SIZE - vst VT1, Y, 0 * SIZE - VMUL VT0, VX3, VXS - VMUL VT1, VX1, VXS - VFSUB VT1, VXZ, VT1 -#ifdef DOUBLE - vst VT0, X, 2 * SIZE - vst VT1, Y, 2 * SIZE - vld VX0, X, 4 * SIZE - vld VX2, Y, 4 * SIZE - vld VX1, X, 6 * SIZE - vld VX3, Y, 6 * SIZE - VMUL VT0, VX2, VXS - VMUL VT1, VX0, VXS - VFSUB VT1, VXZ, VT1 -#endif - vst VT0, X, 4 * SIZE - vst VT1, Y, 4 * SIZE -#ifdef DOUBLE - VMUL VT0, VX3, VXS - VMUL VT1, VX1, VXS - VFSUB VT1, VXZ, VT1 - vst VT0, X, 6 * SIZE - vst VT1, Y, 6 * SIZE -#endif - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L113 - b .L997 - .align 3 - -.L114: // C==0 S==0 - vst VXZ, X, 0 * SIZE - vst VXZ, Y, 0 * SIZE -#ifdef DOUBLE - vst VXZ, X, 2 * SIZE - vst VXZ, Y, 2 * SIZE -#endif - vst VXZ, X, 4 * SIZE - vst VXZ, Y, 4 * SIZE -#ifdef DOUBLE - vst VXZ, X, 6 * SIZE - vst VXZ, Y, 6 * SIZE -#endif - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L114 - b .L997 - .align 3 - -.L12: // INCX==1 and INCY!=1 - bge $r0, I, .L997 - move YY, Y - move XX, X - CMPEQ $fcc0, C, a1 - bcnez $fcc0, .L120 - CMPEQ $fcc0, S, a1 - bcnez $fcc0, .L122 // C!=0 S==0 - b .L121 // C!=0 S!=0 - .align 3 - -.L120: - CMPEQ $fcc0, S, a1 - bcnez $fcc0, .L124 // C==0 S==0 - b .L123 // C==0 S!=0 - .align 3 - .L121: // C!=0 S!=0 #ifdef DOUBLE vld VX0, X, 0 * SIZE @@ -339,948 +204,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vstelm.w VT1, YY, 0, 2 add.d YY, YY, INCY vstelm.w VT1, YY, 0, 3 - add.d YY, YY, INCY - vld VX1, X, 4 * SIZE - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - vinsgr2vr.w VX3, t1, 0 - vinsgr2vr.w VX3, t2, 1 - vinsgr2vr.w VX3, t3, 2 - vinsgr2vr.w VX3, t4, 3 -#endif - add.d Y, Y, INCY - VMUL VT0, VX1, VXC - VFMADD VT0, VX3, VXS, VT0 - VMUL VT1, VX1, VXS - VMSUB VT1, VX3, VXC, VT1 - vst VT0, X, 4 * SIZE -#ifdef DOUBLE - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - vld VX1, X, 6 * SIZE - ld.d t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - vinsgr2vr.d VX3, t3, 0 - vinsgr2vr.d VX3, t4, 1 - add.d Y, Y, INCY - VMUL VT0, VX1, VXC - VFMADD VT0, VX3, VXS, VT0 - VMUL VT1, VX1, VXS - VMSUB VT1, VX3, VXC, VT1 - vst VT0, X, 6 * SIZE - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 -#else - vstelm.w VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 3 -#endif - add.d YY, YY, INCY - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L121 - b .L997 - .align 3 - -.L122: // C!=0 S==0 -#ifdef DOUBLE - vld VX0, X, 0 * SIZE - ld.d t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE -#else - vld VX0, X, 0 * SIZE - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE -#endif -#ifdef DOUBLE - vinsgr2vr.d VX2, t1, 0 - vinsgr2vr.d VX2, t2, 1 -#else - vinsgr2vr.w VX2, t1, 0 - vinsgr2vr.w VX2, t2, 1 - vinsgr2vr.w VX2, t3, 2 - vinsgr2vr.w VX2, t4, 3 -#endif - add.d Y, Y, INCY - VMUL VT0, VX0, VXC - VMUL VT1, VX2, VXC - vst VT0, X, 0 * SIZE -#ifdef DOUBLE - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - vld VX0, X, 2 * SIZE - ld.d t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - vinsgr2vr.d VX2, t3, 0 - vinsgr2vr.d VX2, t4, 1 - add.d Y, Y, INCY - VMUL VT0, VX0, VXC - VMUL VT1, VX2, VXC - vst VT0, X, 2 * SIZE - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - vld VX1, X, 4 * SIZE - ld.d t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE - vinsgr2vr.d VX3, t1, 0 - vinsgr2vr.d VX3, t2, 1 - add.d Y, Y, INCY - VMUL VT0, VX1, VXC - VMUL VT1, VX3, VXC - vst VT0, X, 4 * SIZE - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - vld VX1, X, 6 * SIZE - ld.d t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - vinsgr2vr.d VX3, t3, 0 - vinsgr2vr.d VX3, t4, 1 - add.d Y, Y, INCY - VMUL VT0, VX1, VXC - VMUL VT1, VX3, VXC - vst VT0, X, 6 * SIZE - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 -#else - vstelm.w VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 3 - add.d YY, YY, INCY - vld VX1, X, 4 * SIZE - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - vinsgr2vr.w VX3, t1, 0 - vinsgr2vr.w VX3, t2, 1 - vinsgr2vr.w VX3, t3, 2 - vinsgr2vr.w VX3, t4, 3 - add.d Y, Y, INCY - VMUL VT0, VX1, VXC - VMUL VT1, VX3, VXC - vst VT0, X, 4 * SIZE - vstelm.w VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 3 -#endif - add.d YY, YY, INCY - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L122 - b .L997 - .align 3 - -.L123: // C==0 S!=0 -#ifdef DOUBLE - vld VX0, X, 0 * SIZE - ld.d t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE -#else - vld VX0, X, 0 * SIZE - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE -#endif -#ifdef DOUBLE - vinsgr2vr.d VX2, t1, 0 - vinsgr2vr.d VX2, t2, 1 -#else - vinsgr2vr.w VX2, t1, 0 - vinsgr2vr.w VX2, t2, 1 - vinsgr2vr.w VX2, t3, 2 - vinsgr2vr.w VX2, t4, 3 -#endif - add.d Y, Y, INCY - VMUL VT0, VX2, VXS - VMUL VT1, VX0, VXS - VFSUB VT1, VXZ, VT1 - vst VT0, X, 0 * SIZE -#ifdef DOUBLE - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - vld VX0, X, 2 * SIZE - ld.d t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - vinsgr2vr.d VX2, t3, 0 - vinsgr2vr.d VX2, t4, 1 - add.d Y, Y, INCY - VMUL VT0, VX2, VXS - VMUL VT1, VX0, VXS - VFSUB VT1, VXZ, VT1 - vst VT0, X, 2 * SIZE - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - vld VX1, X, 4 * SIZE - ld.d t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE - vinsgr2vr.d VX3, t1, 0 - vinsgr2vr.d VX3, t2, 1 - add.d Y, Y, INCY - VMUL VT0, VX3, VXS - VMUL VT1, VX1, VXS - VFSUB VT1, VXZ, VT1 - vst VT0, X, 4 * SIZE - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - vld VX1, X, 6 * SIZE - ld.d t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - vinsgr2vr.d VX3, t3, 0 - vinsgr2vr.d VX3, t4, 1 - add.d Y, Y, INCY - VMUL VT0, VX3, VXS - VMUL VT1, VX1, VXS - VFSUB VT1, VXZ, VT1 - vst VT0, X, 6 * SIZE - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 -#else - vstelm.w VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 3 - add.d YY, YY, INCY - vld VX1, X, 4 * SIZE - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - vinsgr2vr.w VX3, t1, 0 - vinsgr2vr.w VX3, t2, 1 - vinsgr2vr.w VX3, t3, 2 - vinsgr2vr.w VX3, t4, 3 - add.d Y, Y, INCY - VMUL VT0, VX3, VXS - VMUL VT1, VX1, VXS - VFSUB VT1, VXZ, VT1 - vst VT0, X, 4 * SIZE - vstelm.w VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 3 -#endif - add.d YY, YY, INCY - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L123 - b .L997 - .align 3 - -.L124: // C==0 S==0 - vst VXZ, X, 0 * SIZE - vst VXZ, X, 4 * SIZE -#ifdef DOUBLE - vstelm.d VXZ, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VXZ, YY, 0, 1 - add.d YY, YY, INCY - vstelm.d VXZ, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VXZ, YY, 0, 1 - add.d YY, YY, INCY - vstelm.d VXZ, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VXZ, YY, 0, 1 - add.d YY, YY, INCY - vstelm.d VXZ, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VXZ, YY, 0, 1 -#else - vstelm.w VXZ, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 3 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 3 -#endif - add.d YY, YY, INCY - addi.d I, I, -1 - addi.d X, X, 8 * SIZE - blt $r0, I, .L124 -#ifdef DOUBLE - move Y, YY -#endif - b .L997 - .align 3 - -.L21:// INCX!=1 and INCY==1 - bge $r0, I, .L997 - move XX, X - CMPEQ $fcc0, C, a1 - bcnez $fcc0, .L210 - CMPEQ $fcc0, S, a1 - bcnez $fcc0, .L212 // C!=0 S==0 - b .L211 // C!=0 S!=0 - .align 3 - -.L210: - CMPEQ $fcc0, S, a1 - bcnez $fcc0, .L214 // C==0 S==0 - b .L213 // C==0 S!=0 - .align 3 - -.L211: // C!=0 S!=0 -#ifdef DOUBLE - vld VX2, Y, 0 * SIZE - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE -#else - vld VX2, Y, 0 * SIZE - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE -#endif -#ifdef DOUBLE - vinsgr2vr.d VX0, t1, 0 - vinsgr2vr.d VX0, t2, 1 -#else - vinsgr2vr.w VX0, t1, 0 - vinsgr2vr.w VX0, t2, 1 - vinsgr2vr.w VX0, t3, 2 - vinsgr2vr.w VX0, t4, 3 -#endif - add.d X, X, INCX - VMUL VT0, VXC, VX0 - VFMADD VT0, VX2, VXS, VT0 - VMUL VT1, VXS, VX0 - VMSUB VT1, VX2, VXC, VT1 -#ifdef DOUBLE - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - vst VT1, Y, 0 * SIZE - vld VX2, Y, 2 * SIZE - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - vinsgr2vr.d VX0, t3, 0 - vinsgr2vr.d VX0, t4, 1 - add.d X, X, INCX - VMUL VT0, VXC, VX0 - VFMADD VT0, VX2, VXS, VT0 - VMUL VT1, VXS, VX0 - VMSUB VT1, VX2, VXC, VT1 - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - vst VT1, Y, 2 * SIZE - vld VX3, Y, 4 * SIZE - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - vinsgr2vr.d VX1, t1, 0 - vinsgr2vr.d VX1, t2, 1 - add.d X, X, INCX - VMUL VT0, VX1, VXC - VFMADD VT0, VX3, VXS, VT0 - VMUL VT1, VX1, VXS - VMSUB VT1, VX3, VXC, VT1 - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - vst VT1, Y, 4 * SIZE - vld VX3, Y, 6 * SIZE - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - vinsgr2vr.d VX1, t3, 0 - vinsgr2vr.d VX1, t4, 1 - add.d X, X, INCX - VMUL VT0, VX1, VXC - VFMADD VT0, VX3, VXS, VT0 - VMUL VT1, VX1, VXS - VMSUB VT1, VX3, VXC, VT1 - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - vst VT1, Y, 6 * SIZE -#else - vstelm.w VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 2 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 3 - add.d XX, XX, INCX - vst VT1, Y, 0 * SIZE - vld VX3, Y, 4 * SIZE - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - vinsgr2vr.w VX1, t1, 0 - vinsgr2vr.w VX1, t2, 1 - vinsgr2vr.w VX1, t3, 2 - vinsgr2vr.w VX1, t4, 3 - add.d X, X, INCX - VMUL VT0, VX1, VXC - VFMADD VT0, VX3, VXS, VT0 - VMUL VT1, VX1, VXS - VMSUB VT1, VX3, VXC, VT1 - vstelm.w VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 2 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 3 - add.d XX, XX, INCX - vst VT1, Y, 4 * SIZE -#endif - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L211 - b .L997 - .align 3 - -.L212: // C!=0 S==0 -#ifdef DOUBLE - vld VX2, Y, 0 * SIZE - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE -#else - vld VX2, Y, 0 * SIZE - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE -#endif -#ifdef DOUBLE - vinsgr2vr.d VX0, t1, 0 - vinsgr2vr.d VX0, t2, 1 -#else - vinsgr2vr.w VX0, t1, 0 - vinsgr2vr.w VX0, t2, 1 - vinsgr2vr.w VX0, t3, 2 - vinsgr2vr.w VX0, t4, 3 -#endif - add.d X, X, INCX - VMUL VT0, VXC, VX0 - VMUL VT1, VX2, VXC - -#ifdef DOUBLE - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - vst VT1, Y, 0 * SIZE - vld VX2, Y, 2 * SIZE - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - vinsgr2vr.d VX0, t3, 0 - vinsgr2vr.d VX0, t4, 1 - add.d X, X, INCX - VMUL VT0, VXC, VX0 - VMUL VT1, VX2, VXC - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - vst VT1, Y, 2 * SIZE - vld VX3, Y, 4 * SIZE - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - vinsgr2vr.d VX1, t1, 0 - vinsgr2vr.d VX1, t2, 1 - add.d X, X, INCX - VMUL VT0, VX1, VXC - VMUL VT1, VX3, VXS - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - vst VT1, Y, 4 * SIZE - vld VX3, Y, 6 * SIZE - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - vinsgr2vr.d VX1, t3, 0 - vinsgr2vr.d VX1, t4, 1 - add.d X, X, INCX - VMUL VT0, VX1, VXC - VMUL VT1, VX3, VXS - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - vst VT1, Y, 6 * SIZE -#else - vstelm.w VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 2 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 3 - add.d XX, XX, INCX - vst VT1, Y, 0 * SIZE - vld VX3, Y, 4 * SIZE - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - vinsgr2vr.w VX1, t1, 0 - vinsgr2vr.w VX1, t2, 1 - vinsgr2vr.w VX1, t3, 2 - vinsgr2vr.w VX1, t4, 3 - add.d X, X, INCX - VMUL VT0, VX1, VXC - VMUL VT1, VX3, VXS - vstelm.w VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 2 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 3 - add.d XX, XX, INCX - vst VT1, Y, 4 * SIZE -#endif - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L212 - b .L997 - .align 3 - -.L213: // C==0 S!=0 -#ifdef DOUBLE - vld VX2, Y, 0 * SIZE - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE -#else - vld VX2, Y, 0 * SIZE - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE -#endif -#ifdef DOUBLE - vinsgr2vr.d VX0, t1, 0 - vinsgr2vr.d VX0, t2, 1 -#else - vinsgr2vr.w VX0, t1, 0 - vinsgr2vr.w VX0, t2, 1 - vinsgr2vr.w VX0, t3, 2 - vinsgr2vr.w VX0, t4, 3 -#endif - add.d X, X, INCX - VMUL VT0, VXS, VX2 - VMUL VT1, VXS, VX0 - VFSUB VT1, VXZ, VT1 - -#ifdef DOUBLE - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - vst VT1, Y, 0 * SIZE - vld VX2, Y, 2 * SIZE - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - vinsgr2vr.d VX0, t3, 0 - vinsgr2vr.d VX0, t4, 1 - add.d X, X, INCX - VMUL VT0, VXS, VX2 - VMUL VT1, VXS, VX0 - VFSUB VT1, VXZ, VT1 - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - vst VT1, Y, 2 * SIZE - vld VX3, Y, 4 * SIZE - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - vinsgr2vr.d VX1, t1, 0 - vinsgr2vr.d VX1, t2, 1 - add.d X, X, INCX - VMUL VT0, VX3, VXS - VMUL VT1, VX1, VXS - VFSUB VT1, VXZ, VT1 - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - vst VT1, Y, 4 * SIZE - vld VX3, Y, 6 * SIZE - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - vinsgr2vr.d VX1, t3, 0 - vinsgr2vr.d VX1, t4, 1 - add.d X, X, INCX - VMUL VT0, VX3, VXS - VMUL VT1, VX1, VXS - VFSUB VT1, VXZ, VT1 - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - vst VT1, Y, 6 * SIZE -#else - vstelm.w VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 2 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 3 - add.d XX, XX, INCX - vst VT1, Y, 0 * SIZE - vld VX3, Y, 4 * SIZE - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - vinsgr2vr.w VX1, t1, 0 - vinsgr2vr.w VX1, t2, 1 - vinsgr2vr.w VX1, t3, 2 - vinsgr2vr.w VX1, t4, 3 - add.d X, X, INCX - VMUL VT0, VX3, VXS - VMUL VT1, VX1, VXS - VFSUB VT1, VXZ, VT1 - vstelm.w VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 2 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 3 - add.d XX, XX, INCX - vst VT1, Y, 4 * SIZE -#endif - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L213 - b .L997 - .align 3 - -.L214: // C==0 S==0 -#ifdef DOUBLE - vstelm.d VXZ, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VXZ, XX, 0, 1 - add.d XX, XX, INCX - vstelm.d VXZ, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VXZ, XX, 0, 1 - add.d XX, XX, INCX - vst VT1, Y, 0 * SIZE - vstelm.d VXZ, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VXZ, XX, 0, 1 - add.d XX, XX, INCX - vstelm.d VXZ, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VXZ, XX, 0, 1 -#else - vstelm.w VXZ, XX, 0, 0 - add.d XX, XX, INCX - vstelm.w VXZ, XX, 0, 1 - add.d XX, XX, INCX - vstelm.w VXZ, XX, 0, 2 - add.d XX, XX, INCX - vstelm.w VXZ, XX, 0, 3 - add.d XX, XX, INCX - vst VT1, Y, 0 * SIZE - vstelm.w VXZ, XX, 0, 0 - add.d XX, XX, INCX - vstelm.w VXZ, XX, 0, 1 - add.d XX, XX, INCX - vstelm.w VXZ, XX, 0, 2 - add.d XX, XX, INCX - vstelm.w VXZ, XX, 0, 3 -#endif - add.d XX, XX, INCX - vst VT1, Y, 4 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L211 -#ifdef DOUBLE - move X, XX -#endif - b .L997 - .align 3 - -.L22: - bge $r0, I, .L997 - move YY, Y - move XX, X - CMPEQ $fcc0, C, a1 - bcnez $fcc0, .L220 - CMPEQ $fcc0, S, a1 - bcnez $fcc0, .L222 // C!=0 S==0 - b .L221 // C!=0 S!=0 - .align 3 - -.L220: - CMPEQ $fcc0, S, a1 - bcnez $fcc0, .L224 // C==0 S==0 - b .L223 // C==0 S!=0 - .align 3 - -.L221: // C!=0 S!=0 -#ifdef DOUBLE - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX -#else - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX -#endif -#ifdef DOUBLE - vinsgr2vr.d VX0, t1, 0 - vinsgr2vr.d VX0, t2, 1 - ld.d t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE - vinsgr2vr.d VX2, t1, 0 - vinsgr2vr.d VX2, t2, 1 - add.d Y, Y, INCY - VMUL VT0, VX0, VXC - VFMADD VT0, VX2, VXS, VT0 - VMUL VT1, VX0, VXS - VMSUB VT1, VX2, VXC, VT1 - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX0, t3, 0 - vinsgr2vr.d VX0, t4, 1 - ld.d t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - vinsgr2vr.d VX2, t3, 0 - vinsgr2vr.d VX2, t4, 1 - add.d Y, Y, INCY - VMUL VT0, VX0, VXC - VFMADD VT0, VX2, VXS, VT0 - VMUL VT1, VX0, VXS - VMSUB VT1, VX2, VXC, VT1 - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - vinsgr2vr.d VX1, t1, 0 - vinsgr2vr.d VX1, t2, 1 - ld.d t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE - vinsgr2vr.d VX3, t1, 0 - vinsgr2vr.d VX3, t2, 1 - add.d Y, Y, INCY - VMUL VT0, VX1, VXC - VFMADD VT0, VX3, VXS, VT0 - VMUL VT1, VX1, VXS - VMSUB VT1, VX3, VXC, VT1 - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - vinsgr2vr.d VX1, t3, 0 - vinsgr2vr.d VX1, t4, 1 - add.d X, X, INCX - ld.d t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - vinsgr2vr.d VX3, t3, 0 - vinsgr2vr.d VX3, t4, 1 - add.d Y, Y, INCY - VMUL VT0, VX1, VXC - VFMADD VT0, VX3, VXS, VT0 - VMUL VT1, VX1, VXS - VMSUB VT1, VX3, VXC, VT1 - vstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY -#else - vinsgr2vr.w VX0, t1, 0 - vinsgr2vr.w VX0, t2, 1 - vinsgr2vr.w VX0, t3, 2 - vinsgr2vr.w VX0, t4, 3 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - vinsgr2vr.w VX2, t1, 0 - vinsgr2vr.w VX2, t2, 1 - vinsgr2vr.w VX2, t3, 2 - vinsgr2vr.w VX2, t4, 3 - add.d Y, Y, INCY - VMUL VT0, VX0, VXC - VFMADD VT0, VX2, VXS, VT0 - VMUL VT1, VX0, VXS - VMSUB VT1, VX2, VXC, VT1 - vstelm.w VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 2 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 3 - add.d XX, XX, INCX - vstelm.w VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 3 - add.d YY, YY, INCY - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - vinsgr2vr.w VX1, t1, 0 - vinsgr2vr.w VX1, t2, 1 - vinsgr2vr.w VX1, t3, 2 - vinsgr2vr.w VX1, t4, 3 - add.d X, X, INCX + add.d YY, YY, INCY + vld VX1, X, 4 * SIZE ld.w t1, Y, 0 * SIZE add.d Y, Y, INCY ld.w t2, Y, 0 * SIZE @@ -1292,19 +217,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vinsgr2vr.w VX3, t2, 1 vinsgr2vr.w VX3, t3, 2 vinsgr2vr.w VX3, t4, 3 +#endif add.d Y, Y, INCY VMUL VT0, VX1, VXC VFMADD VT0, VX3, VXS, VT0 VMUL VT1, VX1, VXS VMSUB VT1, VX3, VXC, VT1 - vstelm.w VT0, XX, 0, 0 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 1 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 2 - add.d XX, XX, INCX - vstelm.w VT0, XX, 0, 3 - add.d XX, XX, INCX + vst VT0, X, 4 * SIZE +#ifdef DOUBLE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY + vld VX1, X, 6 * SIZE + ld.d t3, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t4, Y, 0 * SIZE + vinsgr2vr.d VX3, t3, 0 + vinsgr2vr.d VX3, t4, 1 + add.d Y, Y, INCY + VMUL VT0, VX1, VXC + VFMADD VT0, VX3, VXS, VT0 + VMUL VT1, VX1, VXS + VMSUB VT1, VX3, VXC, VT1 + vst VT0, X, 6 * SIZE + vstelm.d VT1, YY, 0, 0 + add.d YY, YY, INCY + vstelm.d VT1, YY, 0, 1 +#else vstelm.w VT1, YY, 0, 0 add.d YY, YY, INCY vstelm.w VT1, YY, 0, 1 @@ -1312,127 +252,99 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vstelm.w VT1, YY, 0, 2 add.d YY, YY, INCY vstelm.w VT1, YY, 0, 3 - add.d YY, YY, INCY #endif + add.d YY, YY, INCY + addi.d X, X, 8 * SIZE addi.d I, I, -1 - blt $r0, I, .L221 + blt $r0, I, .L121 b .L997 .align 3 -.L222: // C!=0 S==0 - ld.d t1, X, 0 * SIZE +.L211: // C!=0 S!=0 +#ifdef DOUBLE + vld VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE add.d X, X, INCX - ld.d t2, X, 0 * SIZE + ld.d t2, X, 0 * SIZE +#else + vld VX2, Y, 0 * SIZE + ld.w t1, X, 0 * SIZE + add.d X, X, INCX + ld.w t2, X, 0 * SIZE add.d X, X, INCX -#ifndef DOUBLE ld.w t3, X, 0 * SIZE add.d X, X, INCX ld.w t4, X, 0 * SIZE - add.d X, X, INCX #endif #ifdef DOUBLE vinsgr2vr.d VX0, t1, 0 vinsgr2vr.d VX0, t2, 1 - ld.d t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE - vinsgr2vr.d VX2, t1, 0 - vinsgr2vr.d VX2, t2, 1 - add.d Y, Y, INCY - VMUL VT0, VX0, VXC - VMUL VT1, VX2, VXC +#else + vinsgr2vr.w VX0, t1, 0 + vinsgr2vr.w VX0, t2, 1 + vinsgr2vr.w VX0, t3, 2 + vinsgr2vr.w VX0, t4, 3 +#endif + add.d X, X, INCX + VMUL VT0, VXC, VX0 + VFMADD VT0, VX2, VXS, VT0 + VMUL VT1, VXS, VX0 + VMSUB VT1, VX2, VXC, VT1 +#ifdef DOUBLE vstelm.d VT0, XX, 0, 0 add.d XX, XX, INCX vstelm.d VT0, XX, 0, 1 add.d XX, XX, INCX - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY + vst VT1, Y, 0 * SIZE + vld VX2, Y, 2 * SIZE ld.d t3, X, 0 * SIZE add.d X, X, INCX ld.d t4, X, 0 * SIZE - add.d X, X, INCX vinsgr2vr.d VX0, t3, 0 vinsgr2vr.d VX0, t4, 1 - ld.d t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - vinsgr2vr.d VX2, t3, 0 - vinsgr2vr.d VX2, t4, 1 - add.d Y, Y, INCY - VMUL VT0, VX0, VXC - VMUL VT1, VX2, VXC + add.d X, X, INCX + VMUL VT0, VXC, VX0 + VFMADD VT0, VX2, VXS, VT0 + VMUL VT1, VXS, VX0 + VMSUB VT1, VX2, VXC, VT1 vstelm.d VT0, XX, 0, 0 add.d XX, XX, INCX vstelm.d VT0, XX, 0, 1 add.d XX, XX, INCX - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY + vst VT1, Y, 2 * SIZE + vld VX3, Y, 4 * SIZE ld.d t1, X, 0 * SIZE add.d X, X, INCX ld.d t2, X, 0 * SIZE - add.d X, X, INCX vinsgr2vr.d VX1, t1, 0 vinsgr2vr.d VX1, t2, 1 - ld.d t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE - vinsgr2vr.d VX3, t1, 0 - vinsgr2vr.d VX3, t2, 1 - add.d Y, Y, INCY + add.d X, X, INCX VMUL VT0, VX1, VXC - VMUL VT1, VX3, VXC + VFMADD VT0, VX3, VXS, VT0 + VMUL VT1, VX1, VXS + VMSUB VT1, VX3, VXC, VT1 vstelm.d VT0, XX, 0, 0 add.d XX, XX, INCX vstelm.d VT0, XX, 0, 1 add.d XX, XX, INCX - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY + vst VT1, Y, 4 * SIZE + vld VX3, Y, 6 * SIZE ld.d t3, X, 0 * SIZE add.d X, X, INCX ld.d t4, X, 0 * SIZE - add.d X, X, INCX vinsgr2vr.d VX1, t3, 0 vinsgr2vr.d VX1, t4, 1 - ld.d t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - vinsgr2vr.d VX3, t3, 0 - vinsgr2vr.d VX3, t4, 1 - add.d Y, Y, INCY + add.d X, X, INCX VMUL VT0, VX1, VXC - VMUL VT1, VX3, VXC + VFMADD VT0, VX3, VXS, VT0 + VMUL VT1, VX1, VXS + VMSUB VT1, VX3, VXC, VT1 vstelm.d VT0, XX, 0, 0 add.d XX, XX, INCX vstelm.d VT0, XX, 0, 1 add.d XX, XX, INCX - vstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VT1, YY, 0, 1 + vst VT1, Y, 6 * SIZE #else - vinsgr2vr.w VX0, t1, 0 - vinsgr2vr.w VX0, t2, 1 - vinsgr2vr.w VX0, t3, 2 - vinsgr2vr.w VX0, t4, 3 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - vinsgr2vr.w VX2, t1, 0 - vinsgr2vr.w VX2, t2, 1 - vinsgr2vr.w VX2, t3, 2 - vinsgr2vr.w VX2, t4, 3 - add.d Y, Y, INCY - VMUL VT0, VX0, VXC - VMUL VT1, VX2, VXC vstelm.w VT0, XX, 0, 0 add.d XX, XX, INCX vstelm.w VT0, XX, 0, 1 @@ -1441,14 +353,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add.d XX, XX, INCX vstelm.w VT0, XX, 0, 3 add.d XX, XX, INCX - vstelm.w VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 3 - add.d YY, YY, INCY + vst VT1, Y, 0 * SIZE + vld VX3, Y, 4 * SIZE ld.w t1, X, 0 * SIZE add.d X, X, INCX ld.w t2, X, 0 * SIZE @@ -1456,25 +362,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld.w t3, X, 0 * SIZE add.d X, X, INCX ld.w t4, X, 0 * SIZE - add.d X, X, INCX vinsgr2vr.w VX1, t1, 0 vinsgr2vr.w VX1, t2, 1 vinsgr2vr.w VX1, t3, 2 vinsgr2vr.w VX1, t4, 3 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - vinsgr2vr.w VX3, t1, 0 - vinsgr2vr.w VX3, t2, 1 - vinsgr2vr.w VX3, t3, 2 - vinsgr2vr.w VX3, t4, 3 - add.d Y, Y, INCY + add.d X, X, INCX VMUL VT0, VX1, VXC - VMUL VT1, VX3, VXC + VFMADD VT0, VX3, VXS, VT0 + VMUL VT1, VX1, VXS + VMSUB VT1, VX3, VXC, VT1 vstelm.w VT0, XX, 0, 0 add.d XX, XX, INCX vstelm.w VT0, XX, 0, 1 @@ -1483,21 +379,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add.d XX, XX, INCX vstelm.w VT0, XX, 0, 3 add.d XX, XX, INCX - vstelm.w VT1, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VT1, YY, 0, 3 + vst VT1, Y, 4 * SIZE #endif - add.d YY, YY, INCY + addi.d Y, Y, 8 * SIZE addi.d I, I, -1 - blt $r0, I, .L222 + blt $r0, I, .L211 b .L997 .align 3 -.L223: // C==0 S!=0 +.L221: // C!=0 S!=0 #ifdef DOUBLE ld.d t1, X, 0 * SIZE add.d X, X, INCX @@ -1522,9 +412,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vinsgr2vr.d VX2, t1, 0 vinsgr2vr.d VX2, t2, 1 add.d Y, Y, INCY - VMUL VT0, VX2, VXS + VMUL VT0, VX0, VXC + VFMADD VT0, VX2, VXS, VT0 VMUL VT1, VX0, VXS - VFSUB VT1, VXZ, VT1 + VMSUB VT1, VX2, VXC, VT1 vstelm.d VT0, XX, 0, 0 add.d XX, XX, INCX vstelm.d VT0, XX, 0, 1 @@ -1545,9 +436,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vinsgr2vr.d VX2, t3, 0 vinsgr2vr.d VX2, t4, 1 add.d Y, Y, INCY - VMUL VT0, VX2, VXS - VMUL VT1, VX0, VXS - VFSUB VT1, VXZ, VT1 + VMUL VT0, VX0, VXC + VFMADD VT0, VX2, VXS, VT0 + VMUL VT1, VX0, VXS + VMSUB VT1, VX2, VXC, VT1 vstelm.d VT0, XX, 0, 0 add.d XX, XX, INCX vstelm.d VT0, XX, 0, 1 @@ -1568,9 +460,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vinsgr2vr.d VX3, t1, 0 vinsgr2vr.d VX3, t2, 1 add.d Y, Y, INCY - VMUL VT0, VX3, VXS - VMUL VT1, VX0, VXS - VFSUB VT1, VXZ, VT1 + VMUL VT0, VX1, VXC + VFMADD VT0, VX3, VXS, VT0 + VMUL VT1, VX1, VXS + VMSUB VT1, VX3, VXC, VT1 vstelm.d VT0, XX, 0, 0 add.d XX, XX, INCX vstelm.d VT0, XX, 0, 1 @@ -1582,18 +475,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld.d t3, X, 0 * SIZE add.d X, X, INCX ld.d t4, X, 0 * SIZE - add.d X, X, INCX vinsgr2vr.d VX1, t3, 0 vinsgr2vr.d VX1, t4, 1 + add.d X, X, INCX ld.d t3, Y, 0 * SIZE add.d Y, Y, INCY ld.d t4, Y, 0 * SIZE vinsgr2vr.d VX3, t3, 0 vinsgr2vr.d VX3, t4, 1 add.d Y, Y, INCY - VMUL VT0, VX3, VXS - VMUL VT1, VX0, VXS - VFSUB VT1, VXZ, VT1 + VMUL VT0, VX1, VXC + VFMADD VT0, VX3, VXS, VT0 + VMUL VT1, VX1, VXS + VMSUB VT1, VX3, VXC, VT1 vstelm.d VT0, XX, 0, 0 add.d XX, XX, INCX vstelm.d VT0, XX, 0, 1 @@ -1601,6 +495,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vstelm.d VT1, YY, 0, 0 add.d YY, YY, INCY vstelm.d VT1, YY, 0, 1 + add.d YY, YY, INCY #else vinsgr2vr.w VX0, t1, 0 vinsgr2vr.w VX0, t2, 1 @@ -1618,9 +513,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vinsgr2vr.w VX2, t3, 2 vinsgr2vr.w VX2, t4, 3 add.d Y, Y, INCY - VMUL VT0, VX2, VXS + VMUL VT0, VX0, VXC + VFMADD VT0, VX2, VXS, VT0 VMUL VT1, VX0, VXS - VFSUB VT1, VXZ, VT1 + VMSUB VT1, VX2, VXC, VT1 vstelm.w VT0, XX, 0, 0 add.d XX, XX, INCX vstelm.w VT0, XX, 0, 1 @@ -1644,11 +540,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld.w t3, X, 0 * SIZE add.d X, X, INCX ld.w t4, X, 0 * SIZE - add.d X, X, INCX vinsgr2vr.w VX1, t1, 0 vinsgr2vr.w VX1, t2, 1 vinsgr2vr.w VX1, t3, 2 vinsgr2vr.w VX1, t4, 3 + add.d X, X, INCX ld.w t1, Y, 0 * SIZE add.d Y, Y, INCY ld.w t2, Y, 0 * SIZE @@ -1661,9 +557,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vinsgr2vr.w VX3, t3, 2 vinsgr2vr.w VX3, t4, 3 add.d Y, Y, INCY - VMUL VT0, VX3, VXS - VMUL VT1, VX0, VXS - VFSUB VT1, VXZ, VT1 + VMUL VT0, VX1, VXC + VFMADD VT0, VX3, VXS, VT0 + VMUL VT1, VX1, VXS + VMSUB VT1, VX3, VXC, VT1 vstelm.w VT0, XX, 0, 0 add.d XX, XX, INCX vstelm.w VT0, XX, 0, 1 @@ -1679,86 +576,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vstelm.w VT1, YY, 0, 2 add.d YY, YY, INCY vstelm.w VT1, YY, 0, 3 -#endif - add.d YY, YY, INCY - addi.d I, I, -1 - blt $r0, I, .L223 - b .L997 - .align 3 - -.L224: // C==0 S==0 -#ifdef DOUBLE - vstelm.d VXZ, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VXZ, XX, 0, 1 - add.d XX, XX, INCX - vstelm.d VXZ, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VXZ, XX, 0, 1 - add.d XX, XX, INCX - vstelm.d VXZ, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VXZ, YY, 0, 1 - add.d YY, YY, INCY - vstelm.d VXZ, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VXZ, YY, 0, 1 - add.d YY, YY, INCY - vstelm.d VXZ, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VXZ, XX, 0, 1 - add.d XX, XX, INCX - vstelm.d VXZ, XX, 0, 0 - add.d XX, XX, INCX - vstelm.d VXZ, XX, 0, 1 - add.d XX, XX, INCX - vstelm.d VXZ, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VXZ, YY, 0, 1 - add.d YY, YY, INCY - vstelm.d VXZ, YY, 0, 0 - add.d YY, YY, INCY - vstelm.d VXZ, YY, 0, 1 -#else - vstelm.w VXZ, XX, 0, 0 - add.d XX, XX, INCX - vstelm.w VXZ, XX, 0, 1 - add.d XX, XX, INCX - vstelm.w VXZ, XX, 0, 2 - add.d XX, XX, INCX - vstelm.w VXZ, XX, 0, 3 - add.d XX, XX, INCX - vstelm.w VXZ, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 2 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 3 - add.d YY, YY, INCY - vstelm.w VXZ, XX, 0, 0 - add.d XX, XX, INCX - vstelm.w VXZ, XX, 0, 1 - add.d XX, XX, INCX - vstelm.w VXZ, XX, 0, 2 - add.d XX, XX, INCX - vstelm.w VXZ, XX, 0, 3 - add.d XX, XX, INCX - vstelm.w VXZ, YY, 0, 0 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 1 - add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 2 add.d YY, YY, INCY - vstelm.w VXZ, YY, 0, 3 #endif - add.d YY, YY, INCY addi.d I, I, -1 - blt $r0, I, .L224 -#ifdef DOUBLE - move X, XX - move Y, YY -#endif + blt $r0, I, .L221 b .L997 .align 3 From 9e75d6b3d18e3887082aa39693d1c36142669def Mon Sep 17 00:00:00 2001 From: gxw Date: Wed, 12 Feb 2025 14:57:35 +0800 Subject: [PATCH 050/205] LoongArch64: Fixed swap_lsx.S Fixed the error when the stride is zero --- kernel/loongarch64/swap_lsx.S | 65 +++++------------------------------ 1 file changed, 9 insertions(+), 56 deletions(-) diff --git a/kernel/loongarch64/swap_lsx.S b/kernel/loongarch64/swap_lsx.S index 736187f93..6ce1a11f0 100644 --- a/kernel/loongarch64/swap_lsx.S +++ b/kernel/loongarch64/swap_lsx.S @@ -348,62 +348,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. move XX, X .L222: - LD a1, X, 0 - add.d X, X, INCX - LD a2, X, 0 - add.d X, X, INCX - LD a3, X, 0 - add.d X, X, INCX - LD a4, X, 0 - add.d X, X, INCX - LD b1, Y, 0 - ST a1, Y, 0 - add.d Y, Y, INCY - LD b2, Y, 0 - ST a2, Y, 0 - add.d Y, Y, INCY - LD b3, Y, 0 - ST a3, Y, 0 - add.d Y, Y, INCY - LD b4, Y, 0 - ST a4, Y, 0 - add.d Y, Y, INCY - LD a1, X, 0 - add.d X, X, INCX - ST b1, XX, 0 - add.d XX, XX, INCX - LD b1, Y, 0 - ST a1, Y, 0 - add.d Y, Y, INCY - LD a2, X, 0 - add.d X, X, INCX - ST b2, XX, 0 - add.d XX, XX, INCX - LD b2, Y, 0 - ST a2, Y, 0 - add.d Y, Y, INCY - LD a3, X, 0 - add.d X, X, INCX - ST b3, XX, 0 - add.d XX, XX, INCX - LD b3, Y, 0 - ST a3, Y, 0 - LD a4, X, 0 - add.d X, X, INCX - ST b4, XX, 0 - add.d XX, XX, INCX - LD b4, Y, 0 - ST a4, Y, 0 - add.d Y, Y, INCY - ST b1, XX, 0 - add.d XX, XX, INCX - ST b2, XX, 0 - add.d XX, XX, INCX - ST b3, XX, 0 - add.d XX, XX, INCX - ST b4, XX, 0 - add.d XX, XX, INCX - addi.d I, I, -1 +.rept 8 + LD $f12, X, 0 + LD $f14, Y, 0 + ST $f12, Y, 0 + ST $f14, X, 0 + add.d X, X, INCX + add.d Y, Y, INCY +.endr + addi.d I, I, -1 blt $r0, I, .L222 .align 3 From 2c4a5cc6e6037a3ad92b0fff09bcb498a4c74fc0 Mon Sep 17 00:00:00 2001 From: gxw Date: Wed, 12 Feb 2025 14:59:39 +0800 Subject: [PATCH 051/205] LoongArch64: Fixed snrm2_lsx.S and cnrm2_lsx.S When the data type is single-precision real or single-precision complex, converting it to double precision does not prevent overflow (as exposed in LAPACK tests). The only solution is to follow C's approach: find the maximum value in the array and divide each element by that maximum to avoid this issue --- kernel/loongarch64/cnrm2_lsx.S | 79 +++++++++++++++++++++------------ kernel/loongarch64/snrm2_lsx.S | 81 +++++++++++++++++++++++----------- 2 files changed, 106 insertions(+), 54 deletions(-) diff --git a/kernel/loongarch64/cnrm2_lsx.S b/kernel/loongarch64/cnrm2_lsx.S index 20950ba17..17361d143 100644 --- a/kernel/loongarch64/cnrm2_lsx.S +++ b/kernel/loongarch64/cnrm2_lsx.S @@ -47,6 +47,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VX4 $vr21 #define res1 $vr19 #define res2 $vr20 +#define RCP $f2 +#define VALPHA $vr3 PROLOGUE @@ -55,10 +57,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. LDINT INCX, 0(INCX) #endif - vxor.v res1, res1, res1 - vxor.v res2, res2, res2 bge $r0, N, .L999 beq $r0, INCX, .L999 + addi.d $sp, $sp, -32 + st.d $ra, $sp, 0 + st.d N, $sp, 8 + st.d X, $sp, 16 + st.d INCX, $sp, 24 +#ifdef DYNAMIC_ARCH + bl camax_k_LA264 +#else + bl camax_k +#endif + ld.d $ra, $sp, 0 + ld.d N, $sp, 8 + ld.d X, $sp, 16 + ld.d INCX, $sp, 24 + addi.d $sp, $sp, 32 + + frecip.s RCP, $f0 + vreplvei.w VALPHA, $vr2, 0 + vxor.v res1, res1, res1 + vxor.v res2, res2, res2 + fcmp.ceq.s $fcc0, $f0, $f19 + bcnez $fcc0, .L999 li.d TEMP, 1 slli.d TEMP, TEMP, ZBASE_SHIFT slli.d INCX, INCX, ZBASE_SHIFT @@ -69,16 +91,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L10: vld VX0, X, 0 * SIZE - vfcvtl.d.s VX1, VX0 - vfcvth.d.s VX2, VX0 - vfmadd.d res1, VX1, VX1, res1 - vfmadd.d res2, VX2, VX2, res2 - vld VX0, X, 4 * SIZE - vfcvtl.d.s VX3, VX0 - vfcvth.d.s VX4, VX0 - vfmadd.d res1, VX3, VX3, res1 - vfmadd.d res2, VX4, VX4, res2 addi.d I, I, -1 + vld VX0, X, 0 * SIZE + vld VX1, X, 4 * SIZE + vfmul.s VX0, VX0, VALPHA + vfmul.s VX1, VX1, VALPHA + + vfmadd.s res1, VX0, VX0, res1 + vfmadd.s res2, VX1, VX1, res2 + addi.d X, X, 8 * SIZE blt $r0, I, .L10 b .L996 @@ -99,10 +120,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vinsgr2vr.w VX0, t3, 2 vinsgr2vr.w VX0, t4, 3 add.d X, X, INCX - vfcvtl.d.s VX1, VX0 - vfcvth.d.s VX2, VX0 - vfmadd.d res1, VX1, VX1, res1 - vfmadd.d res2, VX2, VX2, res2 + vfmul.s VX0, VX0, VALPHA + vfmadd.s res1, VX0, VX0, res1 + ld.w t1, X, 0 * SIZE ld.w t2, X, 1 * SIZE add.d X, X, INCX @@ -113,19 +133,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vinsgr2vr.w VX0, t3, 2 vinsgr2vr.w VX0, t4, 3 add.d X, X, INCX - vfcvtl.d.s VX3, VX0 - vfcvth.d.s VX4, VX0 - vfmadd.d res1, VX3, VX3, res1 - vfmadd.d res2, VX4, VX4, res2 + vfmul.s VX0, VX0, VALPHA + vfmadd.s res2, VX0, VX0, res2 + addi.d I, I, -1 blt $r0, I, .L21 b .L996 .align 3 .L996: - vfadd.d res1, res1, res2 - vreplvei.d VX1, res1, 1 - vfadd.d res1, VX1, res1 + vfadd.s res1, res1, res2 + vreplvei.w VX1, res1, 1 + vreplvei.w VX2, res1, 2 + vreplvei.w VX3, res1, 3 + vfadd.s res1, VX1, res1 + vfadd.s res1, VX2, res1 + vfadd.s res1, VX3, res1 .align 3 .L997: @@ -137,18 +160,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fld.s a1, X, 0 * SIZE fld.s a2, X, 1 * SIZE addi.d I, I, -1 - fcvt.d.s a1, a1 - fcvt.d.s a2, a2 - fmadd.d res, a1, a1, res - fmadd.d res, a2, a2, res + fmul.s a1, a1, RCP + fmul.s a2, a2, RCP + fmadd.s res, a1, a1, res + fmadd.s res, a2, a2, res add.d X, X, INCX blt $r0, I, .L998 .align 3 .L999: - fsqrt.d res, res + fsqrt.s res, res + fmul.s $f0, res, $f0 move $r4, $r17 - fcvt.s.d $f0, $f19 jirl $r0, $r1, 0x0 .align 3 diff --git a/kernel/loongarch64/snrm2_lsx.S b/kernel/loongarch64/snrm2_lsx.S index bb492dbf0..9822cc66a 100644 --- a/kernel/loongarch64/snrm2_lsx.S +++ b/kernel/loongarch64/snrm2_lsx.S @@ -52,6 +52,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* Don't change following FR unless you know the effects. */ #define res1 $vr19 #define res2 $vr20 +#define RCP $f2 +#define VALPHA $vr3 + +// The optimization for snrm2 cannot simply involve +// extending the data type from float to double and +// then summing the squares of the data. LAPACK tests +// have shown that this approach can still lead to data overflow. +// Instead, we need to find the maximum absolute value in the entire +// array and divide each data element by this maximum value before +// performing the calculation. This approach can avoid overflow (and does not require extending the data type). PROLOGUE @@ -59,10 +69,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. LDINT N, 0(N) LDINT INCX, 0(INCX) #endif - vxor.v res1, res1, res1 - vxor.v res2, res2, res2 bge $r0, N, .L999 beq $r0, INCX, .L999 + + addi.d $sp, $sp, -32 + st.d $ra, $sp, 0 + st.d N, $sp, 8 + st.d X, $sp, 16 + st.d INCX, $sp, 24 +#ifdef DYNAMIC_ARCH + bl samax_k_LA264 +#else + bl samax_k +#endif + ld.d $ra, $sp, 0 + ld.d N, $sp, 8 + ld.d X, $sp, 16 + ld.d INCX, $sp, 24 + addi.d $sp, $sp, 32 + + frecip.s RCP, $f0 + vreplvei.w VALPHA, $vr2, 0 + vxor.v res1, res1, res1 + vxor.v res2, res2, res2 + fcmp.ceq.s $fcc0, $f0, $f19 + bcnez $fcc0, .L999 li.d TEMP, SIZE slli.d INCX, INCX, BASE_SHIFT srai.d I, N, 3 @@ -75,14 +106,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vld VX5, X, 4 * SIZE addi.d I, I, -1 addi.d X, X, 8 * SIZE - vfcvtl.d.s VX1, VX0 - vfcvth.d.s VX2, VX0 - vfcvtl.d.s VX3, VX5 - vfcvth.d.s VX4, VX5 - vfmadd.d res1, VX1, VX1, res1 - vfmadd.d res2, VX2, VX2, res2 - vfmadd.d res1, VX3, VX3, res1 - vfmadd.d res2, VX4, VX4, res2 + + vfmul.s VX0, VX0, VALPHA + vfmul.s VX5, VX5, VALPHA + + vfmadd.s res1, VX0, VX0, res1 + vfmadd.s res2, VX5, VX5, res2 blt $r0, I, .L10 b .L996 .align 3 @@ -104,10 +133,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vinsgr2vr.w VX0, t2, 1 vinsgr2vr.w VX0, t3, 2 vinsgr2vr.w VX0, t4, 3 - vfcvtl.d.s VX1, VX0 - vfcvth.d.s VX2, VX0 - vfmadd.d res1, VX1, VX1, res1 - vfmadd.d res2, VX2, VX2, res2 + vfmul.s VX0, VX0, VALPHA + vfmadd.s res1, VX0, VX0, res1 + ld.w t1, X, 0 add.d X, X, INCX ld.w t2, X, 0 @@ -120,19 +148,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vinsgr2vr.w VX0, t2, 1 vinsgr2vr.w VX0, t3, 2 vinsgr2vr.w VX0, t4, 3 - vfcvtl.d.s VX3, VX0 - vfcvth.d.s VX4, VX0 - vfmadd.d res1, VX3, VX3, res1 - vfmadd.d res2, VX4, VX4, res2 + vfmul.s VX0, VX0, VALPHA + vfmadd.s res2, VX0, VX0, res2 addi.d I, I, -1 blt $r0, I, .L21 - b .L996 .align 3 .L996: - vfadd.d res1, res1, res2 - vreplvei.d VX1, res1, 1 - vfadd.d res1, VX1, res1 + vfadd.s res1, res1, res2 + vreplvei.w VX1, res1, 1 + vreplvei.w VX2, res1, 2 + vreplvei.w VX3, res1, 3 + vfadd.s res1, VX1, res1 + vfadd.s res1, VX2, res1 + vfadd.s res1, VX3, res1 .align 3 .L997: @@ -143,16 +172,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L998: fld.s $f15, X, 0 addi.d I, I, -1 - fcvt.d.s $f15, $f15 - fmadd.d $f19, $f15, $f15, $f19 + fmul.s $f15, $f15, RCP + fmadd.s $f19, $f15, $f15, $f19 add.d X, X, INCX blt $r0, I, .L998 .align 3 .L999: - fsqrt.d $f19, $f19 + fsqrt.s $f19, $f19 + fmul.s $f0, $f19, $f0 move $r4, $r17 - fcvt.s.d $f0, $f19 jirl $r0, $r1, 0x0 .align 3 From 98b5ef929cfc98f2f3c236966830276c255118d2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 12 Feb 2025 09:04:22 +0100 Subject: [PATCH 052/205] Restore the non-vectorized code from before PR4880 for POWER8 --- kernel/power/sgemv_t.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/kernel/power/sgemv_t.c b/kernel/power/sgemv_t.c index e133c815c..ed0a24230 100644 --- a/kernel/power/sgemv_t.c +++ b/kernel/power/sgemv_t.c @@ -78,7 +78,17 @@ static void sgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA temp7 += v_x[i] * va7[i]; } - + #if defined(POWER8) + y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); + y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); + y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]); + y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]); + + y[4] += alpha * (temp4[0] + temp4[1]+temp4[2] + temp4[3]); + y[5] += alpha * (temp5[0] + temp5[1]+temp5[2] + temp5[3]); + y[6] += alpha * (temp6[0] + temp6[1]+temp6[2] + temp6[3]); + y[7] += alpha * (temp7[0] + temp7[1]+temp7[2] + temp7[3]); + #else register __vector float t0, t1, t2, t3; register __vector float a = { alpha, alpha, alpha, alpha }; __vector float *v_y = (__vector float*) y; @@ -105,7 +115,7 @@ static void sgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA v_y[0] += a * temp0; v_y[1] += a * temp4; - +#endif } @@ -132,7 +142,12 @@ static void sgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA temp2 += v_x[i] * va2[i]; temp3 += v_x[i] * va3[i]; } - + #if defined(POWER8) + y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); + y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); + y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]); + y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]); + #else register __vector float t0, t1, t2, t3; register __vector float a = { alpha, alpha, alpha, alpha }; __vector float *v_y = (__vector float*) y; @@ -148,7 +163,7 @@ static void sgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA temp0 += temp1 + temp2 + temp3; v_y[0] += a * temp0; - +#endif } From 81eed868b68c72ea1868663902f0904dc1b22326 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 12 Feb 2025 09:07:20 +0100 Subject: [PATCH 053/205] Restore the non-vectorized code from before PR4880 for POWER8 --- kernel/power/sgemv_t_8.c | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/kernel/power/sgemv_t_8.c b/kernel/power/sgemv_t_8.c index f21f6eb7d..b30bb1137 100644 --- a/kernel/power/sgemv_t_8.c +++ b/kernel/power/sgemv_t_8.c @@ -99,7 +99,17 @@ static void sgemv_kernel_8x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA temp7 += vx1* va7_1 + vx2 * va7_2; } - + #if defined(POWER8) + y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); + y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); + y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]); + y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]); + + y[4] += alpha * (temp4[0] + temp4[1]+temp4[2] + temp4[3]); + y[5] += alpha * (temp5[0] + temp5[1]+temp5[2] + temp5[3]); + y[6] += alpha * (temp6[0] + temp6[1]+temp6[2] + temp6[3]); + y[7] += alpha * (temp7[0] + temp7[1]+temp7[2] + temp7[3]); + #else register __vector float t0, t1, t2, t3; register __vector float a = { alpha, alpha, alpha, alpha }; __vector float *v_y = (__vector float*) y; @@ -126,7 +136,7 @@ static void sgemv_kernel_8x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA v_y[0] += a * temp0; v_y[1] += a * temp4; - +#endif } @@ -153,7 +163,13 @@ static void sgemv_kernel_8x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA temp2 += v_x[i] * va2[i] + v_x[i+1] * va2[i+1]; temp3 += v_x[i] * va3[i] + v_x[i+1] * va3[i+1]; } - + + #if defined(POWER8) + y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); + y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); + y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]); + y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]); + #else register __vector float t0, t1, t2, t3; register __vector float a = { alpha, alpha, alpha, alpha }; __vector float *v_y = (__vector float*) y; @@ -169,7 +185,7 @@ static void sgemv_kernel_8x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA temp0 += temp1 + temp2 + temp3; v_y[0] += a * temp0; - +#endif } From 7f1f776f58398442796dd4bec1d6191a69869993 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 12 Feb 2025 11:23:02 +0100 Subject: [PATCH 054/205] Update FreeBSD jobs to 14.2 --- .cirrus.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.cirrus.yml b/.cirrus.yml index a7f64255d..741e04e18 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -127,7 +127,7 @@ task: FreeBSD_task: name: FreeBSD-gcc freebsd_instance: - image_family: freebsd-14-1 + image_family: freebsd-14-2 install_script: - pkg update -f && pkg upgrade -y && pkg install -y gmake gcc compile_script: @@ -138,7 +138,7 @@ FreeBSD_task: FreeBSD_task: name: freebsd-gcc-ilp64 freebsd_instance: - image_family: freebsd-14-1 + image_family: freebsd-14-2 install_script: - pkg update -f && pkg upgrade -y && pkg install -y gmake gcc compile_script: @@ -148,7 +148,7 @@ FreeBSD_task: FreeBSD_task: name: FreeBSD-clang-openmp freebsd_instance: - image_family: freebsd-14-1 + image_family: freebsd-14-2 install_script: - pkg update -f && pkg upgrade -y && pkg install -y gmake gcc - ln -s /usr/local/lib/gcc13/libgfortran.so.5.0.0 /usr/lib/libgfortran.so From daf16b8229b88362c4bb0dfe5a1face6e82e984a Mon Sep 17 00:00:00 2001 From: Harish-Gits Date: Wed, 12 Feb 2025 12:10:57 +0530 Subject: [PATCH 055/205] Adjusted GESV threading logic for optimal performance on WoA --- interface/lapack/gesv.c | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/interface/lapack/gesv.c b/interface/lapack/gesv.c index 546c2bed2..51a38de60 100644 --- a/interface/lapack/gesv.c +++ b/interface/lapack/gesv.c @@ -107,21 +107,33 @@ int NAME(blasint *N, blasint *NRHS, FLOAT *a, blasint *ldA, blasint *ipiv, #ifndef PPC440 buffer = (FLOAT *)blas_memory_alloc(1); - + sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); #endif #ifdef SMP args.common = NULL; -#ifndef DOUBLE - if (args.m*args.n < 40000) + +#if defined(_WIN64) && defined(_M_ARM64) + #ifdef COMPLEX + if (args.m * args.n > 600) + #else + if (args.m * args.n > 1000) + #endif + args.nthreads = num_cpu_avail(4); + else + args.nthreads = 1; #else - if (args.m*args.n < 10000) + #ifndef DOUBLE + if (args.m * args.n < 40000) + #else + if (args.m * args.n < 10000) + #endif + args.nthreads = 1; + else + args.nthreads = num_cpu_avail(4); #endif - args.nthreads=1; - else - args.nthreads = num_cpu_avail(4); if (args.nthreads == 1) { #endif From 877d5a5be62969375f841dd51d3d9090a7245cc8 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 12 Feb 2025 17:01:06 +0100 Subject: [PATCH 056/205] Add -O2 to flang flags when building on WoA in Release mode --- cmake/system.cmake | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/cmake/system.cmake b/cmake/system.cmake index 9c437fc99..ee2500da1 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -628,6 +628,18 @@ set(CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} ${CCOMMON_OPT}") endif() # TODO: not sure what PFLAGS is -hpa set(PFLAGS "${PFLAGS} ${CCOMMON_OPT} -I${TOPDIR} -DPROFILE ${COMMON_PROF}") +if ("${CMAKE_BUILD_TYPE}" STREQUAL "Release") + +if ("${F_COMPILER}" STREQUAL "FLANG") +if (${CMAKE_Fortran_COMPILER_VERSION} VERSION_LESS_EQUAL 3) + set(CMAKE_Fortran_FLAGS_RELEASE "${CMAKE_Fortran_FLAGS_RELEASE} -fno-unroll-loops") +endif () +endif () +if (ARM64 AND CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*" AND CMAKE_SYSTEM_NAME STREQUAL "Windows") + set(CMAKE_Fortran_FLAGS_RELEASE "${CMAKE_Fortran_FLAGS_RELEASE} -O2") +endif () +endif () + set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} ${FCOMMON_OPT}") # TODO: not sure what FPFLAGS is -hpa @@ -653,7 +665,7 @@ if (CMAKE_Fortran_COMPILER) if ("${F_COMPILER}" STREQUAL "NAGFOR" OR "${F_COMPILER}" STREQUAL "CRAY" OR CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*") set(FILTER_FLAGS "-msse3;-mssse3;-msse4.1;-mavx;-mavx2,-mskylake-avx512") if (CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*") - message(STATUS "removing fortran flags") + message(STATUS "removing fortran flags not supported by the compiler") set(FILTER_FLAGS "${FILTER_FLAGS};-m32;-m64") endif () foreach (FILTER_FLAG ${FILTER_FLAGS}) @@ -684,13 +696,6 @@ if (${CMAKE_C_COMPILER_ID} MATCHES "IntelLLVM" AND ${CMAKE_SYSTEM_NAME} STREQUAL set(LAPACK_CFLAGS "${LAPACK_CFLAGS} -DNOCHANGE") endif () -if ("${CMAKE_BUILD_TYPE}" STREQUAL "Release") -if ("${F_COMPILER}" STREQUAL "FLANG") -if (${CMAKE_Fortran_COMPILER_VERSION} VERSION_LESS_EQUAL 3) - set(CMAKE_Fortran_FLAGS_RELEASE "${CMAKE_Fortran_FLAGS_RELEASE} -fno-unroll-loops") -endif () -endif () -endif () if (NOT DEFINED SUFFIX) set(SUFFIX o) From d23eb3b93ec42eae92bfafffc50f1a6d1e0c0d25 Mon Sep 17 00:00:00 2001 From: Vaisakh K V Date: Thu, 5 Dec 2024 11:41:05 +0530 Subject: [PATCH 057/205] Support for SME1 based sgemm_direct kernel for cblas_sgemm level 3 API * Added ARMV9SME target * Added SGEMM_DIRECT kernel based on SME1 --- CMakeLists.txt | 3 +- Makefile.arm64 | 5 + Makefile.system | 8 + TargetList.txt | 1 + c_check | 19 ++ cmake/arch.cmake | 18 +- cmake/cc.cmake | 6 + cmake/prebuild.cmake | 2 +- cmake/system.cmake | 38 +++- cmake/system_check.cmake | 11 + common.h | 1 + common_arm64.h | 2 +- common_param.h | 6 + common_s.h | 4 +- driver/others/dynamic_arm64.c | 34 +++ getarch.c | 13 ++ interface/gemm.c | 70 ++++-- kernel/CMakeLists.txt | 16 +- kernel/Makefile | 4 + kernel/Makefile.L3 | 33 ++- kernel/arm64/KERNEL.ARMV9SME | 3 + kernel/arm64/sgemm_direct_arm64_sme1.c | 59 +++++ kernel/arm64/sgemm_direct_sme1.S | 228 ++++++++++++++++++++ kernel/arm64/sgemm_direct_sme1_preprocess.S | 133 ++++++++++++ kernel/setparam-ref.c | 5 + param.h | 8 +- 26 files changed, 694 insertions(+), 36 deletions(-) create mode 100644 kernel/arm64/KERNEL.ARMV9SME create mode 100644 kernel/arm64/sgemm_direct_arm64_sme1.c create mode 100644 kernel/arm64/sgemm_direct_sme1.S create mode 100644 kernel/arm64/sgemm_direct_sme1_preprocess.S diff --git a/CMakeLists.txt b/CMakeLists.txt index ddff73c2c..8e99bd208 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -4,11 +4,12 @@ cmake_minimum_required(VERSION 3.16.0) +set (CMAKE_ASM_SOURCE_FILE_EXTENSIONS "S") project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 28.dev) +set(OpenBLAS_PATCH_VERSION 29.dev) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") diff --git a/Makefile.arm64 b/Makefile.arm64 index fccc0d0d0..46e4baefc 100644 --- a/Makefile.arm64 +++ b/Makefile.arm64 @@ -30,6 +30,11 @@ FCOMMON_OPT += -march=armv8-a+sve endif endif +ifeq ($(CORE), ARMV9SME) +CCOMMON_OPT += -march=armv9-a+sve2+sme +FCOMMON_OPT += -march=armv9-a+sve2 +endif + ifeq ($(CORE), CORTEXA53) CCOMMON_OPT += -march=armv8-a -mtune=cortex-a53 ifneq ($(F_COMPILER), NAG) diff --git a/Makefile.system b/Makefile.system index 29ea819f1..14830eb4e 100644 --- a/Makefile.system +++ b/Makefile.system @@ -420,6 +420,7 @@ ifeq ($(ARCH), arm64) export MACOSX_DEPLOYMENT_TARGET=11.0 ifeq ($(C_COMPILER), GCC) export NO_SVE = 1 +export NO_SME = 1 endif else export MACOSX_DEPLOYMENT_TARGET=10.8 @@ -709,6 +710,9 @@ DYNAMIC_CORE += NEOVERSEN2 DYNAMIC_CORE += ARMV8SVE DYNAMIC_CORE += A64FX endif +ifneq ($(NO_SME), 1) +DYNAMIC_CORE += ARMV9SME +endif DYNAMIC_CORE += THUNDERX DYNAMIC_CORE += THUNDERX2T99 DYNAMIC_CORE += TSV110 @@ -1474,6 +1478,10 @@ ifeq ($(NO_SVE), 1) CCOMMON_OPT += -DNO_SVE endif +ifeq ($(NO_SME), 1) +CCOMMON_OPT += -DNO_SME +endif + ifdef SMP CCOMMON_OPT += -DSMP_SERVER diff --git a/TargetList.txt b/TargetList.txt index 25eeddfb0..232e12ffa 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -111,6 +111,7 @@ THUNDERX3T110 VORTEX A64FX ARMV8SVE +ARMV9SME FT2000 9.System Z: diff --git a/c_check b/c_check index c2b52c81b..0aea55fee 100755 --- a/c_check +++ b/c_check @@ -331,6 +331,24 @@ if [ "$architecture" = "arm64" ]; then rm -rf "$tmpd" fi +no_sme=0 +if [ "$architecture" = "arm64" ]; then + tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC') + tmpf="$tmpd/a.S" + printf ".text \n.global sme_test\n\nsme_test:\nsmstart\nsmstop\nret\n">> "$tmpf" + args=" -march=armv9-a+sve2+sme -c -o $tmpf.o $tmpf" + no_sme=0 + { + $compiler_name $flags $args >/dev/null 2>&1 + } || { + args=" -march=armv9-a+sme -c -o $tmpf.o $tmpf" + $compiler_name $flags $args >/dev/null 2>&1 + } || { + no_sme=1 + } + rm -rf "$tmpd" +fi + c11_atomics=0 case "$data" in *HAVE_C11*) @@ -472,6 +490,7 @@ done printf "CEXTRALIB=%s %s %s\n" "$linker_L" "$linker_l" "$linker_a" [ "$no_msa" -eq 1 ] && printf "NO_MSA=1\n" [ "$no_sve" -eq 1 ] && printf "NO_SVE=1\n" + [ "$no_sme" -eq 1 ] && printf "NO_SME=1\n" [ "$no_rv64gv" -eq 1 ] && printf "NO_RV64GV=1\n" [ "$no_avx512" -eq 1 ] && printf "NO_AVX512=1\n" [ "$no_avx512bf" -eq 1 ] && printf "NO_AVX512BF16=1\n" diff --git a/cmake/arch.cmake b/cmake/arch.cmake index 27ba6f872..ec91a2d59 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -44,9 +44,21 @@ endif () if (DYNAMIC_ARCH) if (ARM64) - set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110) - if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 9.99) - set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE A64FX) + set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110) + if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU") + if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 10) # SVE ACLE supported in GCC >= 10 + set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE A64FX) + endif () + if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 14) # SME ACLE supported in GCC >= 14 + set(DYNAMIC_CORE ${DYNAMIC_CORE} ARMV9SME) + endif() + elseif (${CMAKE_C_COMPILER_ID} MATCHES "Clang") + if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 11) # SVE ACLE supported in LLVM >= 11 + set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE A64FX) + endif () + if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 19) # SME ACLE supported in LLVM >= 19 + set(DYNAMIC_CORE ${DYNAMIC_CORE} ARMV9SME) + endif() endif () if (DYNAMIC_LIST) set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST}) diff --git a/cmake/cc.cmake b/cmake/cc.cmake index 775239e1c..5e9c5a8c4 100644 --- a/cmake/cc.cmake +++ b/cmake/cc.cmake @@ -238,6 +238,12 @@ if (${CORE} STREQUAL ARMV8SVE) endif () endif () +if (${CORE} STREQUAL ARMV9SME) + if (NOT DYNAMIC_ARCH) + set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv9-a+sme") + endif () +endif () + if (${CORE} STREQUAL CORTEXA510) if (NOT DYNAMIC_ARCH) set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve") diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index 53a78d782..f6ca73b7b 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -1014,7 +1014,7 @@ endif () set(ZGEMM_UNROLL_M 4) set(ZGEMM_UNROLL_N 4) set(SYMV_P 16) - elseif ("${TCORE}" STREQUAL "NEOVERSEN2") + elseif ("${TCORE}" STREQUAL "NEOVERSEN2" or "${TCORE}" STREQUAL "ARMV9SME") file(APPEND ${TARGET_CONF_TEMP} "#define L1_CODE_SIZE\t65536\n" "#define L1_CODE_LINESIZE\t64\n" diff --git a/cmake/system.cmake b/cmake/system.cmake index 6b891ca0e..871fdb2e6 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -21,7 +21,15 @@ endif() # Other files expect CORE, which is actually TARGET and will become TARGET_CORE for kernel build. Confused yet? # It seems we are meant to use TARGET as input and CORE internally as kernel. if(NOT DEFINED CORE AND DEFINED TARGET) - set(CORE ${TARGET}) + if (${TARGET} STREQUAL "LOONGSON3R5") + set(CORE "LA464") + elseif (${TARGET} STREQUAL "LOONGSON2K1000") + set(CORE "LA264") + elseif (${TARGET} STREQUAL "LOONGSONGENERIC") + set(CORE "LA64_GENERIC)") + else () + set(CORE ${TARGET}) + endif() endif() # TARGET_CORE will override TARGET which is used in DYNAMIC_ARCH=1. @@ -310,6 +318,9 @@ if (${TARGET} STREQUAL NEOVERSEV1) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.2-a+sve") endif() endif() + if (${TARGET} STREQUAL ARMV9SME) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv9-a+sme -O3") + endif() if (${TARGET} STREQUAL A64FX) if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -Msve-intrinsics -march=armv8.2-a+sve -mtune=a64fx") @@ -382,6 +393,8 @@ if (NEED_PIC) if (NOT NOFORTRAN) if (${F_COMPILER} STREQUAL "SUN") set(FCOMMON_OPT "${FCOMMON_OPT} -pic") + elseif (${F_COMPILER} STREQUAL "NAGFOR") + set(FCOMMON_OPT "${FCOMMON_OPT} -PIC") else () set(FCOMMON_OPT "${FCOMMON_OPT} -fPIC") endif () @@ -640,17 +653,17 @@ if (${CMAKE_SYSTEM_NAME} STREQUAL "Windows") endif () if (CMAKE_Fortran_COMPILER) -if ("${F_COMPILER}" STREQUAL "NAG" OR "${F_COMPILER}" STREQUAL "CRAY" OR CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*") - set(FILTER_FLAGS "-msse3;-mssse3;-msse4.1;-mavx;-mavx2,-mskylake-avx512") - if (CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*") -message(STATUS "removing fortran flags") - set(FILTER_FLAGS "${FILTER_FLAGS};-m32;-m64") + if ("${F_COMPILER}" STREQUAL "NAGFOR" OR "${F_COMPILER}" STREQUAL "CRAY" OR CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*") + set(FILTER_FLAGS "-msse3;-mssse3;-msse4.1;-mavx;-mavx2,-mskylake-avx512") + if (CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*") + message(STATUS "removing fortran flags") + set(FILTER_FLAGS "${FILTER_FLAGS};-m32;-m64") + endif () + foreach (FILTER_FLAG ${FILTER_FLAGS}) + string(REPLACE ${FILTER_FLAG} "" LAPACK_FFLAGS ${LAPACK_FFLAGS}) + string(REPLACE ${FILTER_FLAG} "" LAPACK_FPFLAGS ${LAPACK_FPFLAGS}) + endforeach () endif () - foreach (FILTER_FLAG ${FILTER_FLAGS}) - string(REPLACE ${FILTER_FLAG} "" LAPACK_FFLAGS ${LAPACK_FFLAGS}) - string(REPLACE ${FILTER_FLAG} "" LAPACK_FPFLAGS ${LAPACK_FPFLAGS}) - endforeach () -endif () endif () if ("${F_COMPILER}" STREQUAL "GFORTRAN") @@ -670,6 +683,9 @@ endif () if (${CMAKE_C_COMPILER} STREQUAL "LSB" OR ${CMAKE_SYSTEM_NAME} STREQUAL "Windows") set(LAPACK_CFLAGS "${LAPACK_CFLAGS} -DLAPACK_COMPLEX_STRUCTURE") endif () +if (${CMAKE_C_COMPILER_ID} MATCHES "IntelLLVM" AND ${CMAKE_SYSTEM_NAME} STREQUAL "Windows") + set(LAPACK_CFLAGS "${LAPACK_CFLAGS} -DNOCHANGE") +endif () if ("${CMAKE_BUILD_TYPE}" STREQUAL "Release") if ("${F_COMPILER}" STREQUAL "FLANG") diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake index 59a135878..256ab336b 100644 --- a/cmake/system_check.cmake +++ b/cmake/system_check.cmake @@ -135,6 +135,17 @@ endif() endif() endif() +if (ARM64) +if (NOT NO_SME) + file(WRITE ${PROJECT_BINARY_DIR}/sme.c ".text \n.global sme_test\n\nsme_test:\nsmstart\nsmstop\nret\n") + execute_process(COMMAND ${CMAKE_C_COMPILER} -march=armv9-a+sve2+sme -c -v -o ${PROJECT_BINARY_DIR}/sme.o ${PROJECT_BINARY_DIR}/sme.c OUTPUT_QUIET ERROR_QUIET RESULT_VARIABLE NO_SME) +if (NO_SME EQUAL 1) +set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_SME") +endif() + file(REMOVE "${PROJECT_BINARY_DIR}/sme.c" "${PROJECT_BINARY_DIR}/sme.o") +endif() +endif() + include(CheckIncludeFile) CHECK_INCLUDE_FILE("stdatomic.h" HAVE_C11) if (HAVE_C11 EQUAL 1) diff --git a/common.h b/common.h index b8bac1ad2..766b89cf7 100644 --- a/common.h +++ b/common.h @@ -696,6 +696,7 @@ void gotoblas_profile_init(void); void gotoblas_profile_quit(void); int support_avx512(void); +int support_sme1(void); #ifdef USE_OPENMP diff --git a/common_arm64.h b/common_arm64.h index 595a01995..5856898a2 100644 --- a/common_arm64.h +++ b/common_arm64.h @@ -175,7 +175,7 @@ REALNAME: #define HUGE_PAGESIZE ( 4 << 20) #ifndef BUFFERSIZE -#if defined(NEOVERSEN1) || defined(NEOVERSEN2) || defined(NEOVERSEV1) || defined(A64FX) || defined(ARMV8SVE) +#if defined(NEOVERSEN1) || defined(NEOVERSEN2) || defined(NEOVERSEV1) || defined(A64FX) || defined(ARMV8SVE) || defined(ARMV9SME) #define BUFFER_SIZE (32 << 22) #else #define BUFFER_SIZE (32 << 20) diff --git a/common_param.h b/common_param.h index c082d248e..e1a87f969 100644 --- a/common_param.h +++ b/common_param.h @@ -221,6 +221,12 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); void (*sgemm_direct) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG , float *, BLASLONG , float * , BLASLONG); int (*sgemm_direct_performant) (BLASLONG M, BLASLONG N, BLASLONG K); #endif +#ifdef ARCH_ARM64 +#ifdef HAVE_SME + void (*sgemm_direct) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG , float *, BLASLONG , float * , BLASLONG); +#endif +#endif + int (*sgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG); int (*sgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); diff --git a/common_s.h b/common_s.h index fdd80b62f..af9d940ae 100644 --- a/common_s.h +++ b/common_s.h @@ -213,9 +213,9 @@ #ifdef ARCH_X86_64 #define SGEMM_DIRECT_PERFORMANT gotoblas -> sgemm_direct_performant #define SGEMM_DIRECT gotoblas -> sgemm_direct -#else +#elif ARCH_ARM64 #define SGEMM_DIRECT_PERFORMANT sgemm_direct_performant -#define SGEMM_DIRECT sgemm_direct +#define SGEMM_DIRECT gotoblas -> sgemm_direct #endif #define SGEMM_ONCOPY gotoblas -> sgemm_oncopy diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c index dc88d816f..3d2bed4af 100644 --- a/driver/others/dynamic_arm64.c +++ b/driver/others/dynamic_arm64.c @@ -115,6 +115,11 @@ extern gotoblas_t gotoblas_ARMV8SVE; #else #define gotoblas_ARMV8SVE gotoblas_ARMV8 #endif +#ifdef DYN_ARMV9SME +extern gotoblas_t gotoblas_ARMV9SME; +#else +#define gotoblas_ARMV9SME gotoblas_ARMV8 +#endif #ifdef DYN_CORTEX_A55 extern gotoblas_t gotoblas_CORTEXA55; #else @@ -148,6 +153,13 @@ extern gotoblas_t gotoblas_A64FX; #define gotoblas_ARMV8SVE gotoblas_ARMV8 #define gotoblas_A64FX gotoblas_ARMV8 #endif + +#ifndef NO_SME +extern gotoblas_t gotoblas_ARMV9SME; +#else +#define gotoblas_ARMV9SME gotoblas_ARMV8SVE +#endif + extern gotoblas_t gotoblas_THUNDERX3T110; #endif #define gotoblas_NEOVERSEV2 gotoblas_NEOVERSEV1 @@ -168,6 +180,9 @@ extern void openblas_warning(int verbose, const char * msg); #ifndef HWCAP_SVE #define HWCAP_SVE (1 << 22) #endif +#ifndef HWCAP2_SME +#define HWCAP2_SME 1<<23 +#endif #define get_cpu_ftr(id, var) ({ \ __asm__ __volatile__ ("mrs %0, "#id : "=r" (var)); \ @@ -393,6 +408,13 @@ static gotoblas_t *get_coretype(void) { snprintf(coremsg, 128, "Unknown CPU model - implementer %x part %x\n",implementer,part); openblas_warning(1, coremsg); } + +#if !defined(NO_SME) && defined(HWCAP2_SME) + if ((getauxval(AT_HWCAP2) & HWCAP2_SME)) { + return &gotoblas_ARMV9SME; + } +#endif + #ifndef NO_SVE if ((getauxval(AT_HWCAP) & HWCAP_SVE)) { return &gotoblas_ARMV8SVE; @@ -443,3 +465,15 @@ void gotoblas_dynamic_init(void) { void gotoblas_dynamic_quit(void) { gotoblas = NULL; } + +int support_sme1(void) { + int ret = 0; + +#if (defined OS_LINUX || defined OS_ANDROID) + ret = getauxval(AT_HWCAP2) & HWCAP2_SME; + if(getauxval(AT_HWCAP2) & HWCAP2_SME){ + ret = 1; + } +#endif + return ret; +} diff --git a/getarch.c b/getarch.c index 826dd1ce0..b51c3ed64 100644 --- a/getarch.c +++ b/getarch.c @@ -1289,6 +1289,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "ARMV8SVE" #endif +#ifdef FORCE_ARMV9SME +#define FORCE +#define ARCHITECTURE "ARM64" +#define SUBARCHITECTURE "ARMV9SME" +#define SUBDIRNAME "arm64" +#define ARCHCONFIG "-DARMV9SME " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " \ + "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DHAVE_SME -DARMV8 -DARMV9" +#define LIBNAME "armv9sme" +#define CORENAME "ARMV9SME" +#endif #ifdef FORCE_ARMV8 #define FORCE diff --git a/interface/gemm.c b/interface/gemm.c index 576e94593..2cd7d7b5c 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -1,5 +1,5 @@ /*********************************************************************/ -/* Copyright 2024 The OpenBLAS Project */ +/* Copyright 2024, 2025 The OpenBLAS Project */ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ @@ -86,7 +86,7 @@ #endif static int (*gemm[])(blas_arg_t *, BLASLONG *, BLASLONG *, IFLOAT *, IFLOAT *, BLASLONG) = { -#ifndef GEMM3M +#if !defined(GEMM3M) || defined(GENERIC) GEMM_NN, GEMM_TN, GEMM_RN, GEMM_CN, GEMM_NT, GEMM_TT, GEMM_RT, GEMM_CT, GEMM_NR, GEMM_TR, GEMM_RR, GEMM_CR, @@ -177,6 +177,49 @@ static int init_amxtile_permission() { } #endif +#ifdef DYNAMIC_ARCH +extern char* gotoblas_corename(void); +#endif + +#if defined(DYNAMIC_ARCH) || defined(NEOVERSEV1) +static inline int get_gemm_optimal_nthreads_neoversev1(double MNK, int ncpu) { + return + MNK < 262144L ? 1 + : MNK < 1124864L ? MIN(ncpu, 6) + : MNK < 7880599L ? MIN(ncpu, 12) + : MNK < 17173512L ? MIN(ncpu, 16) + : MNK < 33386248L ? MIN(ncpu, 20) + : MNK < 57066625L ? MIN(ncpu, 24) + : MNK < 91733851L ? MIN(ncpu, 32) + : MNK < 265847707L ? MIN(ncpu, 40) + : MNK < 458314011L ? MIN(ncpu, 48) + : MNK < 729000000L ? MIN(ncpu, 56) + : ncpu; +} +#endif + +static inline int get_gemm_optimal_nthreads(double MNK) { + int ncpu = num_cpu_avail(3); +#if defined(NEOVERSEV1) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) + return get_gemm_optimal_nthreads_neoversev1(MNK, ncpu); +#elif defined(DYNAMIC_ARCH) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) + if (strcmp(gotoblas_corename(), "neoversev1") == 0) { + return get_gemm_optimal_nthreads_neoversev1(MNK, ncpu); + } +#endif + if ( MNK <= (SMP_THRESHOLD_MIN * (double) GEMM_MULTITHREAD_THRESHOLD) ) { + return 1; + } + else { + if (MNK/ncpu < SMP_THRESHOLD_MIN*(double)GEMM_MULTITHREAD_THRESHOLD) { + return MNK/(SMP_THRESHOLD_MIN*(double)GEMM_MULTITHREAD_THRESHOLD); + } + else { + return ncpu; + } + } +} + #ifndef CBLAS void NAME(char *TRANSA, char *TRANSB, @@ -310,7 +353,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS FLOAT *beta = (FLOAT*) vbeta; FLOAT *a = (FLOAT*) va; FLOAT *b = (FLOAT*) vb; - FLOAT *c = (FLOAT*) vc; + FLOAT *c = (FLOAT*) vc; #endif blas_arg_t args; @@ -350,14 +393,21 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS PRINT_DEBUG_CNAME; #if !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) && defined(USE_SGEMM_KERNEL_DIRECT) -#ifdef DYNAMIC_ARCH +#if defined(DYNAMIC_ARCH) && defined(ARCH_x86) if (support_avx512() ) -#endif if (beta == 0 && alpha == 1.0 && order == CblasRowMajor && TransA == CblasNoTrans && TransB == CblasNoTrans && SGEMM_DIRECT_PERFORMANT(m,n,k)) { SGEMM_DIRECT(m, n, k, a, lda, b, ldb, c, ldc); return; } - +#endif +#if defined(DYNAMIC_ARCH) && defined(ARCH_ARM64) + if (support_sme1()){ + if (beta == 0 && alpha == 1.0 && order == CblasRowMajor && TransA == CblasNoTrans && TransB == CblasNoTrans) { + SGEMM_DIRECT(m, n, k, a, lda, b, ldb, c, ldc); + return; + } + } +#endif #endif #ifndef COMPLEX @@ -604,13 +654,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS #endif MNK = (double) args.m * (double) args.n * (double) args.k; - if ( MNK <= (SMP_THRESHOLD_MIN * (double) GEMM_MULTITHREAD_THRESHOLD) ) - args.nthreads = 1; - else { - args.nthreads = num_cpu_avail(3); - if (MNK/args.nthreads < SMP_THRESHOLD_MIN*(double)GEMM_MULTITHREAD_THRESHOLD) - args.nthreads = MNK/(SMP_THRESHOLD_MIN*(double)GEMM_MULTITHREAD_THRESHOLD); - } + args.nthreads = get_gemm_optimal_nthreads(MNK); args.common = NULL; diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index 74e6760c2..ad7dca9a7 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -65,6 +65,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) GenerateNamedObjects("${KERNELDIR}/${${float_char}COPYKERNEL}" "C_INTERFACE" "copy_k" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}NRM2KERNEL}" "" "nrm2_k" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}ROTKERNEL}" "" "rot_k" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}ROTMKERNEL}" "" "rotm_k" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}SCALKERNEL}" "" "scal_k" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}SWAPKERNEL}" "" "swap_k" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}AXPBYKERNEL}" "" "axpby_k" false "" "" false ${float_type}) @@ -125,6 +126,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) GenerateNamedObjects("${KERNELDIR}/${SNRM2KERNEL}" "" "nrm2_k" false "" "" false "SINGLE") GenerateNamedObjects("${KERNELDIR}/${SDOTKERNEL}" "" "dot_k" false "" "" false "SINGLE") GenerateNamedObjects("${KERNELDIR}/${SROTKERNEL}" "" "rot_k" false "" "" false "SINGLE") + GenerateNamedObjects("${KERNELDIR}/${SROTMKERNEL}" "" "rotm_k" false "" "" false "SINGLE") endif () if (BUILD_COMPLEX16 AND NOT BUILD_DOUBLE) GenerateNamedObjects("${KERNELDIR}/${DAMAXKERNEL}" "USE_ABS" "amax_k" false "" "" false "DOUBLE") @@ -148,6 +150,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) GenerateNamedObjects("${KERNELDIR}/${DCOPYKERNEL}" "C_INTERFACE" "copy_k" false "" "" false "DOUBLE") GenerateNamedObjects("${KERNELDIR}/${DNRM2KERNEL}" "" "nrm2_k" false "" "" false "DOUBLE") GenerateNamedObjects("${KERNELDIR}/${DROTKERNEL}" "" "rot_k" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DROTMKERNEL}" "" "rotm_k" false "" "" false "DOUBLE") GenerateNamedObjects("${KERNELDIR}/${DDOTKERNEL}" "" "dot_k" false "" "" false "DOUBLE") GenerateNamedObjects("${KERNELDIR}/${DSWAPKERNEL}" "" "swap_k" false "" "" false "DOUBLE") GenerateNamedObjects("${KERNELDIR}/${DAXPYKERNEL}" "" "axpy_k" false "" "" false "DOUBLE") @@ -204,19 +207,27 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) if (ZARCH OR (UC_TARGET_CORE MATCHES POWER8) OR (UC_TARGET_CORE MATCHES POWER9) OR (UC_TARGET_CORE MATCHES POWER10)) set(USE_TRMM true) endif () - set(USE_DIRECT_SGEMM false) - if (X86_64) + if (X86_64 OR (ARM64 AND (UC_TARGET_CORE MATCHES ARMV9SME))) set(USE_DIRECT_SGEMM true) endif() if (USE_DIRECT_SGEMM) # if (NOT DEFINED SGEMMDIRECTKERNEL) + if (X86_64) set (SGEMMDIRECTKERNEL sgemm_direct_skylakex.c) set (SGEMMDIRECTPERFORMANT sgemm_direct_performant.c) # endif() GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTKERNEL}" "" "gemm_direct" false "" "" false SINGLE) GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTPERFORMANT}" "" "gemm_direct_performant" false "" "" false SINGLE) + elseif (ARM64) + set (SGEMMDIRECTKERNEL sgemm_direct_arm64_sme1.c) + set (SGEMMDIRECTSMEKERNEL sgemm_direct_sme1.S) + set (SGEMMDIRECTPREKERNEL sgemm_direct_sme1_preprocess.S) + GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTKERNEL}" "" "gemm_direct" false "" "" false SINGLE) + GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTSMEKERNEL}" "" "gemm_direct_sme1" false "" "" false SINGLE) + GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTPREKERNEL}" "" "gemm_direct_sme1_preprocess" false "" "" false SINGLE) + endif () endif() foreach (float_type SINGLE DOUBLE) @@ -1105,6 +1116,7 @@ endif () GenerateNamedObjects("${KERNELDIR}/${DCOPYKERNEL}" "C_INTERFACE" "copy_k" false "" "" false "DOUBLE") GenerateNamedObjects("${KERNELDIR}/${DNRM2KERNEL}" "" "nrm2_k" false "" "" false "DOUBLE") GenerateNamedObjects("${KERNELDIR}/${DROTKERNEL}" "" "rot_k" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DROTMKERNEL}" "" "rotm_k" false "" "" false "DOUBLE") GenerateNamedObjects("${KERNELDIR}/${DDOTKERNEL}" "" "dot_k" false "" "" false "DOUBLE") GenerateNamedObjects("${KERNELDIR}/${DSWAPKERNEL}" "" "swap_k" false "" "" false "DOUBLE") GenerateNamedObjects("${KERNELDIR}/${DAXPYKERNEL}" "" "axpy_k" false "" "" false "DOUBLE") diff --git a/kernel/Makefile b/kernel/Makefile index 3f9afd3fa..84cd482a0 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -24,7 +24,11 @@ ifdef NO_AVX2 AVX2OPT= endif + ifdef TARGET_CORE +ifeq ($(TARGET_CORE), ARMV9SME) + override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -DHAVE_SME -march=armv9-a+sve2+sme +endif ifeq ($(TARGET_CORE), SAPPHIRERAPIDS) override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) ifeq (1, $(filter 1,$(GCCVERSIONGTEQ11) $(CLANGVERSIONGTEQ12))) diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index ed1c74ecf..41f16f9c9 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -24,6 +24,7 @@ endif ifeq ($(ARCH), arm64) USE_TRMM = 1 +USE_DIRECT_SGEMM = 1 endif ifeq ($(ARCH), riscv64) @@ -95,9 +96,17 @@ endif ifdef USE_DIRECT_SGEMM ifndef SGEMMDIRECTKERNEL +ifeq ($(ARCH), x86_64) SGEMMDIRECTKERNEL = sgemm_direct_skylakex.c SGEMMDIRECTPERFORMANT = sgemm_direct_performant.c endif +ifeq ($(ARCH), arm64) +ifeq ($(TARGET_CORE), ARMV9SME) +HAVE_SME = 1 +SGEMMDIRECTKERNEL = sgemm_direct_arm64_sme1.c +endif +endif +endif endif ifeq ($(BUILD_BFLOAT16), 1) @@ -128,9 +137,19 @@ SKERNELOBJS += \ $(SGEMMONCOPYOBJ) $(SGEMMOTCOPYOBJ) ifdef USE_DIRECT_SGEMM +ifeq ($(ARCH), x86_64) +SKERNELOBJS += \ + sgemm_direct$(TSUFFIX).$(SUFFIX) \ + sgemm_direct_performant$(TSUFFIX).$(SUFFIX) +endif +ifeq ($(ARCH), arm64) +ifdef HAVE_SME SKERNELOBJS += \ sgemm_direct$(TSUFFIX).$(SUFFIX) \ - sgemm_direct_performant$(TSUFFIX).$(SUFFIX) + sgemm_direct_sme1$(TSUFFIX).$(SUFFIX) \ + sgemm_direct_sme1_preprocess$(TSUFFIX).$(SUFFIX) +endif +endif endif endif @@ -809,11 +828,23 @@ else endif ifdef USE_DIRECT_SGEMM +ifeq ($(ARCH), x86_64) $(KDIR)sgemm_direct_performant$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMDIRECTPERFORMANT) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ $(KDIR)sgemm_direct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMDIRECTKERNEL) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ endif +ifeq ($(ARCH), arm64) +ifdef HAVE_SME +$(KDIR)sgemm_direct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMDIRECTKERNEL) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ +$(KDIR)sgemm_direct_sme1$(TSUFFIX).$(SUFFIX) : + $(CC) $(CFLAGS) -c $(KERNELDIR)/sgemm_direct_sme1.S -UDOUBLE -UCOMPLEX -o $@ +$(KDIR)sgemm_direct_sme1_preprocess$(TSUFFIX).$(SUFFIX) : + $(CC) $(CFLAGS) -c $(KERNELDIR)/sgemm_direct_sme1_preprocess.S -UDOUBLE -UCOMPLEX -o $@ +endif +endif +endif ifeq ($(BUILD_BFLOAT16), 1) diff --git a/kernel/arm64/KERNEL.ARMV9SME b/kernel/arm64/KERNEL.ARMV9SME new file mode 100644 index 000000000..dc333d829 --- /dev/null +++ b/kernel/arm64/KERNEL.ARMV9SME @@ -0,0 +1,3 @@ +include $(KERNELDIR)/KERNEL.ARMV8SVE + + diff --git a/kernel/arm64/sgemm_direct_arm64_sme1.c b/kernel/arm64/sgemm_direct_arm64_sme1.c new file mode 100644 index 000000000..bd7e54889 --- /dev/null +++ b/kernel/arm64/sgemm_direct_arm64_sme1.c @@ -0,0 +1,59 @@ +/* + Copyright (c) 2025 Qualcomm Innovation Center, Inc. All rights reserved. + SPDX-License-Identifier: BSD-3-Clause-Clear +*/ + +#include "common.h" +#include +#include +#include + +#if defined(HAVE_SME) + +/* Function prototypes */ +extern void sgemm_direct_sme1_preprocess(uint64_t nbr, uint64_t nbc,\ + const float * restrict a, float * a_mod) __asm__("sgemm_direct_sme1_preprocess"); +extern void sgemm_direct_sme1_2VLx2VL(uint64_t m, uint64_t k, uint64_t n,\ + const float * matLeft,\ + const float * restrict matRight,\ + const float * restrict matResult) __asm__("sgemm_direct_sme1_2VLx2VL"); + +/* Function Definitions */ +uint64_t sve_cntw() { + uint64_t cnt; + asm volatile( + "rdsvl %[res], #1\n" + "lsr %[res], %[res], #2\n" + : [res] "=r" (cnt) :: + ); + return cnt; +} + +/*void sgemm_kernel_direct (BLASLONG M, BLASLONG N, BLASLONG K,\ + float * __restrict A, BLASLONG strideA, float * __restrict B,\ + BLASLONG strideB , float * __restrict R, BLASLONG strideR) +*/ +void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A,\ + BLASLONG strideA, float * __restrict B, BLASLONG strideB ,\ + float * __restrict R, BLASLONG strideR){ + + uint64_t m_mod, vl_elms; + + vl_elms = sve_cntw(); + + m_mod = ceil((double)M/(double)vl_elms) * vl_elms; + + float *A_mod = (float *) malloc(m_mod*K*sizeof(float)); + + /* Pre-process the left matrix to make it suitable for + matrix sum of outer-product calculation + */ + sgemm_direct_sme1_preprocess(M, K, A, A_mod); + + /* Calculate C = A*B */ + sgemm_direct_sme1_2VLx2VL(M, K, N, A_mod, B, R); + + free(A_mod); +} + +#endif diff --git a/kernel/arm64/sgemm_direct_sme1.S b/kernel/arm64/sgemm_direct_sme1.S new file mode 100644 index 000000000..8c0a173f3 --- /dev/null +++ b/kernel/arm64/sgemm_direct_sme1.S @@ -0,0 +1,228 @@ +/* + Copyright (c) 2025 Qualcomm Innovation Center, Inc. All rights reserved. + SPDX-License-Identifier: BSD-3-Clause-Clear +*/ + +/*-------------------------------------------------------------------------- + * SME1 based Matrix multiplication code for FP32 input matrices to FP32 + * output matrix + * C = A*B + * A: Left input matrix of dimension M x K + * B: Right input matrix of dimension K x N + * C: Result matrix of dimension M x N + * + * Usage of function: + * sgemm_direct_sme1_2VLx2VL( uint64_t M , uint64_t K, uint64_t N,\ + const float * restrict A_base,\ + const float * restrict B_base,\ + const float * restrict C_base); +----------------------------------------------------------------------------*/ + +#define M x0 //M dimension +#define K x1 //K dimension +#define N x2 //N dimension +#define A_base x3 //Pointer to left matrix(A) +#define B_base x4 //Pointer to right matrix(B) +#define C_base x5 //Pointer to result matrix(C) +#define Aptr x6 //Pointer to traverse A +#define Aptr_end x7 //Pointer to end of row of A +#define Cptr x8 //Pointer to traverse C +#define Cptr0 x9 //2nd Pointer to traverse C +#define Cptr1 x10 //3rd Pointer to traverse C +#define Bptr x11 //Pointer to traverse B +#define Bptr0 x12 //2nd Pointer to traverse B +#define N_exit x14 //Exit condition for N loop +#define K_exit x15 //Exit condition for K loop +#define M_cntr x16 //M loop counter +#define C1 x17 //Constant1: N*(SVLs+1);SVLs-No. of 32-bit elements +#define C2 x18 //Constant2: N + SVLs +#define C3 x19 //Constant3: K*SVLs + SVLs +#define C4 x20 //Constant4: SVLs-2 +#define C5 x21 //Constant5: K*SVLs +#define C6 x22 //Constant6: N*SVLs + + .text + .global sgemm_direct_sme1_2VLx2VL + + sgemm_direct_sme1_2VLx2VL: + + stp x19, x20, [sp, #-48]! + stp x21, x22, [sp, #16] + stp x23, x24, [sp, #32] + + smstart + + cntw C4 //SVLs + mul C5, C4, K //K*SVLs + mul C6, C4, N //N*SVLs + add C1, C6, N //N*SVLs + N + add N_exit, B_base, N, lsl #2 //N_Loop exit conditon + mov M_cntr, #0 + add C2, N, C4 //N + SVLs + add C3, C5, C4 //K*SVLs + SVLs + whilelt p2.s, M_cntr, M //Tile 0,1 predicate (M dimension) + sub w20, w20, #2 //SVLs-2 + +.M_Loop: + incw M_cntr + whilelt p3.s, M_cntr, M //Tile 2,3 predicate (M dimension) + mov Bptr, B_base //B_base + mov Cptr, C_base //C_base + whilelt p0.b, Bptr, N_exit //Tile 0/2 predicate (N dimension) + +.N_Loop: + mov Aptr, A_base //Aptr = A_base + mov Bptr0, Bptr //Bptr = B_base + mov Cptr0, Cptr //Cptr0 = C_base + addvl Cptr1, Cptr, #1 //Cptr1 = C_base + SVLb + addvl Bptr, Bptr, #1 + whilelt p1.b, Bptr, N_exit //Tile 1,3 predicate (N dimension) + add Aptr_end, A_base, C5, lsl #2 //A_base + K*SVLs + addvl K_exit, Aptr_end, #-1 //Exit condition for K loop + //Load 1st vector from Aptr + ld1w {z1.s}, p2/z, [Aptr] + zero {za} + // Load 1st vector from Bptr + ld1w {z2.s}, p0/z, [Bptr0] + // ZA0 += 1st Aptr vector OP 1st Bptr vector + fmopa za0.s, p2/m, p0/m, z1.s, z2.s + // Load 2nd vector from Aptr + ld1w {z5.s}, p3/z, [Aptr, C5, lsl #2] + // Aptr += SVLb + addvl Aptr, Aptr, #1 + +.K_Loop: + // ZA2 += 2nd Aptr vector OP 1st Bptr vector + fmopa za2.s, p3/m, p0/m, z5.s, z2.s + // Load 2nd vector from Bptr + ld1w {z3.s}, p1/z, [Bptr0, #1, MUL VL] + // ZA1 += 1st Aptr vector OP 2nd Bptr vector + fmopa za1.s, p2/m, p1/m, z1.s, z3.s + // Load next 1st vector from Aptr + ld1w {z0.s}, p2/z, [Aptr] + // ZA3 += 2nd Aptr vector OP 2nd Bptr vector + fmopa za3.s, p3/m, p1/m, z5.s, z3.s + cmp K, #2 + b.le process_K_less_than_equal_2 + // Load next 1st vector from Bptr + ld1w {z6.s}, p0/z, [Bptr0, N, lsl #2] + // ZA0 += 1st Aptr vector OP 1st Bptr vector + fmopa za0.s, p2/m, p0/m, z0.s, z6.s + // Load next 2nd vector from Aptr + ld1w {z4.s}, p3/z, [Aptr, C5, lsl #2] + // ZA2 += 2nd Aptr vector OP 1st Bptr vector + fmopa za2.s, p3/m, p0/m, z4.s, z6.s + // Load next 2nd vector from Bptr + ld1w {z7.s}, p1/z, [Bptr0, C2, lsl #2] + // Bptr += 2*ldb FP32 elms [Bytes] + add Bptr0, Bptr0, N, lsl #3 + // ZA1 += 1st Aptr vector OP 2nd Bptr vector + fmopa za1.s, p2/m, p1/m, z0.s, z7.s + // Load next 2nd vector from Aptr + ld1w {z1.s}, p2/z, [Aptr, #1, MUL VL] + // ZA3 += 2nd Aptr vector OP 2nd Bptr vector + fmopa za3.s, p3/m, p1/m, z4.s, z7.s + // Load next 1st vector from Bptr + ld1w {z2.s}, p0/z, [Bptr0] + // ZA0 += 1st Aptr vector OP 1st Bptr vector + fmopa za0.s, p2/m, p0/m, z1.s, z2.s + // Load next 2nd vector from Aptr + ld1w {z5.s}, p3/z, [Aptr, C3, lsl #2] + // Aptr += 2*SVLb [Bytes] + addvl Aptr, Aptr, #2 + cmp Aptr, K_exit + b.mi .K_Loop + // ZA2 += 2nd Aptr vector OP 1st Bptr vector + fmopa za2.s, p3/m, p0/m, z5.s, z2.s + // Load next 2nd vector from Bptr + ld1w {z3.s}, p1/z, [Bptr0, #1, MUL VL] + // ZA1 += 1st Aptr vector OP 2nd Bptr vector + fmopa za1.s, p2/m, p1/m, z1.s, z3.s + // ZA3 += 2nd Aptr vector OP 2nd Bptr vector + fmopa za3.s, p3/m, p1/m, z5.s, z3.s + +process_K_less_than_equal_2: + // Bptr += 2*ldb FP32 elements + add Bptr0, Bptr0, N, lsl #2 + cmp Aptr, Aptr_end + b.pl .Ktail_end + +.Ktail_start: + ld1w {z1.s}, p2/z, [Aptr] + ld1w {z2.s}, p0/z, [Bptr0] + ld1w {z3.s}, p1/z, [Bptr0, #1, MUL VL] + fmopa za0.s, p2/m, p0/m, z1.s, z2.s + ld1w {z5.s}, p3/z, [Aptr, C5, lsl #2] + fmopa za2.s, p3/m, p0/m, z5.s, z2.s + fmopa za1.s, p2/m, p1/m, z1.s, z3.s + fmopa za3.s, p3/m, p1/m, z5.s, z3.s + +.Ktail_end: + mov w13, #0 + psel p4, p0, p2.s[w13, 0] + psel p5, p1, p2.s[w13, 0] + psel p6, p0, p3.s[w13, 0] + psel p7, p1, p3.s[w13, 0] + // Store to Cptr0 + st1w {za0h.s[w13, #0]}, p4, [Cptr0] + // Store to Cptr1 + st1w {za1h.s[w13, #0]}, p5, [Cptr1] + // Store to Cptr0 + N*SVLs + st1w {za2h.s[w13, #0]}, p6, [Cptr0, C6, lsl #2] + // Store to Cptr1 + N*SVLs + st1w {za3h.s[w13, #0]}, p7, [Cptr1, C6, lsl #2] + +.Loop_store_ZA: + psel p4, p0, p2.s[w13, 1] + psel p5, p1, p2.s[w13, 1] + psel p6, p0, p3.s[w13, 1] + psel p7, p1, p3.s[w13, 1] + // Store to Cptr0 + N + st1w {za0h.s[w13, #1]}, p4, [Cptr0, N, lsl #2] + // Store to Cptr1 + N + st1w {za1h.s[w13, #1]}, p5, [Cptr1, N, lsl #2] + // Store to Cptr0 + N*(SVLs+1) + st1w {za2h.s[w13, #1]}, p6, [Cptr0, C1, lsl #2] + // Store to Cptr1 + N*(SVLs+1) + st1w {za3h.s[w13, #1]}, p7, [Cptr1, C1, lsl #2] + + add Cptr0, Cptr0, N, lsl #3 //Cptr0 += 2*N FP32 elements + add Cptr1, Cptr1, N, lsl #3 //Cptr1 += 2*N FP32 elements + add w13, w13, #2 + + psel p4, p0, p2.s[w13, 0] + psel p5, p1, p2.s[w13, 0] + psel p6, p0, p3.s[w13, 0] + psel p7, p1, p3.s[w13, 0] + st1w {za0h.s[w13, #0]}, p4, [Cptr0] + st1w {za1h.s[w13, #0]}, p5, [Cptr1] + st1w {za2h.s[w13, #0]}, p6, [Cptr0, C6, lsl #2] + st1w {za3h.s[w13, #0]}, p7, [Cptr1, C6, lsl #2] + cmp w13, w20 + b.mi .Loop_store_ZA + psel p4, p0, p2.s[w13, 1] + psel p5, p1, p2.s[w13, 1] + psel p6, p0, p3.s[w13, 1] + psel p7, p1, p3.s[w13, 1] + st1w {za0h.s[w13, #1]}, p4, [Cptr0, N, lsl #2] + st1w {za1h.s[w13, #1]}, p5, [Cptr1, N, lsl #2] + st1w {za2h.s[w13, #1]}, p6, [Cptr0, C1, lsl #2] + st1w {za3h.s[w13, #1]}, p7, [Cptr1, C1, lsl #2] + addvl Cptr, Cptr, #2 + addvl Bptr, Bptr, #1 + whilelt p0.b, Bptr, N_exit //1st Tile predicate (N dimension) + b.first .N_Loop + add A_base, A_base, C5, lsl #3 //A_base += 2*K*SVLs FP32 elements + add C_base, C_base, C6, lsl #3 //C_base += 2*N*SVLs FP32 elements + incw M_cntr + whilelt p2.s, M_cntr, M //1st Tile predicate (M dimension) + b.first .M_Loop + + smstop + + ldp x23, x24, [sp, #32] + ldp x21, x22, [sp, #16] + ldp x19, x20, [sp], #48 + + ret + diff --git a/kernel/arm64/sgemm_direct_sme1_preprocess.S b/kernel/arm64/sgemm_direct_sme1_preprocess.S new file mode 100644 index 000000000..fa1362075 --- /dev/null +++ b/kernel/arm64/sgemm_direct_sme1_preprocess.S @@ -0,0 +1,133 @@ +/* + Copyright (c) 2025 Qualcomm Innovation Center, Inc. All rights reserved. + SPDX-License-Identifier: BSD-3-Clause-Clear +*/ + +/*---------------------------------------------------------------------------- + * This function is used to re-arrange the elements of input matrix to + * make it suitable for matrix outer product computation using SME for matrix + * multiplication. It should be used to pre-process the leftmatrix(A) in the + * matrix muliplication (C= A*B) using sgemm_direct_sme1_2VLx2VL() + * + * The pre-processing transposes a block of SVLs rows of the input matrix and + * stores it contiguously. The same is applied to remaining blocks of SVLs + * rows. The last block of SVLs rows is zero-padded to SVLs rows if needed. + * + * Usage of function: + * sgemm_direct_sme1_preprocess(uint64_t nrow, uint64_t ncol, \ + * const float * restrict mat, float * mat_mod); + * + ----------------------------------------------------------------------------*/ + + +#define nrow x0 //Number of rows of input matrix +#define ncol x1 //Number of coulumns of input matrix +#define mat x2 //Input matrix base address +#define mat_mod x3 //Output matrix (re-arranged matrix) base address +#define mat_mod_ptr x4 //Pointer to output matrix +#define mat_ptr0 x5 //Pointer to input matrix +#define mat_ptr1 x6 //2nd pointer to input matrix +#define outer_loop_cntr x7 //Outer loop counter +#define inner_loop_exit x8 //Inner loop exit condition +#define C1 x9 //Constant1: SVLs - No. of 32-bit elements +#define C2 x10 //Constant2: 3*SVLs +#define C3 x11 //Constant3: ncol*SVLs +#define C4 x13 //Constant4: 2*SVLs +#define C5 x14 //Constant5: 2*ncol +#define C6 x15 //Constant6: 3*ncol + + .text + .global sgemm_direct_sme1_preprocess + + sgemm_direct_sme1_preprocess: + + stp x19, x20, [sp, #-48]! + stp x21, x22, [sp, #16] + stp x23, x24, [sp, #32] + + smstart + + cntw C1 //SVLs + mul C3, C1, ncol //SVLs*ncol + lsl C5, ncol, #1 //2*ncol + add C6, C5, ncol //3*ncol + cnth C4 //2*SVLs + add C2, C1, C1, lsl #1 //3*SVLs + + mov outer_loop_cntr, #0 + //Tile predicate (M dimension) + whilelt p0.s, outer_loop_cntr, nrow + //Predicate for stores + ptrue p9.s + +.M_Loop: + mov mat_ptr0, mat //Load base address of mat + mov mat_mod_ptr, mat_mod //a_mod store base address + add inner_loop_exit, mat, ncol, lsl #2 //Exit condition for inner loop + whilelt p8.b, mat_ptr0, inner_loop_exit //Tile predicate (K dimension) + +.Loop_process: + mov mat_ptr1, mat_ptr0 + //Load_to_tile loop counter + mov w12, #0 + +.Load_to_tile: + psel p2, p8, p0.s[w12, 0] + psel p3, p8, p0.s[w12, 1] + psel p4, p8, p0.s[w12, 2] + psel p5, p8, p0.s[w12, 3] + //Load 1st row from mat_ptr1 + ld1w {za0h.s[w12, #0]}, p2/z, [mat_ptr1] + //Load 2nd row from mat_ptr1 + ncol + ld1w {za0h.s[w12, #1]}, p3/z, [mat_ptr1, ncol, lsl #2] + //Load 3rd row from mat_ptr1 + 2*ncol + ld1w {za0h.s[w12, #2]}, p4/z, [mat_ptr1, C5, lsl #2] + //Load 4th row from mat_ptr1 + 3*ncol + ld1w {za0h.s[w12, #3]}, p5/z, [mat_ptr1, C6, lsl #2] + //mat_ptr1+=4*ncol FP32 elements + add mat_ptr1, mat_ptr1, ncol, lsl #4 + //Increment counter + add w12, w12, #4 + cmp w12, w9 + b.mi .Load_to_tile + // Store_from_tile loop counter + mov w12, #0 + +.Store_from_tile: + psel p2, p9, p8.s[w12, 0] + psel p3, p9, p8.s[w12, 1] + psel p4, p9, p8.s[w12, 2] + psel p5, p9, p8.s[w12, 3] + //Store 1st col to mat_mod + st1w {za0v.s[w12, #0]}, p2, [mat_mod_ptr] + //Store 2nd col to mat_mod + SVLs + st1w {za0v.s[w12, #1]}, p3, [mat_mod_ptr, C1, lsl #2] + //Store 3rd col to mat_mod + 2*SVLs + st1w {za0v.s[w12, #2]}, p4, [mat_mod_ptr, C4, lsl #2] + //Store 4th col to mat_mod + 3*SVLs + st1w {za0v.s[w12, #3]}, p5, [mat_mod_ptr, C2, lsl #2] + + addvl mat_mod_ptr, mat_mod_ptr, #4 //mat_mod_ptr += 4*SVLb + add w12, w12, #4 //Increment counter + cmp w12, w9 + b.mi .Store_from_tile + + addvl mat_ptr0, mat_ptr0, #1 //mat_ptr0 += SVLb + whilelt p8.b, mat_ptr0, inner_loop_exit + b.first .Loop_process + + add mat_mod, mat_mod, C3, lsl #2 //mat_mod+=SVLs*nbc FP32 elements + add mat, mat, C3, lsl #2 //mat+=SVLs*nbc FP32 elements + incw outer_loop_cntr + + whilelt p0.s, outer_loop_cntr, nrow + b.first .M_Loop + + smstop + + ldp x23, x24, [sp, #32] + ldp x21, x22, [sp, #16] + ldp x19, x20, [sp], #48 + + ret + diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index fa61a209e..dece71a66 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -178,6 +178,11 @@ gotoblas_t TABLE_NAME = { #ifdef ARCH_X86_64 sgemm_directTS, sgemm_direct_performantTS, +#endif +#ifdef ARCH_ARM64 +#ifdef HAVE_SME + sgemm_directTS, +#endif #endif sgemm_kernelTS, sgemm_betaTS, diff --git a/param.h b/param.h index fee9195d0..51ebcbabb 100644 --- a/param.h +++ b/param.h @@ -3303,6 +3303,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 + + #ifdef _WIN64 /* Use explicit casting for win64 as LLP64 datamodel is used */ #define GEMM_DEFAULT_ALIGN (BLASULONG)0x03fffUL @@ -3667,7 +3669,7 @@ Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy rout #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 4096 -#elif defined(ARMV8SVE) || defined(ARMV9) || defined(CORTEXA510)|| defined(CORTEXA710) || defined(CORTEXX2) // 128-bit SVE +#elif defined(ARMV8SVE) || defined(ARMV9SME) || defined(ARMV9) || defined(CORTEXA510)|| defined(CORTEXA710) || defined(CORTEXX2) // 128-bit SVE #if defined(XDOUBLE) || defined(DOUBLE) #define SWITCH_RATIO 8 @@ -3738,6 +3740,10 @@ Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy rout #endif /* ARMv8 */ +#if defined(ARMV9SME) /* ARMv9 SME */ +#define USE_SGEMM_KERNEL_DIRECT 1 +#endif /* ARMv9 SME */ + #if defined(ARMV5) #define SNUMOPT 2 #define DNUMOPT 2 From 77c638db67dcddd556e5d8e65feef2277abbe6f5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 15 Feb 2025 20:37:48 +0100 Subject: [PATCH 058/205] Revert "Fix potential inaccuracy in multithreaded level3 related to SWITCH_RATIO" --- driver/level3/level3_thread.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index a37292e8e..9b1aadf7d 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -742,7 +742,7 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG num_parts = 0; while (n > 0){ width = blas_quickdivide(n + nthreads - num_parts - 1, nthreads - num_parts); - if (width < switch_ratio && width > 1) { + if (width < switch_ratio) { width = switch_ratio; } width = round_up(n, width, GEMM_PREFERED_SIZE); From c1bb90a823eebb2aed4bde424941536c900e2fe1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 16 Feb 2025 14:23:07 +0100 Subject: [PATCH 059/205] remove the express NeoverseN2 target from the Cobalt100 job --- .github/workflows/dynamic_arch.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/dynamic_arch.yml b/.github/workflows/dynamic_arch.yml index b388cb1b2..2d5c7b612 100644 --- a/.github/workflows/dynamic_arch.yml +++ b/.github/workflows/dynamic_arch.yml @@ -372,7 +372,7 @@ jobs: - name: Build OpenBLAS run: | - make -j${nproc} TARGET=NEOVERSEN2 - make -j${nproc} TARGET=NEOVERSEN2 lapack-test + make -j${nproc} + make -j${nproc} lapack-test From b9ae246f205909c14561d4e8a52b21d72c7f5e8a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 16 Feb 2025 23:18:04 +0100 Subject: [PATCH 060/205] define USE_TRMM for RISCV64 targets as well --- kernel/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index b43cda2c1..55daa6d1e 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -201,7 +201,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) # Makefile.L3 set(USE_TRMM false) string(TOUPPER ${TARGET_CORE} UC_TARGET_CORE) - if (ARM OR ARM64 OR (UC_TARGET_CORE MATCHES LONGSOON3B) OR (UC_TARGET_CORE MATCHES GENERIC) OR (UC_TARGET_CORE MATCHES HASWELL) OR (UC_TARGET_CORE MATCHES ZEN) OR (UC_TARGET_CORE MATCHES SKYLAKEX) OR (UC_TARGET_CORE MATCHES COOPERLAKE) OR (UC_TARGET_CORE MATCHES SAPPHIRERAPIDS)) + if (ARM OR ARM64 OR RISCV64 OR (UC_TARGET_CORE MATCHES LONGSOON3B) OR (UC_TARGET_CORE MATCHES GENERIC) OR (UC_TARGET_CORE MATCHES HASWELL) OR (UC_TARGET_CORE MATCHES ZEN) OR (UC_TARGET_CORE MATCHES SKYLAKEX) OR (UC_TARGET_CORE MATCHES COOPERLAKE) OR (UC_TARGET_CORE MATCHES SAPPHIRERAPIDS)) set(USE_TRMM true) endif () if (ZARCH OR (UC_TARGET_CORE MATCHES POWER8) OR (UC_TARGET_CORE MATCHES POWER9) OR (UC_TARGET_CORE MATCHES POWER10)) From ebcab9097674f35c1fd7d87fee79875e4703ccab Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 17 Feb 2025 23:12:58 +0100 Subject: [PATCH 061/205] Handle flang-new runtime library linking on Linux like classic-flang --- exports/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/exports/Makefile b/exports/Makefile index 668a4866e..04fc64cfe 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -197,7 +197,7 @@ ifeq ($(F_COMPILER), INTEL) -Wl,--whole-archive $< -Wl,--no-whole-archive \ -Wl,-soname,$(INTERNALNAME) $(EXTRALIB) $(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. -else ifeq ($(F_COMPILER), FLANG) +else ifeq ($(F_COMPILER), $(filter $(F_COMPILER),FLANG FLANGNEW)) $(FC) $(FFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \ -Wl,--whole-archive $< -Wl,--no-whole-archive \ -Wl,-soname,$(INTERNALNAME) $(EXTRALIB) From 6d1444be3ab6617944d1e54c29e46bc90ea5e01f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 19 Feb 2025 14:26:43 +0100 Subject: [PATCH 062/205] Add ARM64 options for NVIDIA HPC --- cmake/cc.cmake | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/cmake/cc.cmake b/cmake/cc.cmake index 5e9c5a8c4..bffc7a7d6 100644 --- a/cmake/cc.cmake +++ b/cmake/cc.cmake @@ -84,7 +84,7 @@ endif () if (${CMAKE_C_COMPILER_ID} STREQUAL "NVHPC") if (POWER) set(CCOMMON_OPT "${CCOMMON_OPT} -tp pwr8") - else () + elseif (X86_64) set(CCOMMON_OPT "${CCOMMON_OPT} -tp px") endif () endif () @@ -182,7 +182,9 @@ endif () if (${CORE} STREQUAL A64FX) if (NOT DYNAMIC_ARCH) - if (${GCC_VERSION} VERSION_GREATER 11.0 OR ${GCC_VERSION} VERSION_EQUAL 11.0) + if (${CMAKE_C_COMPILER_ID} STREQUAL "NVC" AND NOT NO_SVE) + set (CCOMMON_OPT "${CCOMMON_OPT} -tp=a64fx") + elseif (${GCC_VERSION} VERSION_GREATER 11.0 OR ${GCC_VERSION} VERSION_EQUAL 11.0) set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve -mtune=a64fx") else () set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve") @@ -194,6 +196,8 @@ if (${CORE} STREQUAL NEOVERSEN2) if (NOT DYNAMIC_ARCH) if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) set (CCOMMON_OPT "${CCOMMON_OPT} -Msve_intrinsics -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2") + elseif (${CMAKE_C_COMPILER_ID} STREQUAL "NVC" AND NOT NO_SVE) + set (CCOMMON_OPT "${CCOMMON_OPT} -tp=neoverse-v2") else () if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4) set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2") @@ -208,6 +212,8 @@ if (${CORE} STREQUAL NEOVERSEV1) if (NOT DYNAMIC_ARCH) if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) set (CCOMMON_OPT "${CCOMMON_OPT} -Msve_intrinsics -march=armv8.4-a+sve -mtune=neoverse-v1") + elseif (${CMAKE_C_COMPILER_ID} STREQUAL "NVC" AND NOT NO_SVE) + set (CCOMMON_OPT "${CCOMMON_OPT} -tp=neoverse-v1") else () if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4) set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.4-a+sve -mtune=neoverse-v1") @@ -220,7 +226,9 @@ endif () if (${CORE} STREQUAL NEOVERSEN1) if (NOT DYNAMIC_ARCH) - if (${GCC_VERSION} VERSION_GREATER 9.4 OR ${GCC_VERSION} VERSION_EQUAL 9.4) + if (${CMAKE_C_COMPILER_ID} STREQUAL "NVC" AND NOT NO_SVE) + set (CCOMMON_OPT "${CCOMMON_OPT} -tp=neoverse-n1") + elseif (${GCC_VERSION} VERSION_GREATER 9.4 OR ${GCC_VERSION} VERSION_EQUAL 9.4) set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve -mtune=neoverse-n1") else () set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve") @@ -232,6 +240,8 @@ if (${CORE} STREQUAL ARMV8SVE) if (NOT DYNAMIC_ARCH) if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) set (CCOMMON_OPT "${CCOMMON_OPT} -Msve_intrinsics -march=armv8-a+sve") + elseif (${CMAKE_C_COMPILER_ID} STREQUAL "NVC" AND NOT NO_SVE) + set (CCOMMON_OPT "${CCOMMON_OPT} -tp=host") else () set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve") endif () @@ -240,6 +250,9 @@ endif () if (${CORE} STREQUAL ARMV9SME) if (NOT DYNAMIC_ARCH) + if (${CMAKE_C_COMPILER_ID} STREQUAL "NVC" AND NOT NO_SVE) + set (CCOMMON_OPT "${CCOMMON_OPT} -tp=host") + else () set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv9-a+sme") endif () endif () From f1fa370579aa2505975f85327d84b3b169a1228a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 19 Feb 2025 15:22:26 +0100 Subject: [PATCH 063/205] fix missing endif --- cmake/cc.cmake | 1 + 1 file changed, 1 insertion(+) diff --git a/cmake/cc.cmake b/cmake/cc.cmake index bffc7a7d6..f292f1c57 100644 --- a/cmake/cc.cmake +++ b/cmake/cc.cmake @@ -254,6 +254,7 @@ if (${CORE} STREQUAL ARMV9SME) set (CCOMMON_OPT "${CCOMMON_OPT} -tp=host") else () set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv9-a+sme") + endif () endif () endif () From b723c1b7b79663583e303e43132ee2e79ed1592c Mon Sep 17 00:00:00 2001 From: Marek Michalowski Date: Thu, 20 Feb 2025 10:18:47 +0000 Subject: [PATCH 064/205] Add thread throttling profile for SGEMM on `NEOVERSEV2` --- CONTRIBUTORS.md | 3 ++- interface/gemm.c | 23 +++++++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index f4a93aa1b..80dd9211f 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -237,8 +237,9 @@ In chronological order: * [2025-01-10] Add thread throttling profile for SGEMM on NEOVERSEV1 * [2025-01-21] Optimize gemv_t_sve_v1x3 kernel -* Marek Michalowski +* Marek Michalowski * [2025-01-21] Add thread throttling profile for SGEMV on `NEOVERSEV1` + * [2025-02-18] Add thread throttling profile for SGEMM on `NEOVERSEV2` * Ye Tao * [2025-02-03] Optimize SBGEMM kernel on NEOVERSEV1 diff --git a/interface/gemm.c b/interface/gemm.c index 2cd7d7b5c..67ab42b48 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -198,14 +198,37 @@ static inline int get_gemm_optimal_nthreads_neoversev1(double MNK, int ncpu) { } #endif +#if defined(DYNAMIC_ARCH) || defined(NEOVERSEV2) +static inline int get_gemm_optimal_nthreads_neoversev2(double MNK, int ncpu) { + return + MNK < 125000L ? 1 + : MNK < 1092727L ? MIN(ncpu, 6) + : MNK < 2628072L ? MIN(ncpu, 8) + : MNK < 8000000L ? MIN(ncpu, 12) + : MNK < 20346417L ? MIN(ncpu, 16) + : MNK < 57066625L ? MIN(ncpu, 24) + : MNK < 91125000L ? MIN(ncpu, 28) + : MNK < 238328000L ? MIN(ncpu, 40) + : MNK < 454756609L ? MIN(ncpu, 48) + : MNK < 857375000L ? MIN(ncpu, 56) + : MNK < 1073741824L ? MIN(ncpu, 64) + : ncpu; +} +#endif + static inline int get_gemm_optimal_nthreads(double MNK) { int ncpu = num_cpu_avail(3); #if defined(NEOVERSEV1) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) return get_gemm_optimal_nthreads_neoversev1(MNK, ncpu); +#elif defined(NEOVERSEV2) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) + return get_gemm_optimal_nthreads_neoversev2(MNK, ncpu); #elif defined(DYNAMIC_ARCH) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) if (strcmp(gotoblas_corename(), "neoversev1") == 0) { return get_gemm_optimal_nthreads_neoversev1(MNK, ncpu); } + if (strcmp(gotoblas_corename(), "neoversev2") == 0) { + return get_gemm_optimal_nthreads_neoversev2(MNK, ncpu); + } #endif if ( MNK <= (SMP_THRESHOLD_MIN * (double) GEMM_MULTITHREAD_THRESHOLD) ) { return 1; From 650a062e19e452cf1eb77617b14af4d8a838fc27 Mon Sep 17 00:00:00 2001 From: Marek Michalowski Date: Thu, 20 Feb 2025 10:19:40 +0000 Subject: [PATCH 065/205] Add thread throttling profile for SGEMV on `NEOVERSEV2` --- CONTRIBUTORS.md | 1 + interface/gemv.c | 16 ++++++++++++++++ 2 files changed, 17 insertions(+) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 80dd9211f..99166f520 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -240,6 +240,7 @@ In chronological order: * Marek Michalowski * [2025-01-21] Add thread throttling profile for SGEMV on `NEOVERSEV1` * [2025-02-18] Add thread throttling profile for SGEMM on `NEOVERSEV2` + * [2025-02-19] Add thread throttling profile for SGEMV on `NEOVERSEV2` * Ye Tao * [2025-02-03] Optimize SBGEMM kernel on NEOVERSEV1 diff --git a/interface/gemv.c b/interface/gemv.c index f91f364ee..4bcdf07c4 100644 --- a/interface/gemv.c +++ b/interface/gemv.c @@ -77,14 +77,30 @@ static inline int get_gemv_optimal_nthreads_neoversev1(BLASLONG MN, int ncpu) { } #endif +#if defined(DYNAMIC_ARCH) || defined(NEOVERSEV2) +static inline int get_gemv_optimal_nthreads_neoversev2(BLASLONG MN, int ncpu) { + return + MN < 24964L ? 1 + : MN < 65536L ? MIN(ncpu, 8) + : MN < 262144L ? MIN(ncpu, 32) + : MN < 1638400L ? MIN(ncpu, 64) + : ncpu; +} +#endif + static inline int get_gemv_optimal_nthreads(BLASLONG MN) { int ncpu = num_cpu_avail(3); #if defined(NEOVERSEV1) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) return get_gemv_optimal_nthreads_neoversev1(MN, ncpu); +#elif defined(NEOVERSEV2) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) + return get_gemv_optimal_nthreads_neoversev2(MN, ncpu); #elif defined(DYNAMIC_ARCH) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) if (strcmp(gotoblas_corename(), "neoversev1") == 0) { return get_gemv_optimal_nthreads_neoversev1(MN, ncpu); } + if (strcmp(gotoblas_corename(), "neoversev2") == 0) { + return get_gemv_optimal_nthreads_neoversev2(MN, ncpu); + } #endif if ( MN < 115200L * GEMM_MULTITHREAD_THRESHOLD ) From 75b958a0184a614c75ea41446d23d1b066acb2fc Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 20 Feb 2025 23:54:12 +0100 Subject: [PATCH 066/205] Transform the B array back if necessary before returning --- interface/gemmt.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/interface/gemmt.c b/interface/gemmt.c index bcccf5a74..aa65f81ed 100644 --- a/interface/gemmt.c +++ b/interface/gemmt.c @@ -688,5 +688,19 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, IDEBUG_END; +/* transform B back if necessary */ +#if defined(COMPLEX) + if (transb > 1){ +#ifndef CBLAS + IMATCOPY_K_CNC(nrowb, ncolb, (FLOAT)(1.0), (FLOAT)(0.0), b, ldb); +#else + if (order == CblasColMajor) + IMATCOPY_K_CNC(nrowb, ncolb, (FLOAT)(1.0), (FLOAT)(0.0), b, ldb); + if (order == CblasRowMajor) + IMATCOPY_K_RNC(nrowb, ncolb, (FLOAT)(1.0), (FLOAT)(0.0), b, ldb); +#endif + } +#endif + return; } From f0bea79a6e151299cecdfb191969dbfe88ece814 Mon Sep 17 00:00:00 2001 From: Ye Tao Date: Fri, 21 Feb 2025 10:03:50 +0000 Subject: [PATCH 067/205] dispatch NEOVERSEV2 to NEOVERSEN2 under dynamic setting --- driver/others/dynamic_arm64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c index 37991184a..3174e2284 100644 --- a/driver/others/dynamic_arm64.c +++ b/driver/others/dynamic_arm64.c @@ -150,7 +150,7 @@ extern gotoblas_t gotoblas_A64FX; #endif extern gotoblas_t gotoblas_THUNDERX3T110; #endif -#define gotoblas_NEOVERSEV2 gotoblas_NEOVERSEV1 +#define gotoblas_NEOVERSEV2 gotoblas_NEOVERSEN2 extern void openblas_warning(int verbose, const char * msg); #define FALLBACK_VERBOSE 1 From 77fba0f400beedc73ac817b2d5b3f86abefa9d7b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 22 Feb 2025 20:09:21 +0100 Subject: [PATCH 068/205] Fix "dummy2" flag handling --- kernel/power/scal.S | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/power/scal.S b/kernel/power/scal.S index eceb9fe8e..8fd175d18 100644 --- a/kernel/power/scal.S +++ b/kernel/power/scal.S @@ -51,7 +51,7 @@ #else #define X r7 #define INCX r8 -#define FLAG r12 +#define FLAG r11 #endif #endif @@ -63,7 +63,7 @@ #else #define X r7 #define INCX r8 -#define FLAG r12 +#define FLAG r11 #endif #endif @@ -91,7 +91,7 @@ fcmpu cr0, FZERO, ALPHA bne- cr0, LL(A1I1) - LDLONG FLAG, 48+64+8(SP) + LDLONG FLAG, 104(SP) cmpwi cr0, FLAG, 1 beq- cr0, LL(A1I1) From 030ae1fd97f04c0ff4536e4e35567147409fb985 Mon Sep 17 00:00:00 2001 From: Harishmcw Date: Tue, 25 Feb 2025 15:40:39 +0530 Subject: [PATCH 069/205] Redefined threading logic for WoA --- interface/gemv.c | 5 +++++ interface/lapack/gesv.c | 10 ++++++---- interface/zgemv.c | 15 ++++++++++----- 3 files changed, 21 insertions(+), 9 deletions(-) diff --git a/interface/gemv.c b/interface/gemv.c index f91f364ee..0f8fe6678 100644 --- a/interface/gemv.c +++ b/interface/gemv.c @@ -79,6 +79,11 @@ static inline int get_gemv_optimal_nthreads_neoversev1(BLASLONG MN, int ncpu) { static inline int get_gemv_optimal_nthreads(BLASLONG MN) { int ncpu = num_cpu_avail(3); +#if defined(_WIN64) && defined(_M_ARM64) + if (MN > 100000000L) + return num_cpu_avail(4); + return 1; +#endif #if defined(NEOVERSEV1) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) return get_gemv_optimal_nthreads_neoversev1(MN, ncpu); #elif defined(DYNAMIC_ARCH) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) diff --git a/interface/lapack/gesv.c b/interface/lapack/gesv.c index 51a38de60..21fcc2097 100644 --- a/interface/lapack/gesv.c +++ b/interface/lapack/gesv.c @@ -117,13 +117,15 @@ int NAME(blasint *N, blasint *NRHS, FLOAT *a, blasint *ldA, blasint *ipiv, #if defined(_WIN64) && defined(_M_ARM64) #ifdef COMPLEX - if (args.m * args.n > 600) + if (args.m * args.n <= 300) #else - if (args.m * args.n > 1000) + if (args.m * args.n <= 500) #endif - args.nthreads = num_cpu_avail(4); - else args.nthreads = 1; + else if (args.m * args.n <= 1000) + args.nthreads = 4; + else + args.nthreads = num_cpu_avail(4); #else #ifndef DOUBLE if (args.m * args.n < 40000) diff --git a/interface/zgemv.c b/interface/zgemv.c index 3e98dba7f..3438575b9 100644 --- a/interface/zgemv.c +++ b/interface/zgemv.c @@ -252,25 +252,30 @@ void CNAME(enum CBLAS_ORDER order, #ifdef SMP - if ( 1L * m * n < 1024L * GEMM_MULTITHREAD_THRESHOLD ) +#if defined(_WIN64) && defined(_M_ARM64) + if (m*n > 25000000L) + nthreads = num_cpu_avail(4); + else + nthreads = 1; +#else + if (1L * m * n < 1024L * GEMM_MULTITHREAD_THRESHOLD) nthreads = 1; else nthreads = num_cpu_avail(2); +#endif if (nthreads == 1) { -#endif +#endif (gemv[(int)trans])(m, n, 0, alpha_r, alpha_i, a, lda, x, incx, y, incy, buffer); #ifdef SMP - } else { - (gemv_thread[(int)trans])(m, n, ALPHA, a, lda, x, incx, y, incy, buffer, nthreads); - } #endif + STACK_FREE(buffer); FUNCTION_PROFILE_END(4, m * n + m + n, 2 * m * n); From 09ba0994615c2e78570ca400bf13da0a8a45d873 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 25 Feb 2025 12:10:48 +0100 Subject: [PATCH 070/205] make throttling code conditional on SMP --- interface/gemm.c | 2 ++ interface/gemv.c | 9 ++------- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/interface/gemm.c b/interface/gemm.c index 67ab42b48..d36925629 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -177,6 +177,7 @@ static int init_amxtile_permission() { } #endif +#ifdef SMP #ifdef DYNAMIC_ARCH extern char* gotoblas_corename(void); #endif @@ -242,6 +243,7 @@ static inline int get_gemm_optimal_nthreads(double MNK) { } } } +#endif #ifndef CBLAS diff --git a/interface/gemv.c b/interface/gemv.c index 4bcdf07c4..533ea3a56 100644 --- a/interface/gemv.c +++ b/interface/gemv.c @@ -63,6 +63,7 @@ static int (*gemv_thread[])(BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT }; #endif +#ifdef SMP #ifdef DYNAMIC_ARCH extern char* gotoblas_corename(void); #endif @@ -108,6 +109,7 @@ static inline int get_gemv_optimal_nthreads(BLASLONG MN) { else return num_cpu_avail(2); } +#endif #ifndef CBLAS @@ -248,13 +250,6 @@ void CNAME(enum CBLAS_ORDER order, if (alpha == ZERO) return; -#if 0 -/* this optimization causes stack corruption on x86_64 under OSX, Windows and FreeBSD */ - if (trans == 0 && incx == 1 && incy == 1 && m*n < 2304 *GEMM_MULTITHREAD_THRESHOLD) { - GEMV_N(m, n, 0, alpha, a, lda, x, incx, y, incy, NULL); - return; - } -#endif IDEBUG_START; FUNCTION_PROFILE_START(); From edaf51dd99bb979f15fa4f2774ba068cfec0c09e Mon Sep 17 00:00:00 2001 From: Annop Wongwathanarat Date: Wed, 26 Feb 2025 12:47:11 +0000 Subject: [PATCH 071/205] Add sbgemv_t_bfdot kernel for ARM64 This improves performance for sbgemv_t by up to 100x on NEOVERSEV1. The geometric mean speedup is ~61x for M=N=[2,512]. --- CONTRIBUTORS.md | 1 + kernel/arm64/KERNEL.NEOVERSEN2 | 1 + kernel/arm64/KERNEL.NEOVERSEV1 | 1 + kernel/arm64/KERNEL.NEOVERSEV2 | 4 + kernel/arm64/sbgemv_t_bfdot.c | 207 +++++++++++++++++++++++++++++++++ 5 files changed, 214 insertions(+) create mode 100644 kernel/arm64/sbgemv_t_bfdot.c diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 99166f520..9ce5e37de 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -236,6 +236,7 @@ In chronological order: * Annop Wongwathanarat * [2025-01-10] Add thread throttling profile for SGEMM on NEOVERSEV1 * [2025-01-21] Optimize gemv_t_sve_v1x3 kernel + * [2025-02-26] Add sbgemv_t_bfdot kernel * Marek Michalowski * [2025-01-21] Add thread throttling profile for SGEMV on `NEOVERSEV1` diff --git a/kernel/arm64/KERNEL.NEOVERSEN2 b/kernel/arm64/KERNEL.NEOVERSEN2 index 2f7400113..e4e1cfde3 100644 --- a/kernel/arm64/KERNEL.NEOVERSEN2 +++ b/kernel/arm64/KERNEL.NEOVERSEN2 @@ -198,3 +198,4 @@ SBGEMMINCOPYOBJ = sbgemm_incopy$(TSUFFIX).$(SUFFIX) SBGEMMITCOPYOBJ = sbgemm_itcopy$(TSUFFIX).$(SUFFIX) SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX) SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX) +SBGEMVTKERNEL = sbgemv_t_bfdot.c \ No newline at end of file diff --git a/kernel/arm64/KERNEL.NEOVERSEV1 b/kernel/arm64/KERNEL.NEOVERSEV1 index 8845e6860..374acb35b 100644 --- a/kernel/arm64/KERNEL.NEOVERSEV1 +++ b/kernel/arm64/KERNEL.NEOVERSEV1 @@ -15,4 +15,5 @@ SBGEMMONCOPY = sbgemm_ncopy_$(SBGEMM_UNROLL_N)_neoversev1.c SBGEMMOTCOPY = sbgemm_tcopy_$(SBGEMM_UNROLL_N)_neoversev1.c SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX) SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX) +SBGEMVTKERNEL = sbgemv_t_bfdot.c endif \ No newline at end of file diff --git a/kernel/arm64/KERNEL.NEOVERSEV2 b/kernel/arm64/KERNEL.NEOVERSEV2 index bc5999097..4d866f858 100644 --- a/kernel/arm64/KERNEL.NEOVERSEV2 +++ b/kernel/arm64/KERNEL.NEOVERSEV2 @@ -1 +1,5 @@ include $(KERNELDIR)/KERNEL.ARMV8SVE + +ifeq ($(BUILD_BFLOAT16), 1) +SBGEMVTKERNEL = sbgemv_t_bfdot.c +endif \ No newline at end of file diff --git a/kernel/arm64/sbgemv_t_bfdot.c b/kernel/arm64/sbgemv_t_bfdot.c new file mode 100644 index 000000000..0751690fc --- /dev/null +++ b/kernel/arm64/sbgemv_t_bfdot.c @@ -0,0 +1,207 @@ +/*************************************************************************** +Copyright (c) 2025, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" + +static inline float bf16_to_fp32(bfloat16 bf16) { + uint32_t fp32 = (uint32_t)bf16 << 16; + return *((float*)&fp32); +} + +int CNAME(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, BLASLONG incx, float beta, float *y, BLASLONG incy) +{ + if (m < 1 || n < 1) return(0); + BLASLONG i; + BLASLONG ix,iy; + BLASLONG j; + bfloat16_t *a_ptr; + bfloat16_t *x_ptr; + float *y_ptr; + float temp; + + iy = 0; + a_ptr = (bfloat16_t*)(a); + x_ptr = (bfloat16_t*)(x); + + if (incx == 1) { + BLASLONG width = n / 4; + + bfloat16_t *a0_ptr = a_ptr + lda * width * 0; + bfloat16_t *a1_ptr = a_ptr + lda * width * 1; + bfloat16_t *a2_ptr = a_ptr + lda * width * 2; + bfloat16_t *a3_ptr = a_ptr + lda * width * 3; + + float *y0_ptr = y + incy * width * 0; + float *y1_ptr = y + incy * width * 1; + float *y2_ptr = y + incy * width * 2; + float *y3_ptr = y + incy * width * 3; + + for (j = 0; j < width; j++) { + float32x4_t temp0_vec = vdupq_n_f32(0.0f); + float32x4_t temp1_vec = vdupq_n_f32(0.0f); + float32x4_t temp2_vec = vdupq_n_f32(0.0f); + float32x4_t temp3_vec = vdupq_n_f32(0.0f); + + i = 0; + while (i + 7 < m) { + bfloat16x8_t x_vec = vld1q_bf16(x_ptr + i); + + bfloat16x8_t a0_vec = vld1q_bf16(a0_ptr + i); + bfloat16x8_t a1_vec = vld1q_bf16(a1_ptr + i); + bfloat16x8_t a2_vec = vld1q_bf16(a2_ptr + i); + bfloat16x8_t a3_vec = vld1q_bf16(a3_ptr + i); + + temp0_vec = vbfdotq_f32(temp0_vec, a0_vec, x_vec); + temp1_vec = vbfdotq_f32(temp1_vec, a1_vec, x_vec); + temp2_vec = vbfdotq_f32(temp2_vec, a2_vec, x_vec); + temp3_vec = vbfdotq_f32(temp3_vec, a3_vec, x_vec); + + i += 8; + } + if (i + 3 < m) { + float32x2_t t0 = vdup_n_f32(0.0f); + float32x2_t t1 = vdup_n_f32(0.0f); + float32x2_t t2 = vdup_n_f32(0.0f); + float32x2_t t3 = vdup_n_f32(0.0f); + + bfloat16x4_t x_vec = vld1_bf16(x_ptr + i); + + bfloat16x4_t a0_vec = vld1_bf16(a0_ptr + i); + bfloat16x4_t a1_vec = vld1_bf16(a1_ptr + i); + bfloat16x4_t a2_vec = vld1_bf16(a2_ptr + i); + bfloat16x4_t a3_vec = vld1_bf16(a3_ptr + i); + + t0 = vbfdot_f32(t0, a0_vec, x_vec); + t1 = vbfdot_f32(t1, a1_vec, x_vec); + t2 = vbfdot_f32(t2, a2_vec, x_vec); + t3 = vbfdot_f32(t3, a3_vec, x_vec); + + float32x2_t temp0_vec_low = vget_low_f32(temp0_vec); + float32x2_t temp1_vec_low = vget_low_f32(temp1_vec); + float32x2_t temp2_vec_low = vget_low_f32(temp2_vec); + float32x2_t temp3_vec_low = vget_low_f32(temp3_vec); + + temp0_vec = vcombine_f32(vadd_f32(t0, temp0_vec_low), vget_high_f32(temp0_vec)); + temp1_vec = vcombine_f32(vadd_f32(t1, temp1_vec_low), vget_high_f32(temp1_vec)); + temp2_vec = vcombine_f32(vadd_f32(t2, temp2_vec_low), vget_high_f32(temp2_vec)); + temp3_vec = vcombine_f32(vadd_f32(t3, temp3_vec_low), vget_high_f32(temp3_vec)); + + i += 4; + } + if (beta == 0.0f) { + y0_ptr[iy] = alpha * vaddvq_f32(temp0_vec); + y1_ptr[iy] = alpha * vaddvq_f32(temp1_vec); + y2_ptr[iy] = alpha * vaddvq_f32(temp2_vec); + y3_ptr[iy] = alpha * vaddvq_f32(temp3_vec); + } + else { + y0_ptr[iy] = alpha * vaddvq_f32(temp0_vec) + beta * y0_ptr[iy]; + y1_ptr[iy] = alpha * vaddvq_f32(temp1_vec) + beta * y1_ptr[iy]; + y2_ptr[iy] = alpha * vaddvq_f32(temp2_vec) + beta * y2_ptr[iy]; + y3_ptr[iy] = alpha * vaddvq_f32(temp3_vec) + beta * y3_ptr[iy]; + } + + for (; i < m; ++i) { + y0_ptr[iy] += alpha * a0_ptr[i] * x_ptr[i]; + y1_ptr[iy] += alpha * a1_ptr[i] * x_ptr[i]; + y2_ptr[iy] += alpha * a2_ptr[i] * x_ptr[i]; + y3_ptr[iy] += alpha * a3_ptr[i] * x_ptr[i]; + } + + iy += incy; + + a0_ptr += lda; + a1_ptr += lda; + a2_ptr += lda; + a3_ptr += lda; + } + + a_ptr = a3_ptr; + y_ptr = y3_ptr; + for (j = width * 4; j < n; j++) { + float32x4_t temp0_vec = vdupq_n_f32(0.0f); + i = 0; + while (i + 7 < m) { + bfloat16x8_t x_vec = vld1q_bf16(x_ptr + i); + bfloat16x8_t a0_vec = vld1q_bf16(a_ptr + i); + temp0_vec = vbfdotq_f32(temp0_vec, a0_vec, x_vec); + + i += 8; + } + if (i + 3 < m) { + float32x2_t t0 = vdup_n_f32(0.0f); + bfloat16x4_t x_vec = vld1_bf16(x_ptr + i); + bfloat16x4_t a0_vec = vld1_bf16(a_ptr + i); + + t0 = vbfdot_f32(t0, a0_vec, x_vec); + float32x2_t temp0_vec_low = vget_low_f32(temp0_vec); + temp0_vec = vcombine_f32(vadd_f32(t0, temp0_vec_low), vget_high_f32(temp0_vec)); + + i += 4; + } + if (beta == 0.0f) { + y_ptr[iy] = alpha * vaddvq_f32(temp0_vec); + } + else { + y_ptr[iy] = alpha * vaddvq_f32(temp0_vec) + beta * y_ptr[iy]; + } + + for (; i < m; ++i) { + y_ptr[iy] += alpha * a_ptr[i] * x_ptr[i]; + } + + iy += incy; + + a_ptr += lda; + } + return(0); + } + + for (j = 0; j < n; j++) { + temp = 0.0; + ix = 0; + for (i = 0; i < m; i++) { + temp += bf16_to_fp32(a[i]) * bf16_to_fp32(x[ix]); + ix += incx; + } + if (beta == 0.0f) { + y[iy] = alpha * temp; + } + else { + y[iy] = alpha * temp + beta * y[iy]; + } + iy += incy; + a += lda; + } + return (0); +} From 35bdbca1535c3297a64d304977c6aa28e2e57e49 Mon Sep 17 00:00:00 2001 From: Ye Tao Date: Thu, 27 Feb 2025 18:15:17 +0000 Subject: [PATCH 072/205] Add sbgemv_n_neon kernel for arm64. --- CONTRIBUTORS.md | 1 + kernel/arm64/KERNEL.NEOVERSEV1 | 2 + kernel/arm64/sbgemv_n_neon.c | 542 +++++++++++++++++++++++++++++++++ 3 files changed, 545 insertions(+) create mode 100644 kernel/arm64/sbgemv_n_neon.c diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index f4a93aa1b..9edf3d6ea 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -242,3 +242,4 @@ In chronological order: * Ye Tao * [2025-02-03] Optimize SBGEMM kernel on NEOVERSEV1 + * [2025-02-27] Add sbgemv_n_neon kernel \ No newline at end of file diff --git a/kernel/arm64/KERNEL.NEOVERSEV1 b/kernel/arm64/KERNEL.NEOVERSEV1 index 8845e6860..d14993544 100644 --- a/kernel/arm64/KERNEL.NEOVERSEV1 +++ b/kernel/arm64/KERNEL.NEOVERSEV1 @@ -15,4 +15,6 @@ SBGEMMONCOPY = sbgemm_ncopy_$(SBGEMM_UNROLL_N)_neoversev1.c SBGEMMOTCOPY = sbgemm_tcopy_$(SBGEMM_UNROLL_N)_neoversev1.c SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX) SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX) + +SBGEMVNKERNEL = sbgemv_n_neon.c endif \ No newline at end of file diff --git a/kernel/arm64/sbgemv_n_neon.c b/kernel/arm64/sbgemv_n_neon.c new file mode 100644 index 000000000..cdb54298c --- /dev/null +++ b/kernel/arm64/sbgemv_n_neon.c @@ -0,0 +1,542 @@ +/*************************************************************************** +Copyright (c) 2025, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE +GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF +THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if (defined(__GNUC__) && __GNUC__ >= 13) +#define BF16_TO_FP32(bf16) ((float)(bf16)) +#else +static inline float bf16_to_fp32(bfloat16_t bf16) { + uint32_t fp32 = (uint32_t)(*((u_int16_t*)(&bf16))) << 16; + return *((float*)&fp32); +} +#define BF16_TO_FP32(bf16) bf16_to_fp32(bf16) +#endif + +static void beta_op(float *x, BLASLONG n, FLOAT beta) { + if (beta == 0) { + memset(x, 0, n * sizeof(float)); + return; + } + + float32x4_t y0, y1, y2, y3; + + for (BLASLONG i = 0; i < n / 16; i++) { + y0 = vld1q_f32(x); + y1 = vld1q_f32(x + 4); + y2 = vld1q_f32(x + 8); + y3 = vld1q_f32(x + 12); + + y0 = vmulq_n_f32(y0, beta); + y1 = vmulq_n_f32(y1, beta); + y2 = vmulq_n_f32(y2, beta); + y3 = vmulq_n_f32(y3, beta); + + vst1q_f32(x, y0); + vst1q_f32(x + 4, y1); + vst1q_f32(x + 8, y2); + vst1q_f32(x + 12, y3); + + x += 16; + } + + if (n & 15) { + BLASLONG rest_n = n & 15; + for (BLASLONG i = 0; i < (rest_n) / 4; i++) { + y0 = vld1q_f32(x); + y0 = vmulq_n_f32(y0, beta); + vst1q_f32(x, y0); + x += 4; + } + + if (rest_n & 3) { + x[0] *= beta; + if ((rest_n & 3) > 1) + x[1] *= beta; + if ((rest_n & 3) > 2) + x[2] *= beta; + } + } + return; +} + +int CNAME(BLASLONG m, BLASLONG n, FLOAT alpha, bfloat16 *a, BLASLONG lda, + bfloat16 *x, BLASLONG incx, float beta, float *y, BLASLONG incy) { + BLASLONG i, j; + bfloat16_t *a_ptr, *x_ptr; + FLOAT *y_ptr; + + bfloat16x8_t a0, a1, a2, a3, a4, a5, a6, a7; + bfloat16x8_t t0, t1, t2, t3, t4, t5, t6, t7; + bfloat16x8_t x_vec; + float32x4_t y1_vec, y2_vec; + float32x4_t fp32_low, fp32_high; + + float x0, x1, x2, x3, x4, x5, x6, x7; + bfloat16_t *a_ptr0, *a_ptr1, *a_ptr2, *a_ptr3, *a_ptr4, *a_ptr5, *a_ptr6, + *a_ptr7; + + a_ptr = (bfloat16_t *)a; + x_ptr = (bfloat16_t *)x; + + BLASLONG rest_m = m & 3; + + bfloat16x4_t bf16_zero = vreinterpret_bf16_u16(vdup_n_u16(0)); + bfloat16x8_t bf16_zero_q = vreinterpretq_bf16_u16(vdupq_n_u16(0)); + + if (incx == 1 && incy == 1) { + if (beta != 1) { + beta_op(y, n, beta); + } + + for (i = 0; i < n / 8; i++) { + a_ptr0 = a_ptr; + a_ptr1 = a_ptr0 + lda; + a_ptr2 = a_ptr1 + lda; + a_ptr3 = a_ptr2 + lda; + a_ptr4 = a_ptr3 + lda; + a_ptr5 = a_ptr4 + lda; + a_ptr6 = a_ptr5 + lda; + a_ptr7 = a_ptr6 + lda; + + a_ptr += 8 * lda; + + y_ptr = y; + + x_vec = vld1q_bf16(x_ptr); + + if (alpha != 1) { + fp32_low = vreinterpretq_f32_u16( + vzip1q_u16(vreinterpretq_u16_bf16(bf16_zero_q), + vreinterpretq_u16_bf16(x_vec))); + fp32_high = vreinterpretq_f32_u16( + vzip2q_u16(vreinterpretq_u16_bf16(bf16_zero_q), + vreinterpretq_u16_bf16(x_vec))); + + fp32_low = vmulq_n_f32(fp32_low, alpha); + fp32_high = vmulq_n_f32(fp32_high, alpha); + + x_vec = + vcombine_bf16(vcvt_bf16_f32(fp32_low), vcvt_bf16_f32(fp32_high)); + } + + for (j = 0; j < m / 8; j++) { + a0 = vld1q_bf16(a_ptr0); + a1 = vld1q_bf16(a_ptr1); + a2 = vld1q_bf16(a_ptr2); + a3 = vld1q_bf16(a_ptr3); + a4 = vld1q_bf16(a_ptr4); + a5 = vld1q_bf16(a_ptr5); + a6 = vld1q_bf16(a_ptr6); + a7 = vld1q_bf16(a_ptr7); + + y1_vec = vld1q_f32(y_ptr); + y2_vec = vld1q_f32(y_ptr + 4); + + t0 = vreinterpretq_bf16_u16( + vzip1q_u16(vreinterpretq_u16_bf16(a0), vreinterpretq_u16_bf16(a1))); + t1 = vreinterpretq_bf16_u16( + vzip1q_u16(vreinterpretq_u16_bf16(a2), vreinterpretq_u16_bf16(a3))); + t2 = vreinterpretq_bf16_u16( + vzip1q_u16(vreinterpretq_u16_bf16(a4), vreinterpretq_u16_bf16(a5))); + t3 = vreinterpretq_bf16_u16( + vzip1q_u16(vreinterpretq_u16_bf16(a6), vreinterpretq_u16_bf16(a7))); + + t4 = vreinterpretq_bf16_u16( + vzip2q_u16(vreinterpretq_u16_bf16(a0), vreinterpretq_u16_bf16(a1))); + t5 = vreinterpretq_bf16_u16( + vzip2q_u16(vreinterpretq_u16_bf16(a2), vreinterpretq_u16_bf16(a3))); + t6 = vreinterpretq_bf16_u16( + vzip2q_u16(vreinterpretq_u16_bf16(a4), vreinterpretq_u16_bf16(a5))); + t7 = vreinterpretq_bf16_u16( + vzip2q_u16(vreinterpretq_u16_bf16(a6), vreinterpretq_u16_bf16(a7))); + + y1_vec = vbfmlalbq_laneq_f32(y1_vec, t0, x_vec, 0); + y1_vec = vbfmlaltq_laneq_f32(y1_vec, t0, x_vec, 1); + y1_vec = vbfmlalbq_laneq_f32(y1_vec, t1, x_vec, 2); + y1_vec = vbfmlaltq_laneq_f32(y1_vec, t1, x_vec, 3); + y1_vec = vbfmlalbq_laneq_f32(y1_vec, t2, x_vec, 4); + y1_vec = vbfmlaltq_laneq_f32(y1_vec, t2, x_vec, 5); + y1_vec = vbfmlalbq_laneq_f32(y1_vec, t3, x_vec, 6); + y1_vec = vbfmlaltq_laneq_f32(y1_vec, t3, x_vec, 7); + + y2_vec = vbfmlalbq_laneq_f32(y2_vec, t4, x_vec, 0); + y2_vec = vbfmlaltq_laneq_f32(y2_vec, t4, x_vec, 1); + y2_vec = vbfmlalbq_laneq_f32(y2_vec, t5, x_vec, 2); + y2_vec = vbfmlaltq_laneq_f32(y2_vec, t5, x_vec, 3); + y2_vec = vbfmlalbq_laneq_f32(y2_vec, t6, x_vec, 4); + y2_vec = vbfmlaltq_laneq_f32(y2_vec, t6, x_vec, 5); + y2_vec = vbfmlalbq_laneq_f32(y2_vec, t7, x_vec, 6); + y2_vec = vbfmlaltq_laneq_f32(y2_vec, t7, x_vec, 7); + + vst1q_f32(y_ptr, y1_vec); + vst1q_f32(y_ptr + 4, y2_vec); + + a_ptr0 += 8; + a_ptr1 += 8; + a_ptr2 += 8; + a_ptr3 += 8; + a_ptr4 += 8; + a_ptr5 += 8; + a_ptr6 += 8; + a_ptr7 += 8; + + y_ptr += 8; + } + + if (m & 4) { + bfloat16x4_t a0x4 = vld1_bf16(a_ptr0); + bfloat16x4_t a1x4 = vld1_bf16(a_ptr1); + bfloat16x4_t a2x4 = vld1_bf16(a_ptr2); + bfloat16x4_t a3x4 = vld1_bf16(a_ptr3); + bfloat16x4_t a4x4 = vld1_bf16(a_ptr4); + bfloat16x4_t a5x4 = vld1_bf16(a_ptr5); + bfloat16x4_t a6x4 = vld1_bf16(a_ptr6); + bfloat16x4_t a7x4 = vld1_bf16(a_ptr7); + + y1_vec = vld1q_f32(y_ptr); + + a0 = vcombine_bf16(a0x4, bf16_zero); + a1 = vcombine_bf16(a1x4, bf16_zero); + a2 = vcombine_bf16(a2x4, bf16_zero); + a3 = vcombine_bf16(a3x4, bf16_zero); + a4 = vcombine_bf16(a4x4, bf16_zero); + a5 = vcombine_bf16(a5x4, bf16_zero); + a6 = vcombine_bf16(a6x4, bf16_zero); + a7 = vcombine_bf16(a7x4, bf16_zero); + + t0 = vreinterpretq_bf16_u16( + vzip1q_u16(vreinterpretq_u16_bf16(a0), vreinterpretq_u16_bf16(a1))); + t1 = vreinterpretq_bf16_u16( + vzip1q_u16(vreinterpretq_u16_bf16(a2), vreinterpretq_u16_bf16(a3))); + t2 = vreinterpretq_bf16_u16( + vzip1q_u16(vreinterpretq_u16_bf16(a4), vreinterpretq_u16_bf16(a5))); + t3 = vreinterpretq_bf16_u16( + vzip1q_u16(vreinterpretq_u16_bf16(a6), vreinterpretq_u16_bf16(a7))); + + y1_vec = vbfmlalbq_laneq_f32(y1_vec, t0, x_vec, 0); + y1_vec = vbfmlaltq_laneq_f32(y1_vec, t0, x_vec, 1); + y1_vec = vbfmlalbq_laneq_f32(y1_vec, t1, x_vec, 2); + y1_vec = vbfmlaltq_laneq_f32(y1_vec, t1, x_vec, 3); + y1_vec = vbfmlalbq_laneq_f32(y1_vec, t2, x_vec, 4); + y1_vec = vbfmlaltq_laneq_f32(y1_vec, t2, x_vec, 5); + y1_vec = vbfmlalbq_laneq_f32(y1_vec, t3, x_vec, 6); + y1_vec = vbfmlaltq_laneq_f32(y1_vec, t3, x_vec, 7); + + vst1q_f32(y_ptr, y1_vec); + + a_ptr0 += 4; + a_ptr1 += 4; + a_ptr2 += 4; + a_ptr3 += 4; + a_ptr4 += 4; + a_ptr5 += 4; + a_ptr6 += 4; + a_ptr7 += 4; + + y_ptr += 4; + } + + if (rest_m) { + x0 = alpha * BF16_TO_FP32(x_ptr[0]); + x1 = alpha * BF16_TO_FP32(x_ptr[1]); + x2 = alpha * BF16_TO_FP32(x_ptr[2]); + x3 = alpha * BF16_TO_FP32(x_ptr[3]); + x4 = alpha * BF16_TO_FP32(x_ptr[4]); + x5 = alpha * BF16_TO_FP32(x_ptr[5]); + x6 = alpha * BF16_TO_FP32(x_ptr[6]); + x7 = alpha * BF16_TO_FP32(x_ptr[7]); + + for (BLASLONG j = 0; j < rest_m; j++) { + y_ptr[j] += x0 * BF16_TO_FP32(a_ptr0[j]); + y_ptr[j] += x1 * BF16_TO_FP32(a_ptr1[j]); + y_ptr[j] += x2 * BF16_TO_FP32(a_ptr2[j]); + y_ptr[j] += x3 * BF16_TO_FP32(a_ptr3[j]); + y_ptr[j] += x4 * BF16_TO_FP32(a_ptr4[j]); + y_ptr[j] += x5 * BF16_TO_FP32(a_ptr5[j]); + y_ptr[j] += x6 * BF16_TO_FP32(a_ptr6[j]); + y_ptr[j] += x7 * BF16_TO_FP32(a_ptr7[j]); + } + } + + x_ptr += 8; + } + + if (n & 4) { + a_ptr0 = a_ptr; + a_ptr1 = a_ptr0 + lda; + a_ptr2 = a_ptr1 + lda; + a_ptr3 = a_ptr2 + lda; + + a_ptr += 4 * lda; + + bfloat16x4_t x_vecx4 = vld1_bf16(x_ptr); + if (alpha != 1) { + x_vec = vcombine_bf16(x_vecx4, bf16_zero); + fp32_low = vreinterpretq_f32_u16( + vzip1q_u16(vreinterpretq_u16_bf16(bf16_zero_q), + vreinterpretq_u16_bf16(x_vec))); + fp32_low = vmulq_n_f32(fp32_low, alpha); + x_vecx4 = vcvt_bf16_f32(fp32_low); + } + + y_ptr = y; + for (j = 0; j < m / 8; j++) { + a0 = vld1q_bf16(a_ptr0); + a1 = vld1q_bf16(a_ptr1); + a2 = vld1q_bf16(a_ptr2); + a3 = vld1q_bf16(a_ptr3); + + y1_vec = vld1q_f32(y_ptr); + y2_vec = vld1q_f32(y_ptr + 4); + + t0 = vreinterpretq_bf16_u16( + vzip1q_u16(vreinterpretq_u16_bf16(a0), vreinterpretq_u16_bf16(a1))); + t1 = vreinterpretq_bf16_u16( + vzip1q_u16(vreinterpretq_u16_bf16(a2), vreinterpretq_u16_bf16(a3))); + t4 = vreinterpretq_bf16_u16( + vzip2q_u16(vreinterpretq_u16_bf16(a0), vreinterpretq_u16_bf16(a1))); + t5 = vreinterpretq_bf16_u16( + vzip2q_u16(vreinterpretq_u16_bf16(a2), vreinterpretq_u16_bf16(a3))); + + y1_vec = vbfmlalbq_lane_f32(y1_vec, t0, x_vecx4, 0); + y1_vec = vbfmlaltq_lane_f32(y1_vec, t0, x_vecx4, 1); + y1_vec = vbfmlalbq_lane_f32(y1_vec, t1, x_vecx4, 2); + y1_vec = vbfmlaltq_lane_f32(y1_vec, t1, x_vecx4, 3); + + y2_vec = vbfmlalbq_lane_f32(y2_vec, t4, x_vecx4, 0); + y2_vec = vbfmlaltq_lane_f32(y2_vec, t4, x_vecx4, 1); + y2_vec = vbfmlalbq_lane_f32(y2_vec, t5, x_vecx4, 2); + y2_vec = vbfmlaltq_lane_f32(y2_vec, t5, x_vecx4, 3); + + vst1q_f32(y_ptr, y1_vec); + vst1q_f32(y_ptr + 4, y2_vec); + + a_ptr0 += 8; + a_ptr1 += 8; + a_ptr2 += 8; + a_ptr3 += 8; + + y_ptr += 8; + } + + if (m & 4) { + bfloat16x4_t a0x4 = vld1_bf16(a_ptr0); + bfloat16x4_t a1x4 = vld1_bf16(a_ptr1); + bfloat16x4_t a2x4 = vld1_bf16(a_ptr2); + bfloat16x4_t a3x4 = vld1_bf16(a_ptr3); + + y1_vec = vld1q_f32(y_ptr); + + a0 = vcombine_bf16(a0x4, bf16_zero); + a1 = vcombine_bf16(a1x4, bf16_zero); + a2 = vcombine_bf16(a2x4, bf16_zero); + a3 = vcombine_bf16(a3x4, bf16_zero); + + t0 = vreinterpretq_bf16_u16( + vzip1q_u16(vreinterpretq_u16_bf16(a0), vreinterpretq_u16_bf16(a1))); + t1 = vreinterpretq_bf16_u16( + vzip1q_u16(vreinterpretq_u16_bf16(a2), vreinterpretq_u16_bf16(a3))); + + y1_vec = vbfmlalbq_lane_f32(y1_vec, t0, x_vecx4, 0); + y1_vec = vbfmlaltq_lane_f32(y1_vec, t0, x_vecx4, 1); + y1_vec = vbfmlalbq_lane_f32(y1_vec, t1, x_vecx4, 2); + y1_vec = vbfmlaltq_lane_f32(y1_vec, t1, x_vecx4, 3); + + vst1q_f32(y_ptr, y1_vec); + + a_ptr0 += 4; + a_ptr1 += 4; + a_ptr2 += 4; + a_ptr3 += 4; + + y_ptr += 4; + } + + if (rest_m) { + x0 = alpha * BF16_TO_FP32(x_ptr[0]); + x1 = alpha * BF16_TO_FP32(x_ptr[1]); + x2 = alpha * BF16_TO_FP32(x_ptr[2]); + x3 = alpha * BF16_TO_FP32(x_ptr[3]); + + for (BLASLONG j = 0; j < rest_m; j++) { + y_ptr[j] += x0 * BF16_TO_FP32(a_ptr0[j]); + y_ptr[j] += x1 * BF16_TO_FP32(a_ptr1[j]); + y_ptr[j] += x2 * BF16_TO_FP32(a_ptr2[j]); + y_ptr[j] += x3 * BF16_TO_FP32(a_ptr3[j]); + } + } + + x_ptr += 4; + } + + if (n & 2) { + a_ptr0 = a_ptr; + a_ptr1 = a_ptr0 + lda; + + a_ptr += 2 * lda; + + bfloat16_t tmp_buffer[4]; + memset((void*)tmp_buffer, 0, sizeof(bfloat16_t)); + + tmp_buffer[0] = x_ptr[0]; + tmp_buffer[1] = x_ptr[1]; + + bfloat16x4_t x_vecx4 = vld1_bf16(tmp_buffer); + if (alpha != 1) { + x_vec = vcombine_bf16(x_vecx4, bf16_zero); + fp32_low = vreinterpretq_f32_u16( + vzip1q_u16(vreinterpretq_u16_bf16(bf16_zero_q), + vreinterpretq_u16_bf16(x_vec))); + fp32_low = vmulq_n_f32(fp32_low, alpha); + x_vecx4 = vcvt_bf16_f32(fp32_low); + } + + y_ptr = y; + for (j = 0; j < m / 8; j++) { + a0 = vld1q_bf16(a_ptr0); + a1 = vld1q_bf16(a_ptr1); + + y1_vec = vld1q_f32(y_ptr); + y2_vec = vld1q_f32(y_ptr + 4); + + t0 = vreinterpretq_bf16_u16( + vzip1q_u16(vreinterpretq_u16_bf16(a0), vreinterpretq_u16_bf16(a1))); + t4 = vreinterpretq_bf16_u16( + vzip2q_u16(vreinterpretq_u16_bf16(a0), vreinterpretq_u16_bf16(a1))); + + y1_vec = vbfmlalbq_lane_f32(y1_vec, t0, x_vecx4, 0); + y1_vec = vbfmlaltq_lane_f32(y1_vec, t0, x_vecx4, 1); + + y2_vec = vbfmlalbq_lane_f32(y2_vec, t4, x_vecx4, 0); + y2_vec = vbfmlaltq_lane_f32(y2_vec, t4, x_vecx4, 1); + + vst1q_f32(y_ptr, y1_vec); + vst1q_f32(y_ptr + 4, y2_vec); + + a_ptr0 += 8; + a_ptr1 += 8; + + y_ptr += 8; + } + + if (m & 4) { + bfloat16x4_t a0x4 = vld1_bf16(a_ptr0); + bfloat16x4_t a1x4 = vld1_bf16(a_ptr1); + + y1_vec = vld1q_f32(y_ptr); + + a0 = vcombine_bf16(a0x4, bf16_zero); + a1 = vcombine_bf16(a1x4, bf16_zero); + + t0 = vreinterpretq_bf16_u16( + vzip1q_u16(vreinterpretq_u16_bf16(a0), vreinterpretq_u16_bf16(a1))); + t1 = vreinterpretq_bf16_u16( + vzip1q_u16(vreinterpretq_u16_bf16(a2), vreinterpretq_u16_bf16(a3))); + + y1_vec = vbfmlalbq_lane_f32(y1_vec, t0, x_vecx4, 0); + y1_vec = vbfmlaltq_lane_f32(y1_vec, t0, x_vecx4, 1); + y1_vec = vbfmlalbq_lane_f32(y1_vec, t1, x_vecx4, 2); + y1_vec = vbfmlaltq_lane_f32(y1_vec, t1, x_vecx4, 3); + + vst1q_f32(y_ptr, y1_vec); + + a_ptr0 += 4; + a_ptr1 += 4; + a_ptr2 += 4; + a_ptr3 += 4; + + y_ptr += 4; + } + + if (m & 2) { + float x0, x1; + x0 = alpha * (BF16_TO_FP32(x_ptr[0])); + x1 = alpha * (BF16_TO_FP32(x_ptr[1])); + + y_ptr[0] += x0 * BF16_TO_FP32(a_ptr0[0]); + y_ptr[0] += x1 * BF16_TO_FP32(a_ptr1[0]); + y_ptr[1] += x0 * BF16_TO_FP32(a_ptr0[1]); + y_ptr[1] += x1 * BF16_TO_FP32(a_ptr1[1]); + + a_ptr0 += 2; + a_ptr1 += 2; + + y_ptr += 2; + } + + if (m & 1) { + float x0, x1; + x0 = alpha * BF16_TO_FP32(x_ptr[0]); + x1 = alpha * BF16_TO_FP32(x_ptr[1]); + + y_ptr[0] += x0 * BF16_TO_FP32(a_ptr0[0]); + y_ptr[0] += x1 * BF16_TO_FP32(a_ptr1[0]); + } + + x_ptr += 2; + } + + if (n & 1) { + x0 = BF16_TO_FP32(x_ptr[0]) * alpha; + y_ptr = y; + a_ptr0 = a_ptr; + + for (j = 0; j < m; j++) { + y_ptr[j] += x0 * BF16_TO_FP32(a_ptr0[j]); + } + } + + return (0); + } + + BLASLONG iy = 0; + for (i = 0; i < m; i++) { + y[iy] *= beta; + iy += incy; + } + + for (j = 0; j < n; j++) { + x0 = alpha * BF16_TO_FP32(*x_ptr); + iy = 0; + for (i = 0; i < m; i++) { + y[iy] += x0 * BF16_TO_FP32(a_ptr[i]); + iy += incy; + } + + a_ptr += lda; + x_ptr += incx; + } + + return (0); +} From 4346b9155970dc80b794f88583411549778fde74 Mon Sep 17 00:00:00 2001 From: Ye Tao Date: Fri, 28 Feb 2025 13:17:46 +0000 Subject: [PATCH 073/205] add beta and alpha testcase for sbgemv --- test/compare_sgemm_sbgemm.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/test/compare_sgemm_sbgemm.c b/test/compare_sgemm_sbgemm.c index 05d9b33ab..ae109c1a5 100644 --- a/test/compare_sgemm_sbgemm.c +++ b/test/compare_sgemm_sbgemm.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2020, The OpenBLAS Project +Copyright (c) 2020,2025 The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -202,6 +202,8 @@ main (int argc, char *argv[]) return ret; } + for (beta = 0; beta < 3; beta += 1) { + for (alpha = 0; alpha < 3; alpha += 1) { for (l = 0; l < 2; l++) { // l = 1 to test inc_x & inc_y not equal to one. for (x = 1; x <= loop; x++) { @@ -230,7 +232,10 @@ main (int argc, char *argv[]) B[j << l] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; sbstobf16_(&one, &B[j << l], &one, &btmp, &one); BB[j << l].v = btmp; + + CC[j << l] = C[j << l] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; } + for (y = 0; y < 2; y++) { if (y == 0) { @@ -246,12 +251,14 @@ main (int argc, char *argv[]) SGEMV (&transA, &x, &x, &alpha, A, &x, B, &k, &beta, C, &k); SBGEMV (&transA, &x, &x, &alpha, (bfloat16*) AA, &x, (bfloat16*) BB, &k, &beta, CC, &k); + for (int i = 0; i < x; i ++) DD[i] *= beta; + for (j = 0; j < x; j++) for (i = 0; i < x; i++) if (transA == 'N') { - DD[i] += float16to32 (AA[j * x + i]) * float16to32 (BB[j << l]); + DD[i] += alpha * float16to32 (AA[j * x + i]) * float16to32 (BB[j << l]); } else if (transA == 'T') { - DD[j] += float16to32 (AA[j * x + i]) * float16to32 (BB[i << l]); + DD[j] += alpha * float16to32 (AA[j * x + i]) * float16to32 (BB[i << l]); } for (j = 0; j < x; j++) { @@ -268,8 +275,10 @@ main (int argc, char *argv[]) free(BB); free(DD); free(CC); - } - } + } // x + } // l + } // alpha + } // beta if (ret != 0) fprintf (stderr, "FATAL ERROR SBGEMV - Return code: %d\n", ret); From 35914aa9a2a862d61e391bc40a1620e11df64ad3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 2 Mar 2025 22:54:59 +0100 Subject: [PATCH 074/205] Expose the option to build without LAPACKE to ccmake --- CMakeLists.txt | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8e99bd208..f8b63041a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -21,6 +21,8 @@ include(CMakePackageConfigHelpers) ####### option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS)" OFF) +option(BUILD_WITHOUT_LAPACKE "Do not build the C interface to LAPACK)" OFF) + option(BUILD_LAPACK_DEPRECATED "When building LAPACK, include also some older, deprecated routines" ON) set(LAPACK_STRLEN "" CACHE STRING "When building LAPACK, use this type (e.g. \"int\") for character lengths (defaults to size_t)") @@ -81,6 +83,10 @@ if(BUILD_WITHOUT_LAPACK) set(NO_LAPACKE 1) endif() +if (BUILD_WITHOUT_LAPACKE) + set(NO_LAPACKE 1) +endif() + if(BUILD_WITHOUT_CBLAS) set(NO_CBLAS 1) endif() From 38ee7c93011946ac3cb5231600e09a0a583d93b0 Mon Sep 17 00:00:00 2001 From: Ye Tao Date: Mon, 3 Mar 2025 11:30:45 +0000 Subject: [PATCH 075/205] Add dispatch of SBGEMVNKERNEL for NEOVERSEN2 and NEOVERSEV2 --- kernel/arm64/KERNEL.NEOVERSEN2 | 3 ++- kernel/arm64/KERNEL.NEOVERSEV2 | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/arm64/KERNEL.NEOVERSEN2 b/kernel/arm64/KERNEL.NEOVERSEN2 index e4e1cfde3..fc7fe6930 100644 --- a/kernel/arm64/KERNEL.NEOVERSEN2 +++ b/kernel/arm64/KERNEL.NEOVERSEN2 @@ -198,4 +198,5 @@ SBGEMMINCOPYOBJ = sbgemm_incopy$(TSUFFIX).$(SUFFIX) SBGEMMITCOPYOBJ = sbgemm_itcopy$(TSUFFIX).$(SUFFIX) SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX) SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX) -SBGEMVTKERNEL = sbgemv_t_bfdot.c \ No newline at end of file +SBGEMVTKERNEL = sbgemv_t_bfdot.c +SBGEMVNKERNEL = sbgemv_n_neon.c \ No newline at end of file diff --git a/kernel/arm64/KERNEL.NEOVERSEV2 b/kernel/arm64/KERNEL.NEOVERSEV2 index 4d866f858..e08efdb9d 100644 --- a/kernel/arm64/KERNEL.NEOVERSEV2 +++ b/kernel/arm64/KERNEL.NEOVERSEV2 @@ -2,4 +2,5 @@ include $(KERNELDIR)/KERNEL.ARMV8SVE ifeq ($(BUILD_BFLOAT16), 1) SBGEMVTKERNEL = sbgemv_t_bfdot.c +SBGEMVNKERNEL = sbgemv_n_neon.c endif \ No newline at end of file From 6b8b35cdf2de638d67396fe624cf624087fb9d0a Mon Sep 17 00:00:00 2001 From: Ye Tao Date: Mon, 3 Mar 2025 11:55:27 +0000 Subject: [PATCH 076/205] fix minior issues of redeclaration of float x0,x1 in sbgemv_n_neon.c --- kernel/arm64/sbgemv_n_neon.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/arm64/sbgemv_n_neon.c b/kernel/arm64/sbgemv_n_neon.c index cdb54298c..9e7ea03c8 100644 --- a/kernel/arm64/sbgemv_n_neon.c +++ b/kernel/arm64/sbgemv_n_neon.c @@ -480,7 +480,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT alpha, bfloat16 *a, BLASLONG lda, } if (m & 2) { - float x0, x1; x0 = alpha * (BF16_TO_FP32(x_ptr[0])); x1 = alpha * (BF16_TO_FP32(x_ptr[1])); @@ -496,7 +495,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT alpha, bfloat16 *a, BLASLONG lda, } if (m & 1) { - float x0, x1; x0 = alpha * BF16_TO_FP32(x_ptr[0]); x1 = alpha * BF16_TO_FP32(x_ptr[1]); From 39eb43d4410d0745ca2c78d1162d973abf3aa35b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 7 Mar 2025 13:48:28 +0100 Subject: [PATCH 077/205] Improve thread safety of pthreads builds that rely on C11 atomic operations for locking (#5170) * Tighten memory orders for C11 atomic operations --- driver/others/blas_server.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index 7306a3ecd..4b79136ec 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -146,8 +146,8 @@ typedef struct { } thread_status_t; #ifdef HAVE_C11 -#define atomic_load_queue(p) __atomic_load_n(p, __ATOMIC_RELAXED) -#define atomic_store_queue(p, v) __atomic_store_n(p, v, __ATOMIC_RELAXED) +#define atomic_load_queue(p) __atomic_load_n(p, __ATOMIC_ACQUIRE) +#define atomic_store_queue(p, v) __atomic_store_n(p, v, __ATOMIC_RELEASE) #else #define atomic_load_queue(p) (blas_queue_t*)(*(volatile blas_queue_t**)(p)) #define atomic_store_queue(p, v) (*(volatile blas_queue_t* volatile*)(p) = (v)) @@ -637,7 +637,9 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ #ifdef SMP_SERVER // Handle lazy re-init of the thread-pool after a POSIX fork + LOCK_COMMAND(&server_lock); if (unlikely(blas_server_avail == 0)) blas_thread_init(); + UNLOCK_COMMAND(&server_lock); #endif BLASLONG i = 0; blas_queue_t *current = queue; From 5c4e38ab17eb530e950e68e1d45ea7a2fcd25cea Mon Sep 17 00:00:00 2001 From: manjam01 Date: Thu, 27 Feb 2025 09:39:06 +0000 Subject: [PATCH 078/205] Optimize gemv_n_sve kernel --- kernel/arm64/KERNEL.ARMV8SVE | 2 +- kernel/arm64/gemv_n_sve.c | 83 ++++++++++++++++++++++++++++++------ 2 files changed, 72 insertions(+), 13 deletions(-) diff --git a/kernel/arm64/KERNEL.ARMV8SVE b/kernel/arm64/KERNEL.ARMV8SVE index dc58e329f..9adacce63 100644 --- a/kernel/arm64/KERNEL.ARMV8SVE +++ b/kernel/arm64/KERNEL.ARMV8SVE @@ -74,7 +74,7 @@ DSCALKERNEL = scal.S CSCALKERNEL = zscal.S ZSCALKERNEL = zscal.S -SGEMVNKERNEL = gemv_n.S +SGEMVNKERNEL = gemv_n_sve.c DGEMVNKERNEL = gemv_n.S CGEMVNKERNEL = zgemv_n.S ZGEMVNKERNEL = zgemv_n.S diff --git a/kernel/arm64/gemv_n_sve.c b/kernel/arm64/gemv_n_sve.c index 295055561..59a5c8557 100644 --- a/kernel/arm64/gemv_n_sve.c +++ b/kernel/arm64/gemv_n_sve.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2024, The OpenBLAS Project +Copyright (c) 2024-2025, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without @@ -59,23 +59,82 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO a_ptr = a; if (inc_y == 1) { + BLASLONG width = n / 3; uint64_t sve_size = SV_COUNT(); - for (j = 0; j < n; j++) { - SV_TYPE temp_vec = SV_DUP(alpha * x[ix]); - i = 0; - svbool_t pg = SV_WHILE(i, m); - while (svptest_any(SV_TRUE(), pg)) { - SV_TYPE a_vec = svld1(pg, a_ptr + i); + svbool_t pg_true = SV_TRUE(); + svbool_t pg = SV_WHILE(0, m % sve_size); + + FLOAT *a0_ptr = a + lda * width * 0; + FLOAT *a1_ptr = a + lda * width * 1; + FLOAT *a2_ptr = a + lda * width * 2; + + for (j = 0; j < width; j++) { + for (i = 0; (i + sve_size - 1) < m; i += sve_size) { + ix = j * inc_x; + + SV_TYPE x0_vec = SV_DUP(alpha * x[ix + (inc_x * width * 0)]); + SV_TYPE x1_vec = SV_DUP(alpha * x[ix + (inc_x * width * 1)]); + SV_TYPE x2_vec = SV_DUP(alpha * x[ix + (inc_x * width * 2)]); + + SV_TYPE a00_vec = svld1(pg_true, a0_ptr + i); + SV_TYPE a01_vec = svld1(pg_true, a1_ptr + i); + SV_TYPE a02_vec = svld1(pg_true, a2_ptr + i); + + SV_TYPE y_vec = svld1(pg_true, y + i); + y_vec = svmla_lane(y_vec, a00_vec, x0_vec, 0); + y_vec = svmla_lane(y_vec, a01_vec, x1_vec, 0); + y_vec = svmla_lane(y_vec, a02_vec, x2_vec, 0); + + svst1(pg_true, y + i, y_vec); + } + + if (i < m) { + SV_TYPE x0_vec = SV_DUP(alpha * x[ix + (inc_x * width * 0)]); + SV_TYPE x1_vec = SV_DUP(alpha * x[ix + (inc_x * width * 1)]); + SV_TYPE x2_vec = SV_DUP(alpha * x[ix + (inc_x * width * 2)]); + + SV_TYPE a00_vec = svld1(pg, a0_ptr + i); + SV_TYPE a01_vec = svld1(pg, a1_ptr + i); + SV_TYPE a02_vec = svld1(pg, a2_ptr + i); + SV_TYPE y_vec = svld1(pg, y + i); - y_vec = svmla_x(pg, y_vec, temp_vec, a_vec); + y_vec = svmla_m(pg, y_vec, a00_vec, x0_vec); + y_vec = svmla_m(pg, y_vec, a01_vec, x1_vec); + y_vec = svmla_m(pg, y_vec, a02_vec, x2_vec); + + ix += inc_x; + svst1(pg, y + i, y_vec); - i += sve_size; - pg = SV_WHILE(i, m); } + + a0_ptr += lda; + a1_ptr += lda; + a2_ptr += lda; + } + + a_ptr = a2_ptr; + for (j = width * 3; j < n; j++) { + ix = j * inc_x; + for (i = 0; (i + sve_size - 1) < m; i += sve_size) { + SV_TYPE y_vec = svld1(pg_true, y + i); + SV_TYPE x_vec = SV_DUP(alpha * x[(ix)]); + SV_TYPE a_vec = svld1(pg_true, a_ptr + i); + y_vec = svmla_x(pg_true, y_vec, a_vec, x_vec); + svst1(pg_true, y + i, y_vec); + } + + if (i < m) { + SV_TYPE y_vec = svld1(pg, y + i); + SV_TYPE x_vec = SV_DUP(alpha * x[(ix)]); + SV_TYPE a_vec = svld1(pg, a_ptr + i); + y_vec = svmla_m(pg, y_vec, a_vec, x_vec); + svst1(pg, y + i, y_vec); + } + a_ptr += lda; ix += inc_x; } - return(0); + return (0); } for (j = 0; j < n; j++) { @@ -89,4 +148,4 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO ix += inc_x; } return (0); -} +} \ No newline at end of file From 80d3c2ad95781211b77272a1cfc9d77ba7ec402d Mon Sep 17 00:00:00 2001 From: Masato Nakagawa Date: Tue, 11 Mar 2025 20:18:20 +0900 Subject: [PATCH 079/205] Add Improving Load Imbalance in Thread-Parallel GEMM --- driver/level3/level3_thread.c | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index 9b1aadf7d..77aaeee6b 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -591,7 +591,7 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG BLASLONG nthreads = args -> nthreads; - BLASLONG width, i, j, k, js; + BLASLONG width, width_n, i, j, k, js; BLASLONG m, n, n_from, n_to; int mode; #if defined(DYNAMIC_ARCH) @@ -740,18 +740,25 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG /* Partition (a step of) n into nthreads regions */ range_N[0] = js; num_parts = 0; - while (n > 0){ - width = blas_quickdivide(n + nthreads - num_parts - 1, nthreads - num_parts); - if (width < switch_ratio) { - width = switch_ratio; + for(j = 0; j < nthreads_n; j++){ + width_n = blas_quickdivide(n + nthreads_n - j - 1, nthreads_n - j); + n -= width_n; + for(i = 0; i < nthreads_m; i++){ + width = blas_quickdivide(width_n + nthreads_m - i - 1, nthreads_m - i); + if (width < switch_ratio) { + width = switch_ratio; + } + width = round_up(width_n, width, GEMM_PREFERED_SIZE); + + width_n -= width; + if (width_n < 0) { + width = width + width_n; + width_n = 0; + } + range_N[num_parts + 1] = range_N[num_parts] + width; + + num_parts ++; } - width = round_up(n, width, GEMM_PREFERED_SIZE); - - n -= width; - if (n < 0) width = width + n; - range_N[num_parts + 1] = range_N[num_parts] + width; - - num_parts ++; } for (j = num_parts; j < MAX_CPU_NUMBER; j++) { range_N[j + 1] = range_N[num_parts]; From a085b6c9ec38de7109fe95322db677fc18c31696 Mon Sep 17 00:00:00 2001 From: Annop Wongwathanarat Date: Wed, 12 Mar 2025 14:49:10 +0000 Subject: [PATCH 080/205] Fix aarch64 sbgemv_t compilation error for GCC < 13 --- CONTRIBUTORS.md | 1 + kernel/arm64/sbgemv_t_bfdot.c | 17 ++++++----------- 2 files changed, 7 insertions(+), 11 deletions(-) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 041582892..938a3bf91 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -237,6 +237,7 @@ In chronological order: * [2025-01-10] Add thread throttling profile for SGEMM on NEOVERSEV1 * [2025-01-21] Optimize gemv_t_sve_v1x3 kernel * [2025-02-26] Add sbgemv_t_bfdot kernel + * [2025-03-12] Fix aarch64 sbgemv_t compilation error for GCC < 13 * Marek Michalowski * [2025-01-21] Add thread throttling profile for SGEMV on `NEOVERSEV1` diff --git a/kernel/arm64/sbgemv_t_bfdot.c b/kernel/arm64/sbgemv_t_bfdot.c index 0751690fc..fc4ae019e 100644 --- a/kernel/arm64/sbgemv_t_bfdot.c +++ b/kernel/arm64/sbgemv_t_bfdot.c @@ -33,11 +33,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include "common.h" -static inline float bf16_to_fp32(bfloat16 bf16) { - uint32_t fp32 = (uint32_t)bf16 << 16; - return *((float*)&fp32); -} - int CNAME(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, BLASLONG incx, float beta, float *y, BLASLONG incy) { if (m < 1 || n < 1) return(0); @@ -132,10 +127,10 @@ int CNAME(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat } for (; i < m; ++i) { - y0_ptr[iy] += alpha * a0_ptr[i] * x_ptr[i]; - y1_ptr[iy] += alpha * a1_ptr[i] * x_ptr[i]; - y2_ptr[iy] += alpha * a2_ptr[i] * x_ptr[i]; - y3_ptr[iy] += alpha * a3_ptr[i] * x_ptr[i]; + y0_ptr[iy] += alpha * vcvtah_f32_bf16(a0_ptr[i]) * vcvtah_f32_bf16(x_ptr[i]); + y1_ptr[iy] += alpha * vcvtah_f32_bf16(a1_ptr[i]) * vcvtah_f32_bf16(x_ptr[i]); + y2_ptr[iy] += alpha * vcvtah_f32_bf16(a2_ptr[i]) * vcvtah_f32_bf16(x_ptr[i]); + y3_ptr[iy] += alpha * vcvtah_f32_bf16(a3_ptr[i]) * vcvtah_f32_bf16(x_ptr[i]); } iy += incy; @@ -177,7 +172,7 @@ int CNAME(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat } for (; i < m; ++i) { - y_ptr[iy] += alpha * a_ptr[i] * x_ptr[i]; + y_ptr[iy] += alpha * vcvtah_f32_bf16(a_ptr[i]) * vcvtah_f32_bf16(x_ptr[i]); } iy += incy; @@ -191,7 +186,7 @@ int CNAME(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat temp = 0.0; ix = 0; for (i = 0; i < m; i++) { - temp += bf16_to_fp32(a[i]) * bf16_to_fp32(x[ix]); + temp += vcvtah_f32_bf16(a_ptr[i]) * vcvtah_f32_bf16(x_ptr[ix]); ix += incx; } if (beta == 0.0f) { From 4c00099ed65af573912065a69d83ce42a9aa0cba Mon Sep 17 00:00:00 2001 From: Ye Tao Date: Wed, 12 Mar 2025 16:20:15 +0000 Subject: [PATCH 081/205] replace customize bf16_to_fp32 with arm neon vcvtah_f32_bf16 --- kernel/arm64/sbgemv_n_neon.c | 86 ++++++++++++++++-------------------- 1 file changed, 38 insertions(+), 48 deletions(-) diff --git a/kernel/arm64/sbgemv_n_neon.c b/kernel/arm64/sbgemv_n_neon.c index 9e7ea03c8..489d4d22c 100644 --- a/kernel/arm64/sbgemv_n_neon.c +++ b/kernel/arm64/sbgemv_n_neon.c @@ -33,16 +33,6 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if (defined(__GNUC__) && __GNUC__ >= 13) -#define BF16_TO_FP32(bf16) ((float)(bf16)) -#else -static inline float bf16_to_fp32(bfloat16_t bf16) { - uint32_t fp32 = (uint32_t)(*((u_int16_t*)(&bf16))) << 16; - return *((float*)&fp32); -} -#define BF16_TO_FP32(bf16) bf16_to_fp32(bf16) -#endif - static void beta_op(float *x, BLASLONG n, FLOAT beta) { if (beta == 0) { memset(x, 0, n * sizeof(float)); @@ -268,24 +258,24 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT alpha, bfloat16 *a, BLASLONG lda, } if (rest_m) { - x0 = alpha * BF16_TO_FP32(x_ptr[0]); - x1 = alpha * BF16_TO_FP32(x_ptr[1]); - x2 = alpha * BF16_TO_FP32(x_ptr[2]); - x3 = alpha * BF16_TO_FP32(x_ptr[3]); - x4 = alpha * BF16_TO_FP32(x_ptr[4]); - x5 = alpha * BF16_TO_FP32(x_ptr[5]); - x6 = alpha * BF16_TO_FP32(x_ptr[6]); - x7 = alpha * BF16_TO_FP32(x_ptr[7]); + x0 = alpha * vcvtah_f32_bf16(x_ptr[0]); + x1 = alpha * vcvtah_f32_bf16(x_ptr[1]); + x2 = alpha * vcvtah_f32_bf16(x_ptr[2]); + x3 = alpha * vcvtah_f32_bf16(x_ptr[3]); + x4 = alpha * vcvtah_f32_bf16(x_ptr[4]); + x5 = alpha * vcvtah_f32_bf16(x_ptr[5]); + x6 = alpha * vcvtah_f32_bf16(x_ptr[6]); + x7 = alpha * vcvtah_f32_bf16(x_ptr[7]); for (BLASLONG j = 0; j < rest_m; j++) { - y_ptr[j] += x0 * BF16_TO_FP32(a_ptr0[j]); - y_ptr[j] += x1 * BF16_TO_FP32(a_ptr1[j]); - y_ptr[j] += x2 * BF16_TO_FP32(a_ptr2[j]); - y_ptr[j] += x3 * BF16_TO_FP32(a_ptr3[j]); - y_ptr[j] += x4 * BF16_TO_FP32(a_ptr4[j]); - y_ptr[j] += x5 * BF16_TO_FP32(a_ptr5[j]); - y_ptr[j] += x6 * BF16_TO_FP32(a_ptr6[j]); - y_ptr[j] += x7 * BF16_TO_FP32(a_ptr7[j]); + y_ptr[j] += x0 * vcvtah_f32_bf16(a_ptr0[j]); + y_ptr[j] += x1 * vcvtah_f32_bf16(a_ptr1[j]); + y_ptr[j] += x2 * vcvtah_f32_bf16(a_ptr2[j]); + y_ptr[j] += x3 * vcvtah_f32_bf16(a_ptr3[j]); + y_ptr[j] += x4 * vcvtah_f32_bf16(a_ptr4[j]); + y_ptr[j] += x5 * vcvtah_f32_bf16(a_ptr5[j]); + y_ptr[j] += x6 * vcvtah_f32_bf16(a_ptr6[j]); + y_ptr[j] += x7 * vcvtah_f32_bf16(a_ptr7[j]); } } @@ -384,16 +374,16 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT alpha, bfloat16 *a, BLASLONG lda, } if (rest_m) { - x0 = alpha * BF16_TO_FP32(x_ptr[0]); - x1 = alpha * BF16_TO_FP32(x_ptr[1]); - x2 = alpha * BF16_TO_FP32(x_ptr[2]); - x3 = alpha * BF16_TO_FP32(x_ptr[3]); + x0 = alpha * vcvtah_f32_bf16(x_ptr[0]); + x1 = alpha * vcvtah_f32_bf16(x_ptr[1]); + x2 = alpha * vcvtah_f32_bf16(x_ptr[2]); + x3 = alpha * vcvtah_f32_bf16(x_ptr[3]); for (BLASLONG j = 0; j < rest_m; j++) { - y_ptr[j] += x0 * BF16_TO_FP32(a_ptr0[j]); - y_ptr[j] += x1 * BF16_TO_FP32(a_ptr1[j]); - y_ptr[j] += x2 * BF16_TO_FP32(a_ptr2[j]); - y_ptr[j] += x3 * BF16_TO_FP32(a_ptr3[j]); + y_ptr[j] += x0 * vcvtah_f32_bf16(a_ptr0[j]); + y_ptr[j] += x1 * vcvtah_f32_bf16(a_ptr1[j]); + y_ptr[j] += x2 * vcvtah_f32_bf16(a_ptr2[j]); + y_ptr[j] += x3 * vcvtah_f32_bf16(a_ptr3[j]); } } @@ -480,13 +470,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT alpha, bfloat16 *a, BLASLONG lda, } if (m & 2) { - x0 = alpha * (BF16_TO_FP32(x_ptr[0])); - x1 = alpha * (BF16_TO_FP32(x_ptr[1])); + x0 = alpha * (vcvtah_f32_bf16(x_ptr[0])); + x1 = alpha * (vcvtah_f32_bf16(x_ptr[1])); - y_ptr[0] += x0 * BF16_TO_FP32(a_ptr0[0]); - y_ptr[0] += x1 * BF16_TO_FP32(a_ptr1[0]); - y_ptr[1] += x0 * BF16_TO_FP32(a_ptr0[1]); - y_ptr[1] += x1 * BF16_TO_FP32(a_ptr1[1]); + y_ptr[0] += x0 * vcvtah_f32_bf16(a_ptr0[0]); + y_ptr[0] += x1 * vcvtah_f32_bf16(a_ptr1[0]); + y_ptr[1] += x0 * vcvtah_f32_bf16(a_ptr0[1]); + y_ptr[1] += x1 * vcvtah_f32_bf16(a_ptr1[1]); a_ptr0 += 2; a_ptr1 += 2; @@ -495,23 +485,23 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT alpha, bfloat16 *a, BLASLONG lda, } if (m & 1) { - x0 = alpha * BF16_TO_FP32(x_ptr[0]); - x1 = alpha * BF16_TO_FP32(x_ptr[1]); + x0 = alpha * vcvtah_f32_bf16(x_ptr[0]); + x1 = alpha * vcvtah_f32_bf16(x_ptr[1]); - y_ptr[0] += x0 * BF16_TO_FP32(a_ptr0[0]); - y_ptr[0] += x1 * BF16_TO_FP32(a_ptr1[0]); + y_ptr[0] += x0 * vcvtah_f32_bf16(a_ptr0[0]); + y_ptr[0] += x1 * vcvtah_f32_bf16(a_ptr1[0]); } x_ptr += 2; } if (n & 1) { - x0 = BF16_TO_FP32(x_ptr[0]) * alpha; + x0 = vcvtah_f32_bf16(x_ptr[0]) * alpha; y_ptr = y; a_ptr0 = a_ptr; for (j = 0; j < m; j++) { - y_ptr[j] += x0 * BF16_TO_FP32(a_ptr0[j]); + y_ptr[j] += x0 * vcvtah_f32_bf16(a_ptr0[j]); } } @@ -525,10 +515,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT alpha, bfloat16 *a, BLASLONG lda, } for (j = 0; j < n; j++) { - x0 = alpha * BF16_TO_FP32(*x_ptr); + x0 = alpha * vcvtah_f32_bf16(*x_ptr); iy = 0; for (i = 0; i < m; i++) { - y[iy] += x0 * BF16_TO_FP32(a_ptr[i]); + y[iy] += x0 * vcvtah_f32_bf16(a_ptr[i]); iy += incy; } From b34235ca66a0116990a28abd3408252fde4a3d7e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 12 Mar 2025 22:41:50 +0100 Subject: [PATCH 082/205] Fix inclusion of deprecated interfaces and cgesvdq/strsyl3 --- cmake/lapacke.cmake | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/cmake/lapacke.cmake b/cmake/lapacke.cmake index f43bf10d0..94224d8ba 100644 --- a/cmake/lapacke.cmake +++ b/cmake/lapacke.cmake @@ -98,6 +98,8 @@ set(CSRC lapacke_cgesv_work.c lapacke_cgesvd.c lapacke_cgesvd_work.c + lapacke_cgesvdq.c + lapacke_cgesvdq_work.c lapacke_cgesvdx.c lapacke_cgesvdx_work.c lapacke_cgesvj.c @@ -1766,8 +1768,8 @@ set(SSRC lapacke_strsna_work.c lapacke_strsyl.c lapacke_strsyl_work.c - lapacke_ctrsyl3.c - lapacke_ctrsyl3_work.c + lapacke_strsyl3.c + lapacke_strsyl3_work.c lapacke_strtri.c lapacke_strtri_work.c lapacke_strtrs.c @@ -2410,10 +2412,10 @@ set(ZSRC lapacke_ilaver.c ) if (BUILD_LAPACK_DEPRECATED) -set(SRCS $SRCS lapacke_sgeqpf.c lapacke_sgeqpf_work.c lapacke_sggsvd.c lapacke_sggsvd_work.c lapacke_sggsvp.c lapacke_sggsvp_work.c) -set(SRCD $SRCD lapacke_dgeqpf.c lapacke_dgeqpf_work.c lapacke_dggsvd.c lapacke_dggsvd_work.c lapacke_dggsvp.c lapacke_dggsvp_work.c) -set(SRCC $SRCC lapacke_cgeqpf.c lapacke_cgeqpf_work.c lapacke_cggsvd.c lapacke_cggsvd_work.c lapacke_cggsvp.c lapacke_cggsvp_work.c) -set(SRCZ $SRCZ lapacke_zgeqpf.c lapacke_zgeqpf_work.c lapacke_zggsvd.c lapacke_zggsvd_work.c lapacke_zggsvp.c lapacke_zggsvp_work.c) + list(APPEND SSRC lapacke_sgeqpf.c lapacke_sgeqpf_work.c lapacke_sggsvd.c lapacke_sggsvd_work.c lapacke_sggsvp.c lapacke_sggsvp_work.c) + list(APPEND DSRC lapacke_dgeqpf.c lapacke_dgeqpf_work.c lapacke_dggsvd.c lapacke_dggsvd_work.c lapacke_dggsvp.c lapacke_dggsvp_work.c) + list(APPEND CSRC lapacke_cgeqpf.c lapacke_cgeqpf_work.c lapacke_cggsvd.c lapacke_cggsvd_work.c lapacke_cggsvp.c lapacke_cggsvp_work.c) + list(APPEND ZSRC lapacke_zgeqpf.c lapacke_zgeqpf_work.c lapacke_zggsvd.c lapacke_zggsvd_work.c lapacke_zggsvp.c lapacke_zggsvp_work.c) endif() set(SRCX From 8a418b1aab753c4101d7b5129398417c2ee18c87 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 12 Mar 2025 23:20:16 +0100 Subject: [PATCH 083/205] Add dummy implementations for the LAPACK_COMPLEX_CUSTOM case --- lapack-netlib/LAPACKE/utils/lapacke_make_complex_double.c | 2 ++ lapack-netlib/LAPACKE/utils/lapacke_make_complex_float.c | 2 ++ 2 files changed, 4 insertions(+) diff --git a/lapack-netlib/LAPACKE/utils/lapacke_make_complex_double.c b/lapack-netlib/LAPACKE/utils/lapacke_make_complex_double.c index f6fb74b18..274ece33c 100644 --- a/lapack-netlib/LAPACKE/utils/lapacke_make_complex_double.c +++ b/lapack-netlib/LAPACKE/utils/lapacke_make_complex_double.c @@ -48,4 +48,6 @@ lapack_complex_double lapack_make_complex_double( double re, double im ) { #endif return z; } +#else +lapack_complex_double lapack_make_complex_double( double re, double im ) {} #endif diff --git a/lapack-netlib/LAPACKE/utils/lapacke_make_complex_float.c b/lapack-netlib/LAPACKE/utils/lapacke_make_complex_float.c index c04eb084c..c76e59c91 100644 --- a/lapack-netlib/LAPACKE/utils/lapacke_make_complex_float.c +++ b/lapack-netlib/LAPACKE/utils/lapacke_make_complex_float.c @@ -48,4 +48,6 @@ lapack_complex_float lapack_make_complex_float( float re, float im ) { #endif return z; } +#else +lapack_complex_float lapack_make_complex_float( float re, float im ) {} #endif From 9807f56580fca7f06dd9ef1e14673748cf025e31 Mon Sep 17 00:00:00 2001 From: Annop Wongwathanarat Date: Wed, 12 Mar 2025 21:26:27 +0000 Subject: [PATCH 084/205] Optimize aarch64 sgemm_ncopy --- CONTRIBUTORS.md | 1 + kernel/arm64/sgemm_ncopy_4.S | 47 +++++------ kernel/arm64/sgemm_ncopy_8.S | 149 ++++++++++++++--------------------- 3 files changed, 78 insertions(+), 119 deletions(-) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 938a3bf91..2e2979acc 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -238,6 +238,7 @@ In chronological order: * [2025-01-21] Optimize gemv_t_sve_v1x3 kernel * [2025-02-26] Add sbgemv_t_bfdot kernel * [2025-03-12] Fix aarch64 sbgemv_t compilation error for GCC < 13 + * [2025-03-12] Optimize aarch64 sgemm_ncopy * Marek Michalowski * [2025-01-21] Add thread throttling profile for SGEMV on `NEOVERSEV1` diff --git a/kernel/arm64/sgemm_ncopy_4.S b/kernel/arm64/sgemm_ncopy_4.S index c819ee6fb..de8c8eca6 100644 --- a/kernel/arm64/sgemm_ncopy_4.S +++ b/kernel/arm64/sgemm_ncopy_4.S @@ -88,28 +88,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. prfm PLDL1KEEP, [A04, #A_PREFETCH] ldr q0, [A01], #16 - ins v8.s[0], v0.s[0] - ins v9.s[0], v0.s[1] - ins v10.s[0], v0.s[2] - ins v11.s[0], v0.s[3] - ldr q1, [A02], #16 - ins v8.s[1], v1.s[0] - ins v9.s[1], v1.s[1] - ins v10.s[1], v1.s[2] - ins v11.s[1], v1.s[3] - ldr q2, [A03], #16 - ins v8.s[2], v2.s[0] - ins v9.s[2], v2.s[1] - ins v10.s[2], v2.s[2] - ins v11.s[2], v2.s[3] - ldr q3, [A04], #16 - ins v8.s[3], v3.s[0] - ins v9.s[3], v3.s[1] - ins v10.s[3], v3.s[2] - ins v11.s[3], v3.s[3] + + zip1 v12.4s, v0.4s, v1.4s + zip1 v13.4s, v2.4s, v3.4s + zip2 v14.4s, v0.4s, v1.4s + zip2 v15.4s, v2.4s, v3.4s + + zip1 v8.2d, v12.2d, v13.2d + zip2 v9.2d, v12.2d, v13.2d + zip1 v10.2d, v14.2d, v15.2d + zip2 v11.2d, v14.2d, v15.2d st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [B00] add B00, B00, #64 @@ -138,16 +129,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. prfm PLDL1KEEP, [A02, #A_PREFETCH] ldr q0, [A01], #16 - ins v8.s[0], v0.s[0] - ins v9.s[0], v0.s[1] - ins v10.s[0], v0.s[2] - ins v11.s[0], v0.s[3] - ldr q1, [A02], #16 - ins v8.s[1], v1.s[0] - ins v9.s[1], v1.s[1] - ins v10.s[1], v1.s[2] - ins v11.s[1], v1.s[3] + + zip1 v12.4s, v0.4s, v1.4s + zip2 v13.4s, v0.4s, v1.4s + + dup v8.2d, v12.d[0] + dup v9.2d, v12.d[1] + dup v10.2d, v13.d[0] + dup v11.2d , v13.d[1] st1 {v8.2s, v9.2s, v10.2s, v11.2s}, [B00] add B00, B00, #32 @@ -330,4 +320,3 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ret EPILOGUE - diff --git a/kernel/arm64/sgemm_ncopy_8.S b/kernel/arm64/sgemm_ncopy_8.S index f99b1d992..d941eb3eb 100644 --- a/kernel/arm64/sgemm_ncopy_8.S +++ b/kernel/arm64/sgemm_ncopy_8.S @@ -86,47 +86,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY4x8 ldr q0, [A01], #16 ldr q1, [A02], #16 - ins v8.s[0], v0.s[0] - ins v10.s[0], v0.s[1] - ins v12.s[0], v0.s[2] - ins v14.s[0], v0.s[3] - ins v8.s[1], v1.s[0] - ins v10.s[1], v1.s[1] - ins v12.s[1], v1.s[2] - ins v14.s[1], v1.s[3] - ldr q2, [A03], #16 ldr q3, [A04], #16 - ins v8.s[2], v2.s[0] - ins v10.s[2], v2.s[1] - ins v12.s[2], v2.s[2] - ins v14.s[2], v2.s[3] - ins v8.s[3], v3.s[0] - ins v10.s[3], v3.s[1] - ins v12.s[3], v3.s[2] - ins v14.s[3], v3.s[3] + + zip1 v16.4s, v0.4s, v1.4s + zip1 v17.4s, v2.4s, v3.4s + zip2 v18.4s, v0.4s, v1.4s + zip2 v19.4s, v2.4s, v3.4s + + zip1 v8.2d, v16.2d, v17.2d + zip2 v10.2d, v16.2d, v17.2d + zip1 v12.2d, v18.2d, v19.2d + zip2 v14.2d, v18.2d, v19.2d ldr q4, [A05], #16 ldr q5, [A06], #16 - ins v9.s[0], v4.s[0] - ins v11.s[0], v4.s[1] - ins v13.s[0], v4.s[2] - ins v15.s[0], v4.s[3] - ins v9.s[1], v5.s[0] - ins v11.s[1], v5.s[1] - ins v13.s[1], v5.s[2] - ins v15.s[1], v5.s[3] - ldr q6, [A07], #16 ldr q7, [A08], #16 - ins v9.s[2], v6.s[0] - ins v11.s[2], v6.s[1] - ins v13.s[2], v6.s[2] - ins v15.s[2], v6.s[3] - ins v9.s[3], v7.s[0] - ins v11.s[3], v7.s[1] - ins v13.s[3], v7.s[2] - ins v15.s[3], v7.s[3] + + zip1 v16.4s, v4.4s, v5.4s + zip1 v17.4s, v6.4s, v7.4s + zip2 v18.4s, v4.4s, v5.4s + zip2 v19.4s, v6.4s, v7.4s + + zip1 v9.2d, v16.2d, v17.2d + zip2 v11.2d, v16.2d, v17.2d + zip1 v13.2d, v18.2d, v19.2d + zip2 v15.2d, v18.2d, v19.2d st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [B00], #64 st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [B00], #64 @@ -135,31 +121,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY2x8 ldr d0, [A01], #8 ldr d1, [A02], #8 - ins v8.s[0], v0.s[0] - ins v10.s[0], v0.s[1] - ins v8.s[1], v1.s[0] - ins v10.s[1], v1.s[1] - ldr d2, [A03], #8 ldr d3, [A04], #8 - ins v8.s[2], v2.s[0] - ins v10.s[2], v2.s[1] - ins v8.s[3], v3.s[0] - ins v10.s[3], v3.s[1] + + zip1 v12.4s, v0.4s, v1.4s + zip1 v13.4s, v2.4s, v3.4s + + zip1 v8.2d, v12.2d, v13.2d + zip2 v10.2d, v12.2d, v13.2d ldr d4, [A05], #8 ldr d5, [A06], #8 - ins v9.s[0], v4.s[0] - ins v11.s[0], v4.s[1] - ins v9.s[1], v5.s[0] - ins v11.s[1], v5.s[1] - ldr d6, [A07], #8 ldr d7, [A08], #8 - ins v9.s[2], v6.s[0] - ins v11.s[2], v6.s[1] - ins v9.s[3], v7.s[0] - ins v11.s[3], v7.s[1] + + zip1 v12.4s, v4.4s, v5.4s + zip1 v13.4s, v6.4s, v7.4s + + zip1 v9.2d, v12.2d, v13.2d + zip2 v11.2d, v12.2d, v13.2d st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [B00], #64 .endm @@ -191,25 +171,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY4x4 ldr q0, [A01], #16 ldr q1, [A02], #16 - ins v8.s[0], v0.s[0] - ins v9.s[0], v0.s[1] - ins v10.s[0], v0.s[2] - ins v11.s[0], v0.s[3] - ins v8.s[1], v1.s[0] - ins v9.s[1], v1.s[1] - ins v10.s[1], v1.s[2] - ins v11.s[1], v1.s[3] - ldr q2, [A03], #16 ldr q3, [A04], #16 - ins v8.s[2], v2.s[0] - ins v9.s[2], v2.s[1] - ins v10.s[2], v2.s[2] - ins v11.s[2], v2.s[3] - ins v8.s[3], v3.s[0] - ins v9.s[3], v3.s[1] - ins v10.s[3], v3.s[2] - ins v11.s[3], v3.s[3] + + zip1 v12.4s, v0.4s, v1.4s + zip1 v13.4s, v2.4s, v3.4s + zip2 v14.4s, v0.4s, v1.4s + zip2 v15.4s, v2.4s, v3.4s + + zip1 v8.2d, v12.2d, v13.2d + zip2 v9.2d, v12.2d, v13.2d + zip1 v10.2d, v14.2d, v15.2d + zip2 v11.2d, v14.2d, v15.2d st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [B00], #64 .endm @@ -217,17 +190,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY2x4 ldr d0, [A01], #8 ldr d1, [A02], #8 - ins v8.s[0], v0.s[0] - ins v9.s[0], v0.s[1] - ins v8.s[1], v1.s[0] - ins v9.s[1], v1.s[1] - ldr d2, [A03], #8 ldr d3, [A04], #8 - ins v8.s[2], v2.s[0] - ins v9.s[2], v2.s[1] - ins v8.s[3], v3.s[0] - ins v9.s[3], v3.s[1] + + zip1 v10.4s, v0.4s, v1.4s + zip1 v11.4s, v2.4s, v3.4s + + zip1 v8.2d, v10.2d, v11.2d + zip2 v9.2d, v10.2d, v11.2d st1 {v8.4s, v9.4s}, [B00], #32 .endm @@ -249,14 +219,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY4x2 ldr q0, [A01], #16 ldr q1, [A02], #16 - ins v8.s[0], v0.s[0] - ins v9.s[0], v0.s[1] - ins v10.s[0], v0.s[2] - ins v11.s[0], v0.s[3] - ins v8.s[1], v1.s[0] - ins v9.s[1], v1.s[1] - ins v10.s[1], v1.s[2] - ins v11.s[1], v1.s[3] + + zip1 v12.4s, v0.4s, v1.4s + zip2 v13.4s, v0.4s, v1.4s + + dup v8.2d, v12.d[0] + dup v9.2d, v12.d[1] + dup v10.2d, v13.d[0] + dup v11.2d , v13.d[1] st1 {v8.2s, v9.2s, v10.2s, v11.2s}, [B00], #32 .endm @@ -264,10 +234,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY2x2 ldr d0, [A01], #8 ldr d1, [A02], #8 - ins v8.s[0], v0.s[0] - ins v9.s[0], v0.s[1] - ins v8.s[1], v1.s[0] - ins v9.s[1], v1.s[1] + + zip1 v8.2s, v0.2s, v1.2s + zip2 v9.2s, v0.2s, v1.2s st1 {v8.2s, v9.2s}, [B00], #16 .endm From edef2e4441e50e3a2da1920fdbde09101087c43d Mon Sep 17 00:00:00 2001 From: Annop Wongwathanarat Date: Thu, 13 Mar 2025 20:55:31 +0000 Subject: [PATCH 085/205] Fix bug in ARM64 sbgemv_t --- kernel/arm64/sbgemv_t_bfdot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/arm64/sbgemv_t_bfdot.c b/kernel/arm64/sbgemv_t_bfdot.c index fc4ae019e..672f70acf 100644 --- a/kernel/arm64/sbgemv_t_bfdot.c +++ b/kernel/arm64/sbgemv_t_bfdot.c @@ -196,7 +196,7 @@ int CNAME(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat y[iy] = alpha * temp + beta * y[iy]; } iy += incy; - a += lda; + a_ptr += lda; } return (0); } From f27ba5efd15fd7ab94543b551e60325edca728d7 Mon Sep 17 00:00:00 2001 From: Ye Tao Date: Fri, 14 Mar 2025 17:55:40 +0000 Subject: [PATCH 086/205] fix bugs in aarch64 sbgemv_n kernel --- kernel/arm64/sbgemv_n_neon.c | 83 +++++++++++++++--------------------- 1 file changed, 34 insertions(+), 49 deletions(-) diff --git a/kernel/arm64/sbgemv_n_neon.c b/kernel/arm64/sbgemv_n_neon.c index 489d4d22c..ff730407f 100644 --- a/kernel/arm64/sbgemv_n_neon.c +++ b/kernel/arm64/sbgemv_n_neon.c @@ -69,12 +69,8 @@ static void beta_op(float *x, BLASLONG n, FLOAT beta) { x += 4; } - if (rest_n & 3) { - x[0] *= beta; - if ((rest_n & 3) > 1) - x[1] *= beta; - if ((rest_n & 3) > 2) - x[2] *= beta; + for (BLASLONG i = 0; i < (rest_n & 3); i ++) { + x[i] *= beta; } } return; @@ -88,7 +84,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT alpha, bfloat16 *a, BLASLONG lda, bfloat16x8_t a0, a1, a2, a3, a4, a5, a6, a7; bfloat16x8_t t0, t1, t2, t3, t4, t5, t6, t7; + bfloat16x8_t x_vec; + bfloat16x4_t x_vecx4; + float32x4_t y1_vec, y2_vec; float32x4_t fp32_low, fp32_high; @@ -106,7 +105,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT alpha, bfloat16 *a, BLASLONG lda, if (incx == 1 && incy == 1) { if (beta != 1) { - beta_op(y, n, beta); + beta_op(y, m, beta); } for (i = 0; i < n / 8; i++) { @@ -290,12 +289,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT alpha, bfloat16 *a, BLASLONG lda, a_ptr += 4 * lda; - bfloat16x4_t x_vecx4 = vld1_bf16(x_ptr); + x_vecx4 = vld1_bf16(x_ptr); if (alpha != 1) { - x_vec = vcombine_bf16(x_vecx4, bf16_zero); - fp32_low = vreinterpretq_f32_u16( - vzip1q_u16(vreinterpretq_u16_bf16(bf16_zero_q), - vreinterpretq_u16_bf16(x_vec))); + fp32_low = vcvt_f32_bf16(x_vecx4); fp32_low = vmulq_n_f32(fp32_low, alpha); x_vecx4 = vcvt_bf16_f32(fp32_low); } @@ -348,15 +344,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT alpha, bfloat16 *a, BLASLONG lda, y1_vec = vld1q_f32(y_ptr); - a0 = vcombine_bf16(a0x4, bf16_zero); - a1 = vcombine_bf16(a1x4, bf16_zero); - a2 = vcombine_bf16(a2x4, bf16_zero); - a3 = vcombine_bf16(a3x4, bf16_zero); + a0 = vcombine_bf16(a0x4, a2x4); + a1 = vcombine_bf16(a1x4, a3x4); - t0 = vreinterpretq_bf16_u16( - vzip1q_u16(vreinterpretq_u16_bf16(a0), vreinterpretq_u16_bf16(a1))); - t1 = vreinterpretq_bf16_u16( - vzip1q_u16(vreinterpretq_u16_bf16(a2), vreinterpretq_u16_bf16(a3))); + t0 = vreinterpretq_bf16_u16(vzip1q_u16(vreinterpretq_u16_bf16(a0), vreinterpretq_u16_bf16(a1))); + t1 = vreinterpretq_bf16_u16(vzip2q_u16(vreinterpretq_u16_bf16(a0), vreinterpretq_u16_bf16(a1))); y1_vec = vbfmlalbq_lane_f32(y1_vec, t0, x_vecx4, 0); y1_vec = vbfmlaltq_lane_f32(y1_vec, t0, x_vecx4, 1); @@ -374,10 +366,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT alpha, bfloat16 *a, BLASLONG lda, } if (rest_m) { - x0 = alpha * vcvtah_f32_bf16(x_ptr[0]); - x1 = alpha * vcvtah_f32_bf16(x_ptr[1]); - x2 = alpha * vcvtah_f32_bf16(x_ptr[2]); - x3 = alpha * vcvtah_f32_bf16(x_ptr[3]); + fp32_low = vcvt_f32_bf16(x_vecx4); + + x0 = vgetq_lane_f32(fp32_low, 0); + x1 = vgetq_lane_f32(fp32_low, 1); + x2 = vgetq_lane_f32(fp32_low, 2); + x3 = vgetq_lane_f32(fp32_low, 3); for (BLASLONG j = 0; j < rest_m; j++) { y_ptr[j] += x0 * vcvtah_f32_bf16(a_ptr0[j]); @@ -396,18 +390,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT alpha, bfloat16 *a, BLASLONG lda, a_ptr += 2 * lda; - bfloat16_t tmp_buffer[4]; - memset((void*)tmp_buffer, 0, sizeof(bfloat16_t)); - - tmp_buffer[0] = x_ptr[0]; - tmp_buffer[1] = x_ptr[1]; + x_vecx4 = vreinterpret_bf16_u16(vzip1_u16( + vreinterpret_u16_bf16(vdup_n_bf16(x_ptr[0])), + vreinterpret_u16_bf16(vdup_n_bf16(x_ptr[1])) + )); - bfloat16x4_t x_vecx4 = vld1_bf16(tmp_buffer); if (alpha != 1) { - x_vec = vcombine_bf16(x_vecx4, bf16_zero); - fp32_low = vreinterpretq_f32_u16( - vzip1q_u16(vreinterpretq_u16_bf16(bf16_zero_q), - vreinterpretq_u16_bf16(x_vec))); + fp32_low = vcvt_f32_bf16(x_vecx4); fp32_low = vmulq_n_f32(fp32_low, alpha); x_vecx4 = vcvt_bf16_f32(fp32_low); } @@ -422,14 +411,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT alpha, bfloat16 *a, BLASLONG lda, t0 = vreinterpretq_bf16_u16( vzip1q_u16(vreinterpretq_u16_bf16(a0), vreinterpretq_u16_bf16(a1))); - t4 = vreinterpretq_bf16_u16( + t1 = vreinterpretq_bf16_u16( vzip2q_u16(vreinterpretq_u16_bf16(a0), vreinterpretq_u16_bf16(a1))); y1_vec = vbfmlalbq_lane_f32(y1_vec, t0, x_vecx4, 0); y1_vec = vbfmlaltq_lane_f32(y1_vec, t0, x_vecx4, 1); - y2_vec = vbfmlalbq_lane_f32(y2_vec, t4, x_vecx4, 0); - y2_vec = vbfmlaltq_lane_f32(y2_vec, t4, x_vecx4, 1); + y2_vec = vbfmlalbq_lane_f32(y2_vec, t1, x_vecx4, 0); + y2_vec = vbfmlaltq_lane_f32(y2_vec, t1, x_vecx4, 1); vst1q_f32(y_ptr, y1_vec); vst1q_f32(y_ptr + 4, y2_vec); @@ -449,29 +438,24 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT alpha, bfloat16 *a, BLASLONG lda, a0 = vcombine_bf16(a0x4, bf16_zero); a1 = vcombine_bf16(a1x4, bf16_zero); - t0 = vreinterpretq_bf16_u16( - vzip1q_u16(vreinterpretq_u16_bf16(a0), vreinterpretq_u16_bf16(a1))); - t1 = vreinterpretq_bf16_u16( - vzip1q_u16(vreinterpretq_u16_bf16(a2), vreinterpretq_u16_bf16(a3))); + t0 = vreinterpretq_bf16_u16(vzip1q_u16(vreinterpretq_u16_bf16(a0), vreinterpretq_u16_bf16(a1))); y1_vec = vbfmlalbq_lane_f32(y1_vec, t0, x_vecx4, 0); y1_vec = vbfmlaltq_lane_f32(y1_vec, t0, x_vecx4, 1); - y1_vec = vbfmlalbq_lane_f32(y1_vec, t1, x_vecx4, 2); - y1_vec = vbfmlaltq_lane_f32(y1_vec, t1, x_vecx4, 3); vst1q_f32(y_ptr, y1_vec); a_ptr0 += 4; a_ptr1 += 4; - a_ptr2 += 4; - a_ptr3 += 4; y_ptr += 4; } if (m & 2) { - x0 = alpha * (vcvtah_f32_bf16(x_ptr[0])); - x1 = alpha * (vcvtah_f32_bf16(x_ptr[1])); + fp32_low = vcvt_f32_bf16(x_vecx4); + x0 = vgetq_lane_f32(fp32_low, 0); + x1 = vgetq_lane_f32(fp32_low, 1); + y_ptr[0] += x0 * vcvtah_f32_bf16(a_ptr0[0]); y_ptr[0] += x1 * vcvtah_f32_bf16(a_ptr1[0]); @@ -485,8 +469,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT alpha, bfloat16 *a, BLASLONG lda, } if (m & 1) { - x0 = alpha * vcvtah_f32_bf16(x_ptr[0]); - x1 = alpha * vcvtah_f32_bf16(x_ptr[1]); + fp32_low = vcvt_f32_bf16(x_vecx4); + x0 = vgetq_lane_f32(fp32_low, 0); + x1 = vgetq_lane_f32(fp32_low, 1); y_ptr[0] += x0 * vcvtah_f32_bf16(a_ptr0[0]); y_ptr[0] += x1 * vcvtah_f32_bf16(a_ptr1[0]); From b6cb5ece5845512c1598aaca03831f8e6f63756a Mon Sep 17 00:00:00 2001 From: "shubham.chaudhari" Date: Fri, 28 Feb 2025 13:10:40 +0530 Subject: [PATCH 087/205] Add thread throttling profile for DGEMV on NEOVERSEV1 --- interface/gemv.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/interface/gemv.c b/interface/gemv.c index d03133946..360b82dcd 100644 --- a/interface/gemv.c +++ b/interface/gemv.c @@ -89,6 +89,24 @@ static inline int get_gemv_optimal_nthreads_neoversev2(BLASLONG MN, int ncpu) { } #endif +//thread throttling for dgemv +#if defined(DYNAMIC_ARCH) || defined(NEOVERSEV1) +static inline int get_dgemv_optimal_nthreads_neoversev1(BLASLONG MN, int ncpu) { + + return + MN < 8100L ? 1 +: MN < 12100L ? MIN(ncpu, 2) +: MN < 36100L ? MIN(ncpu, 4) +: MN < 84100L ? MIN(ncpu, 8) +: MN < 348100L ? MIN(ncpu, 16) +: MN < 435600L ? MIN(ncpu, 24) +: MN < 810000L ? MIN(ncpu, 32) +: MN < 1050625 ? MIN(ncpu, 40) +: ncpu; + +} +#endif + static inline int get_gemv_optimal_nthreads(BLASLONG MN) { int ncpu = num_cpu_avail(3); #if defined(_WIN64) && defined(_M_ARM64) @@ -98,6 +116,8 @@ static inline int get_gemv_optimal_nthreads(BLASLONG MN) { #endif #if defined(NEOVERSEV1) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) return get_gemv_optimal_nthreads_neoversev1(MN, ncpu); +#elif defined(NEOVERSEV1) && !defined(COMPLEX) && defined(DOUBLE) && !defined(BFLOAT16) + return get_dgemv_optimal_nthreads_neoversev1(MN, ncpu); #elif defined(NEOVERSEV2) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) return get_gemv_optimal_nthreads_neoversev2(MN, ncpu); #elif defined(DYNAMIC_ARCH) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) From 189dbbc04ff6fb4d58168fd1aef11c21ed9d14c4 Mon Sep 17 00:00:00 2001 From: "shubham.chaudhari" Date: Tue, 4 Mar 2025 16:08:55 +0530 Subject: [PATCH 088/205] Add thread throttling for dynamic arch neoversev1 --- interface/gemv.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/interface/gemv.c b/interface/gemv.c index 360b82dcd..22409649e 100644 --- a/interface/gemv.c +++ b/interface/gemv.c @@ -127,6 +127,12 @@ static inline int get_gemv_optimal_nthreads(BLASLONG MN) { if (strcmp(gotoblas_corename(), "neoversev2") == 0) { return get_gemv_optimal_nthreads_neoversev2(MN, ncpu); } +#elif defined(DYNAMIC_ARCH) && !defined(COMPLEX) && defined(DOUBLE) && !defined(BFLOAT16) + if (strcmp(gotoblas_corename(), "neoversev1") == 0) { + return get_dgemv_optimal_nthreads_neoversev1(MN, ncpu); + } + + #endif if ( MN < 115200L * GEMM_MULTITHREAD_THRESHOLD ) From 8e289ecddc7f51913d3fafcb11957d780c7d3d7e Mon Sep 17 00:00:00 2001 From: "shubham.chaudhari" Date: Tue, 18 Mar 2025 13:24:05 +0530 Subject: [PATCH 089/205] Simplified thread throttling function in gemv --- interface/gemv.c | 51 +++++++++++++++++------------------------------- 1 file changed, 18 insertions(+), 33 deletions(-) diff --git a/interface/gemv.c b/interface/gemv.c index 22409649e..34b6addd3 100644 --- a/interface/gemv.c +++ b/interface/gemv.c @@ -70,11 +70,22 @@ static int (*gemv_thread[])(BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT #if defined(DYNAMIC_ARCH) || defined(NEOVERSEV1) static inline int get_gemv_optimal_nthreads_neoversev1(BLASLONG MN, int ncpu) { - return - MN < 25600L ? 1 - : MN < 63001L ? MIN(ncpu, 4) - : MN < 459684L ? MIN(ncpu, 16) - : ncpu; + #ifdef DOUBLE + return (MN < 8100L) ? 1 + : (MN < 12100L) ? MIN(ncpu, 2) + : (MN < 36100L) ? MIN(ncpu, 4) + : (MN < 84100L) ? MIN(ncpu, 8) + : (MN < 348100L) ? MIN(ncpu, 16) + : (MN < 435600L) ? MIN(ncpu, 24) + : (MN < 810000L) ? MIN(ncpu, 32) + : (MN < 1050625L) ? MIN(ncpu, 40) + : ncpu; + #else + return (MN < 25600L) ? 1 + : (MN < 63001L) ? MIN(ncpu, 4) + : (MN < 459684L) ? MIN(ncpu, 16) + : ncpu; + #endif } #endif @@ -89,24 +100,6 @@ static inline int get_gemv_optimal_nthreads_neoversev2(BLASLONG MN, int ncpu) { } #endif -//thread throttling for dgemv -#if defined(DYNAMIC_ARCH) || defined(NEOVERSEV1) -static inline int get_dgemv_optimal_nthreads_neoversev1(BLASLONG MN, int ncpu) { - - return - MN < 8100L ? 1 -: MN < 12100L ? MIN(ncpu, 2) -: MN < 36100L ? MIN(ncpu, 4) -: MN < 84100L ? MIN(ncpu, 8) -: MN < 348100L ? MIN(ncpu, 16) -: MN < 435600L ? MIN(ncpu, 24) -: MN < 810000L ? MIN(ncpu, 32) -: MN < 1050625 ? MIN(ncpu, 40) -: ncpu; - -} -#endif - static inline int get_gemv_optimal_nthreads(BLASLONG MN) { int ncpu = num_cpu_avail(3); #if defined(_WIN64) && defined(_M_ARM64) @@ -114,25 +107,17 @@ static inline int get_gemv_optimal_nthreads(BLASLONG MN) { return num_cpu_avail(4); return 1; #endif -#if defined(NEOVERSEV1) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) +#if defined(NEOVERSEV1) && !defined(COMPLEX) && !defined(BFLOAT16) return get_gemv_optimal_nthreads_neoversev1(MN, ncpu); -#elif defined(NEOVERSEV1) && !defined(COMPLEX) && defined(DOUBLE) && !defined(BFLOAT16) - return get_dgemv_optimal_nthreads_neoversev1(MN, ncpu); #elif defined(NEOVERSEV2) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) return get_gemv_optimal_nthreads_neoversev2(MN, ncpu); -#elif defined(DYNAMIC_ARCH) && !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) +#elif defined(DYNAMIC_ARCH) && !defined(COMPLEX) && !defined(BFLOAT16) if (strcmp(gotoblas_corename(), "neoversev1") == 0) { return get_gemv_optimal_nthreads_neoversev1(MN, ncpu); } if (strcmp(gotoblas_corename(), "neoversev2") == 0) { return get_gemv_optimal_nthreads_neoversev2(MN, ncpu); } -#elif defined(DYNAMIC_ARCH) && !defined(COMPLEX) && defined(DOUBLE) && !defined(BFLOAT16) - if (strcmp(gotoblas_corename(), "neoversev1") == 0) { - return get_dgemv_optimal_nthreads_neoversev1(MN, ncpu); - } - - #endif if ( MN < 115200L * GEMM_MULTITHREAD_THRESHOLD ) From c0a5c9655ed3e8f7c3903ccb33ea96c6ae9b80ad Mon Sep 17 00:00:00 2001 From: Harishmcw Date: Mon, 24 Mar 2025 13:49:55 +0530 Subject: [PATCH 090/205] Fix missing commas in gensymbol.pl --- exports/gensymbol.pl | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/exports/gensymbol.pl b/exports/gensymbol.pl index dd79e924d..91892395e 100644 --- a/exports/gensymbol.pl +++ b/exports/gensymbol.pl @@ -59,7 +59,7 @@ cblas_csscal, cblas_cswap, cblas_csymm, cblas_csyr2k, cblas_csyrk, cblas_ctbmv, cblas_cgeadd, cblas_ctbsv, cblas_ctpmv, cblas_ctpsv, cblas_ctrmm, cblas_ctrmv, cblas_ctrsm, cblas_ctrsv, cblas_scnrm2, cblas_scasum, - cblas_icamax, cblas_icamin, cblas_icmin, cblas_icmax, cblas_scsum,cblas_cimatcopy,cblas_comatcopy + cblas_icamax, cblas_icamin, cblas_icmin, cblas_icmax, cblas_scsum,cblas_cimatcopy,cblas_comatcopy, cblas_cgemmt); @cblasobjsd = ( cblas_dasum, cblas_daxpy, cblas_dcopy, cblas_ddot, @@ -68,7 +68,7 @@ cblas_dspmv, cblas_dspr2, cblas_dspr, cblas_dswap, cblas_dsymm, cblas_dsymv, cblas_dsyr2, cblas_dsyr2k, cblas_dsyr, cblas_dsyrk, cblas_dtbmv, cblas_dtbsv, cblas_dtpmv, cblas_dtpsv, cblas_dtrmm, cblas_dtrmv, cblas_dtrsm, cblas_dtrsv, cblas_daxpby, cblas_dgeadd, - cblas_idamax, cblas_idamin, cblas_idmin, cblas_idmax, cblas_dsum,cblas_dimatcopy,cblas_domatcopy + cblas_idamax, cblas_idamin, cblas_idmin, cblas_idmax, cblas_dsum,cblas_dimatcopy,cblas_domatcopy, cblas_dgemmt); @cblasobjss = ( @@ -79,7 +79,7 @@ cblas_sswap, cblas_ssymm, cblas_ssymv, cblas_ssyr2, cblas_ssyr2k, cblas_ssyr, cblas_ssyrk, cblas_stbmv, cblas_stbsv, cblas_stpmv, cblas_stpsv, cblas_strmm, cblas_strmv, cblas_strsm, cblas_strsv, cblas_sgeadd, - cblas_isamax, cblas_isamin, cblas_ismin, cblas_ismax, cblas_ssum,cblas_simatcopy,cblas_somatcopy + cblas_isamax, cblas_isamin, cblas_ismin, cblas_ismax, cblas_ssum,cblas_simatcopy,cblas_somatcopy, cblas_sgemmt); @cblasobjsz = ( cblas_dzasum, cblas_dznrm2, cblas_zaxpy, cblas_zcopy, cblas_zdotc, cblas_zdotu, cblas_zdscal, @@ -89,7 +89,7 @@ cblas_ztbmv, cblas_ztbsv, cblas_ztpmv, cblas_ztpsv, cblas_ztrmm, cblas_ztrmv, cblas_ztrsm, cblas_ztrsv, cblas_cdotc_sub, cblas_cdotu_sub, cblas_zdotc_sub, cblas_zdotu_sub, cblas_zaxpby, cblas_zgeadd, - cblas_izamax, cblas_izamin, cblas_izmin, cblas_izmax, cblas_dzsum,cblas_zimatcopy,cblas_zomatcopy + cblas_izamax, cblas_izamin, cblas_izmin, cblas_izmax, cblas_dzsum,cblas_zimatcopy,cblas_zomatcopy, cblas_zgemmt); @cblasobjs = ( cblas_xerbla ); @@ -1584,7 +1584,7 @@ zpotri, LAPACKE_cgetsqrhrt, LAPACKE_cgetsqrhrt_work, LAPACKE_cungtsqr_row, - LAPACKE_cungtsqr_row_work + LAPACKE_cungtsqr_row_work, ); @lapackeobjsd = ( @@ -2197,7 +2197,7 @@ zpotri, LAPACKE_dgetsqrhrt, LAPACKE_dgetsqrhrt_work, LAPACKE_dorgtsqr_row, - LAPACKE_dorgtsqr_row_work + LAPACKE_dorgtsqr_row_work, ); @lapackeobjss = ( @@ -2802,7 +2802,7 @@ zpotri, LAPACKE_sgetsqrhrt, LAPACKE_sgetsqrhrt_work, LAPACKE_sorgtsqr_row, - LAPACKE_sorgtsqr_row_work + LAPACKE_sorgtsqr_row_work, ); @lapackeobjsz = ( @@ -3345,7 +3345,7 @@ zpotri, LAPACKE_zgetsqrhrt, LAPACKE_zgetsqrhrt_work, LAPACKE_zungtsqr_row, - LAPACKE_zungtsqr_row_work + LAPACKE_zungtsqr_row_work, ## @(SRCX_OBJ) from `lapack-3.4.1/lapacke/src/Makefile` ## Not exported: requires LAPACKE_EXTENDED to be set and depends on the @@ -3551,7 +3551,7 @@ zpotri, LAPACKE_zsytrs_aa_2stage_work, # new functions from 3.9.0 LAPACKE_zgesvdq, - LAPACKE_zgesvdq_work + LAPACKE_zgesvdq_work, ); #These function may need 2 underscores. @@ -3639,7 +3639,7 @@ zpotri, zhesv_aa_2stage, zhetrf_aa_2stage, zhetrs_aa_2stage, zsysv_aa_2stage, zsytrf_aa_2stage, zsytrs_aa_2stage, - zlaunhr_col_getrfnp, zlaunhr_col_getrfnp2, zunhr_col + zlaunhr_col_getrfnp, zlaunhr_col_getrfnp2, zunhr_col, ); From c2e7ab5351cd402edfed96225c88ca5d0d753780 Mon Sep 17 00:00:00 2001 From: Harishmcw Date: Wed, 26 Mar 2025 10:50:29 +0530 Subject: [PATCH 091/205] DLL symbol pre/postfixing in CMake builds --- CMakeLists.txt | 35 +++++++++++++++++++++++++++++++---- 1 file changed, 31 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f8b63041a..c140bf5ba 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -77,6 +77,16 @@ set(SYMBOLPREFIX "" CACHE STRING "Add a prefix to all exported symbol names in set(SYMBOLSUFFIX "" CACHE STRING "Add a suffix to all exported symbol names in the shared library, e.g. _64 for INTERFACE64 builds" ) +if (CMAKE_SYSTEM_NAME MATCHES "Windows" AND BUILD_SHARED_LIBS AND NOT ("${SYMBOLPREFIX}${SYMBOLSUFFIX}" STREQUAL "")) +if (NOT BUILD_STATIC_LIBS) + message (STATUS "forcing build of a temporary static library for symbol renaming") + set (BUILD_SHARED_LIBS OFF CACHE BOOL "Build shared library" FORCE) + set (BUILD_STATIC_LIBS ON CACHE BOOL "Build static library" FORCE) + set (DELETE_STATIC_LIBS 1) +endif () +endif() + + ####### if(BUILD_WITHOUT_LAPACK) set(NO_LAPACK 1) @@ -379,7 +389,7 @@ if (BUILD_SHARED_LIBS AND BUILD_RELAPACK) endif() endif() -if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "") +if (BUILD_SHARED_LIBS OR DELETE_STATIC_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "") if (NOT DEFINED ARCH) set(ARCH_IN "x86_64") else() @@ -467,10 +477,26 @@ if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "") else () set (BZ 0) endif() + + if (CMAKE_SYSTEM_NAME MATCHES "Windows") +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) +set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) + #if (USE_PERL) +message(STATUS "adding postbuild instruction to rename syms") + add_custom_command(TARGET ${OpenBLAS_LIBNAME}_static POST_BUILD + COMMAND perl ${PROJECT_SOURCE_DIR}/exports/gensymbol.pl "win2k" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BLD}" "${BBF16}" "${BS}" "${BD}" "${BC}" "${BZ}" > ${PROJECT_BINARY_DIR}/renamesyms.def + COMMAND ${CMAKE_C_COMPILER} ${CMAKE_C_FLAGS} -I${PROJECT_SOURCE_DIR} -I${PROJECT_BINARY_DIR} -c -o ${PROJECT_BINARY_DIR}/dllinit.o ${PROJECT_SOURCE_DIR}/exports/dllinit.c + COMMAND lld-link -nodefaultlib:libcmt -defaultlib:msvcrt ${CMAKE_LINKER_FLAGS} -errorlimit:0 -def:${PROJECT_BINARY_DIR}/renamesyms.def ${PROJECT_BINARY_DIR}/dllinit.o $ -wholearchive:$ -dll -out:$/${OpenBLAS_LIBNAME}.dll -implib:$/${OpenBLAS_LIBNAME}.dll.a + #if (${REMOVE_STATIC_LIB}) + #file (REMOVE ${PROJECT_BINARY_DIR}/lib/${OpenBLAS_LIBNAME}.lib) + #endif () + ) + #endif () + else () if (NOT USE_PERL) add_custom_command(TARGET ${OpenBLAS_LIBNAME}_shared POST_BUILD - COMMAND ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BLD}" "${BBF16}" "${BS}" "${BD}" "${BC}" "${BZ}" > ${PROJECT_BINARY_DIR}/objcopy.def - COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so + COMMAND sh ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BLD}" "${BBF16}" "${BS}" "${BD}" "${BC}" "${BZ}" > ${PROJECT_BINARY_DIR}/objcopy.def + COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/${OpenBLAS_LIBNAME}.so COMMENT "renaming symbols" ) else() @@ -481,6 +507,7 @@ if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "") ) endif() endif() +endif() if (BUILD_BENCHMARKS) #find_package(OpenMP REQUIRED) @@ -650,4 +677,4 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PN}ConfigVersion.cmake DESTINATION ${CMAKECONFIG_INSTALL_DIR}) install(EXPORT "${PN}${SUFFIX64}Targets" NAMESPACE "${PN}${SUFFIX64}::" - DESTINATION ${CMAKECONFIG_INSTALL_DIR}) + DESTINATION ${CMAKECONFIG_INSTALL_DIR}) \ No newline at end of file From 1724b3f10497bc4c4c62e9019e6bd169796d0cda Mon Sep 17 00:00:00 2001 From: Harishmcw Date: Wed, 26 Mar 2025 10:55:50 +0530 Subject: [PATCH 092/205] DLL symbol pre/postfixing in CMake builds --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c140bf5ba..df9c631d4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -488,7 +488,7 @@ message(STATUS "adding postbuild instruction to rename syms") COMMAND ${CMAKE_C_COMPILER} ${CMAKE_C_FLAGS} -I${PROJECT_SOURCE_DIR} -I${PROJECT_BINARY_DIR} -c -o ${PROJECT_BINARY_DIR}/dllinit.o ${PROJECT_SOURCE_DIR}/exports/dllinit.c COMMAND lld-link -nodefaultlib:libcmt -defaultlib:msvcrt ${CMAKE_LINKER_FLAGS} -errorlimit:0 -def:${PROJECT_BINARY_DIR}/renamesyms.def ${PROJECT_BINARY_DIR}/dllinit.o $ -wholearchive:$ -dll -out:$/${OpenBLAS_LIBNAME}.dll -implib:$/${OpenBLAS_LIBNAME}.dll.a #if (${REMOVE_STATIC_LIB}) - #file (REMOVE ${PROJECT_BINARY_DIR}/lib/${OpenBLAS_LIBNAME}.lib) + #file (REMOVE $/${OpenBLAS_LIBNAME}.lib) #endif () ) #endif () From 3ca1ba1be3865169803d6bb2c10bacad5196e7ad Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 26 Mar 2025 18:37:11 +0100 Subject: [PATCH 093/205] resynchronize with the posix shell version --- exports/gensymbol.pl | 165 +++++++++++++++++++++++++++++++++++++------ 1 file changed, 145 insertions(+), 20 deletions(-) diff --git a/exports/gensymbol.pl b/exports/gensymbol.pl index 91892395e..559730634 100644 --- a/exports/gensymbol.pl +++ b/exports/gensymbol.pl @@ -21,7 +21,7 @@ chbmv,chemm,chemv,cher2,cher2k,cher,cherk,scabs1,scamax, chpmv,chpr2,chpr,crotg,cscal,csrot,csscal,cswap,scamin,scasum,scnrm2, csymm,csyr2k,csyrk,ctbmv,ctbsv,ctpmv,ctpsv,ctrmm,ctrmv,ctrsm, - ctrsv,icamax,icamin,cimatcopy,comatcopy,cgeadd,scsum,cgemmt); + ctrsv,icamax,icamin,cimatcopy,comatcopy,cgeadd,scsum,cgemmt,cgemmtr); @blasobjsd = ( damax,damin,dasum,daxpy,daxpby,dcabs1,dcopy,ddot,dgbmv,dgemm, @@ -29,7 +29,7 @@ dscal,dsdot,dspmv,dspr2,dimatcopy,domatcopy, dspr,dswap,dsymm,dsymv,dsyr2,dsyr2k,dsyr,dsyrk,dtbmv,dtbsv, dtpmv,dtpsv,dtrmm,dtrmv,dtrsm,dtrsv, - idamax,idamin,idmax,idmin,dgeadd,dsum,dgemmt); + idamax,idamin,idmax,idmin,dgeadd,dsum,dgemmt,dgemmtr); @blasobjss = ( isamax,isamin,ismax,ismin, @@ -38,7 +38,7 @@ smax,smin,snrm2,simatcopy,somatcopy, srot,srotg,srotm,srotmg,ssbmv,sscal,sspmv,sspr2,sspr,sswap, ssymm,ssymv,ssyr2,ssyr2k,ssyr,ssyrk,stbmv,stbsv,stpmv,stpsv, - strmm,strmv,strsm,strsv, sgeadd,ssum,sgemmt); + strmm,strmv,strsm,strsv, sgeadd,ssum,sgemmt,sgemmtr); @blasobjsz = ( izamax,izamin,, @@ -48,28 +48,29 @@ zhpr,zrotg,zscal,zswap,zsymm,zsyr2k,zsyrk,ztbmv, ztbsv,ztpmv,ztpsv,ztrmm,ztrmv,ztrsm,ztrsv, zomatcopy, zimatcopy,dzamax,dzamin,dzasum,dznrm2, - zgeadd, dzsum, zgemmt); + zgeadd, dzsum, zgemmt,zgemmtr); @blasobjs = (lsame, xerbla); -@bfblasobjs = (sbgemm, sbgemv, sbdot, sbstobf16, sbdtobf16, sbf16tos, dbf16tod); +@bfblasobjs = (sbgemm, sbgemmt, sbgemmtr, sbgemv, sbdot, sbstobf16, sbdtobf16, sbf16tos, dbf16tod); @cblasobjsc = ( cblas_caxpy, cblas_ccopy, cblas_cdotc, cblas_cdotu, cblas_cgbmv, cblas_cgemm, cblas_cgemv, cblas_cgerc, cblas_cgeru, cblas_chbmv, cblas_chemm, cblas_chemv, cblas_cher2, cblas_cher2k, cblas_cher, cblas_cherk, cblas_chpmv, cblas_chpr2, cblas_chpr, cblas_cscal, cblas_caxpby, cblas_csscal, cblas_cswap, cblas_csymm, cblas_csyr2k, cblas_csyrk, cblas_ctbmv, cblas_cgeadd, cblas_ctbsv, cblas_ctpmv, cblas_ctpsv, cblas_ctrmm, cblas_ctrmv, cblas_ctrsm, cblas_ctrsv, - cblas_scnrm2, cblas_scasum, + cblas_scnrm2, cblas_scasum, cblas_cgemmt, cblas_cgemmtr, cblas_icamax, cblas_icamin, cblas_icmin, cblas_icmax, cblas_scsum,cblas_cimatcopy,cblas_comatcopy, - cblas_cgemmt); + cblas_caxpyc, cblas_crotg, cblas_csrot, cblas_scamax, cblas_scamin, cblas_cgemm_batch); + @cblasobjsd = ( cblas_dasum, cblas_daxpy, cblas_dcopy, cblas_ddot, cblas_dgbmv, cblas_dgemm, cblas_dgemv, cblas_dger, cblas_dnrm2, cblas_drot, cblas_drotg, cblas_drotm, cblas_drotmg, cblas_dsbmv, cblas_dscal, cblas_dsdot, cblas_dspmv, cblas_dspr2, cblas_dspr, cblas_dswap, cblas_dsymm, cblas_dsymv, cblas_dsyr2, cblas_dsyr2k, cblas_dsyr, cblas_dsyrk, cblas_dtbmv, cblas_dtbsv, cblas_dtpmv, cblas_dtpsv, - cblas_dtrmm, cblas_dtrmv, cblas_dtrsm, cblas_dtrsv, cblas_daxpby, cblas_dgeadd, + cblas_dtrmm, cblas_dtrmv, cblas_dtrsm, cblas_dtrsv, cblas_daxpby, cblas_dgeadd, cblas_dgemmt, cblas_dgemmtr, cblas_idamax, cblas_idamin, cblas_idmin, cblas_idmax, cblas_dsum,cblas_dimatcopy,cblas_domatcopy, - cblas_dgemmt); + cblas_damax, cblas_damin, cblas_dgemm_batch); @cblasobjss = ( cblas_sasum, cblas_saxpy, cblas_saxpby, @@ -78,9 +79,10 @@ cblas_srotm, cblas_srotmg, cblas_ssbmv, cblas_sscal, cblas_sspmv, cblas_sspr2, cblas_sspr, cblas_sswap, cblas_ssymm, cblas_ssymv, cblas_ssyr2, cblas_ssyr2k, cblas_ssyr, cblas_ssyrk, cblas_stbmv, cblas_stbsv, cblas_stpmv, cblas_stpsv, cblas_strmm, cblas_strmv, cblas_strsm, - cblas_strsv, cblas_sgeadd, + cblas_strsv, cblas_sgeadd, cblas_sgemmt, cblas_sgemmtr, cblas_isamax, cblas_isamin, cblas_ismin, cblas_ismax, cblas_ssum,cblas_simatcopy,cblas_somatcopy, - cblas_sgemmt); + cblas_samax, cblas_samin, cblas_sgemm_batch); + @cblasobjsz = ( cblas_dzasum, cblas_dznrm2, cblas_zaxpy, cblas_zcopy, cblas_zdotc, cblas_zdotu, cblas_zdscal, cblas_zgbmv, cblas_zgemm, cblas_zgemv, cblas_zgerc, cblas_zgeru, cblas_zhbmv, cblas_zhemm, @@ -88,13 +90,13 @@ cblas_zhpr, cblas_zscal, cblas_zswap, cblas_zsymm, cblas_zsyr2k, cblas_zsyrk, cblas_ztbmv, cblas_ztbsv, cblas_ztpmv, cblas_ztpsv, cblas_ztrmm, cblas_ztrmv, cblas_ztrsm, cblas_ztrsv, cblas_cdotc_sub, cblas_cdotu_sub, cblas_zdotc_sub, cblas_zdotu_sub, - cblas_zaxpby, cblas_zgeadd, + cblas_zaxpby, cblas_zgeadd, cblas_zgemmt, cblas_zgemmtr, cblas_izamax, cblas_izamin, cblas_izmin, cblas_izmax, cblas_dzsum,cblas_zimatcopy,cblas_zomatcopy, - cblas_zgemmt); + cblas_zaxpyc, cblas_zdrot, cblas_zrotg, cblas_dzamax, cblas_dzamin, cblas_zgemm_batch); @cblasobjs = ( cblas_xerbla ); -@bfcblasobjs = (cblas_sbgemm, cblas_sbgemv, cblas_sbdot, cblas_sbstobf16, cblas_sbdtobf16, cblas_sbf16tos, cblas_dbf16tod); +@bfcblasobjs = (cblas_sbgemm, cblas_sbgemmt, cblas_sbgemmtr, cblas_sbgemv, cblas_sbdot, cblas_sbstobf16, cblas_sbdtobf16, cblas_sbf16tos, cblas_dbf16tod, cblas_sbgemm_batch); @exblasobjs = ( qamax,qamin,qasum,qaxpy,qcabs1,qcopy,qdot,qgbmv,qgemm, @@ -709,6 +711,7 @@ zpotri, # functions added for lapack-3.7.0 @lapackobjs2s = (@lapackobjs2s, slarfy, + ssyconvf, strevc3, sgelqt, sgelqt3, @@ -832,12 +835,82 @@ zpotri, zungtsqr_row ); +#functions added for lapack-3.11 +@lapackobjs2c = (@lapackobjs2c, + cgedmd, + cgedmdq + ); +@lapackobjs2d = (@lapackobjs2d, + dgedmd, + dgedmdq + ); +@lapackobjs2s = (@lapackobjs2s, + sgedmd, + sgedmdq + ); +@lapackobjs2z = (@lapackobjs2z, + zgedmd, + zgedmdq + ); + +#functions added post 3.11 + +@lapackobjs2c = (@lapackobjs2c, + cgelst, + cgeqp3rk, + claqp2rk, + claqp3rk, + clatrs3, + crscl, + ctrsyl3 + ); +# claqz0 +# claqz1 +# claqz2 +# claqz3 +# clatrs3 + +@lapackobjs2d = (@lapackobjs2d, + dgelst, + dgeqp3rk, + dlaqp2rk, + dlaqp3rk, + dlarmm, + dlatrs3, + dtrsyl3 + ); + +@lapackobjs2s = (@lapackobjs2s, + sgelst, + sgeqp3rk, + slaqp2rk, + slaqp3rk, + slarmm, + slatrs3, + strsyl3 + ); + +@lapackobjs2z = (@lapackobjs2z, + zgelst, + zgeqp3rk, + zlaqp2rk, + zlaqp3rk, + zlatrs3, + zrscl, + ztrsyl3 + ); +# zlaqz0 +# zlaqz1 +# zlaqz2 +# zlaqz3 + @lapack_extendedprecision_objs = ( zposvxx, clagge, clatms, chesvxx, cposvxx, cgesvxx, ssyrfssx, csyrfsx, dlagsy, dsysvxx, sporfsx, slatms, zlatms, zherfsx, csysvxx, ); @lapack_deprecated_objsc = ( + cgelqs, cgeqrs, cgegs, cggsvd, cgegv, cggsvp, cgelsx, clahrd, @@ -845,13 +918,16 @@ zpotri, ctzrqf, ); @lapack_deprecated_objsd = ( + dgelqs, dgeqrs, dgegs, dgeqpf, dgegv, dggsvd, dgelsx, dggsvp, dlahrd, dlatzm, dtzrqf); -@lapack_deprecated_objss = ( +@lapack_deprecated_objss = ( + sgelqs, + sgeqrs, sgelsx, sgegs, sgegv, @@ -864,6 +940,8 @@ zpotri, ); @lapack_deprecated_objsz = ( + zgelqs, + zgeqrs, zgegs, zgegv, zgelsx, @@ -997,6 +1075,10 @@ zpotri, LAPACKE_cgebrd_work, LAPACKE_cgecon, LAPACKE_cgecon_work, + LAPACKE_cgedmd, + LAPACKE_cgedmd_work, + LAPACKE_cgedmdq, + LAPACKE_cgedmdq_work, LAPACKE_cgeequ, LAPACKE_cgeequ_work, LAPACKE_cgeequb, @@ -1585,7 +1667,14 @@ zpotri, LAPACKE_cgetsqrhrt_work, LAPACKE_cungtsqr_row, LAPACKE_cungtsqr_row_work, - + LAPACKE_clangb, + LAPACKE_clangb_work, + LAPACKE_ctrsyl3, + LAPACKE_ctrsyl3_work, + LAPACKE_ctz_nancheck, + LAPACKE_ctz_trans, + LAPACKE_cunhr_col, + LAPACKE_cunhr_col_work ); @lapackeobjsd = ( LAPACKE_dgb_nancheck, @@ -1656,6 +1745,10 @@ zpotri, LAPACKE_dgebrd_work, LAPACKE_dgecon, LAPACKE_dgecon_work, + LAPACKE_dgedmd, + LAPACKE_dgedmd_work, + LAPACKE_dgedmdq, + LAPACKE_dgedmdq_work, LAPACKE_dgeequ, LAPACKE_dgeequ_work, LAPACKE_dgeequb, @@ -2198,6 +2291,14 @@ zpotri, LAPACKE_dgetsqrhrt_work, LAPACKE_dorgtsqr_row, LAPACKE_dorgtsqr_row_work, + LAPACKE_dlangb, + LAPACKE_dlangb_work, + LAPACKE_dorhr_col, + LAPACKE_dorhr_col_work, + LAPACKE_dtrsyl3, + LAPACKE_dtrsyl3_work, + LAPACKE_dtz_nancheck, + LAPACKE_dtz_trans, ); @lapackeobjss = ( @@ -2269,6 +2370,10 @@ zpotri, LAPACKE_sgebrd_work, LAPACKE_sgecon, LAPACKE_sgecon_work, + LAPACKE_sgedmd, + LAPACKE_sgedmd_work, + LAPACKE_sgedmdq, + LAPACKE_sgedmdq_work, LAPACKE_sgeequ, LAPACKE_sgeequ_work, LAPACKE_sgeequb, @@ -2803,6 +2908,14 @@ zpotri, LAPACKE_sgetsqrhrt_work, LAPACKE_sorgtsqr_row, LAPACKE_sorgtsqr_row_work, + LAPACKE_slangb, + LAPACKE_slangb_work, + LAPACKE_sorhr_col, + LAPACKE_sorhr_col_work, + LAPACKE_strsyl3, + LAPACKE_strsyl3_work, + LAPACKE_stz_nancheck, + LAPACKE_stz_trans, ); @lapackeobjsz = ( @@ -2878,6 +2991,10 @@ zpotri, LAPACKE_zgebrd_work, LAPACKE_zgecon, LAPACKE_zgecon_work, + LAPACKE_zgedmd, + LAPACKE_zgedmd_work, + LAPACKE_zgedmdq, + LAPACKE_zgedmdq_work, LAPACKE_zgeequ, LAPACKE_zgeequ_work, LAPACKE_zgeequb, @@ -3346,6 +3463,14 @@ zpotri, LAPACKE_zgetsqrhrt_work, LAPACKE_zungtsqr_row, LAPACKE_zungtsqr_row_work, + LAPACKE_zlangb, + LAPACKE_zlangb_work, + LAPACKE_zunhr_col, + LAPACKE_zunhr_col_work, + LAPACKE_ztrsyl3, + LAPACKE_ztrsyl3_work, + LAPACKE_ztz_nancheck, + LAPACKE_ztz_trans, ## @(SRCX_OBJ) from `lapack-3.4.1/lapacke/src/Makefile` ## Not exported: requires LAPACKE_EXTENDED to be set and depends on the @@ -3573,7 +3698,7 @@ zpotri, ssygv_2stage, ssysv_aa_2stage, ssytrf_aa_2stage, ssytrs_aa_2stage, - slaorhr_col_getrfnp, slaorhr_col_getrfnp2, sorhr_col, + slaorhr_col_getrfnp, slaorhr_col_getrfnp2, sorhr_col, slarfb_gett ); @lapack_embeded_underscore_objs_c=( chetf2_rook, chetrf_rook, chetri_rook, @@ -3598,7 +3723,7 @@ zpotri, chetrf_aa_2stage, chetrs_aa_2stage, csysv_aa_2stage, csytrf_aa_2stage, csytrs_aa_2stage, - claunhr_col_getrfnp, claunhr_col_getrfnp2, cunhr_col, + claunhr_col_getrfnp, claunhr_col_getrfnp2, cunhr_col, clarfb_gett ); @lapack_embeded_underscore_objs_d=( dlasyf_rook, @@ -3615,7 +3740,7 @@ zpotri, dsbevd_2stage, dsygv_2stage, dsysv_aa_2stage, dsytrf_aa_2stage, dsytrs_aa_2stage, - dlaorhr_col_getrfnp, dlaorhr_col_getrfnp2, dorhr_col, + dlaorhr_col_getrfnp, dlaorhr_col_getrfnp2, dorhr_col, dlarfb_gett ); @lapack_embeded_underscore_objs_z=( zhetf2_rook, zhetrf_rook, zhetri_rook, @@ -3639,7 +3764,7 @@ zpotri, zhesv_aa_2stage, zhetrf_aa_2stage, zhetrs_aa_2stage, zsysv_aa_2stage, zsytrf_aa_2stage, zsytrs_aa_2stage, - zlaunhr_col_getrfnp, zlaunhr_col_getrfnp2, zunhr_col, + zlaunhr_col_getrfnp, zlaunhr_col_getrfnp2, zunhr_col, zlarfb_gett ); From 51c1fb1f93fd6d7f7fd24dd22c954ba9527b4a05 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 26 Mar 2025 23:36:49 +0100 Subject: [PATCH 094/205] Fix ?spmv build and misinterpretation of NO_LAPACK=0 --- interface/CMakeLists.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/interface/CMakeLists.txt b/interface/CMakeLists.txt index c0d5896e1..393b7cdd3 100644 --- a/interface/CMakeLists.txt +++ b/interface/CMakeLists.txt @@ -30,17 +30,17 @@ set(BLAS2_SOURCES gemv.c ger.c trsv.c trmv.c syr2.c gbmv.c - sbmv.c + sbmv.c spmv.c spr2.c tbsv.c tbmv.c tpsv.c tpmv.c ) set(BLAS2_REAL_ONLY_SOURCES - symv.c syr.c spmv.c spr.c + symv.c syr.c spr.c ) set(BLAS2_COMPLEX_LAPACK_SOURCES - symv.c syr.c spmv.c spr.c + symv.c syr.c spr.c ) set(BLAS2_COMPLEX_ONLY_MANGLED_SOURCES @@ -195,7 +195,7 @@ if (NOT DEFINED NO_CBLAS) endforeach () endif() -if (NOT DEFINED NO_LAPACK) +if (NOT NO_LAPACK) set(LAPACK_SOURCES lapack/gesv.c ) From 02fd1df10b4a2e36a848d852589a5ba25214926c Mon Sep 17 00:00:00 2001 From: Ruiyang Wu Date: Wed, 12 Mar 2025 20:41:55 -0400 Subject: [PATCH 095/205] CMake: Pass `OpenMP` compiler and linker flags through CMake targets Using `OpenMP::OpenMP_LANG` targets for CMake is less error-prone than passing the compiler and linker flags manually. Furthermore, it allows the user to customize those flags by setting `OpenMP_LANG_FLAGS`, `OpenMP_LANG_LIB_NAMES`, and `OpenMP_omp_LIBRARY`. --- CMakeLists.txt | 22 ++++++++++++++++------ cmake/arch.cmake | 11 ----------- cmake/fc.cmake | 24 ++++++++++++------------ cmake/system.cmake | 17 ++++++++--------- driver/level2/CMakeLists.txt | 4 ++++ driver/level3/CMakeLists.txt | 4 ++++ driver/others/CMakeLists.txt | 4 ++++ interface/CMakeLists.txt | 4 ++++ kernel/CMakeLists.txt | 3 +++ lapack/CMakeLists.txt | 4 ++++ 10 files changed, 59 insertions(+), 38 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index df9c631d4..6a8d36c40 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -119,10 +119,6 @@ endif() message(WARNING "CMake support is experimental. It does not yet support all build options and may not produce the same Makefiles that OpenBLAS ships with.") -if (USE_OPENMP) - find_package(OpenMP REQUIRED) -endif () - include("${PROJECT_SOURCE_DIR}/cmake/utils.cmake") include("${PROJECT_SOURCE_DIR}/cmake/system.cmake") @@ -240,6 +236,12 @@ endif () # add objects to the openblas lib if(NOT NO_LAPACK) add_library(LAPACK_OVERRIDES OBJECT ${LA_SOURCES}) + if (USE_OPENMP AND (NOT NOFORTRAN)) + # Disable OpenMP for LAPACK Fortran codes on Windows. + if(NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows") + target_link_libraries(LAPACK_OVERRIDES OpenMP::OpenMP_Fortran) + endif() + endif() list(APPEND TARGET_OBJS "$") endif() if(NOT NO_LAPACKE) @@ -281,10 +283,18 @@ endif() if (USE_OPENMP) if(BUILD_STATIC_LIBS) - target_link_libraries(${OpenBLAS_LIBNAME}_static OpenMP::OpenMP_C) + if(NOFORTRAN) + target_link_libraries(${OpenBLAS_LIBNAME}_static OpenMP::OpenMP_C) + else() + target_link_libraries(${OpenBLAS_LIBNAME}_static OpenMP::OpenMP_C OpenMP::OpenMP_Fortran) + endif() endif() if(BUILD_SHARED_LIBS) - target_link_libraries(${OpenBLAS_LIBNAME}_shared OpenMP::OpenMP_C) + if(NOFORTRAN) + target_link_libraries(${OpenBLAS_LIBNAME}_shared OpenMP::OpenMP_C) + else() + target_link_libraries(${OpenBLAS_LIBNAME}_shared OpenMP::OpenMP_C OpenMP::OpenMP_Fortran) + endif() endif() endif() diff --git a/cmake/arch.cmake b/cmake/arch.cmake index ec91a2d59..d9a7aafd6 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -31,17 +31,6 @@ if (${CMAKE_C_COMPILER_ID} STREQUAL "Intel") set(CCOMMON_OPT "${CCOMMON_OPT} -wd981") endif () -if (USE_OPENMP) - # USE_SIMPLE_THREADED_LEVEL3 = 1 - # NO_AFFINITY = 1 - find_package(OpenMP REQUIRED) - if (OpenMP_FOUND) - set(CCOMMON_OPT "${CCOMMON_OPT} ${OpenMP_C_FLAGS} -DUSE_OPENMP") - set(FCOMMON_OPT "${FCOMMON_OPT} ${OpenMP_Fortran_FLAGS}") - endif() -endif () - - if (DYNAMIC_ARCH) if (ARM64) set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110) diff --git a/cmake/fc.cmake b/cmake/fc.cmake index 38bd406a3..3aa6a151d 100644 --- a/cmake/fc.cmake +++ b/cmake/fc.cmake @@ -7,7 +7,7 @@ if (${F_COMPILER} STREQUAL "FLANG" AND NOT CMAKE_Fortran_COMPILER_ID STREQUAL "L # This is for classic Flang. LLVM Flang is handled with gfortran below. set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FLANG") if (USE_OPENMP) - set(FCOMMON_OPT "${FCOMMON_OPT} -fopenmp") + set(OpenMP_Fortran_FLAGS "-fopenmp" CACHE STRING "OpenMP Fortran compiler flags") endif () set(FCOMMON_OPT "${FCOMMON_OPT} -Mrecursive -Kieee") endif () @@ -117,7 +117,7 @@ if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95" OR CMAKE_F endif () if (USE_OPENMP) - set(FCOMMON_OPT "${FCOMMON_OPT} -fopenmp") + set(OpenMP_Fortran_FLAGS "-fopenmp" CACHE STRING "OpenMP Fortran compiler flags") endif () endif () @@ -128,14 +128,14 @@ if (${F_COMPILER} STREQUAL "INTEL" OR CMAKE_Fortran_COMPILER_ID MATCHES "Intel") endif () set(FCOMMON_OPT "${FCOMMON_OPT} -recursive -fp-model=consistent") if (USE_OPENMP) - set(FCOMMON_OPT "${FCOMMON_OPT} -openmp") + set(OpenMP_Fortran_FLAGS "-openmp" CACHE STRING "OpenMP Fortran compiler flags") endif () endif () if (${F_COMPILER} STREQUAL "FUJITSU") set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FUJITSU") if (USE_OPENMP) - set(FCOMMON_OPT "${FCOMMON_OPT} -openmp") + set(OpenMP_Fortran_FLAGS "-openmp" CACHE STRING "OpenMP Fortran compiler flags") endif () endif () @@ -151,7 +151,7 @@ if (${F_COMPILER} STREQUAL "IBM") set(FCOMMON_OPT "${FCOMMON_OPT} -q32") endif () if (USE_OPENMP) - set(FCOMMON_OPT "${FCOMMON_OPT} -openmp") + set(OpenMP_Fortran_FLAGS "-openmp" CACHE STRING "OpenMP Fortran compiler flags") endif () endif () @@ -168,7 +168,7 @@ if (${F_COMPILER} STREQUAL "PGI" OR ${F_COMPILER} STREQUAL "PGF95") endif () set(FCOMMON_OPT "${FCOMMON_OPT} -Mrecursive") if (USE_OPENMP) - set(FCOMMON_OPT "${FCOMMON_OPT} -mp") + set(OpenMP_Fortran_FLAGS "-mp" CACHE STRING "OpenMP Fortran compiler flags") endif () endif () @@ -195,7 +195,7 @@ if (${F_COMPILER} STREQUAL "PATHSCALE") endif () if (USE_OPENMP) - set(FCOMMON_OPT "${FCOMMON_OPT} -mp") + set(OpenMP_Fortran_FLAGS "-mp" CACHE STRING "OpenMP Fortran compiler flags") endif () endif () @@ -233,7 +233,7 @@ if (${F_COMPILER} STREQUAL "OPEN64") if (USE_OPENMP) set(FEXTRALIB "${FEXTRALIB} -lstdc++") - set(FCOMMON_OPT "${FCOMMON_OPT} -mp") + set(OpenMP_Fortran_FLAGS "-mp" CACHE STRING "OpenMP Fortran compiler flags") endif () endif () @@ -245,14 +245,14 @@ if (${F_COMPILER} STREQUAL "SUN") set(FCOMMON_OPT "${FCOMMON_OPT} -m64") endif () if (USE_OPENMP) - set(FCOMMON_OPT "${FCOMMON_OPT} -xopenmp=parallel") + set(OpenMP_Fortran_FLAGS "-xopenmp=parallel" CACHE STRING "OpenMP Fortran compiler flags") endif () endif () if (${F_COMPILER} STREQUAL "COMPAQ") set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_COMPAQ") if (USE_OPENMP) - set(FCOMMON_OPT "${FCOMMON_OPT} -openmp") + set(OpenMP_Fortran_FLAGS "-openmp" CACHE STRING "OpenMP Fortran compiler flags") endif () endif () @@ -265,7 +265,7 @@ if (${F_COMPILER} STREQUAL "CRAY") if (NOT USE_OPENMP) set(FCOMMON_OPT "${FCOMMON_OPT} -fno-openmp") else () - set(FCOMMON_OPT "${FCOMMON_OPT} -fopenmp") + set(OpenMP_Fortran_FLAGS "-fopenmp" CACHE STRING "OpenMP Fortran compiler flags") endif () endif () @@ -290,7 +290,7 @@ if (${F_COMPILER} STREQUAL "NAGFOR") # -w=unused: Suppress warning messages about unused variables set(FCOMMON_OPT "${FCOMMON_OPT} -w=x77 -w=ques -w=unused") if (USE_OPENMP) - set(FCOMMON_OPT "${FCOMMON_OPT} -openmp") + set(OpenMP_Fortran_FLAGS "-openmp" CACHE STRING "OpenMP Fortran compiler flags") endif () endif () diff --git a/cmake/system.cmake b/cmake/system.cmake index efaafee40..4c9d9198c 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -372,6 +372,14 @@ else () endif () endif () +if (USE_OPENMP) + find_package(OpenMP COMPONENTS C REQUIRED) + set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_OPENMP") + if (NOT NOFORTRAN) + find_package(OpenMP COMPONENTS Fortran REQUIRED) + endif () +endif () + if (BINARY64) if (INTERFACE64) # CCOMMON_OPT += -DUSE64BITINT @@ -655,15 +663,6 @@ if (LAPACK_STRLEN) endif() set(LAPACK_FPFLAGS "${LAPACK_FPFLAGS} ${FPFLAGS}") -#Disable -fopenmp for LAPACK Fortran codes on Windows. -if (${CMAKE_SYSTEM_NAME} STREQUAL "Windows") - set(FILTER_FLAGS "-fopenmp;-mp;-openmp;-xopenmp=parallel") - foreach (FILTER_FLAG ${FILTER_FLAGS}) - string(REPLACE ${FILTER_FLAG} "" LAPACK_FFLAGS ${LAPACK_FFLAGS}) - string(REPLACE ${FILTER_FLAG} "" LAPACK_FPFLAGS ${LAPACK_FPFLAGS}) - endforeach () -endif () - if (CMAKE_Fortran_COMPILER) if ("${F_COMPILER}" STREQUAL "NAGFOR" OR "${F_COMPILER}" STREQUAL "CRAY" OR CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*") set(FILTER_FLAGS "-msse3;-mssse3;-msse4.1;-mavx;-mavx2,-mskylake-avx512") diff --git a/driver/level2/CMakeLists.txt b/driver/level2/CMakeLists.txt index 3e9964ab1..c52b461a7 100644 --- a/driver/level2/CMakeLists.txt +++ b/driver/level2/CMakeLists.txt @@ -223,3 +223,7 @@ if (USE_THREAD) endif () add_library(driver_level2 OBJECT ${OPENBLAS_SRC}) + +if (USE_OPENMP) + target_link_libraries(driver_level2 OpenMP::OpenMP_C) +endif() diff --git a/driver/level3/CMakeLists.txt b/driver/level3/CMakeLists.txt index b1ec94c23..eabfeed24 100644 --- a/driver/level3/CMakeLists.txt +++ b/driver/level3/CMakeLists.txt @@ -171,3 +171,7 @@ endforeach () # add_library(driver_level3 OBJECT ${OPENBLAS_SRC}) + +if (USE_OPENMP) + target_link_libraries(driver_level3 OpenMP::OpenMP_C) +endif() diff --git a/driver/others/CMakeLists.txt b/driver/others/CMakeLists.txt index 139f329ec..ebcc0aa78 100644 --- a/driver/others/CMakeLists.txt +++ b/driver/others/CMakeLists.txt @@ -88,3 +88,7 @@ endif () #endif add_library(driver_others OBJECT ${OPENBLAS_SRC} ${MEMORY} ${SMP_SOURCES} ${COMMON_SOURCES}) + +if (USE_OPENMP) + target_link_libraries(driver_others OpenMP::OpenMP_C) +endif() diff --git a/interface/CMakeLists.txt b/interface/CMakeLists.txt index c0d5896e1..12b679c84 100644 --- a/interface/CMakeLists.txt +++ b/interface/CMakeLists.txt @@ -250,3 +250,7 @@ if ( BUILD_COMPLEX16 AND NOT BUILD_DOUBLE) endif () add_library(interface OBJECT ${OPENBLAS_SRC}) + +if (USE_OPENMP) + target_link_libraries(interface OpenMP::OpenMP_C) +endif() diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index d3262e07d..81185f603 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -1364,6 +1364,9 @@ endif () if (USE_GEMM3M) target_compile_definitions(kernel${TSUFFIX} PRIVATE USE_GEMM3M) endif() + if (USE_OPENMP) + target_link_libraries(kernel${TSUFFIX} OpenMP::OpenMP_C) + endif() endfunction () diff --git a/lapack/CMakeLists.txt b/lapack/CMakeLists.txt index 1d44e9490..bbaacb2be 100644 --- a/lapack/CMakeLists.txt +++ b/lapack/CMakeLists.txt @@ -117,3 +117,7 @@ GenerateCombinationObjects("${UNIT_SOURCES}" "UNIT" "N" "" 4) GenerateCombinationObjects("${UNIT_SOURCES2}" "UNIT" "N" "" 0 "" "" 3) add_library(lapack OBJECT ${OPENBLAS_SRC}) + +if (USE_OPENMP) + target_link_libraries(lapack OpenMP::OpenMP_C) +endif() From 1b0c0f00e9448a4e2866cc3449b02cc4d727abf7 Mon Sep 17 00:00:00 2001 From: Ruiyang Wu Date: Thu, 13 Mar 2025 02:25:52 -0400 Subject: [PATCH 096/205] CMake: Avoid mixed OpenMP linkage --- cmake/system.cmake | 6 ++++++ ctest/CMakeLists.txt | 16 ---------------- lapack-netlib/TESTING/EIG/CMakeLists.txt | 6 ------ lapack-netlib/TESTING/LIN/CMakeLists.txt | 4 ---- test/CMakeLists.txt | 4 ---- 5 files changed, 6 insertions(+), 30 deletions(-) diff --git a/cmake/system.cmake b/cmake/system.cmake index 4c9d9198c..14b2c65b1 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -377,6 +377,12 @@ if (USE_OPENMP) set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_OPENMP") if (NOT NOFORTRAN) find_package(OpenMP COMPONENTS Fortran REQUIRED) + # Avoid mixed OpenMP linkage + get_target_property(OMP_C_LIB OpenMP::OpenMP_C INTERFACE_LINK_LIBRARIES) + get_target_property(OMP_Fortran_LIB OpenMP::OpenMP_Fortran INTERFACE_LINK_LIBRARIES) + if (NOT OMP_C_LIB STREQUAL OMP_Fortran_LIB) + message(FATAL_ERROR "Multiple OpenMP runtime libraries detected. Mixed OpenMP runtime linkage is dangerous. You may pass -DOpenMP_LANG_LIB_NAMES and -DOpenMP_omp_LIBRARY to manually choose the OpenMP library.") + endif() endif () endif () diff --git a/ctest/CMakeLists.txt b/ctest/CMakeLists.txt index 4496eff82..03b157843 100644 --- a/ctest/CMakeLists.txt +++ b/ctest/CMakeLists.txt @@ -44,10 +44,6 @@ else() c_${float_char}blas1.c) endif() target_link_libraries(x${float_char}cblat1 ${OpenBLAS_LIBNAME}) - if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang)) - string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}") - target_link_libraries(x${float_char}cblat1 omp pthread) - endif() if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX") target_link_libraries(x${float_char}cblat1 m) endif() @@ -73,10 +69,6 @@ else() constant.c) endif() target_link_libraries(x${float_char}cblat2 ${OpenBLAS_LIBNAME}) - if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang)) - string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}") - target_link_libraries(x${float_char}cblat2 omp pthread) - endif() if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX") target_link_libraries(x${float_char}cblat2 m) endif() @@ -124,20 +116,12 @@ else() endif() endif() target_link_libraries(x${float_char}cblat3 ${OpenBLAS_LIBNAME}) - if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang)) - string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}") - target_link_libraries(x${float_char}cblat3 omp pthread) - endif() if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX") target_link_libraries(x${float_char}cblat3 m) endif() if (USE_GEMM3M) if ((${float_char} STREQUAL "c") OR (${float_char} STREQUAL "z")) target_link_libraries(x${float_char}cblat3_3m ${OpenBLAS_LIBNAME}) - if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang)) - string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}") - target_link_libraries(x${float_char}cblat3 omp pthread) - endif() if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX") target_link_libraries(x${float_char}cblat3_3m m) endif() diff --git a/lapack-netlib/TESTING/EIG/CMakeLists.txt b/lapack-netlib/TESTING/EIG/CMakeLists.txt index e4c4181b2..d9c34fe98 100644 --- a/lapack-netlib/TESTING/EIG/CMakeLists.txt +++ b/lapack-netlib/TESTING/EIG/CMakeLists.txt @@ -107,12 +107,6 @@ set(ZDMDEIGTST zchkdmd.f90) macro(add_eig_executable name) add_executable(${name} ${ARGN}) target_link_libraries(${name} ${LIBNAMEPREFIX}openblas${LIBNAMESUFFIX}${SUFFIX64_UNDERSCORE}) - - if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang)) - string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}") - target_link_libraries(${name} omp pthread) - endif() - #${TMGLIB} ../${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) endmacro() diff --git a/lapack-netlib/TESTING/LIN/CMakeLists.txt b/lapack-netlib/TESTING/LIN/CMakeLists.txt index e406570e1..95baa3122 100644 --- a/lapack-netlib/TESTING/LIN/CMakeLists.txt +++ b/lapack-netlib/TESTING/LIN/CMakeLists.txt @@ -240,10 +240,6 @@ set(ZLINTSTRFP zchkrfp.f zdrvrfp.f zdrvrf1.f zdrvrf2.f zdrvrf3.f zdrvrf4.f zerrr macro(add_lin_executable name) add_executable(${name} ${ARGN}) target_link_libraries(${name} ${LIBNAMEPREFIX}openblas${LIBNAMESUFFIX}${SUFFIX64_UNDERSCORE}) - if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang)) - string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}") - target_link_libraries(${name} omp pthread) - endif() #${TMGLIB} ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) endmacro() diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 4ebd5348c..f874fa5ea 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -34,10 +34,6 @@ endif () foreach(test_bin ${OpenBLAS_Tests}) add_executable(${test_bin} ${test_bin}.f) target_link_libraries(${test_bin} ${OpenBLAS_LIBNAME}) -if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang)) - string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}") -target_link_libraries(${test_bin} omp pthread) -endif() endforeach() # $1 exec, $2 input, $3 output_result From 251c3f857dc5148a61c4475f1bbfad2de6046a31 Mon Sep 17 00:00:00 2001 From: Ruiyang Wu Date: Wed, 26 Mar 2025 23:19:40 -0400 Subject: [PATCH 097/205] gh m1: fix mixed linkage when built with OpenMP and clang+gfortran --- .github/workflows/apple_m.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/apple_m.yml b/.github/workflows/apple_m.yml index e34eada86..81952dabd 100644 --- a/.github/workflows/apple_m.yml +++ b/.github/workflows/apple_m.yml @@ -102,6 +102,7 @@ jobs: mkdir build && cd build cmake -DDYNAMIC_ARCH=1 \ -DUSE_OPENMP=${{matrix.openmp}} \ + -DOpenMP_Fortran_LIB_NAMES=omp \ -DINTERFACE64=${{matrix.ilp64}} \ -DNOFORTRAN=0 \ -DBUILD_WITHOUT_LAPACK=0 \ From ea6515c4b33800ca6c91148a5cdb4452937c87fc Mon Sep 17 00:00:00 2001 From: Egbert Eich Date: Wed, 26 Mar 2025 17:35:21 +0100 Subject: [PATCH 098/205] On zarch don't produce objects from assembler with a writable stack section On z-series, the current version of the GNU toolchain produces warnings such as: ``` /usr/lib64/gcc/[...]/s390x-suse-linux/bin/ld: warning: ztrmm_kernel_RC_Z14.o: missing .note.GNU-stack section implies executable stack /usr/lib64/[...]/s390x-suse-linux/bin/ld: NOTE: This behaviour is deprecated and will be removed in a future version of the linker ``` To prevent this message and make sure we are future proof, add ``` .section .note.GNU-stack,"",@progbits ``` Also add the `.size` bit to give the asm defined functions a proper size in the symbol table. Signed-off-by: Egbert Eich --- common_zarch.h | 11 +++++++++-- kernel/zarch/ctrmm4x4V.S | 2 ++ kernel/zarch/gemm8x4V.S | 2 ++ kernel/zarch/strmm8x4V.S | 2 ++ kernel/zarch/trmm8x4V.S | 2 ++ kernel/zarch/ztrmm4x4V.S | 2 ++ 6 files changed, 19 insertions(+), 2 deletions(-) diff --git a/common_zarch.h b/common_zarch.h index 7911f11ae..035bcd27c 100644 --- a/common_zarch.h +++ b/common_zarch.h @@ -103,9 +103,16 @@ static inline int blas_quickdivide(blasint x, blasint y){ .global REALNAME ;\ .type REALNAME, %function ;\ REALNAME: - -#define EPILOGUE +#if defined(__ELF__) && defined(__linux__) +# define GNUSTACK .section .note.GNU-stack,"",@progbits +#else +# define GNUSTACK +#endif + +#define EPILOGUE \ + .size REALNAME, .-REALNAME; \ + GNUSTACK #define PROFCODE diff --git a/kernel/zarch/ctrmm4x4V.S b/kernel/zarch/ctrmm4x4V.S index 123f2ead0..dd997fbdf 100644 --- a/kernel/zarch/ctrmm4x4V.S +++ b/kernel/zarch/ctrmm4x4V.S @@ -714,6 +714,8 @@ ld %f10,136(%r15) ld %f11,144(%r15) ld %f12,152(%r15) br %r14 + +EPILOGUE .end diff --git a/kernel/zarch/gemm8x4V.S b/kernel/zarch/gemm8x4V.S index 633e60ea6..47ce5f8dd 100644 --- a/kernel/zarch/gemm8x4V.S +++ b/kernel/zarch/gemm8x4V.S @@ -604,6 +604,8 @@ ALIGN_2 /*end*/ lmg %r6,%r12,48(%r15) br %r14 + +EPILOGUE .end diff --git a/kernel/zarch/strmm8x4V.S b/kernel/zarch/strmm8x4V.S index e34a7a05a..c93c928cb 100644 --- a/kernel/zarch/strmm8x4V.S +++ b/kernel/zarch/strmm8x4V.S @@ -845,6 +845,8 @@ ALIGN_2 lmg %r6,%r12,48(%r15) #endif br %r14 + +EPILOGUE .end diff --git a/kernel/zarch/trmm8x4V.S b/kernel/zarch/trmm8x4V.S index 4da113ff3..de337e351 100644 --- a/kernel/zarch/trmm8x4V.S +++ b/kernel/zarch/trmm8x4V.S @@ -864,6 +864,8 @@ ALIGN_2 lmg %r6,%r12,48(%r15) #endif br %r14 + +EPILOGUE .end diff --git a/kernel/zarch/ztrmm4x4V.S b/kernel/zarch/ztrmm4x4V.S index 6fd7f2509..fa99daee5 100644 --- a/kernel/zarch/ztrmm4x4V.S +++ b/kernel/zarch/ztrmm4x4V.S @@ -719,6 +719,8 @@ ld %f10,136(%r15) ld %f11,144(%r15) ld %f12,152(%r15) br %r14 + +EPILOGUE .end From 61b9339d3a1fd7a4c4d91fce92ac55e41f80a08a Mon Sep 17 00:00:00 2001 From: Egbert Eich Date: Fri, 28 Mar 2025 08:59:26 +0100 Subject: [PATCH 099/205] getarch/cpuid.S: Fix warning about executable stack When using the GNU toolchain a warning is printed about an executible stack: /usr/lib64/gcc/.../x86_64-suse-linux/bin/ld: warning: /tmp/ccyG3xBB.o: missing .note.GNU-stack section implies executable stack [ 15s] /usr/lib64/gcc/.../x86_64-suse-linux/bin/ld: NOTE: This behaviour is deprecated and will be removed in a future version of the linker to prevent this warning, add: ``` .section .note.GNU-stack,"",@progbits ``` Signed-off-by: Egbert Eich --- cpuid.S | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cpuid.S b/cpuid.S index 851fe34d2..295917bdb 100644 --- a/cpuid.S +++ b/cpuid.S @@ -65,3 +65,6 @@ _cpuid: .subsections_via_symbols #endif +#if defined(__ELF__) && defined(__linux__) + .section .note.GNU-stack,"",@progbits +#endif From 3fc15ad81cabf3f847bfa99c1fcb3e8039543068 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 30 Mar 2025 23:22:09 +0200 Subject: [PATCH 100/205] Fix pdb file creation in debug dll builds with CMake on Windows/WoA --- CMakeLists.txt | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index df9c631d4..9bcfd38ad 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -78,11 +78,12 @@ set(SYMBOLPREFIX "" CACHE STRING "Add a prefix to all exported symbol names in set(SYMBOLSUFFIX "" CACHE STRING "Add a suffix to all exported symbol names in the shared library, e.g. _64 for INTERFACE64 builds" ) if (CMAKE_SYSTEM_NAME MATCHES "Windows" AND BUILD_SHARED_LIBS AND NOT ("${SYMBOLPREFIX}${SYMBOLSUFFIX}" STREQUAL "")) +set (DELETE_STATIC_LIBS "") if (NOT BUILD_STATIC_LIBS) message (STATUS "forcing build of a temporary static library for symbol renaming") set (BUILD_SHARED_LIBS OFF CACHE BOOL "Build shared library" FORCE) set (BUILD_STATIC_LIBS ON CACHE BOOL "Build static library" FORCE) - set (DELETE_STATIC_LIBS 1) + set (DELETE_STATIC_LIBS file (REMOVE $/${OpenBLAS_LIBNAME}.lib)) endif () endif() @@ -481,15 +482,22 @@ if (BUILD_SHARED_LIBS OR DELETE_STATIC_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFI if (CMAKE_SYSTEM_NAME MATCHES "Windows") set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) - #if (USE_PERL) +if (CMAKE_BUILD_TYPE MATCHES "Debug") +set (CRTLIB msvcrtd) +set (PDBOPT -debug -pdb:$/${OpenBLAS_LIBNAME}.pdb) +set (PDB_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) +else () +set (CRTLIB msvcrt) +set (PDBOPT "") +endif() + #if (USE_PERL) message(STATUS "adding postbuild instruction to rename syms") add_custom_command(TARGET ${OpenBLAS_LIBNAME}_static POST_BUILD - COMMAND perl ${PROJECT_SOURCE_DIR}/exports/gensymbol.pl "win2k" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BLD}" "${BBF16}" "${BS}" "${BD}" "${BC}" "${BZ}" > ${PROJECT_BINARY_DIR}/renamesyms.def + COMMAND perl ${PROJECT_SOURCE_DIR}/exports/gensymbol.pl "win2k" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" "${SYMBOLPREFIX}" "${SYMBOLSUFFIX}" "${BLD}" "${BBF16}" "${BS}" "${BD}" "${BC}" "${BZ}" > ${PROJECT_BINARY_DIR}/renamesyms.def COMMAND ${CMAKE_C_COMPILER} ${CMAKE_C_FLAGS} -I${PROJECT_SOURCE_DIR} -I${PROJECT_BINARY_DIR} -c -o ${PROJECT_BINARY_DIR}/dllinit.o ${PROJECT_SOURCE_DIR}/exports/dllinit.c - COMMAND lld-link -nodefaultlib:libcmt -defaultlib:msvcrt ${CMAKE_LINKER_FLAGS} -errorlimit:0 -def:${PROJECT_BINARY_DIR}/renamesyms.def ${PROJECT_BINARY_DIR}/dllinit.o $ -wholearchive:$ -dll -out:$/${OpenBLAS_LIBNAME}.dll -implib:$/${OpenBLAS_LIBNAME}.dll.a - #if (${REMOVE_STATIC_LIB}) - #file (REMOVE $/${OpenBLAS_LIBNAME}.lib) - #endif () + COMMAND lld-link -nodefaultlib:libcmt -defaultlib:${CRTLIB} ${CMAKE_LINKER_FLAGS} -errorlimit:0 -def:${PROJECT_BINARY_DIR}/renamesyms.def ${PROJECT_BINARY_DIR}/dllinit.o $ -wholearchive:$ -dll -out:$/${OpenBLAS_LIBNAME}.dll -implib:$/${OpenBLAS_LIBNAME}.dll.a ${PDBOPT} + #COMMAND lld-link -nodefaultlib:libcmt -defaultlib:msvcrt ${CMAKE_LINKER_FLAGS} -errorlimit:0 -def:${PROJECT_BINARY_DIR}/renamesyms.def ${PROJECT_BINARY_DIR}/dllinit.o $ -wholearchive:$ -dll -out:$/${OpenBLAS_LIBNAME}.dll -implib:$/${OpenBLAS_LIBNAME}.dll.a + ${REMOVE_STATIC_LIB} VERBATIM ) #endif () else () From 04915be8295f0c28723f9dd6a69fc55a6f680118 Mon Sep 17 00:00:00 2001 From: Vaisakh K V Date: Thu, 3 Apr 2025 12:18:43 +0530 Subject: [PATCH 101/205] Add vector registers to clobber list to prevent compiler optimization. SME based SGEMMDIRECT kernel uses the vector registers (z) and adding clobber list informs compiler not to optimize these registers. --- kernel/arm64/sgemm_direct_arm64_sme1.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/kernel/arm64/sgemm_direct_arm64_sme1.c b/kernel/arm64/sgemm_direct_arm64_sme1.c index bd7e54889..50c2a9a2d 100644 --- a/kernel/arm64/sgemm_direct_arm64_sme1.c +++ b/kernel/arm64/sgemm_direct_arm64_sme1.c @@ -7,7 +7,6 @@ #include #include #include - #if defined(HAVE_SME) /* Function prototypes */ @@ -44,7 +43,17 @@ void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A,\ m_mod = ceil((double)M/(double)vl_elms) * vl_elms; float *A_mod = (float *) malloc(m_mod*K*sizeof(float)); - + + /* Prevent compiler optimization by reading from memory instead + * of reading directly from vector (z) registers. + * */ + asm volatile("" : : :"p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", + "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", + "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", + "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", + "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", + "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"); + /* Pre-process the left matrix to make it suitable for matrix sum of outer-product calculation */ @@ -52,7 +61,13 @@ void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A,\ /* Calculate C = A*B */ sgemm_direct_sme1_2VLx2VL(M, K, N, A_mod, B, R); - + + asm volatile("" : : :"p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", + "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", + "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", + "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", + "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", + "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"); free(A_mod); } From 0aa5ef29ec27ad6b0c5858d352ba1b4ea0b35b50 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 3 Apr 2025 23:54:56 +0200 Subject: [PATCH 102/205] Repeat the libs target's "ln" in the all target to ensure completeness --- Makefile | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Makefile b/Makefile index 4c7217734..2083c3c78 100644 --- a/Makefile +++ b/Makefile @@ -93,6 +93,11 @@ ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))";\ fi endif + +ifeq ($(OSNAME), WINNT) + @-$(LNCMD) $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) +endif + ifneq ($(OSNAME), AIX) @echo -n " Library Name ... $(LIBNAME)" else From 7bf848454ddfd8713c50de6c55138388c2823a33 Mon Sep 17 00:00:00 2001 From: ColumbusAI <75283809+ColumbusAI@users.noreply.github.com> Date: Sat, 5 Apr 2025 09:57:53 -0700 Subject: [PATCH 103/205] Update zsum.c -- fixed spelling error to successfully compile spelling error where zsum_kernel is used and it should be zasum_kernel. Will not compile without fix. --- kernel/x86_64/zsum.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/zsum.c b/kernel/x86_64/zsum.c index 5973c1253..974d1b2f1 100644 --- a/kernel/x86_64/zsum.c +++ b/kernel/x86_64/zsum.c @@ -54,7 +54,7 @@ static FLOAT sum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (n <= 0 || inc_x <= 0) return(sumf); if (inc_x == 1) { - sumf = zsum_kernel(n, x); + sumf = zasum_kernel(n, x); } else { inc_x2 = 2 * inc_x; From 1ed962d25975ff9fba9ca08f5f5d5101bd6426b7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 6 Apr 2025 10:44:48 -0700 Subject: [PATCH 104/205] Fix compilation with xcode16.3/clang17/gcc14 --- Makefile.system | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/Makefile.system b/Makefile.system index d6dd9e960..79544276b 100644 --- a/Makefile.system +++ b/Makefile.system @@ -435,6 +435,15 @@ ifeq (x$(XCVER), x 15) CCOMMON_OPT += -Wl,-ld_classic FCOMMON_OPT += -Wl,-ld_classic endif +ifeq (x$(XCVER), x 16) +ifeq ($(C_COMPILER), GCC) +CCOMMON_OPT += -Wl,-ld_classic +FCOMMON_OPT += -Wl,-ld_classic +endif +ifeq ($(F_COMPILER), GFORTRAN) +override CEXTRALIB := $(filter-out(-lto_library, $(CEXTRALIB))) +endif +endif endif ifneq (,$(findstring $(OSNAME), FreeBSD OpenBSD DragonFly)) From 67c5bdd639b7c64b764bf5a279e8f58a567742ab Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 7 Apr 2025 12:20:43 -0700 Subject: [PATCH 105/205] Azure CI: Update flang call in OSX_LLVM_flangnew job (#5208) * Update flang call in OSX_LLVM_flangnew job --- azure-pipelines.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 26f4c2af3..7941bf463 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -175,7 +175,7 @@ jobs: - script: | brew update brew install llvm flang - make TARGET=NEHALEM CC=/usr/local/opt/llvm/bin/clang FC=/usr/local/Cellar/flang/19.1.7_1/bin/flang-new NO_SHARED=1 + make TARGET=NEHALEM CC=/usr/local/opt/llvm/bin/clang FC=/usr/local/opt/flang/bin/flang NO_SHARED=1 - job: OSX_OpenMP_Clang pool: From 1c5d0d5539d2d2dfb5d2cc431a9cba1a214f59df Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 8 Apr 2025 10:44:36 +0200 Subject: [PATCH 106/205] move libomp to extralib --- test/Makefile | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/Makefile b/test/Makefile index 65576d3dd..9ba88988b 100644 --- a/test/Makefile +++ b/test/Makefile @@ -299,18 +299,18 @@ CLDFLAGS = $(CFLAGS) $(LDFLAGS) ifeq ($(USE_OPENMP), 1) ifeq ($(F_COMPILER), GFORTRAN) ifeq ($(C_COMPILER), CLANG) -CEXTRALIB += -lomp +EXTRALIB += -lomp endif endif ifeq ($(F_COMPILER), NAG) -CEXTRALIB = -lgomp +EXTRALIB = -lgomp endif ifeq ($(F_COMPILER), IBM) ifeq ($(C_COMPILER), GCC) -CEXTRALIB += -lgomp +EXTRALIB += -lgomp endif ifeq ($(C_COMPILER), CLANG) -CEXTRALIB += -lomp +EXTRALIB += -lomp endif endif endif From fc8090b60774447a2d205bda363f14408e14a780 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 8 Apr 2025 11:54:36 +0200 Subject: [PATCH 107/205] Move additional omp dependency to EXTRALIB --- ctest/Makefile | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ctest/Makefile b/ctest/Makefile index 877a190c1..e6f683bd8 100644 --- a/ctest/Makefile +++ b/ctest/Makefile @@ -235,18 +235,18 @@ FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS) ifeq ($(USE_OPENMP), 1) ifeq ($(F_COMPILER), GFORTRAN) ifeq ($(C_COMPILER), CLANG) -CEXTRALIB += -lomp +EXTRALIB += -lomp endif endif ifeq ($(F_COMPILER), NAG) -CEXTRALIB = -lgomp +EXTRALIB = -lgomp endif ifeq ($(F_COMPILER), IBM) ifeq ($(C_COMPILER), GCC) -CEXTRALIB += -lgomp +EXTRALIB += -lgomp endif ifeq ($(C_COMPILER), CLANG) -CEXTRALIB += -lomp +EXTRALIB += -lomp endif endif endif From 1ff303f36e54da80e459aad3556b561ee70b78b0 Mon Sep 17 00:00:00 2001 From: lglglglgy <2661896437@qq.com> Date: Tue, 8 Apr 2025 21:18:00 +0800 Subject: [PATCH 108/205] Optimizing the Implementation of GEMV on the RISC-V V Extension Specialized some scenarios, performed loop unrolling, and reduced the number of multiplications. --- kernel/riscv64/gemv_n_vector.c | 304 ++++++++++++++++++++++----------- 1 file changed, 207 insertions(+), 97 deletions(-) diff --git a/kernel/riscv64/gemv_n_vector.c b/kernel/riscv64/gemv_n_vector.c index aa13fc87d..64ed532cb 100644 --- a/kernel/riscv64/gemv_n_vector.c +++ b/kernel/riscv64/gemv_n_vector.c @@ -27,13 +27,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) -#define FLOAT_V_T vfloat32m4_t -#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4) -#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4) -#define VSEV_FLOAT RISCV_RVV(vse32_v_f32m4) -#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4) -#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4) +#define VSETVL(n) RISCV_RVV(vsetvl_e32m8)(n) +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m8) +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m8) +#define VSEV_FLOAT RISCV_RVV(vse32_v_f32m8) +#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m8) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m8) +#define VFMUL_VF_FLOAT RISCV_RVV(vfmul_vf_f32m8) +#define VFILL_ZERO_FLOAT RISCV_RVV(vfsub_vv_f32m8) #else #define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) #define FLOAT_V_T vfloat64m4_t @@ -42,103 +44,211 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSEV_FLOAT RISCV_RVV(vse64_v_f64m4) #define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4) #define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4) +#define VFMUL_VF_FLOAT RISCV_RVV(vfmul_vf_f64m4) +#define VFILL_ZERO_FLOAT RISCV_RVV(vfsub_vv_f64m4) #endif int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { - BLASLONG i = 0, j = 0, k = 0; - BLASLONG ix = 0, iy = 0; - - if(n < 0) return(0); - FLOAT *a_ptr = a; - FLOAT temp = 0.0; - FLOAT_V_T va0, va1, vy0, vy1; - unsigned int gvl = 0; - if(inc_y == 1){ - gvl = VSETVL(m); - if(gvl <= m/2){ - for(k=0,j=0; k Date: Tue, 8 Apr 2025 07:03:11 -0700 Subject: [PATCH 109/205] Fix incomplete error message (Reference-LAPACK PR 1119) --- lapack-netlib/TESTING/EIG/cerred.f | 12 ++++++------ lapack-netlib/TESTING/EIG/derred.f | 12 ++++++------ lapack-netlib/TESTING/EIG/serred.f | 12 ++++++------ lapack-netlib/TESTING/EIG/zerred.f | 12 ++++++------ 4 files changed, 24 insertions(+), 24 deletions(-) diff --git a/lapack-netlib/TESTING/EIG/cerred.f b/lapack-netlib/TESTING/EIG/cerred.f index 98d157080..7514a3241 100644 --- a/lapack-netlib/TESTING/EIG/cerred.f +++ b/lapack-netlib/TESTING/EIG/cerred.f @@ -332,7 +332,7 @@ WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ), $ NT ELSE - WRITE( NOUT, FMT = 9998 ) + WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ) END IF * * Test CGESDD @@ -367,7 +367,7 @@ WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ), $ NT ELSE - WRITE( NOUT, FMT = 9998 ) + WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ) END IF * * Test CGEJSV @@ -433,7 +433,7 @@ WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ), $ NT ELSE - WRITE( NOUT, FMT = 9998 ) + WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ) END IF * * Test CGESVDX @@ -492,7 +492,7 @@ WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ), $ NT ELSE - WRITE( NOUT, FMT = 9998 ) + WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ) END IF * * Test CGESVDQ @@ -547,7 +547,7 @@ WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ), $ NT ELSE - WRITE( NOUT, FMT = 9998 ) + WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ) END IF END IF * @@ -558,7 +558,7 @@ WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ), $ NT ELSE - WRITE( NOUT, FMT = 9998 ) + WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ) END IF END IF * diff --git a/lapack-netlib/TESTING/EIG/derred.f b/lapack-netlib/TESTING/EIG/derred.f index 11a932052..faa716f8b 100644 --- a/lapack-netlib/TESTING/EIG/derred.f +++ b/lapack-netlib/TESTING/EIG/derred.f @@ -329,7 +329,7 @@ WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ), $ NT ELSE - WRITE( NOUT, FMT = 9998 ) + WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ) END IF * * Test DGESDD @@ -358,7 +358,7 @@ WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ), $ NT ELSE - WRITE( NOUT, FMT = 9998 ) + WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ) END IF * * Test DGEJSV @@ -424,7 +424,7 @@ WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ), $ NT ELSE - WRITE( NOUT, FMT = 9998 ) + WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ) END IF * * Test DGESVDX @@ -483,7 +483,7 @@ WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ), $ NT ELSE - WRITE( NOUT, FMT = 9998 ) + WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ) END IF * * Test DGESVDQ @@ -538,7 +538,7 @@ WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ), $ NT ELSE - WRITE( NOUT, FMT = 9998 ) + WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ) END IF END IF * @@ -549,7 +549,7 @@ WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ), $ NT ELSE - WRITE( NOUT, FMT = 9998 ) + WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ) END IF END IF * diff --git a/lapack-netlib/TESTING/EIG/serred.f b/lapack-netlib/TESTING/EIG/serred.f index b52aa1624..a55097eba 100644 --- a/lapack-netlib/TESTING/EIG/serred.f +++ b/lapack-netlib/TESTING/EIG/serred.f @@ -329,7 +329,7 @@ WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ), $ NT ELSE - WRITE( NOUT, FMT = 9998 ) + WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ) END IF * * Test SGESDD @@ -358,7 +358,7 @@ WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ), $ NT ELSE - WRITE( NOUT, FMT = 9998 ) + WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ) END IF * * Test SGEJSV @@ -424,7 +424,7 @@ WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ), $ NT ELSE - WRITE( NOUT, FMT = 9998 ) + WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ) END IF * * Test SGESVDX @@ -483,7 +483,7 @@ WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ), $ NT ELSE - WRITE( NOUT, FMT = 9998 ) + WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ) END IF * * Test SGESVDQ @@ -538,7 +538,7 @@ WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ), $ NT ELSE - WRITE( NOUT, FMT = 9998 ) + WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ) END IF END IF * @@ -549,7 +549,7 @@ WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ), $ NT ELSE - WRITE( NOUT, FMT = 9998 ) + WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ) END IF END IF * diff --git a/lapack-netlib/TESTING/EIG/zerred.f b/lapack-netlib/TESTING/EIG/zerred.f index 1876c1f1d..f325dcdc3 100644 --- a/lapack-netlib/TESTING/EIG/zerred.f +++ b/lapack-netlib/TESTING/EIG/zerred.f @@ -332,7 +332,7 @@ WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ), $ NT ELSE - WRITE( NOUT, FMT = 9998 ) + WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ) END IF * * Test ZGESDD @@ -367,7 +367,7 @@ WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ), $ NT ELSE - WRITE( NOUT, FMT = 9998 ) + WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ) END IF * * Test ZGEJSV @@ -433,7 +433,7 @@ WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ), $ NT ELSE - WRITE( NOUT, FMT = 9998 ) + WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ) END IF * * Test ZGESVDX @@ -492,7 +492,7 @@ WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ), $ NT ELSE - WRITE( NOUT, FMT = 9998 ) + WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ) END IF * * Test ZGESVDQ @@ -547,7 +547,7 @@ WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ), $ NT ELSE - WRITE( NOUT, FMT = 9998 ) + WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ) END IF END IF * @@ -558,7 +558,7 @@ WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ), $ NT ELSE - WRITE( NOUT, FMT = 9998 ) + WRITE( NOUT, FMT = 9998 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ) END IF END IF * From a34b487f225e219670e95ad89cc7ea98121a9439 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 9 Apr 2025 17:25:46 +0200 Subject: [PATCH 110/205] Remove spurious cast from Alpha and Cell's DEFAULT_ALIGN --- param.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/param.h b/param.h index 229554f33..48b64fd2a 100644 --- a/param.h +++ b/param.h @@ -2146,7 +2146,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 512 #define GEMM_DEFAULT_OFFSET_B 512 -#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL +#define GEMM_DEFAULT_ALIGN 0x0ffffUL #define SGEMM_DEFAULT_UNROLL_M 4 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -2214,7 +2214,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 8192 -#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL +#define GEMM_DEFAULT_ALIGN 0x0ffffUL #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 4 From ec146157d380c2a1003bac4d25b6fc39d55d4c0e Mon Sep 17 00:00:00 2001 From: Annop Wongwathanarat Date: Wed, 2 Apr 2025 09:11:58 +0000 Subject: [PATCH 111/205] Use SVE kernel for S/DGEMVT for SVE machines --- CONTRIBUTORS.md | 3 +++ kernel/arm64/KERNEL.ARMV8SVE | 4 ++-- kernel/arm64/KERNEL.NEOVERSEN2 | 4 ++-- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 2e2979acc..6b0814dcc 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -26,6 +26,9 @@ * Chris Sidebottom * Optimizations and other improvements targeting AArch64 +* Annop Wongwathanarat + * Optimizations and other improvements targeting AArch64 + ## Previous Developers * Zaheer Chothia diff --git a/kernel/arm64/KERNEL.ARMV8SVE b/kernel/arm64/KERNEL.ARMV8SVE index 9adacce63..4ff53c6d0 100644 --- a/kernel/arm64/KERNEL.ARMV8SVE +++ b/kernel/arm64/KERNEL.ARMV8SVE @@ -79,8 +79,8 @@ DGEMVNKERNEL = gemv_n.S CGEMVNKERNEL = zgemv_n.S ZGEMVNKERNEL = zgemv_n.S -SGEMVTKERNEL = gemv_t.S -DGEMVTKERNEL = gemv_t.S +SGEMVTKERNEL = gemv_t_sve_v1x3.c +DGEMVTKERNEL = gemv_t_sve_v1x3.c CGEMVTKERNEL = zgemv_t.S ZGEMVTKERNEL = zgemv_t.S diff --git a/kernel/arm64/KERNEL.NEOVERSEN2 b/kernel/arm64/KERNEL.NEOVERSEN2 index fc7fe6930..b9dc23562 100644 --- a/kernel/arm64/KERNEL.NEOVERSEN2 +++ b/kernel/arm64/KERNEL.NEOVERSEN2 @@ -65,8 +65,8 @@ DGEMVNKERNEL = gemv_n.S CGEMVNKERNEL = zgemv_n.S ZGEMVNKERNEL = zgemv_n.S -SGEMVTKERNEL = gemv_t.S -DGEMVTKERNEL = gemv_t.S +SGEMVTKERNEL = gemv_t_sve_v1x3.c +DGEMVTKERNEL = gemv_t_sve_v1x3.c CGEMVTKERNEL = zgemv_t.S ZGEMVTKERNEL = zgemv_t.S From 51ba70f47bba6b7a161f61526a2f196fe896e1f6 Mon Sep 17 00:00:00 2001 From: Harmen Stoppels Date: Thu, 10 Apr 2025 15:20:34 +0200 Subject: [PATCH 112/205] test_potrs.c: remove pragma darwin-aarch64 support Using GCC 14.2.0 on Darwin, the pragma ultimately causes a linker error "ld: invalid r_symbolnum=". The current workaround is to use the old linker, but (a) it's deprecated and (b) it can produce libraries that are subsequently not linkable with the newer linker in dependents: the new ld64 does not link to libraries with duplicate rpaths created by the classic linker. --- Makefile.system | 4 ---- utest/test_potrs.c | 1 - 2 files changed, 5 deletions(-) diff --git a/Makefile.system b/Makefile.system index 79544276b..ac6a41c92 100644 --- a/Makefile.system +++ b/Makefile.system @@ -436,10 +436,6 @@ CCOMMON_OPT += -Wl,-ld_classic FCOMMON_OPT += -Wl,-ld_classic endif ifeq (x$(XCVER), x 16) -ifeq ($(C_COMPILER), GCC) -CCOMMON_OPT += -Wl,-ld_classic -FCOMMON_OPT += -Wl,-ld_classic -endif ifeq ($(F_COMPILER), GFORTRAN) override CEXTRALIB := $(filter-out(-lto_library, $(CEXTRALIB))) endif diff --git a/utest/test_potrs.c b/utest/test_potrs.c index 642ce1e37..e6ccf4bb6 100644 --- a/utest/test_potrs.c +++ b/utest/test_potrs.c @@ -32,7 +32,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ #include "openblas_utest.h" -#pragma GCC optimize("no-gcse") /* void BLASFUNC(cpotrf)(char*, BLASINT*, complex float*, BLASINT*, BLASINT*); void BLASFUNC(zpotrs_(char*, BLASINT*, BLASINT*, complex double*, From 3d6d026fe1f61ed4ee501aa7d69b40196ee1b174 Mon Sep 17 00:00:00 2001 From: Harmen Stoppels Date: Thu, 10 Apr 2025 15:44:31 +0200 Subject: [PATCH 113/205] no-gcse when loongarch64 --- utest/test_potrs.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/utest/test_potrs.c b/utest/test_potrs.c index e6ccf4bb6..bcb1f753b 100644 --- a/utest/test_potrs.c +++ b/utest/test_potrs.c @@ -32,6 +32,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ #include "openblas_utest.h" +#if defined(ARCH_LOONGARCH64) +#pragma GCC optimize("no-gcse") +#endif /* void BLASFUNC(cpotrf)(char*, BLASINT*, complex float*, BLASINT*, BLASINT*); void BLASFUNC(zpotrs_(char*, BLASINT*, BLASINT*, complex double*, From fd3afef1222b95d4c1a809667e7a76e259675fe1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 10 Apr 2025 22:09:19 +0200 Subject: [PATCH 114/205] lapacke_mangling.h is no longer generated, so don't delete on make clean --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 2083c3c78..90de50913 100644 --- a/Makefile +++ b/Makefile @@ -452,7 +452,7 @@ endif @rm -f cblas.tmp cblas.tmp2 @touch $(NETLIB_LAPACK_DIR)/make.inc @$(MAKE) -C $(NETLIB_LAPACK_DIR) clean - @rm -f $(NETLIB_LAPACK_DIR)/make.inc $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_mangling.h + @rm -f $(NETLIB_LAPACK_DIR)/make.inc @$(MAKE) -C relapack clean @rm -f *.grd Makefile.conf_last config_last.h @(cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out testing_results.txt) From 211dfd0754a3800d178b14b24e312330954721df Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 10 Apr 2025 22:21:57 +0200 Subject: [PATCH 115/205] disable the CooperLake microkernel as it produces wrong results --- kernel/x86_64/sbgemv_n.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/x86_64/sbgemv_n.c b/kernel/x86_64/sbgemv_n.c index 08ccace61..b2d4eb74f 100644 --- a/kernel/x86_64/sbgemv_n.c +++ b/kernel/x86_64/sbgemv_n.c @@ -28,9 +28,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) -#include "sbgemv_n_microk_cooperlake.c" -#endif +//#if defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) +//#include "sbgemv_n_microk_cooperlake.c" +//#endif #define ALIGN64_ALLOC(alloc_size, TYPE, ptr_align, ptr) \ ptr = (TYPE *) malloc(sizeof(TYPE)*alloc_size + 63); \ From f1e628b88974ba984adf0ea08c817aef60548720 Mon Sep 17 00:00:00 2001 From: "Iha, Taisei" Date: Fri, 11 Apr 2025 20:00:33 +0900 Subject: [PATCH 116/205] Further performance improvements to [SD]GEMV. --- kernel/arm64/KERNEL.A64FX | 4 +- kernel/arm64/KERNEL.NEOVERSEV1 | 2 + kernel/arm64/gemv_n_sve_v1x3.c | 138 ++++++++++++++++++++++ kernel/arm64/gemv_n_sve_v4x3.c | 207 +++++++++++++++++++++++++++++++++ 4 files changed, 349 insertions(+), 2 deletions(-) create mode 100644 kernel/arm64/gemv_n_sve_v1x3.c create mode 100644 kernel/arm64/gemv_n_sve_v4x3.c diff --git a/kernel/arm64/KERNEL.A64FX b/kernel/arm64/KERNEL.A64FX index 75f0f39a7..3d68271da 100644 --- a/kernel/arm64/KERNEL.A64FX +++ b/kernel/arm64/KERNEL.A64FX @@ -1,6 +1,6 @@ include $(KERNELDIR)/KERNEL.ARMV8SVE -SGEMVNKERNEL = gemv_n_sve.c -DGEMVNKERNEL = gemv_n_sve.c +SGEMVNKERNEL = gemv_n_sve_v4x3.c +DGEMVNKERNEL = gemv_n_sve_v4x3.c SGEMVTKERNEL = gemv_t_sve_v4x3.c DGEMVTKERNEL = gemv_t_sve_v4x3.c diff --git a/kernel/arm64/KERNEL.NEOVERSEV1 b/kernel/arm64/KERNEL.NEOVERSEV1 index bacedf8cf..3e622bcbf 100644 --- a/kernel/arm64/KERNEL.NEOVERSEV1 +++ b/kernel/arm64/KERNEL.NEOVERSEV1 @@ -1,5 +1,7 @@ include $(KERNELDIR)/KERNEL.ARMV8SVE +SGEMVNKERNEL = gemv_n_sve_v1x3.c +DGEMVNKERNEL = gemv_n_sve_v1x3.c SGEMVTKERNEL = gemv_t_sve_v1x3.c DGEMVTKERNEL = gemv_t_sve_v1x3.c ifeq ($(BUILD_BFLOAT16), 1) diff --git a/kernel/arm64/gemv_n_sve_v1x3.c b/kernel/arm64/gemv_n_sve_v1x3.c new file mode 100644 index 000000000..44c9a89b9 --- /dev/null +++ b/kernel/arm64/gemv_n_sve_v1x3.c @@ -0,0 +1,138 @@ +/*************************************************************************** +Copyright (c) 2025, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include + +#include "common.h" + +#ifdef DOUBLE +#define SV_COUNT svcntd +#define SV_TYPE svfloat64_t +#define SV_TRUE svptrue_b64 +#define SV_WHILE svwhilelt_b64_s64 +#define SV_DUP svdup_f64 +#else +#define SV_COUNT svcntw +#define SV_TYPE svfloat32_t +#define SV_TRUE svptrue_b32 +#define SV_WHILE svwhilelt_b32_s64 +#define SV_DUP svdup_f32 +#endif + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, + BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, + FLOAT *buffer) +{ + BLASLONG i; + BLASLONG ix,iy; + BLASLONG j; + FLOAT *a_ptr; + FLOAT temp; + + ix = 0; + a_ptr = a; + + if (inc_y == 1) { + BLASLONG width = (n + 3 - 1) / 3; + + FLOAT *a0_ptr = a_ptr + lda * width * 0; + FLOAT *a1_ptr = a_ptr + lda * width * 1; + FLOAT *a2_ptr = a_ptr + lda * width * 2; + + FLOAT *x0_ptr = x + inc_x * width * 0; + FLOAT *x1_ptr = x + inc_x * width * 1; + FLOAT *x2_ptr = x + inc_x * width * 2; + + for (j = 0; j < width; j++) { + svbool_t pg00 = ((j + width * 0) < n) ? SV_TRUE() : svpfalse(); + svbool_t pg01 = ((j + width * 1) < n) ? SV_TRUE() : svpfalse(); + svbool_t pg02 = ((j + width * 2) < n) ? SV_TRUE() : svpfalse(); + + SV_TYPE temp0_vec = SV_DUP(alpha * x0_ptr[ix]); + SV_TYPE temp1_vec = SV_DUP(alpha * x1_ptr[ix]); + SV_TYPE temp2_vec = SV_DUP(alpha * x2_ptr[ix]); + i = 0; + BLASLONG sve_size = SV_COUNT(); + while ((i + sve_size * 1 - 1) < m) { + SV_TYPE y0_vec = svld1_vnum(SV_TRUE(), y + i, 0); + + SV_TYPE a00_vec = svld1_vnum(pg00, a0_ptr + i, 0); + SV_TYPE a01_vec = svld1_vnum(pg01, a1_ptr + i, 0); + SV_TYPE a02_vec = svld1_vnum(pg02, a2_ptr + i, 0); + + y0_vec = svmla_m(pg00, y0_vec, temp0_vec, a00_vec); + y0_vec = svmla_m(pg01, y0_vec, temp1_vec, a01_vec); + y0_vec = svmla_m(pg02, y0_vec, temp2_vec, a02_vec); + + svst1_vnum(SV_TRUE(), y + i, 0, y0_vec); + i += sve_size * 1; + } + + if (i < m) { + svbool_t pg0 = SV_WHILE(i + sve_size * 0, m); + + pg00 = svand_z(SV_TRUE(), pg0, pg00); + pg01 = svand_z(SV_TRUE(), pg0, pg01); + pg02 = svand_z(SV_TRUE(), pg0, pg02); + + SV_TYPE y0_vec = svld1_vnum(pg0, y + i, 0); + + SV_TYPE a00_vec = svld1_vnum(pg00, a0_ptr + i, 0); + SV_TYPE a01_vec = svld1_vnum(pg01, a1_ptr + i, 0); + SV_TYPE a02_vec = svld1_vnum(pg02, a2_ptr + i, 0); + + y0_vec = svmla_m(pg00, y0_vec, temp0_vec, a00_vec); + y0_vec = svmla_m(pg01, y0_vec, temp1_vec, a01_vec); + y0_vec = svmla_m(pg02, y0_vec, temp2_vec, a02_vec); + + svst1_vnum(pg0, y + i, 0, y0_vec); + } + a0_ptr += lda; + a1_ptr += lda; + a2_ptr += lda; + ix += inc_x; + } + return(0); + } + + for (j = 0; j < n; j++) { + temp = alpha * x[ix]; + iy = 0; + for (i = 0; i < m; i++) { + y[iy] += temp * a_ptr[i]; + iy += inc_y; + } + a_ptr += lda; + ix += inc_x; + } + return (0); +} diff --git a/kernel/arm64/gemv_n_sve_v4x3.c b/kernel/arm64/gemv_n_sve_v4x3.c new file mode 100644 index 000000000..92e4f75b6 --- /dev/null +++ b/kernel/arm64/gemv_n_sve_v4x3.c @@ -0,0 +1,207 @@ +/*************************************************************************** +Copyright (c) 2025, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include + +#include "common.h" + +#ifdef DOUBLE +#define SV_COUNT svcntd +#define SV_TYPE svfloat64_t +#define SV_TRUE svptrue_b64 +#define SV_WHILE svwhilelt_b64_s64 +#define SV_DUP svdup_f64 +#else +#define SV_COUNT svcntw +#define SV_TYPE svfloat32_t +#define SV_TRUE svptrue_b32 +#define SV_WHILE svwhilelt_b32_s64 +#define SV_DUP svdup_f32 +#endif + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, + BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, + FLOAT *buffer) +{ + BLASLONG i; + BLASLONG ix,iy; + BLASLONG j; + FLOAT *a_ptr; + FLOAT temp; + + ix = 0; + a_ptr = a; + + if (inc_y == 1) { + BLASLONG width = (n + 3 - 1) / 3; + + FLOAT *a0_ptr = a_ptr + lda * width * 0; + FLOAT *a1_ptr = a_ptr + lda * width * 1; + FLOAT *a2_ptr = a_ptr + lda * width * 2; + + FLOAT *x0_ptr = x + inc_x * width * 0; + FLOAT *x1_ptr = x + inc_x * width * 1; + FLOAT *x2_ptr = x + inc_x * width * 2; + + for (j = 0; j < width; j++) { + svbool_t pg00 = ((j + width * 0) < n) ? SV_TRUE() : svpfalse(); + svbool_t pg10 = ((j + width * 0) < n) ? SV_TRUE() : svpfalse(); + svbool_t pg20 = ((j + width * 0) < n) ? SV_TRUE() : svpfalse(); + svbool_t pg30 = ((j + width * 0) < n) ? SV_TRUE() : svpfalse(); + svbool_t pg01 = ((j + width * 1) < n) ? SV_TRUE() : svpfalse(); + svbool_t pg11 = ((j + width * 1) < n) ? SV_TRUE() : svpfalse(); + svbool_t pg21 = ((j + width * 1) < n) ? SV_TRUE() : svpfalse(); + svbool_t pg31 = ((j + width * 1) < n) ? SV_TRUE() : svpfalse(); + svbool_t pg02 = ((j + width * 2) < n) ? SV_TRUE() : svpfalse(); + svbool_t pg12 = ((j + width * 2) < n) ? SV_TRUE() : svpfalse(); + svbool_t pg22 = ((j + width * 2) < n) ? SV_TRUE() : svpfalse(); + svbool_t pg32 = ((j + width * 2) < n) ? SV_TRUE() : svpfalse(); + + SV_TYPE temp0_vec = SV_DUP(alpha * x0_ptr[ix]); + SV_TYPE temp1_vec = SV_DUP(alpha * x1_ptr[ix]); + SV_TYPE temp2_vec = SV_DUP(alpha * x2_ptr[ix]); + i = 0; + BLASLONG sve_size = SV_COUNT(); + while ((i + sve_size * 4 - 1) < m) { + SV_TYPE y0_vec = svld1_vnum(SV_TRUE(), y + i, 0); + SV_TYPE y1_vec = svld1_vnum(SV_TRUE(), y + i, 1); + SV_TYPE y2_vec = svld1_vnum(SV_TRUE(), y + i, 2); + SV_TYPE y3_vec = svld1_vnum(SV_TRUE(), y + i, 3); + + SV_TYPE a00_vec = svld1_vnum(pg00, a0_ptr + i, 0); + SV_TYPE a10_vec = svld1_vnum(pg10, a0_ptr + i, 1); + SV_TYPE a20_vec = svld1_vnum(pg20, a0_ptr + i, 2); + SV_TYPE a30_vec = svld1_vnum(pg30, a0_ptr + i, 3); + SV_TYPE a01_vec = svld1_vnum(pg01, a1_ptr + i, 0); + SV_TYPE a11_vec = svld1_vnum(pg11, a1_ptr + i, 1); + SV_TYPE a21_vec = svld1_vnum(pg21, a1_ptr + i, 2); + SV_TYPE a31_vec = svld1_vnum(pg31, a1_ptr + i, 3); + SV_TYPE a02_vec = svld1_vnum(pg02, a2_ptr + i, 0); + SV_TYPE a12_vec = svld1_vnum(pg12, a2_ptr + i, 1); + SV_TYPE a22_vec = svld1_vnum(pg22, a2_ptr + i, 2); + SV_TYPE a32_vec = svld1_vnum(pg32, a2_ptr + i, 3); + + y0_vec = svmla_m(pg00, y0_vec, temp0_vec, a00_vec); + y1_vec = svmla_m(pg10, y1_vec, temp0_vec, a10_vec); + y2_vec = svmla_m(pg20, y2_vec, temp0_vec, a20_vec); + y3_vec = svmla_m(pg30, y3_vec, temp0_vec, a30_vec); + y0_vec = svmla_m(pg01, y0_vec, temp1_vec, a01_vec); + y1_vec = svmla_m(pg11, y1_vec, temp1_vec, a11_vec); + y2_vec = svmla_m(pg21, y2_vec, temp1_vec, a21_vec); + y3_vec = svmla_m(pg31, y3_vec, temp1_vec, a31_vec); + y0_vec = svmla_m(pg02, y0_vec, temp2_vec, a02_vec); + y1_vec = svmla_m(pg12, y1_vec, temp2_vec, a12_vec); + y2_vec = svmla_m(pg22, y2_vec, temp2_vec, a22_vec); + y3_vec = svmla_m(pg32, y3_vec, temp2_vec, a32_vec); + + svst1_vnum(SV_TRUE(), y + i, 0, y0_vec); + svst1_vnum(SV_TRUE(), y + i, 1, y1_vec); + svst1_vnum(SV_TRUE(), y + i, 2, y2_vec); + svst1_vnum(SV_TRUE(), y + i, 3, y3_vec); + i += sve_size * 4; + } + + if (i < m) { + svbool_t pg0 = SV_WHILE(i + sve_size * 0, m); + svbool_t pg1 = SV_WHILE(i + sve_size * 1, m); + svbool_t pg2 = SV_WHILE(i + sve_size * 2, m); + svbool_t pg3 = SV_WHILE(i + sve_size * 3, m); + + pg00 = svand_z(SV_TRUE(), pg0, pg00); + pg10 = svand_z(SV_TRUE(), pg1, pg10); + pg20 = svand_z(SV_TRUE(), pg2, pg20); + pg30 = svand_z(SV_TRUE(), pg3, pg30); + pg01 = svand_z(SV_TRUE(), pg0, pg01); + pg11 = svand_z(SV_TRUE(), pg1, pg11); + pg21 = svand_z(SV_TRUE(), pg2, pg21); + pg31 = svand_z(SV_TRUE(), pg3, pg31); + pg02 = svand_z(SV_TRUE(), pg0, pg02); + pg12 = svand_z(SV_TRUE(), pg1, pg12); + pg22 = svand_z(SV_TRUE(), pg2, pg22); + pg32 = svand_z(SV_TRUE(), pg3, pg32); + + SV_TYPE y0_vec = svld1_vnum(pg0, y + i, 0); + SV_TYPE y1_vec = svld1_vnum(pg1, y + i, 1); + SV_TYPE y2_vec = svld1_vnum(pg2, y + i, 2); + SV_TYPE y3_vec = svld1_vnum(pg3, y + i, 3); + + SV_TYPE a00_vec = svld1_vnum(pg00, a0_ptr + i, 0); + SV_TYPE a10_vec = svld1_vnum(pg10, a0_ptr + i, 1); + SV_TYPE a20_vec = svld1_vnum(pg20, a0_ptr + i, 2); + SV_TYPE a30_vec = svld1_vnum(pg30, a0_ptr + i, 3); + SV_TYPE a01_vec = svld1_vnum(pg01, a1_ptr + i, 0); + SV_TYPE a11_vec = svld1_vnum(pg11, a1_ptr + i, 1); + SV_TYPE a21_vec = svld1_vnum(pg21, a1_ptr + i, 2); + SV_TYPE a31_vec = svld1_vnum(pg31, a1_ptr + i, 3); + SV_TYPE a02_vec = svld1_vnum(pg02, a2_ptr + i, 0); + SV_TYPE a12_vec = svld1_vnum(pg12, a2_ptr + i, 1); + SV_TYPE a22_vec = svld1_vnum(pg22, a2_ptr + i, 2); + SV_TYPE a32_vec = svld1_vnum(pg32, a2_ptr + i, 3); + + y0_vec = svmla_m(pg00, y0_vec, temp0_vec, a00_vec); + y1_vec = svmla_m(pg10, y1_vec, temp0_vec, a10_vec); + y2_vec = svmla_m(pg20, y2_vec, temp0_vec, a20_vec); + y3_vec = svmla_m(pg30, y3_vec, temp0_vec, a30_vec); + y0_vec = svmla_m(pg01, y0_vec, temp1_vec, a01_vec); + y1_vec = svmla_m(pg11, y1_vec, temp1_vec, a11_vec); + y2_vec = svmla_m(pg21, y2_vec, temp1_vec, a21_vec); + y3_vec = svmla_m(pg31, y3_vec, temp1_vec, a31_vec); + y0_vec = svmla_m(pg02, y0_vec, temp2_vec, a02_vec); + y1_vec = svmla_m(pg12, y1_vec, temp2_vec, a12_vec); + y2_vec = svmla_m(pg22, y2_vec, temp2_vec, a22_vec); + y3_vec = svmla_m(pg32, y3_vec, temp2_vec, a32_vec); + + svst1_vnum(pg0, y + i, 0, y0_vec); + svst1_vnum(pg1, y + i, 1, y1_vec); + svst1_vnum(pg2, y + i, 2, y2_vec); + svst1_vnum(pg3, y + i, 3, y3_vec); + } + a0_ptr += lda; + a1_ptr += lda; + a2_ptr += lda; + ix += inc_x; + } + return(0); + } + + for (j = 0; j < n; j++) { + temp = alpha * x[ix]; + iy = 0; + for (i = 0; i < m; i++) { + y[iy] += temp * a_ptr[i]; + iy += inc_y; + } + a_ptr += lda; + ix += inc_x; + } + return (0); +} From d711906e3e438995146c5c6dce23a2795bbf4740 Mon Sep 17 00:00:00 2001 From: "Usui, Tetsuzo" Date: Fri, 11 Apr 2025 20:39:52 +0900 Subject: [PATCH 117/205] Add symv kernels for arm64 --- kernel/arm64/KERNEL.ARMV8SVE | 5 ++ kernel/arm64/KERNEL.NEOVERSEN1 | 4 + kernel/arm64/symv_L_asimd_4x4.c | 113 +++++++++++++++++++++++++ kernel/arm64/symv_L_sve_v1x4.c | 103 +++++++++++++++++++++++ kernel/arm64/symv_U_asimd_4x4.c | 106 +++++++++++++++++++++++ kernel/arm64/symv_U_sve_v1x4.c | 104 +++++++++++++++++++++++ kernel/arm64/symv_microk_asimd_4x4.c | 120 +++++++++++++++++++++++++++ kernel/arm64/symv_microk_sve_v1x4.c | 89 ++++++++++++++++++++ 8 files changed, 644 insertions(+) create mode 100644 kernel/arm64/symv_L_asimd_4x4.c create mode 100644 kernel/arm64/symv_L_sve_v1x4.c create mode 100644 kernel/arm64/symv_U_asimd_4x4.c create mode 100644 kernel/arm64/symv_U_sve_v1x4.c create mode 100644 kernel/arm64/symv_microk_asimd_4x4.c create mode 100644 kernel/arm64/symv_microk_sve_v1x4.c diff --git a/kernel/arm64/KERNEL.ARMV8SVE b/kernel/arm64/KERNEL.ARMV8SVE index 4ff53c6d0..0e51f2c2f 100644 --- a/kernel/arm64/KERNEL.ARMV8SVE +++ b/kernel/arm64/KERNEL.ARMV8SVE @@ -84,6 +84,11 @@ DGEMVTKERNEL = gemv_t_sve_v1x3.c CGEMVTKERNEL = zgemv_t.S ZGEMVTKERNEL = zgemv_t.S +SSYMV_L_KERNEL = symv_L_sve_v1x4.c +SSYMV_U_KERNEL = symv_U_sve_v1x4.c +DSYMV_L_KERNEL = symv_L_sve_v1x4.c +DSYMV_U_KERNEL = symv_U_sve_v1x4.c + SASUMKERNEL = sasum_thunderx2t99.c DASUMKERNEL = dasum_thunderx2t99.c CASUMKERNEL = casum_thunderx2t99.c diff --git a/kernel/arm64/KERNEL.NEOVERSEN1 b/kernel/arm64/KERNEL.NEOVERSEN1 index e623814d6..665ebe459 100644 --- a/kernel/arm64/KERNEL.NEOVERSEN1 +++ b/kernel/arm64/KERNEL.NEOVERSEN1 @@ -70,6 +70,10 @@ DGEMVTKERNEL = gemv_t.S CGEMVTKERNEL = zgemv_t.S ZGEMVTKERNEL = zgemv_t.S +SSYMV_L_KERNEL = symv_L_asimd_4x4.c +SSYMV_U_KERNEL = symv_U_asimd_4x4.c +DSYMV_L_KERNEL = symv_L_asimd_4x4.c +DSYMV_U_KERNEL = symv_U_asimd_4x4.c SASUMKERNEL = sasum_thunderx2t99.c DASUMKERNEL = dasum_thunderx2t99.c diff --git a/kernel/arm64/symv_L_asimd_4x4.c b/kernel/arm64/symv_L_asimd_4x4.c new file mode 100644 index 000000000..b3d15ba67 --- /dev/null +++ b/kernel/arm64/symv_L_asimd_4x4.c @@ -0,0 +1,113 @@ +/*************************************************************************** +Copyright (c) 2025, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "symv_microk_asimd_4x4.c" + +int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, + FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i, j; + FLOAT temp1, temp2; + FLOAT tmp1[4]; + FLOAT tmp2[4]; + FLOAT *a0, *a1, *a2, *a3; + FLOAT x0, x1, x2, x3; + FLOAT *X = x; + FLOAT *Y = y; + + if (inc_y != 1) { + Y = buffer; + COPY_K(m, y, inc_y, Y, 1); + } + if (inc_x != 1) { + if (inc_y != 1) { + X = Y + m; + } else { + X = buffer; + } + COPY_K(m, x, inc_x, X, 1); + } + + BLASLONG offset1 = (offset / 4) * 4; + for (j = 0; j < offset1; j+=4) { + a0 = &a[j*lda]; + a1 = a0 + lda; + a2 = a1 + lda; + a3 = a2 + lda; + x0 = X[j]; + x1 = X[j+1]; + x2 = X[j+2]; + x3 = X[j+3]; + tmp2[0] = a0[j ]*x0 + a0[j+1]*x1 + a0[j+2]*x2 + a0[j+3]*x3; + tmp2[1] = a0[j+1]*x0 + a1[j+1]*x1 + a1[j+2]*x2 + a1[j+3]*x3; + tmp2[2] = a0[j+2]*x0 + a1[j+2]*x1 + a2[j+2]*x2 + a2[j+3]*x3; + tmp2[3] = a0[j+3]*x0 + a1[j+3]*x1 + a2[j+3]*x2 + a3[j+3]*x3; + tmp1[0] = alpha * x0; + tmp1[1] = alpha * x1; + tmp1[2] = alpha * x2; + tmp1[3] = alpha * x3; + + BLASLONG m2 = (m/4)*4; + if (m2 > j+4) + symv_kernel_4x4(j+4, m2, a0, a1, a2, a3, X, Y, tmp1, tmp2); + + for (i = m2; i < m; i++) { + Y[i] += tmp1[0] * a0[i]; + tmp2[0] += a0[i] * X[i]; + Y[i] += tmp1[1] * a1[i]; + tmp2[1] += a1[i] * X[i]; + Y[i] += tmp1[2] * a2[i]; + tmp2[2] += a2[i] * X[i]; + Y[i] += tmp1[3] * a3[i]; + tmp2[3] += a3[i] * X[i]; + } + Y[j] += alpha * tmp2[0]; + Y[j+1] += alpha * tmp2[1]; + Y[j+2] += alpha * tmp2[2]; + Y[j+3] += alpha * tmp2[3]; + } + + for (j = offset1; j < offset; j++) { + temp1 = alpha * X[j]; + temp2 = 0.0; + Y[j] += temp1 * a[j*lda+j]; + for (i = j+1; i < m; i++) { + Y[i] += temp1 * a[j*lda+i]; + temp2 += a[j*lda+i] * X[i]; + } + Y[j] += alpha * temp2; + } + + if (inc_y != 1) { + COPY_K(m, Y, 1, y, inc_y); + } + return(0); +} diff --git a/kernel/arm64/symv_L_sve_v1x4.c b/kernel/arm64/symv_L_sve_v1x4.c new file mode 100644 index 000000000..4b9252339 --- /dev/null +++ b/kernel/arm64/symv_L_sve_v1x4.c @@ -0,0 +1,103 @@ +/*************************************************************************** +Copyright (c) 2025, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "symv_microk_sve_v1x4.c" + +int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, + FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i, j; + FLOAT temp1, temp2; + FLOAT tmp1[4]; + FLOAT tmp2[4]; + FLOAT *a0, *a1, *a2, *a3; + FLOAT x0, x1, x2, x3; + FLOAT *X = x; + FLOAT *Y = y; + + if (inc_y != 1) { + Y = buffer; + COPY_K(m, y, inc_y, Y, 1); + } + if (inc_x != 1) { + if (inc_y != 1) { + X = Y + m; + } else { + X = buffer; + } + COPY_K(m, x, inc_x, X, 1); + } + + BLASLONG offset1 = (offset / 4) * 4; + + for (j = 0; j < offset1; j+=4) { + a0 = &a[j*lda]; + a1 = a0 + lda; + a2 = a1 + lda; + a3 = a2 + lda; + x0 = X[j]; + x1 = X[j+1]; + x2 = X[j+2]; + x3 = X[j+3]; + tmp2[0] = a0[j ]*x0 + a0[j+1]*x1 + a0[j+2]*x2 + a0[j+3]*x3; + tmp2[1] = a0[j+1]*x0 + a1[j+1]*x1 + a1[j+2]*x2 + a1[j+3]*x3; + tmp2[2] = a0[j+2]*x0 + a1[j+2]*x1 + a2[j+2]*x2 + a2[j+3]*x3; + tmp2[3] = a0[j+3]*x0 + a1[j+3]*x1 + a2[j+3]*x2 + a3[j+3]*x3; + tmp1[0] = alpha * x0; + tmp1[1] = alpha * x1; + tmp1[2] = alpha * x2; + tmp1[3] = alpha * x3; + + symv_kernel_v1x4(j+4, m, a0, a1, a2, a3, X, Y, tmp1, tmp2); + + Y[j] += alpha * tmp2[0]; + Y[j+1] += alpha * tmp2[1]; + Y[j+2] += alpha * tmp2[2]; + Y[j+3] += alpha * tmp2[3]; + } + + for (j = offset1; j < offset; j++) { + temp1 = alpha * X[j]; + temp2 = 0.0; + a0 = &a[j*lda]; + Y[j] += temp1 * a0[j]; + for (i = j+1; i < m; i++) { + Y[i] += temp1 * a0[i]; + temp2 += a0[i] * X[i]; + } + Y[j] += alpha * temp2; + } + + if (inc_y != 1) { + COPY_K(m, Y, 1, y, inc_y); + } + return(0); +} diff --git a/kernel/arm64/symv_U_asimd_4x4.c b/kernel/arm64/symv_U_asimd_4x4.c new file mode 100644 index 000000000..83e954260 --- /dev/null +++ b/kernel/arm64/symv_U_asimd_4x4.c @@ -0,0 +1,106 @@ +/*************************************************************************** +Copyright (c) 2025, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "symv_microk_asimd_4x4.c" + +int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, + FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i, j, j1, j2, m2; + FLOAT temp1, temp2; + FLOAT tmp1[4]; + FLOAT tmp2[4]; + FLOAT *a0, *a1, *a2, *a3; + FLOAT *X = x; + FLOAT *Y = y; + + BLASLONG m1 = m - offset; + if (inc_y != 1) { + Y = buffer; + COPY_K(m, y, inc_y, Y, 1); + } + if (inc_x != 1) { + if (inc_y != 1) { + X = Y + m; + } else { + X = buffer; + } + COPY_K(m, x, inc_x, X, 1); + } + + m2 = m - (offset % 4); + for (j = m1; j < m2; j += 4) { + tmp1[0] = alpha * X[j]; + tmp1[1] = alpha * X[j+1]; + tmp1[2] = alpha * X[j+2]; + tmp1[3] = alpha * X[j+3]; + tmp2[0] = 0.0; + tmp2[1] = 0.0; + tmp2[2] = 0.0; + tmp2[3] = 0.0; + a0 = &a[j*lda]; + a1 = a0 + lda; + a2 = a1 + lda; + a3 = a2 + lda; + j1 = (j / 4) * 4; + if ( j1 ) + symv_kernel_4x4(0, j1, a0, a1, a2, a3, X, Y, tmp1, tmp2); + + j2 = 0; + for (j1 = j ; j1 < j+4 ; j1++) { + temp1 = tmp1[j2]; + temp2 = tmp2[j2]; + a0 = &a[j1*lda]; + for (i=j ; i + +static void symv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, + FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) +{ +#ifdef DOUBLE + float64x2_t vtmpx0 = vld1q_dup_f64(&temp1[0]); + float64x2_t vtmpx1 = vld1q_dup_f64(&temp1[1]); + float64x2_t vtmpx2 = vld1q_dup_f64(&temp1[2]); + float64x2_t vtmpx3 = vld1q_dup_f64(&temp1[3]); + float64x2_t vtmpy0 = {0.0, 0.0}; + float64x2_t vtmpy1 = {0.0, 0.0}; + float64x2_t vtmpy2 = {0.0, 0.0}; + float64x2_t vtmpy3 = {0.0, 0.0}; + float64x2_t vxl, vxh, vyl, vyh; + float64x2_t vap0l, vap0h, vap1l, vap1h, vap2l, vap2h, vap3l, vap3h; + BLASLONG i; + for (i = from; i < to; i+=4) { + vyl = vld1q_f64(&y[i]); + vyh = vld1q_f64(&y[i+2]); + vxl = vld1q_f64(&x[i]); + vxh = vld1q_f64(&x[i+2]); + vap0l = vld1q_f64(&a0[i]); + vap0h = vld1q_f64(&a0[i+2]); + vap1l = vld1q_f64(&a1[i]); + vap1h = vld1q_f64(&a1[i+2]); + vap2l = vld1q_f64(&a2[i]); + vap2h = vld1q_f64(&a2[i+2]); + vap3l = vld1q_f64(&a3[i]); + vap3h = vld1q_f64(&a3[i+2]); + vyl = vfmaq_f64(vyl, vtmpx0, vap0l); + vyh = vfmaq_f64(vyh, vtmpx0, vap0h); + vyl = vfmaq_f64(vyl, vtmpx1, vap1l); + vyh = vfmaq_f64(vyh, vtmpx1, vap1h); + vyl = vfmaq_f64(vyl, vtmpx2, vap2l); + vyh = vfmaq_f64(vyh, vtmpx2, vap2h); + vyl = vfmaq_f64(vyl, vtmpx3, vap3l); + vyh = vfmaq_f64(vyh, vtmpx3, vap3h); + vtmpy0 = vfmaq_f64(vtmpy0, vxl, vap0l); + vtmpy0 = vfmaq_f64(vtmpy0, vxh, vap0h); + vtmpy1 = vfmaq_f64(vtmpy1, vxl, vap1l); + vtmpy2 = vfmaq_f64(vtmpy2, vxl, vap2l); + vtmpy1 = vfmaq_f64(vtmpy1, vxh, vap1h); + vtmpy2 = vfmaq_f64(vtmpy2, vxh, vap2h); + vtmpy3 = vfmaq_f64(vtmpy3, vxl, vap3l); + vtmpy3 = vfmaq_f64(vtmpy3, vxh, vap3h); + vst1q_f64(&y[i], vyl); + vst1q_f64(&y[i+2], vyh); + } + temp2[0] += vaddvq_f64(vtmpy0); + temp2[1] += vaddvq_f64(vtmpy1); + temp2[2] += vaddvq_f64(vtmpy2); + temp2[3] += vaddvq_f64(vtmpy3); +#else + float32x4_t vtmpx0 = vld1q_dup_f32(&temp1[0]); + float32x4_t vtmpx1 = vld1q_dup_f32(&temp1[1]); + float32x4_t vtmpx2 = vld1q_dup_f32(&temp1[2]); + float32x4_t vtmpx3 = vld1q_dup_f32(&temp1[3]); + float32x4_t vtmpy0 = {0.0, 0.0, 0.0, 0.0}; + float32x4_t vtmpy1 = {0.0, 0.0, 0.0, 0.0}; + float32x4_t vtmpy2 = {0.0, 0.0, 0.0, 0.0}; + float32x4_t vtmpy3 = {0.0, 0.0, 0.0, 0.0}; + float32x4_t vx, vy; + float32x4_t vap0, vap1, vap2, vap3; + BLASLONG i; + for (i = from; i < to; i+=4) { + vy = vld1q_f32(&y[i]); + vx = vld1q_f32(&x[i]); + vap0 = vld1q_f32(&a0[i]); + vap1 = vld1q_f32(&a1[i]); + vap2 = vld1q_f32(&a2[i]); + vap3 = vld1q_f32(&a3[i]); + vy = vfmaq_f32(vy, vtmpx0, vap0); + vy = vfmaq_f32(vy, vtmpx1, vap1); + vy = vfmaq_f32(vy, vtmpx2, vap2); + vy = vfmaq_f32(vy, vtmpx3, vap3); + vtmpy0 = vfmaq_f32(vtmpy0, vx, vap0); + vtmpy1 = vfmaq_f32(vtmpy1, vx, vap1); + vtmpy2 = vfmaq_f32(vtmpy2, vx, vap2); + vtmpy3 = vfmaq_f32(vtmpy3, vx, vap3); + vst1q_f32(&y[i], vy); + } + temp2[0] += vaddvq_f32(vtmpy0); + temp2[1] += vaddvq_f32(vtmpy1); + temp2[2] += vaddvq_f32(vtmpy2); + temp2[3] += vaddvq_f32(vtmpy3); +#endif +} diff --git a/kernel/arm64/symv_microk_sve_v1x4.c b/kernel/arm64/symv_microk_sve_v1x4.c new file mode 100644 index 000000000..f87613f39 --- /dev/null +++ b/kernel/arm64/symv_microk_sve_v1x4.c @@ -0,0 +1,89 @@ +/*************************************************************************** +Copyright (c) 2025, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#ifdef DOUBLE +#define SV_COUNT svcntd +#define SV_TYPE svfloat64_t +#define SV_TRUE svptrue_b64 +#define SV_WHILE svwhilelt_b64_s64 +#define SV_DUP svdup_f64 +#else +#define SV_COUNT svcntw +#define SV_TYPE svfloat32_t +#define SV_TRUE svptrue_b32 +#define SV_WHILE svwhilelt_b32_s64 +#define SV_DUP svdup_f32 +#endif + +static void symv_kernel_v1x4(BLASLONG from, BLASLONG to, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, + FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) +{ + SV_TYPE vtmpx0 = SV_DUP(temp1[0]); + SV_TYPE vtmpx1 = SV_DUP(temp1[1]); + SV_TYPE vtmpx2 = SV_DUP(temp1[2]); + SV_TYPE vtmpx3 = SV_DUP(temp1[3]); + SV_TYPE vtmpy0 = SV_DUP(0.0); + SV_TYPE vtmpy1 = SV_DUP(0.0); + SV_TYPE vtmpy2 = SV_DUP(0.0); + SV_TYPE vtmpy3 = SV_DUP(0.0); + SV_TYPE vx, vy; + SV_TYPE vap0, vap1, vap2, vap3; + BLASLONG i; + uint64_t sve_size = SV_COUNT(); + svbool_t pg; + + for (i = from; i < to; i += sve_size) { + pg = SV_WHILE(i, to); + vy = svld1(pg, &y[i]); + vx = svld1(pg, &x[i]); + vap0 = svld1(pg, &a0[i]); + vap1 = svld1(pg, &a1[i]); + vap2 = svld1(pg, &a2[i]); + vap3 = svld1(pg, &a3[i]); + vy = svmla_m(pg, vy, vtmpx0, vap0); + vy = svmla_m(pg, vy, vtmpx1, vap1); + vy = svmla_m(pg, vy, vtmpx2, vap2); + vy = svmla_m(pg, vy, vtmpx3, vap3); + vtmpy0 = svmla_m(pg, vtmpy0, vx, vap0); + vtmpy1 = svmla_m(pg, vtmpy1, vx, vap1); + vtmpy2 = svmla_m(pg, vtmpy2, vx, vap2); + vtmpy3 = svmla_m(pg, vtmpy3, vx, vap3); + svst1(pg, &y[i], vy); + } + pg = SV_TRUE(); + temp2[0] += svaddv(pg, vtmpy0); + temp2[1] += svaddv(pg, vtmpy1); + temp2[2] += svaddv(pg, vtmpy2); + temp2[3] += svaddv(pg, vtmpy3); +} From 7b66330deada3236e2bcd0ccf3e919ab33a88e28 Mon Sep 17 00:00:00 2001 From: zanpeeters Date: Tue, 15 Apr 2025 17:12:03 -0700 Subject: [PATCH 118/205] hw.perflevel[01].cpusperl changed to hw.perflevel[01].cpusperl2 --- cpuid_arm64.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpuid_arm64.c b/cpuid_arm64.c index 20dbead23..03563a23b 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -378,9 +378,9 @@ int detect(void) cpulowperf=value64; sysctlbyname("hw.nperflevels",&value64,&length64,NULL,0); if (value64 > 1) { - sysctlbyname("hw.perflevel0.cpusperl",&value64,&length64,NULL,0); + sysctlbyname("hw.perflevel0.cpusperl2",&value64,&length64,NULL,0); cpuhiperf=value64; - sysctlbyname("hw.perflevel1.cpusperl",&value64,&length64,NULL,0); + sysctlbyname("hw.perflevel1.cpusperl2",&value64,&length64,NULL,0); cpulowperf=value64; } sysctlbyname("hw.cpufamily",&value64,&length64,NULL,0); From d1c2528aed50bac13ef6b41aba012d2fea30eb4b Mon Sep 17 00:00:00 2001 From: zanpeeters Date: Tue, 15 Apr 2025 17:14:19 -0700 Subject: [PATCH 119/205] Add L1_DATA_LINESIZE for ifdef __APPLE__ --- cpuid_arm64.c | 1 + 1 file changed, 1 insertion(+) diff --git a/cpuid_arm64.c b/cpuid_arm64.c index 03563a23b..95c1b9519 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -702,6 +702,7 @@ void get_cpuconfig(void) printf("#define L1_CODE_SIZE %lld \n",value64); sysctlbyname("hw.cachelinesize",&value64,&length64,NULL,0); printf("#define L1_CODE_LINESIZE %lld \n",value64); + printf("#define L1_DATA_LINESIZE %lld \n",value64); sysctlbyname("hw.l1dcachesize",&value64,&length64,NULL,0); printf("#define L1_DATA_SIZE %lld \n",value64); sysctlbyname("hw.l2cachesize",&value64,&length64,NULL,0); From acef78c778631ee23c8e23214b41226872e454f5 Mon Sep 17 00:00:00 2001 From: zanpeeters Date: Tue, 15 Apr 2025 17:17:17 -0700 Subject: [PATCH 120/205] Reset buffer length before every call to sysctlbyname. --- cpuid_arm64.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/cpuid_arm64.c b/cpuid_arm64.c index 95c1b9519..c60725828 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -374,15 +374,20 @@ int detect(void) } #else #ifdef __APPLE__ + length64 = sizeof(value64); sysctlbyname("hw.ncpu",&value64,&length64,NULL,0); cpulowperf=value64; + length64 = sizeof(value64); sysctlbyname("hw.nperflevels",&value64,&length64,NULL,0); if (value64 > 1) { + length64 = sizeof(value64); sysctlbyname("hw.perflevel0.cpusperl2",&value64,&length64,NULL,0); cpuhiperf=value64; + length64 = sizeof(value64); sysctlbyname("hw.perflevel1.cpusperl2",&value64,&length64,NULL,0); cpulowperf=value64; } + length64 = sizeof(value64); sysctlbyname("hw.cpufamily",&value64,&length64,NULL,0); if (value64 ==131287967|| value64 == 458787763 ) return CPU_VORTEX; //A12/M1 if (value64 == 3660830781) return CPU_VORTEX; //A15/M2 @@ -467,6 +472,7 @@ int n=0; printf("#define NUM_CORES_HP %d\n",cpuhiperf); #endif #ifdef __APPLE__ + length64 = sizeof(value64); sysctlbyname("hw.physicalcpu_max",&value,&length,NULL,0); printf("#define NUM_CORES %d\n",value); if (cpulowperf >0) @@ -698,13 +704,17 @@ void get_cpuconfig(void) case CPU_VORTEX: printf("#define VORTEX \n"); #ifdef __APPLE__ + length64 = sizeof(value64); sysctlbyname("hw.l1icachesize",&value64,&length64,NULL,0); printf("#define L1_CODE_SIZE %lld \n",value64); + length64 = sizeof(value64); sysctlbyname("hw.cachelinesize",&value64,&length64,NULL,0); printf("#define L1_CODE_LINESIZE %lld \n",value64); printf("#define L1_DATA_LINESIZE %lld \n",value64); + length64 = sizeof(value64); sysctlbyname("hw.l1dcachesize",&value64,&length64,NULL,0); printf("#define L1_DATA_SIZE %lld \n",value64); + length64 = sizeof(value64); sysctlbyname("hw.l2cachesize",&value64,&length64,NULL,0); printf("#define L2_SIZE %lld \n",value64); #endif From d9369bda1e5641ea1d5509ed122bc3f4181ff18f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 16 Apr 2025 01:09:57 -0700 Subject: [PATCH 121/205] Update and amend parameters for Neoverse cpus --- cmake/prebuild.cmake | 33 +++++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index c8adf4ab2..4c100a770 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -1006,15 +1006,15 @@ endif () "#define HAVE_SVE\n" "#define ARMV8\n") set(SGEMM_UNROLL_M 16) - set(SGEMM_UNROLL_N 4) - set(DGEMM_UNROLL_M 8) - set(DGEMM_UNROLL_N 4) - set(CGEMM_UNROLL_M 8) + set(SGEMM_UNROLL_N 8) + set(DGEMM_UNROLL_M 4) + set(DGEMM_UNROLL_N 8) + set(CGEMM_UNROLL_M 2) set(CGEMM_UNROLL_N 4) - set(ZGEMM_UNROLL_M 4) + set(ZGEMM_UNROLL_M 2) set(ZGEMM_UNROLL_N 4) set(SYMV_P 16) - elseif ("${TCORE}" STREQUAL "NEOVERSEN2" or "${TCORE}" STREQUAL "ARMV9SME") + elseif ("${TCORE}" STREQUAL "NEOVERSEN2" OR "${TCORE}" STREQUAL "ARMV9SME") file(APPEND ${TARGET_CONF_TEMP} "#define L1_CODE_SIZE\t65536\n" "#define L1_CODE_LINESIZE\t64\n" @@ -1249,6 +1249,25 @@ endif () set(ZGEMM_UNROLL_M 2) set(ZGEMM_UNROLL_N 4) set(SYMV_P 16) + elseif ("${TCORE}" STREQUAL "ARMV8SVE" OR "${TCORE}" STREQUAL "CORTEXA510" OR "${TCORE}" STREQUAL "CORTEXX2" OR "${TCORE}" STREQUAL "ARMV9") + file(APPEND ${TARGET_CONF_TEMP} + "#define L1_DATA_SIZE\t32768\n" + "#define L1_DATA_LINESIZE\t64\n" + "#define L2_SIZE\t262144\n" + "#define L2_LINESIZE\t64\n" + "#define DTB_DEFAULT_ENTRIES\t64\n" + "#define DTB_SIZE\t4096\n" + "#define L2_ASSOCIATIVE\t32\n" + "#define ARMV8\n") + set(SGEMM_UNROLL_M 4) + set(SGEMM_UNROLL_N 8) + set(DGEMM_UNROLL_M 4) + set(DGEMM_UNROLL_N 8) + set(CGEMM_UNROLL_M 2) + set(CGEMM_UNROLL_N 4) + set(ZGEMM_UNROLL_M 2) + set(ZGEMM_UNROLL_N 4) + set(SYMV_P 16) elseif ("${TCORE}" STREQUAL "P5600") file(APPEND ${TARGET_CONF_TEMP} "#define L2_SIZE 1048576\n" @@ -1409,9 +1428,11 @@ endif () # GetArch_2nd foreach(float_char S;D;Q;C;Z;X) if (NOT DEFINED ${float_char}GEMM_UNROLL_M) + message(STATUS "setting unrollm=2") set(${float_char}GEMM_UNROLL_M 2) endif() if (NOT DEFINED ${float_char}GEMM_UNROLL_N) + message(STATUS "setting unrolln=2") set(${float_char}GEMM_UNROLL_N 2) endif() endforeach() From d53572880398016fb5bee5b7aa96131926d295ec Mon Sep 17 00:00:00 2001 From: Annop Wongwathanarat Date: Wed, 9 Apr 2025 12:54:57 +0000 Subject: [PATCH 122/205] Improve performance for SGEMVN on NEONVERSEN1 --- kernel/arm64/KERNEL.NEOVERSEN1 | 2 +- kernel/arm64/sgemv_n_neon.c | 219 +++++++++++++++++++++++++++++++++ 2 files changed, 220 insertions(+), 1 deletion(-) create mode 100644 kernel/arm64/sgemv_n_neon.c diff --git a/kernel/arm64/KERNEL.NEOVERSEN1 b/kernel/arm64/KERNEL.NEOVERSEN1 index e623814d6..de4d33c74 100644 --- a/kernel/arm64/KERNEL.NEOVERSEN1 +++ b/kernel/arm64/KERNEL.NEOVERSEN1 @@ -60,7 +60,7 @@ DSCALKERNEL = scal.S CSCALKERNEL = zscal.S ZSCALKERNEL = zscal.S -SGEMVNKERNEL = gemv_n.S +SGEMVNKERNEL = sgemv_n_neon.c DGEMVNKERNEL = gemv_n.S CGEMVNKERNEL = zgemv_n.S ZGEMVNKERNEL = zgemv_n.S diff --git a/kernel/arm64/sgemv_n_neon.c b/kernel/arm64/sgemv_n_neon.c new file mode 100644 index 000000000..5fa86b350 --- /dev/null +++ b/kernel/arm64/sgemv_n_neon.c @@ -0,0 +1,219 @@ +/*************************************************************************** +Copyright (c) 2025, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i; + BLASLONG ix,iy; + BLASLONG j; + FLOAT *a_ptr; + FLOAT temp; + + ix = 0; + a_ptr = a; + + if (inc_x == 1 && inc_y == 1) { + FLOAT *a0_ptr = a + lda * 0; + FLOAT *a1_ptr = a + lda * 1; + FLOAT *a2_ptr = a + lda * 2; + FLOAT *a3_ptr = a + lda * 3; + FLOAT *a4_ptr = a + lda * 4; + FLOAT *a5_ptr = a + lda * 5; + FLOAT *a6_ptr = a + lda * 6; + FLOAT *a7_ptr = a + lda * 7; + + j = 0; + while (j + 3 < n) { + float32x4_t x0_vec = vld1q_f32(x + j); + x0_vec = vmulq_n_f32(x0_vec, alpha); + i = 0; + while (i + 7 < m) { + float32x4_t a00_vec = vld1q_f32(a0_ptr + i); + float32x4_t a01_vec = vld1q_f32(a0_ptr + i + 4); + float32x4_t a10_vec = vld1q_f32(a1_ptr + i); + float32x4_t a11_vec = vld1q_f32(a1_ptr + i + 4); + float32x4_t a20_vec = vld1q_f32(a2_ptr + i); + float32x4_t a21_vec = vld1q_f32(a2_ptr + i + 4); + float32x4_t a30_vec = vld1q_f32(a3_ptr + i); + float32x4_t a31_vec = vld1q_f32(a3_ptr + i + 4); + + float32x4_t y0_vec = vld1q_f32(y + i); + float32x4_t y1_vec = vld1q_f32(y + i + 4); + y0_vec = vmlaq_laneq_f32(y0_vec, a00_vec, x0_vec, 0); + y0_vec = vmlaq_laneq_f32(y0_vec, a10_vec, x0_vec, 1); + y0_vec = vmlaq_laneq_f32(y0_vec, a20_vec, x0_vec, 2); + y0_vec = vmlaq_laneq_f32(y0_vec, a30_vec, x0_vec, 3); + y1_vec = vmlaq_laneq_f32(y1_vec, a01_vec, x0_vec, 0); + y1_vec = vmlaq_laneq_f32(y1_vec, a11_vec, x0_vec, 1); + y1_vec = vmlaq_laneq_f32(y1_vec, a21_vec, x0_vec, 2); + y1_vec = vmlaq_laneq_f32(y1_vec, a31_vec, x0_vec, 3); + + vst1q_f32(y + i, y0_vec); + vst1q_f32(y + i + 4, y1_vec); + + i += 8; + } + while (i + 3 < m) { + float32x4_t a0_vec = vld1q_f32(a0_ptr + i); + float32x4_t a1_vec = vld1q_f32(a1_ptr + i); + float32x4_t a2_vec = vld1q_f32(a2_ptr + i); + float32x4_t a3_vec = vld1q_f32(a3_ptr + i); + + float32x4_t y_vec = vld1q_f32(y + i); + y_vec = vmlaq_laneq_f32(y_vec, a0_vec, x0_vec, 0); + y_vec = vmlaq_laneq_f32(y_vec, a1_vec, x0_vec, 1); + y_vec = vmlaq_laneq_f32(y_vec, a2_vec, x0_vec, 2); + y_vec = vmlaq_laneq_f32(y_vec, a3_vec, x0_vec, 3); + + vst1q_f32(y + i, y_vec); + + i += 4; + } + while (i + 1 < m) { + float32x2_t a0_vec = vld1_f32(a0_ptr + i); + float32x2_t a1_vec = vld1_f32(a1_ptr + i); + float32x2_t a2_vec = vld1_f32(a2_ptr + i); + float32x2_t a3_vec = vld1_f32(a3_ptr + i); + + float32x2_t y_vec = vld1_f32(y + i); + y_vec = vmla_laneq_f32(y_vec, a0_vec, x0_vec, 0); + y_vec = vmla_laneq_f32(y_vec, a1_vec, x0_vec, 1); + y_vec = vmla_laneq_f32(y_vec, a2_vec, x0_vec, 2); + y_vec = vmla_laneq_f32(y_vec, a3_vec, x0_vec, 3); + + vst1_f32(y + i, y_vec); + + i += 2; + } + while (i < m) { + y[i] += a0_ptr[i] * x0_vec[0]; + y[i] += a1_ptr[i] * x0_vec[1]; + y[i] += a2_ptr[i] * x0_vec[2]; + y[i] += a3_ptr[i] * x0_vec[3]; + + i++; + } + + a0_ptr += lda * 4; + a1_ptr += lda * 4; + a2_ptr += lda * 4; + a3_ptr += lda * 4; + + j += 4; + } + while (j + 1 < n) { + float32x2_t x0_vec = vld1_f32(x + j); + x0_vec = vmul_n_f32(x0_vec, alpha); + i = 0; + while (i + 7 < m) { + float32x4_t a00_vec = vld1q_f32(a0_ptr + i); + float32x4_t a01_vec = vld1q_f32(a0_ptr + i + 4); + float32x4_t a10_vec = vld1q_f32(a1_ptr + i); + float32x4_t a11_vec = vld1q_f32(a1_ptr + i + 4); + + float32x4_t y0_vec = vld1q_f32(y + i); + float32x4_t y1_vec = vld1q_f32(y + i + 4); + y0_vec = vmlaq_lane_f32(y0_vec, a00_vec, x0_vec, 0); + y0_vec = vmlaq_lane_f32(y0_vec, a10_vec, x0_vec, 1); + y1_vec = vmlaq_lane_f32(y1_vec, a01_vec, x0_vec, 0); + y1_vec = vmlaq_lane_f32(y1_vec, a11_vec, x0_vec, 1); + + vst1q_f32(y + i, y0_vec); + vst1q_f32(y + i + 4, y1_vec); + + i += 8; + } + while (i + 3 < m) { + float32x4_t a0_vec = vld1q_f32(a0_ptr + i); + float32x4_t a1_vec = vld1q_f32(a1_ptr + i); + + float32x4_t y_vec = vld1q_f32(y + i); + y_vec = vmlaq_lane_f32(y_vec, a0_vec, x0_vec, 0); + y_vec = vmlaq_lane_f32(y_vec, a1_vec, x0_vec, 1); + + vst1q_f32(y + i, y_vec); + + i += 4; + } + while (i + 1 < m) { + float32x2_t a0_vec = vld1_f32(a0_ptr + i); + float32x2_t a1_vec = vld1_f32(a1_ptr + i); + + float32x2_t y_vec = vld1_f32(y + i); + y_vec = vmla_lane_f32(y_vec, a0_vec, x0_vec, 0); + y_vec = vmla_lane_f32(y_vec, a1_vec, x0_vec, 1); + + vst1_f32(y + i, y_vec); + + i += 2; + } + while (i < m) { + y[i] += a0_ptr[i] * x0_vec[0]; + y[i] += a1_ptr[i] * x0_vec[1]; + + i++; + } + + a0_ptr += lda * 2; + a1_ptr += lda * 2; + + j += 2; + } + while (j < n) { + i = 0; + temp = alpha * x[j]; + while (i < m) { + y[i] += a0_ptr[i] * temp; + i++; + } + + a0_ptr += lda; + j++; + } + return (0); + } + + for (j = 0; j < n; j++) { + temp = alpha * x[ix]; + iy = 0; + for (i = 0; i < m; i++) { + y[iy] += temp * a_ptr[i]; + iy += inc_y; + } + a_ptr += lda; + ix += inc_x; + } + return (0); +} From 1f687b2f6001009b271f9ba790760e35e4334530 Mon Sep 17 00:00:00 2001 From: Han Gao Date: Sun, 20 Apr 2025 14:20:49 +0800 Subject: [PATCH 123/205] Bump xuantie qemu for c910v Signed-off-by: Han Gao --- .github/workflows/c910v.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/c910v.yml b/.github/workflows/c910v.yml index 1dd3a2c71..c5b497316 100644 --- a/.github/workflows/c910v.yml +++ b/.github/workflows/c910v.yml @@ -31,7 +31,7 @@ jobs: steps: - name: Checkout repository - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: install build deps run: | @@ -40,18 +40,18 @@ jobs: gcc-${{ matrix.apt_triple }} gfortran-${{ matrix.apt_triple }} libgomp1-riscv64-cross libglib2.0-dev - name: checkout qemu - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: - repository: T-head-Semi/qemu + repository: XUANTIE-RV/qemu path: qemu - ref: 1e692ebb43d396c52352406323fc782c1ac99a42 + ref: e0ace167effcd36d1f82c7ccb4522b3126011479 # xuantie-qemu-9.0 - name: build qemu run: | # Force use c910v qemu-user - wget https://github.com/revyos/qemu/commit/5164bca5a4bcde4534dc1a9aa3a7f619719874cf.patch + wget https://github.com/revyos/qemu/commit/222729c7455784dd855216d7a2bec4bd8f2a6800.patch cd qemu - patch -p1 < ../5164bca5a4bcde4534dc1a9aa3a7f619719874cf.patch + patch -p1 < ../222729c7455784dd855216d7a2bec4bd8f2a6800.patch export CXXFLAGS="-Wno-error"; export CFLAGS="-Wno-error" ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=riscv64-linux-user --disable-system make -j$(nproc) From 0cc248559459eccdf80fdd18510aff6920f9917a Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Sun, 20 Apr 2025 07:50:04 +0000 Subject: [PATCH 124/205] Explicit unaligned vector load/stores in PPC64LE GEMV kernels --- kernel/power/sgemv_n.c | 757 ++++++++++++++++++----------------------- kernel/power/sgemv_t.c | 311 ++++++++--------- 2 files changed, 477 insertions(+), 591 deletions(-) diff --git a/kernel/power/sgemv_n.c b/kernel/power/sgemv_n.c index f5c1ba729..79c651df7 100644 --- a/kernel/power/sgemv_n.c +++ b/kernel/power/sgemv_n.c @@ -17,454 +17,369 @@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE +GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF +THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #if !defined(__VEC__) || !defined(__ALTIVEC__) #include "../arm/gemv_n.c" #else -#include "common.h" +#include +#include "common.h" #define NBMAX 4096 -static void sgemv_kernel_4x8(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, BLASLONG lda4, FLOAT *alpha) -{ - +static void sgemv_kernel_4x8(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, + BLASLONG lda4, FLOAT *alpha) { BLASLONG i; - FLOAT *a0,*a1,*a2,*a3,*b0,*b1,*b2,*b3; - FLOAT x0,x1,x2,x3,x4,x5,x6,x7; - a0 = ap[0]; - a1 = ap[1]; - a2 = ap[2]; - a3 = ap[3]; - b0 = a0 + lda4 ; - b1 = a1 + lda4 ; - b2 = a2 + lda4 ; - b3 = a3 + lda4 ; - x0 = xo[0] * *alpha; - x1 = xo[1] * *alpha; - x2 = xo[2] * *alpha; - x3 = xo[3] * *alpha; - x4 = xo[4] * *alpha; - x5 = xo[5] * *alpha; - x6 = xo[6] * *alpha; - x7 = xo[7] * *alpha; - __vector float* va0 = (__vector float*)a0; - __vector float* va1 = (__vector float*)a1; - __vector float* va2 = (__vector float*)a2; - __vector float* va3 = (__vector float*)a3; - __vector float* vb0 = (__vector float*)b0; - __vector float* vb1 = (__vector float*)b1; - __vector float* vb2 = (__vector float*)b2; - __vector float* vb3 = (__vector float*)b3; - - __vector float v_x0 = {x0,x0,x0,x0}; - __vector float v_x1 = {x1,x1,x1,x1}; - __vector float v_x2 = {x2,x2,x2,x2}; - __vector float v_x3 = {x3,x3,x3,x3}; - __vector float v_x4 = {x4,x4,x4,x4}; - __vector float v_x5 = {x5,x5,x5,x5}; - __vector float v_x6 = {x6,x6,x6,x6}; - __vector float v_x7 = {x7,x7,x7,x7}; - __vector float* v_y =(__vector float*)y; - - for ( i=0; i< n/4; i++) - { - register __vector float vy=v_y[i]; - vy += v_x0 * va0[i] + v_x1 * va1[i] + v_x2 * va2[i] + v_x3 * va3[i] ; - vy += v_x4 * vb0[i] + v_x5 * vb1[i] + v_x6 * vb2[i] + v_x7 * vb3[i] ; - v_y[i] =vy; + FLOAT *a0, *a1, *a2, *a3, *b0, *b1, *b2, *b3; + FLOAT x0, x1, x2, x3, x4, x5, x6, x7; + a0 = ap[0]; + a1 = ap[1]; + a2 = ap[2]; + a3 = ap[3]; + b0 = a0 + lda4; + b1 = a1 + lda4; + b2 = a2 + lda4; + b3 = a3 + lda4; + x0 = xo[0] * (*alpha); + x1 = xo[1] * (*alpha); + x2 = xo[2] * (*alpha); + x3 = xo[3] * (*alpha); + x4 = xo[4] * (*alpha); + x5 = xo[5] * (*alpha); + x6 = xo[6] * (*alpha); + x7 = xo[7] * (*alpha); + + __vector float v_x0 = {x0, x0, x0, x0}; + __vector float v_x1 = {x1, x1, x1, x1}; + __vector float v_x2 = {x2, x2, x2, x2}; + __vector float v_x3 = {x3, x3, x3, x3}; + __vector float v_x4 = {x4, x4, x4, x4}; + __vector float v_x5 = {x5, x5, x5, x5}; + __vector float v_x6 = {x6, x6, x6, x6}; + __vector float v_x7 = {x7, x7, x7, x7}; + + for (i = 0; i < n; i += 4) { + __vector float vy = vec_vsx_ld(0, &y[i]); + __vector float va0 = vec_vsx_ld(0, &a0[i]); + __vector float va1 = vec_vsx_ld(0, &a1[i]); + __vector float va2 = vec_vsx_ld(0, &a2[i]); + __vector float va3 = vec_vsx_ld(0, &a3[i]); + __vector float vb0 = vec_vsx_ld(0, &b0[i]); + __vector float vb1 = vec_vsx_ld(0, &b1[i]); + __vector float vb2 = vec_vsx_ld(0, &b2[i]); + __vector float vb3 = vec_vsx_ld(0, &b3[i]); + vy += v_x0 * va0 + v_x1 * va1 + v_x2 * va2 + v_x3 * va3; + vy += v_x4 * vb0 + v_x5 * vb1 + v_x6 * vb2 + v_x7 * vb3; + vec_vsx_st(vy, 0, &y[i]); } - } - -static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) -{ + +static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, + FLOAT *alpha) { BLASLONG i; - FLOAT x0,x1,x2,x3; - x0 = xo[0] * *alpha; - x1 = xo[1] * *alpha; - x2 = xo[2] * *alpha; - x3 = xo[3] * *alpha; - __vector float v_x0 = {x0,x0,x0,x0}; - __vector float v_x1 = {x1,x1,x1,x1}; - __vector float v_x2 = {x2,x2,x2,x2}; - __vector float v_x3 = {x3,x3,x3,x3}; - __vector float* v_y =(__vector float*)y; - __vector float* va0 = (__vector float*)ap[0]; - __vector float* va1 = (__vector float*)ap[1]; - __vector float* va2 = (__vector float*)ap[2]; - __vector float* va3 = (__vector float*)ap[3]; - - for ( i=0; i< n/4; i++ ) - { - register __vector float vy=v_y[i]; - vy += v_x0 * va0[i] + v_x1 * va1[i] + v_x2 * va2[i] + v_x3 * va3[i] ; - v_y[i] =vy; + FLOAT x0, x1, x2, x3; + FLOAT *a0, *a1, *a2, *a3; + a0 = ap[0]; + a1 = ap[1]; + a2 = ap[2]; + a3 = ap[3]; + x0 = xo[0] * (*alpha); + x1 = xo[1] * (*alpha); + x2 = xo[2] * (*alpha); + x3 = xo[3] * (*alpha); + __vector float v_x0 = {x0, x0, x0, x0}; + __vector float v_x1 = {x1, x1, x1, x1}; + __vector float v_x2 = {x2, x2, x2, x2}; + __vector float v_x3 = {x3, x3, x3, x3}; + + for (i = 0; i < n; i += 4) { + __vector float vy = vec_vsx_ld(0, &y[i]); + __vector float va0 = vec_vsx_ld(0, &a0[i]); + __vector float va1 = vec_vsx_ld(0, &a1[i]); + __vector float va2 = vec_vsx_ld(0, &a2[i]); + __vector float va3 = vec_vsx_ld(0, &a3[i]); + vy += v_x0 * va0 + v_x1 * va1 + v_x2 * va2 + v_x3 * va3; + vec_vsx_st(vy, 0, &y[i]); } +} -} - -static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) -{ - +static void sgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, + FLOAT *alpha) { BLASLONG i; - FLOAT x0,x1; - x0 = x[0] * *alpha; - x1 = x[1] * *alpha; - __vector float v_x0 = {x0,x0,x0,x0}; - __vector float v_x1 = {x1,x1,x1,x1}; - __vector float* v_y =(__vector float*)y; - __vector float* va0 = (__vector float*)ap[0]; - __vector float* va1 = (__vector float*)ap[1]; - - for ( i=0; i< n/4; i++ ) - { - v_y[i] += v_x0 * va0[i] + v_x1 * va1[i] ; + FLOAT x0, x1; + FLOAT *a0, *a1; + a0 = ap[0]; + a1 = ap[1]; + x0 = x[0] * (*alpha); + x1 = x[1] * (*alpha); + __vector float v_x0 = {x0, x0, x0, x0}; + __vector float v_x1 = {x1, x1, x1, x1}; + + for (i = 0; i < n; i += 4) { + __vector float vy = vec_vsx_ld(0, &y[i]); + __vector float va0 = vec_vsx_ld(0, &a0[i]); + __vector float va1 = vec_vsx_ld(0, &a1[i]); + vy += v_x0 * va0 + v_x1 * va1; + vec_vsx_st(vy, 0, &y[i]); } +} -} - - -static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) -{ - +static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, + FLOAT *alpha) { BLASLONG i; - FLOAT x0 ; - x0 = x[0] * *alpha; - __vector float v_x0 = {x0,x0,x0,x0}; - __vector float* v_y =(__vector float*)y; - __vector float* va0 = (__vector float*)ap; - - for ( i=0; i< n/4; i++ ) - { - v_y[i] += v_x0 * va0[i] ; + FLOAT x0 = x[0] * (*alpha); + __vector float v_x0 = {x0, x0, x0, x0}; + + for (i = 0; i < n; i += 4) { + __vector float vy = vec_vsx_ld(0, &y[i]); + __vector float va0 = vec_vsx_ld(0, &ap[i]); + vy += v_x0 * va0; + vec_vsx_st(vy, 0, &y[i]); } - } - -static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) -{ + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) { BLASLONG i; - - for ( i=0; i> 3 ; - n2 = n & 7 ; - } - else - { - n1 = n >> 2 ; - n2 = n & 3 ; - - } - - m3 = m & 3 ; - m1 = m & -4 ; - m2 = (m & (NBMAX-1)) - m3 ; - - - y_ptr = y; - - BLASLONG NB = NBMAX; - - while ( NB == NBMAX ) - { - - m1 -= NB; - if ( m1 < 0) - { - if ( m2 == 0 ) break; - NB = m2; - } - - a_ptr = a; - x_ptr = x; - - ap[0] = a_ptr; - ap[1] = a_ptr + lda; - ap[2] = ap[1] + lda; - ap[3] = ap[2] + lda; - - if ( inc_y != 1 ) - memset(ybuffer,0,NB*4); - else - ybuffer = y_ptr; - - if ( inc_x == 1 ) - { - - - for( i = 0; i < n1 ; i++) - { - sgemv_kernel_4x8(NB,ap,x_ptr,ybuffer,lda4,&alpha); - ap[0] += lda8; - ap[1] += lda8; - ap[2] += lda8; - ap[3] += lda8; - a_ptr += lda8; - x_ptr += 8; - } - - - if ( n2 & 4 ) - { - sgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,&alpha); - ap[0] += lda4; - ap[1] += lda4; - ap[2] += lda4; - ap[3] += lda4; - a_ptr += lda4; - x_ptr += 4; - } - - if ( n2 & 2 ) - { - sgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,&alpha); - a_ptr += lda*2; - x_ptr += 2; - } - - - if ( n2 & 1 ) - { - sgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha); +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, + BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, + FLOAT *buffer) { + BLASLONG i, n1, m1, m2, m3, n2, lda4, lda8; + FLOAT *a_ptr, *x_ptr, *y_ptr, *ap[4]; + + lda4 = lda << 2; + lda8 = lda << 3; + FLOAT xbuffer[8] __attribute__((aligned(16))); + FLOAT *ybuffer = buffer; + + if (m < 1) return (0); + if (n < 1) return (0); + + if (inc_x == 1) { + n1 = n >> 3; + n2 = n & 7; + } else { + n1 = n >> 2; + n2 = n & 3; + } + + m3 = m & 3; + m1 = m & -4; + m2 = (m & (NBMAX - 1)) - m3; + y_ptr = y; + BLASLONG NB = NBMAX; + + while (NB == NBMAX) { + m1 -= NB; + if (m1 < 0) { + if (m2 == 0) break; + NB = m2; + } + + a_ptr = a; + x_ptr = x; + + ap[0] = a_ptr; + ap[1] = a_ptr + lda; + ap[2] = ap[1] + lda; + ap[3] = ap[2] + lda; + + if (inc_y != 1) + memset(ybuffer, 0, NB * 4); + else + ybuffer = y_ptr; + + if (inc_x == 1) { + for (i = 0; i < n1; i++) { + sgemv_kernel_4x8(NB, ap, x_ptr, ybuffer, lda4, &alpha); + ap[0] += lda8; + ap[1] += lda8; + ap[2] += lda8; + ap[3] += lda8; + a_ptr += lda8; + x_ptr += 8; + } + if (n2 & 4) { + sgemv_kernel_4x4(NB, ap, x_ptr, ybuffer, &alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + x_ptr += 4; + } + + if (n2 & 2) { + sgemv_kernel_4x2(NB, ap, x_ptr, ybuffer, &alpha); + a_ptr += lda * 2; + x_ptr += 2; + } + + if (n2 & 1) { + sgemv_kernel_4x1(NB, a_ptr, x_ptr, ybuffer, &alpha); + a_ptr += lda; + x_ptr += 1; + } + + } else { + for (i = 0; i < n1; i++) { + xbuffer[0] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[1] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[2] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[3] = x_ptr[0]; + x_ptr += inc_x; + sgemv_kernel_4x4(NB, ap, xbuffer, ybuffer, &alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + } + + for (i = 0; i < n2; i++) { + xbuffer[0] = x_ptr[0]; + x_ptr += inc_x; + sgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, &alpha); + a_ptr += lda; + } + } + + a += NB; + if (inc_y != 1) { + add_y(NB, ybuffer, y_ptr, inc_y); + y_ptr += NB * inc_y; + } else + y_ptr += NB; + } + + if (m3 == 0) return (0); + + if (m3 == 3) { + a_ptr = a; + x_ptr = x; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + FLOAT temp2 = 0.0; + if (lda == 3 && inc_x == 1) { + for (i = 0; i < (n & -4); i += 4) { + temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1]; + temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1]; + + temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3]; + temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3]; + temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3]; + + a_ptr += 12; + x_ptr += 4; + } + + for (; i < n; i++) { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + temp2 += a_ptr[2] * x_ptr[0]; + a_ptr += 3; + x_ptr++; + } + + } else { + for (i = 0; i < n; i++) { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + temp2 += a_ptr[2] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + } + } + y_ptr[0] += alpha * temp0; + y_ptr += inc_y; + y_ptr[0] += alpha * temp1; + y_ptr += inc_y; + y_ptr[0] += alpha * temp2; + return (0); + } + + if (m3 == 2) { + a_ptr = a; + x_ptr = x; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + if (lda == 2 && inc_x == 1) { + for (i = 0; i < (n & -4); i += 4) { + temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; + temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; + temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3]; + a_ptr += 8; + x_ptr += 4; + } + + for (; i < n; i++) { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + a_ptr += 2; + x_ptr++; + } + + } else { + for (i = 0; i < n; i++) { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + } + } + y_ptr[0] += alpha * temp0; + y_ptr += inc_y; + y_ptr[0] += alpha * temp1; + return (0); + } + + if (m3 == 1) { + a_ptr = a; + x_ptr = x; + FLOAT temp = 0.0; + if (lda == 1 && inc_x == 1) { + for (i = 0; i < (n & -4); i += 4) { + temp += a_ptr[i] * x_ptr[i] + a_ptr[i + 1] * x_ptr[i + 1] + + a_ptr[i + 2] * x_ptr[i + 2] + + a_ptr[i + 3] * x_ptr[i + 3]; + } + + for (; i < n; i++) { + temp += a_ptr[i] * x_ptr[i]; + } + + } else { + for (i = 0; i < n; i++) { + temp += a_ptr[0] * x_ptr[0]; a_ptr += lda; - x_ptr += 1; - } - - - } - else - { - - for( i = 0; i < n1 ; i++) - { - xbuffer[0] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[1] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[2] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[3] = x_ptr[0]; - x_ptr += inc_x; - sgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,&alpha); - ap[0] += lda4; - ap[1] += lda4; - ap[2] += lda4; - ap[3] += lda4; - a_ptr += lda4; - } - - for( i = 0; i < n2 ; i++) - { - xbuffer[0] = x_ptr[0]; - x_ptr += inc_x; - sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,&alpha); - a_ptr += lda; - - } - - } - - a += NB; - if ( inc_y != 1 ) - { - add_y(NB,ybuffer,y_ptr,inc_y); - y_ptr += NB * inc_y; - } - else - y_ptr += NB ; - - } - - if ( m3 == 0 ) return(0); - - if ( m3 == 3 ) - { - a_ptr = a; - x_ptr = x; - FLOAT temp0 = 0.0; - FLOAT temp1 = 0.0; - FLOAT temp2 = 0.0; - if ( lda == 3 && inc_x ==1 ) - { - - for( i = 0; i < ( n & -4 ); i+=4 ) - { - - temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1]; - temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1]; - temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1]; - - temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3]; - temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3]; - temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3]; - - a_ptr += 12; - x_ptr += 4; - } - - for( ; i < n; i++ ) - { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - temp2 += a_ptr[2] * x_ptr[0]; - a_ptr += 3; - x_ptr ++; - } - - } - else - { - - for( i = 0; i < n; i++ ) - { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - temp2 += a_ptr[2] * x_ptr[0]; - a_ptr += lda; - x_ptr += inc_x; - - - } - - } - y_ptr[0] += alpha * temp0; - y_ptr += inc_y; - y_ptr[0] += alpha * temp1; - y_ptr += inc_y; - y_ptr[0] += alpha * temp2; - return(0); - } - - - if ( m3 == 2 ) - { - a_ptr = a; - x_ptr = x; - FLOAT temp0 = 0.0; - FLOAT temp1 = 0.0; - if ( lda == 2 && inc_x ==1 ) - { - - for( i = 0; i < (n & -4) ; i+=4 ) - { - temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; - temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; - temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; - temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3]; - a_ptr += 8; - x_ptr += 4; - - } - - - for( ; i < n; i++ ) - { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - a_ptr += 2; - x_ptr ++; - } - - } - else - { - - for( i = 0; i < n; i++ ) - { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - a_ptr += lda; - x_ptr += inc_x; - - - } - - } - y_ptr[0] += alpha * temp0; - y_ptr += inc_y; - y_ptr[0] += alpha * temp1; - return(0); - } - - if ( m3 == 1 ) - { - a_ptr = a; - x_ptr = x; - FLOAT temp = 0.0; - if ( lda == 1 && inc_x ==1 ) - { - - for( i = 0; i < (n & -4); i+=4 ) - { - temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3]; - - } - - for( ; i < n; i++ ) - { - temp += a_ptr[i] * x_ptr[i]; - } - - } - else - { - - for( i = 0; i < n; i++ ) - { - temp += a_ptr[0] * x_ptr[0]; - a_ptr += lda; - x_ptr += inc_x; - } - - } - y_ptr[0] += alpha * temp; - return(0); - } - - - return(0); + x_ptr += inc_x; + } + } + y_ptr[0] += alpha * temp; + return (0); + } + + return (0); } #endif - diff --git a/kernel/power/sgemv_t.c b/kernel/power/sgemv_t.c index ed0a24230..8a01d2de4 100644 --- a/kernel/power/sgemv_t.c +++ b/kernel/power/sgemv_t.c @@ -17,12 +17,12 @@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE +GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF +THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #if !defined(__VEC__) || !defined(__ALTIVEC__) #include "../arm/gemv_t.c" @@ -33,20 +33,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define NBMAX 2048 -#include - -static void sgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { - BLASLONG i; +#include + +static void sgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, + FLOAT *y, FLOAT alpha) { + BLASLONG i; FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7; - __vector float *va0, *va1, *va2, *va3, *va4, *va5, *va6, *va7, *v_x; - register __vector float temp0 = {0,0,0,0}; - register __vector float temp1 = {0,0,0,0}; - register __vector float temp2 = {0,0,0,0}; - register __vector float temp3 = {0,0,0,0}; - register __vector float temp4 = {0,0,0,0}; - register __vector float temp5 = {0,0,0,0}; - register __vector float temp6 = {0,0,0,0}; - register __vector float temp7 = {0,0,0,0}; + register __vector float temp0 = {0, 0, 0, 0}; + register __vector float temp1 = {0, 0, 0, 0}; + register __vector float temp2 = {0, 0, 0, 0}; + register __vector float temp3 = {0, 0, 0, 0}; + register __vector float temp4 = {0, 0, 0, 0}; + register __vector float temp5 = {0, 0, 0, 0}; + register __vector float temp6 = {0, 0, 0, 0}; + register __vector float temp7 = {0, 0, 0, 0}; a0 = ap; a1 = ap + lda; @@ -56,43 +56,42 @@ static void sgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA a5 = a4 + lda; a6 = a5 + lda; a7 = a6 + lda; - va0 = (__vector float*) a0; - va1 = (__vector float*) a1; - va2 = (__vector float*) a2; - va3 = (__vector float*) a3; - va4 = (__vector float*) a4; - va5 = (__vector float*) a5; - va6 = (__vector float*) a6; - va7 = (__vector float*) a7; - v_x = (__vector float*) x; - - - for (i = 0; i < n/4; i ++) { - temp0 += v_x[i] * va0[i]; - temp1 += v_x[i] * va1[i]; - temp2 += v_x[i] * va2[i]; - temp3 += v_x[i] * va3[i]; - temp4 += v_x[i] * va4[i]; - temp5 += v_x[i] * va5[i]; - temp6 += v_x[i] * va6[i]; - temp7 += v_x[i] * va7[i]; - } - - #if defined(POWER8) - y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); - y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); - y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]); - y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]); - - y[4] += alpha * (temp4[0] + temp4[1]+temp4[2] + temp4[3]); - y[5] += alpha * (temp5[0] + temp5[1]+temp5[2] + temp5[3]); - y[6] += alpha * (temp6[0] + temp6[1]+temp6[2] + temp6[3]); - y[7] += alpha * (temp7[0] + temp7[1]+temp7[2] + temp7[3]); - #else - register __vector float t0, t1, t2, t3; - register __vector float a = { alpha, alpha, alpha, alpha }; - __vector float *v_y = (__vector float*) y; + for (i = 0; i < n; i += 4) { + __vector float vx = vec_vsx_ld(0, &x[i]); + __vector float vva0 = vec_vsx_ld(0, &a0[i]); + __vector float vva1 = vec_vsx_ld(0, &a1[i]); + __vector float vva2 = vec_vsx_ld(0, &a2[i]); + __vector float vva3 = vec_vsx_ld(0, &a3[i]); + __vector float vva4 = vec_vsx_ld(0, &a4[i]); + __vector float vva5 = vec_vsx_ld(0, &a5[i]); + __vector float vva6 = vec_vsx_ld(0, &a6[i]); + __vector float vva7 = vec_vsx_ld(0, &a7[i]); + temp0 += vx * vva0; + temp1 += vx * vva1; + temp2 += vx * vva2; + temp3 += vx * vva3; + temp4 += vx * vva4; + temp5 += vx * vva5; + temp6 += vx * vva6; + temp7 += vx * vva7; + } + +#if defined(POWER8) + y[0] += alpha * (temp0[0] + temp0[1] + temp0[2] + temp0[3]); + y[1] += alpha * (temp1[0] + temp1[1] + temp1[2] + temp1[3]); + y[2] += alpha * (temp2[0] + temp2[1] + temp2[2] + temp2[3]); + y[3] += alpha * (temp3[0] + temp3[1] + temp3[2] + temp3[3]); + + y[4] += alpha * (temp4[0] + temp4[1] + temp4[2] + temp4[3]); + y[5] += alpha * (temp5[0] + temp5[1] + temp5[2] + temp5[3]); + y[6] += alpha * (temp6[0] + temp6[1] + temp6[2] + temp6[3]); + y[7] += alpha * (temp7[0] + temp7[1] + temp7[2] + temp7[3]); +#else + register __vector float t0, t1, t2, t3; + register __vector float a = {alpha, alpha, alpha, alpha}; + __vector float vy0 = vec_vsx_ld(0, y); + __vector float vy1 = vec_vsx_ld(0, &(y[4])); t0 = vec_mergeh(temp0, temp2); t1 = vec_mergel(temp0, temp2); t2 = vec_mergeh(temp1, temp3); @@ -113,44 +112,46 @@ static void sgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA temp7 = vec_mergel(t1, t3); temp4 += temp5 + temp6 + temp7; - v_y[0] += a * temp0; - v_y[1] += a * temp4; + vy0 += a * temp0; + vy1 += a * temp4; + vec_vsx_st(vy0, 0, y); + vec_vsx_st(vy1, 0, &(y[4])); #endif } - -static void sgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { +static void sgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, + FLOAT *y, FLOAT alpha) { BLASLONG i = 0; FLOAT *a0, *a1, *a2, *a3; a0 = ap; a1 = ap + lda; a2 = a1 + lda; a3 = a2 + lda; - __vector float* va0 = (__vector float*) a0; - __vector float* va1 = (__vector float*) a1; - __vector float* va2 = (__vector float*) a2; - __vector float* va3 = (__vector float*) a3; - __vector float* v_x = (__vector float*) x; - register __vector float temp0 = {0,0,0,0}; - register __vector float temp1 = {0,0,0,0}; - register __vector float temp2 = {0,0,0,0}; - register __vector float temp3 = {0,0,0,0}; - - for (i = 0; i < n / 4; i ++) { - temp0 += v_x[i] * va0[i]; - temp1 += v_x[i] * va1[i]; - temp2 += v_x[i] * va2[i]; - temp3 += v_x[i] * va3[i]; + register __vector float temp0 = {0, 0, 0, 0}; + register __vector float temp1 = {0, 0, 0, 0}; + register __vector float temp2 = {0, 0, 0, 0}; + register __vector float temp3 = {0, 0, 0, 0}; + + for (i = 0; i < n; i += 4) { + __vector float vx = vec_vsx_ld(0, &x[i]); + __vector float vva0 = vec_vsx_ld(0, &a0[i]); + __vector float vva1 = vec_vsx_ld(0, &a1[i]); + __vector float vva2 = vec_vsx_ld(0, &a2[i]); + __vector float vva3 = vec_vsx_ld(0, &a3[i]); + temp0 += vx * vva0; + temp1 += vx * vva1; + temp2 += vx * vva2; + temp3 += vx * vva3; } - #if defined(POWER8) - y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); - y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); - y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]); - y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]); - #else +#if defined(POWER8) + y[0] += alpha * (temp0[0] + temp0[1] + temp0[2] + temp0[3]); + y[1] += alpha * (temp1[0] + temp1[1] + temp1[2] + temp1[3]); + y[2] += alpha * (temp2[0] + temp2[1] + temp2[2] + temp2[3]); + y[3] += alpha * (temp3[0] + temp3[1] + temp3[2] + temp3[3]); +#else register __vector float t0, t1, t2, t3; - register __vector float a = { alpha, alpha, alpha, alpha }; - __vector float *v_y = (__vector float*) y; + register __vector float a = {alpha, alpha, alpha, alpha}; + __vector float vy0 = vec_vsx_ld(0, y); t0 = vec_mergeh(temp0, temp2); t1 = vec_mergel(temp0, temp2); @@ -162,47 +163,42 @@ static void sgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA temp3 = vec_mergel(t1, t3); temp0 += temp1 + temp2 + temp3; - v_y[0] += a * temp0; + vy0 += a * temp0; + vec_vsx_st(vy0, 0, y); #endif } - - -static void sgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha, BLASLONG inc_y) { +static void sgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, + FLOAT *y, FLOAT alpha, BLASLONG inc_y) { BLASLONG i; FLOAT *a0, *a1; a0 = ap; a1 = ap + lda; - __vector float* va0 = (__vector float*) a0; - __vector float* va1 = (__vector float*) a1; - __vector float* v_x = (__vector float*) x; - __vector float temp0 = {0,0,0,0}; - __vector float temp1 = {0,0,0,0}; - for (i = 0; i < n / 4; i ++) { - temp0 += v_x[i] * va0[i]; - temp1 += v_x[i] * va1[i]; + __vector float temp0 = {0, 0, 0, 0}; + __vector float temp1 = {0, 0, 0, 0}; + for (i = 0; i < n; i += 4) { + __vector float vx = vec_vsx_ld(0, &x[i]); + __vector float vva0 = vec_vsx_ld(0, &a0[i]); + __vector float vva1 = vec_vsx_ld(0, &a1[i]); + temp0 += vx * vva0; + temp1 += vx * vva1; } - - - y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); - y[inc_y] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); + y[0] += alpha * (temp0[0] + temp0[1] + temp0[2] + temp0[3]); + y[inc_y] += alpha * (temp1[0] + temp1[1] + temp1[2] + temp1[3]); } -static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { - +static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, + FLOAT alpha) { BLASLONG i; - FLOAT *a0; - a0 = ap; - __vector float* va0 = (__vector float*) a0; - __vector float* v_x = (__vector float*) x; - __vector float temp0 = {0,0,0,0}; - for (i = 0; i < n / 4; i ++) { - temp0 += v_x[i] * va0[i] ; + __vector float temp0 = {0, 0, 0, 0}; + for (i = 0; i < n; i += 4) { + __vector float vx = vec_vsx_ld(0, &x[i]); + __vector float vva0 = vec_vsx_ld(0, &ap[i]); + temp0 += vx * vva0; } - y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); - + y[0] += alpha * (temp0[0] + temp0[1] + temp0[2] + temp0[3]); } static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { @@ -213,20 +209,14 @@ static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { } } -int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { - BLASLONG i; - BLASLONG j; - FLOAT *a_ptr; - FLOAT *x_ptr; - FLOAT *y_ptr; - - BLASLONG n1; - BLASLONG m1; - BLASLONG m2; - BLASLONG m3; - BLASLONG n2; +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, + BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, + FLOAT *buffer) { + + BLASLONG i, j, n1, m1, m2, m3, n2; + FLOAT *a_ptr, *x_ptr, *y_ptr; FLOAT ybuffer[8] __attribute__((aligned(16))); - FLOAT *xbuffer; + FLOAT *xbuffer; if (m < 1) return (0); if (n < 1) return (0); @@ -242,7 +232,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO BLASLONG NB = NBMAX; while (NB == NBMAX) { - m1 -= NB; if (m1 < 0) { if (m2 == 0) break; @@ -260,20 +249,15 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO BLASLONG lda8 = lda << 3; - if (inc_y == 1) { - for (i = 0; i < n1; i++) { - sgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, y_ptr, alpha); - + y_ptr += 8; a_ptr += lda8; - } } else { - for (i = 0; i < n1; i++) { ybuffer[0] = 0; ybuffer[1] = 0; @@ -285,8 +269,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO ybuffer[7] = 0; sgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, ybuffer, alpha); - - *y_ptr += ybuffer[0]; y_ptr += inc_y; *y_ptr += ybuffer[1]; @@ -307,10 +289,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO a_ptr += lda8; } - } - if (n2 & 4) { ybuffer[0] = 0; ybuffer[1] = 0; @@ -318,7 +298,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO ybuffer[3] = 0; sgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha); - a_ptr += lda<<2; + a_ptr += lda << 2; *y_ptr += ybuffer[0]; y_ptr += inc_y; @@ -334,20 +314,16 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO sgemv_kernel_4x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha, inc_y); a_ptr += lda << 1; y_ptr += 2 * inc_y; - } if (n2 & 1) { sgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha); a_ptr += lda; y_ptr += inc_y; - } a += NB; x += NB * inc_x; - - } if (m3 == 0) return (0); @@ -365,13 +341,14 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO y_ptr = y; if (lda == 3 && inc_y == 1) { - for (j = 0; j < (n & -4); j += 4) { - y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; - y_ptr[j + 1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2; - y_ptr[j + 2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2; - y_ptr[j + 3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2; + y_ptr[j + 1] += + aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2; + y_ptr[j + 2] += + aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2; + y_ptr[j + 3] += + aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2; aj += 12; } @@ -381,38 +358,40 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO } } else { - if (inc_y == 1) { - BLASLONG register lda2 = lda << 1; BLASLONG register lda4 = lda << 2; BLASLONG register lda3 = lda2 + lda; for (j = 0; j < (n & -4); j += 4) { - - y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; - y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1 + *(aj + lda + 2) * xtemp2; - y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1 + *(aj + lda2 + 2) * xtemp2; - y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1 + *(aj + lda3 + 2) * xtemp2; + y_ptr[j] += + *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; + y_ptr[j + 1] += *(aj + lda) * xtemp0 + + *(aj + lda + 1) * xtemp1 + + *(aj + lda + 2) * xtemp2; + y_ptr[j + 2] += *(aj + lda2) * xtemp0 + + *(aj + lda2 + 1) * xtemp1 + + *(aj + lda2 + 2) * xtemp2; + y_ptr[j + 3] += *(aj + lda3) * xtemp0 + + *(aj + lda3 + 1) * xtemp1 + + *(aj + lda3 + 2) * xtemp2; aj += lda4; } for (; j < n; j++) { - - y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; + y_ptr[j] += + *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; aj += lda; } } else { - for (j = 0; j < n; j++) { - *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; + *y_ptr += + *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; y_ptr += inc_y; aj += lda; } - } - } return (0); } @@ -426,14 +405,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO y_ptr = y; if (lda == 2 && inc_y == 1) { - for (j = 0; j < (n & -4); j += 4) { y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1; y_ptr[j + 1] += aj[2] * xtemp0 + aj[3] * xtemp1; y_ptr[j + 2] += aj[4] * xtemp0 + aj[5] * xtemp1; y_ptr[j + 3] += aj[6] * xtemp0 + aj[7] * xtemp1; aj += 8; - } for (; j < n; j++) { @@ -443,22 +420,22 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO } else { if (inc_y == 1) { - BLASLONG register lda2 = lda << 1; BLASLONG register lda4 = lda << 2; BLASLONG register lda3 = lda2 + lda; for (j = 0; j < (n & -4); j += 4) { - y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; - y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1; - y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1; - y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1; + y_ptr[j + 1] += + *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1; + y_ptr[j + 2] += + *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1; + y_ptr[j + 3] += + *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1; aj += lda4; } for (; j < n; j++) { - y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; aj += lda; } @@ -470,10 +447,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO aj += lda; } } - } return (0); - } FLOAT xtemp = *x_ptr * alpha; @@ -490,10 +465,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO y_ptr[j] += aj[j] * xtemp; } - } else { if (inc_y == 1) { - BLASLONG register lda2 = lda << 1; BLASLONG register lda4 = lda << 2; BLASLONG register lda3 = lda2 + lda; @@ -516,12 +489,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO y_ptr += inc_y; aj += lda; } - } } return (0); - } #endif From d659f3c3f6058ece8a704423b72ece3821cfba4d Mon Sep 17 00:00:00 2001 From: Ruiyang Wu Date: Wed, 16 Apr 2025 12:15:44 -0400 Subject: [PATCH 125/205] Fix "Argument list too long" compilation error for Intel macOS --- CMakeLists.txt | 60 ++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 43 insertions(+), 17 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e56a013b2..5f46b905d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -299,23 +299,49 @@ if (USE_OPENMP) endif() endif() -# Seems that this hack doesn't required since macOS 11 Big Sur -if (APPLE AND BUILD_SHARED_LIBS AND CMAKE_HOST_SYSTEM_VERSION VERSION_LESS 20) - set (CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1) - if (NOT NOFORTRAN) - set (CMAKE_Fortran_USE_RESPONSE_FILE_FOR_OBJECTS 1) - set (CMAKE_Fortran_CREATE_SHARED_LIBRARY - "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ${CMAKE_AR} -ru libopenblas.a && exit 0' " - "sh -c '${CMAKE_AR} -rs libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' " - "sh -c 'echo \"\" | ${CMAKE_Fortran_COMPILER} -o dummy.o -c -x f95-cpp-input - '" - "sh -c '${CMAKE_Fortran_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load dummy.o -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'" - "sh -c 'ls -l ${CMAKE_BINARY_DIR}/lib'") - else () - set (CMAKE_C_CREATE_SHARED_LIBRARY - "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ${CMAKE_AR} -ru libopenblas.a && exit 0' " - "sh -c '${CMAKE_AR} -rs libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' " - "sh -c '${CMAKE_C_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'") - endif () +# Fix "Argument list too long" for macOS with Intel CPUs and DYNAMIC_ARCH turned on +if(APPLE AND DYNAMIC_ARCH AND (NOT CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "arm64")) + # Use response files + set(CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1) + # Always build static library first + if(INTERFACE64) + set(STATIC_FILE "libopenblas_64.a") + else() + set(STATIC_FILE "libopenblas.a") + endif() + if(BUILD_STATIC_LIBS) + set(STATIC_PATH "${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/${STATIC_FILE}") + else() + add_library(${OpenBLAS_LIBNAME}_static STATIC ${TARGET_OBJS} ${OpenBLAS_DEF_FILE}) + set(STATIC_PATH STATIC_FILE) + endif() + set(CREATE_STATIC_LIBRARY_COMMAND + "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ${CMAKE_AR} -ru ${STATIC_PATH} && exit 0' " + "sh -c '${CMAKE_AR} -rs ${STATIC_PATH} ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' ") + if(BUILD_SHARED_LIBS) + add_dependencies(${OpenBLAS_LIBNAME}_shared ${OpenBLAS_LIBNAME}_static) + set(SHARED_PATH "${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib") + endif() + if(USE_OPENMP) + get_target_property(OMP_LIB OpenMP::OpenMP_C INTERFACE_LINK_LIBRARIES) + else() + set(OMP_LIB "") + endif() + if(NOT NOFORTRAN) + set(CMAKE_Fortran_USE_RESPONSE_FILE_FOR_OBJECTS 1) + set(CMAKE_Fortran_CREATE_STATIC_LIBRARY ${CREATE_STATIC_LIBRARY_COMMAND}) + if(BUILD_SHARED_LIBS) + set(CMAKE_Fortran_CREATE_SHARED_LIBRARY + "sh -c 'echo \"\" | ${CMAKE_Fortran_COMPILER} -o dummy.o -c -x f95-cpp-input - '" + "sh -c '${CMAKE_Fortran_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,${STATIC_PATH} dummy.o -o ${SHARED_PATH} ${OMP_LIB}'") + endif() + else() + set(CMAKE_C_CREATE_STATIC_LIBRARY ${CREATE_STATIC_LIBRARY_COMMAND}) + if(BUILD_SHARED_LIBS) + set(CMAKE_C_CREATE_SHARED_LIBRARY + "sh -c '${CMAKE_C_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,${STATIC_PATH} -o ${SHARED_PATH} ${OMP_LIB}'") + endif() + endif() endif() # Handle MSVC exports From 9aa7a0b2a7b2770adec6ff26b34660d3bcd8c49c Mon Sep 17 00:00:00 2001 From: Ruiyang Wu Date: Sun, 20 Apr 2025 22:55:19 -0400 Subject: [PATCH 126/205] Follow-up to d659f3c --- CMakeLists.txt | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5f46b905d..f94c4c474 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -304,19 +304,14 @@ if(APPLE AND DYNAMIC_ARCH AND (NOT CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "arm64") # Use response files set(CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1) # Always build static library first - if(INTERFACE64) - set(STATIC_FILE "libopenblas_64.a") - else() - set(STATIC_FILE "libopenblas.a") - endif() if(BUILD_STATIC_LIBS) - set(STATIC_PATH "${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/${STATIC_FILE}") + set(STATIC_PATH "${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/lib${OpenBLAS_LIBNAME}.a") else() add_library(${OpenBLAS_LIBNAME}_static STATIC ${TARGET_OBJS} ${OpenBLAS_DEF_FILE}) - set(STATIC_PATH STATIC_FILE) + set(STATIC_PATH "lib${OpenBLAS_LIBNAME}.a") endif() set(CREATE_STATIC_LIBRARY_COMMAND - "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ${CMAKE_AR} -ru ${STATIC_PATH} && exit 0' " + "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/${OpenBLAS_LIBNAME}_static.dir/objects*.rsp | xargs -n 1024 ${CMAKE_AR} -ru ${STATIC_PATH} && exit 0' " "sh -c '${CMAKE_AR} -rs ${STATIC_PATH} ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' ") if(BUILD_SHARED_LIBS) add_dependencies(${OpenBLAS_LIBNAME}_shared ${OpenBLAS_LIBNAME}_static) From 96d80801bc2a5df7d85f73dbd2ecc8673dad0292 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 21 Apr 2025 22:53:26 +0200 Subject: [PATCH 127/205] Reinstate the CooperLake microkernel --- kernel/x86_64/sbgemv_n.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/x86_64/sbgemv_n.c b/kernel/x86_64/sbgemv_n.c index b2d4eb74f..08ccace61 100644 --- a/kernel/x86_64/sbgemv_n.c +++ b/kernel/x86_64/sbgemv_n.c @@ -28,9 +28,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -//#if defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) -//#include "sbgemv_n_microk_cooperlake.c" -//#endif +#if defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) +#include "sbgemv_n_microk_cooperlake.c" +#endif #define ALIGN64_ALLOC(alloc_size, TYPE, ptr_align, ptr) \ ptr = (TYPE *) malloc(sizeof(TYPE)*alloc_size + 63); \ From 99d9f1ff3878e791591671f92f3a2470a1e565e6 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 21 Apr 2025 22:55:45 +0200 Subject: [PATCH 128/205] Fix conditional --- kernel/x86_64/sbgemv_n_microk_cooperlake_template.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/sbgemv_n_microk_cooperlake_template.c b/kernel/x86_64/sbgemv_n_microk_cooperlake_template.c index 4711e9720..ab22e0848 100644 --- a/kernel/x86_64/sbgemv_n_microk_cooperlake_template.c +++ b/kernel/x86_64/sbgemv_n_microk_cooperlake_template.c @@ -231,7 +231,7 @@ static int sbgemv_kernel_32xN_lda_direct(BLASLONG m, BLASLONG n, float alpha, bf accum512_8 = _mm512_permutex2var_ps(accum512_0, idx_base_0, accum512_1); accum512_9 = _mm512_permutex2var_ps(accum512_0, idx_base_1, accum512_1); - if ((m-tag_m_32x) > 16) { + if ((m-tag_m_32x) >= 16) { STORE16_COMPLETE_RESULT(accum512_8, y+tag_m_32x+0) STORE16_MASK_COMPLETE_RESULT(accum512_9, y+tag_m_32x+16, store_tail_mask) } else { From 4ec62d7f730b13d0ee48ce6f03d879f82696d9e6 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 21 Apr 2025 23:14:10 +0200 Subject: [PATCH 129/205] remove non-vectorized code path for power8, restoring PR4880 --- kernel/power/sgemv_t.c | 23 ++++------------------- 1 file changed, 4 insertions(+), 19 deletions(-) diff --git a/kernel/power/sgemv_t.c b/kernel/power/sgemv_t.c index 8a01d2de4..c66b003d2 100644 --- a/kernel/power/sgemv_t.c +++ b/kernel/power/sgemv_t.c @@ -77,17 +77,7 @@ static void sgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, temp7 += vx * vva7; } -#if defined(POWER8) - y[0] += alpha * (temp0[0] + temp0[1] + temp0[2] + temp0[3]); - y[1] += alpha * (temp1[0] + temp1[1] + temp1[2] + temp1[3]); - y[2] += alpha * (temp2[0] + temp2[1] + temp2[2] + temp2[3]); - y[3] += alpha * (temp3[0] + temp3[1] + temp3[2] + temp3[3]); - - y[4] += alpha * (temp4[0] + temp4[1] + temp4[2] + temp4[3]); - y[5] += alpha * (temp5[0] + temp5[1] + temp5[2] + temp5[3]); - y[6] += alpha * (temp6[0] + temp6[1] + temp6[2] + temp6[3]); - y[7] += alpha * (temp7[0] + temp7[1] + temp7[2] + temp7[3]); -#else + register __vector float t0, t1, t2, t3; register __vector float a = {alpha, alpha, alpha, alpha}; __vector float vy0 = vec_vsx_ld(0, y); @@ -116,7 +106,7 @@ static void sgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, vy1 += a * temp4; vec_vsx_st(vy0, 0, y); vec_vsx_st(vy1, 0, &(y[4])); -#endif + } static void sgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, @@ -143,12 +133,7 @@ static void sgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, temp2 += vx * vva2; temp3 += vx * vva3; } -#if defined(POWER8) - y[0] += alpha * (temp0[0] + temp0[1] + temp0[2] + temp0[3]); - y[1] += alpha * (temp1[0] + temp1[1] + temp1[2] + temp1[3]); - y[2] += alpha * (temp2[0] + temp2[1] + temp2[2] + temp2[3]); - y[3] += alpha * (temp3[0] + temp3[1] + temp3[2] + temp3[3]); -#else + register __vector float t0, t1, t2, t3; register __vector float a = {alpha, alpha, alpha, alpha}; __vector float vy0 = vec_vsx_ld(0, y); @@ -165,7 +150,7 @@ static void sgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, vy0 += a * temp0; vec_vsx_st(vy0, 0, y); -#endif + } static void sgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, From e11744a41163c5ce6e744214f54fb583e294f598 Mon Sep 17 00:00:00 2001 From: Annop Wongwathanarat Date: Tue, 22 Apr 2025 09:40:13 +0000 Subject: [PATCH 130/205] Use SVE kernel for S/DGEMVN for SVE machines --- kernel/arm64/KERNEL.ARMV8SVE | 4 ++-- kernel/arm64/KERNEL.NEOVERSEN2 | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/arm64/KERNEL.ARMV8SVE b/kernel/arm64/KERNEL.ARMV8SVE index 0e51f2c2f..a8371a03c 100644 --- a/kernel/arm64/KERNEL.ARMV8SVE +++ b/kernel/arm64/KERNEL.ARMV8SVE @@ -74,8 +74,8 @@ DSCALKERNEL = scal.S CSCALKERNEL = zscal.S ZSCALKERNEL = zscal.S -SGEMVNKERNEL = gemv_n_sve.c -DGEMVNKERNEL = gemv_n.S +SGEMVNKERNEL = gemv_n_sve_v1x3.c +DGEMVNKERNEL = gemv_n_sve_v1x3.c CGEMVNKERNEL = zgemv_n.S ZGEMVNKERNEL = zgemv_n.S diff --git a/kernel/arm64/KERNEL.NEOVERSEN2 b/kernel/arm64/KERNEL.NEOVERSEN2 index b9dc23562..c8d511f20 100644 --- a/kernel/arm64/KERNEL.NEOVERSEN2 +++ b/kernel/arm64/KERNEL.NEOVERSEN2 @@ -60,8 +60,8 @@ DSCALKERNEL = scal.S CSCALKERNEL = zscal.S ZSCALKERNEL = zscal.S -SGEMVNKERNEL = gemv_n.S -DGEMVNKERNEL = gemv_n.S +SGEMVNKERNEL = gemv_n_sve_v1x3.c +DGEMVNKERNEL = gemv_n_sve_v1x3.c CGEMVNKERNEL = zgemv_n.S ZGEMVNKERNEL = zgemv_n.S From 08b5c18d707adc363e5047793fdee5218db63b20 Mon Sep 17 00:00:00 2001 From: "Iha, Taisei" Date: Tue, 22 Apr 2025 19:56:44 +0900 Subject: [PATCH 131/205] fixed a potential out-of-bounds on gemv. --- kernel/arm64/gemv_n_sve_v1x3.c | 6 +++--- kernel/arm64/gemv_n_sve_v4x3.c | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/arm64/gemv_n_sve_v1x3.c b/kernel/arm64/gemv_n_sve_v1x3.c index 44c9a89b9..d6aa3d389 100644 --- a/kernel/arm64/gemv_n_sve_v1x3.c +++ b/kernel/arm64/gemv_n_sve_v1x3.c @@ -77,9 +77,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, svbool_t pg01 = ((j + width * 1) < n) ? SV_TRUE() : svpfalse(); svbool_t pg02 = ((j + width * 2) < n) ? SV_TRUE() : svpfalse(); - SV_TYPE temp0_vec = SV_DUP(alpha * x0_ptr[ix]); - SV_TYPE temp1_vec = SV_DUP(alpha * x1_ptr[ix]); - SV_TYPE temp2_vec = SV_DUP(alpha * x2_ptr[ix]); + SV_TYPE temp0_vec = ((j + width * 0) < n) ? SV_DUP(alpha * x0_ptr[ix]) : SV_DUP(0.0); + SV_TYPE temp1_vec = ((j + width * 1) < n) ? SV_DUP(alpha * x1_ptr[ix]) : SV_DUP(0.0); + SV_TYPE temp2_vec = ((j + width * 2) < n) ? SV_DUP(alpha * x2_ptr[ix]) : SV_DUP(0.0); i = 0; BLASLONG sve_size = SV_COUNT(); while ((i + sve_size * 1 - 1) < m) { diff --git a/kernel/arm64/gemv_n_sve_v4x3.c b/kernel/arm64/gemv_n_sve_v4x3.c index 92e4f75b6..0a7018303 100644 --- a/kernel/arm64/gemv_n_sve_v4x3.c +++ b/kernel/arm64/gemv_n_sve_v4x3.c @@ -86,9 +86,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, svbool_t pg22 = ((j + width * 2) < n) ? SV_TRUE() : svpfalse(); svbool_t pg32 = ((j + width * 2) < n) ? SV_TRUE() : svpfalse(); - SV_TYPE temp0_vec = SV_DUP(alpha * x0_ptr[ix]); - SV_TYPE temp1_vec = SV_DUP(alpha * x1_ptr[ix]); - SV_TYPE temp2_vec = SV_DUP(alpha * x2_ptr[ix]); + SV_TYPE temp0_vec = ((j + width * 0) < n) ? SV_DUP(alpha * x0_ptr[ix]) : SV_DUP(0.0); + SV_TYPE temp1_vec = ((j + width * 1) < n) ? SV_DUP(alpha * x1_ptr[ix]) : SV_DUP(0.0); + SV_TYPE temp2_vec = ((j + width * 2) < n) ? SV_DUP(alpha * x2_ptr[ix]) : SV_DUP(0.0); i = 0; BLASLONG sve_size = SV_COUNT(); while ((i + sve_size * 4 - 1) < m) { From 9c02cdb073264d3c5360f077d4f2baa157e08fc9 Mon Sep 17 00:00:00 2001 From: abhishek-fujitsu Date: Mon, 24 Mar 2025 01:00:50 +0530 Subject: [PATCH 132/205] optimise dot using thread throttling for NEOVERSE V1 --- kernel/arm64/dot.c | 53 +++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 50 insertions(+), 3 deletions(-) diff --git a/kernel/arm64/dot.c b/kernel/arm64/dot.c index 4607ebc59..ece31bccd 100644 --- a/kernel/arm64/dot.c +++ b/kernel/arm64/dot.c @@ -48,6 +48,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, void *c, BLASLONG ldc, int (*function)(), int nthreads); + +#ifdef DYNAMIC_ARCH +extern char* gotoblas_corename(void); +#endif + +#if defined(DYNAMIC_ARCH) || defined(NEOVERSEV1) +static inline int get_dot_optimal_nthreads_neoversev1(BLASLONG N, int ncpu) { + #ifdef DOUBLE + return (N <= 10000L) ? 1 + : (N <= 64500L) ? 1 + : (N <= 100000L) ? MIN(ncpu, 2) + : (N <= 150000L) ? MIN(ncpu, 4) + : (N <= 260000L) ? MIN(ncpu, 8) + : (N <= 360000L) ? MIN(ncpu, 16) + : (N <= 520000L) ? MIN(ncpu, 24) + : (N <= 1010000L) ? MIN(ncpu, 56) + : ncpu; + #else + return (N <= 10000L) ? 1 + : (N <= 110000L) ? 1 + : (N <= 200000L) ? MIN(ncpu, 2) + : (N <= 280000L) ? MIN(ncpu, 4) + : (N <= 520000L) ? MIN(ncpu, 8) + : (N <= 830000L) ? MIN(ncpu, 16) + : (N <= 1010000L) ? MIN(ncpu, 24) + : ncpu; + #endif +} +#endif + +static inline int get_dot_optimal_nthreads(BLASLONG n) { + int ncpu = num_cpu_avail(1); + +#if defined(NEOVERSEV1) && !defined(COMPLEX) && !defined(BFLOAT16) + return get_dot_optimal_nthreads_neoversev1(n, ncpu); +#elif defined(DYNAMIC_ARCH) && !defined(COMPLEX) && !defined(BFLOAT16) + if (strcmp(gotoblas_corename(), "neoversev1") == 0) { + return get_dot_optimal_nthreads_neoversev1(n, ncpu); + } +#endif + + // Default case + if (n <= 10000L) + return 1; + else + return num_cpu_avail(1); +} #endif static RETURN_TYPE dot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) @@ -85,10 +132,10 @@ RETURN_TYPE CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y RETURN_TYPE dot = 0.0; #if defined(SMP) - if (inc_x == 0 || inc_y == 0 || n <= 10000) + if (inc_x == 0 || inc_y == 0) nthreads = 1; else - nthreads = num_cpu_avail(1); + nthreads = get_dot_optimal_nthreads(n); if (nthreads == 1) { dot = dot_compute(n, x, inc_x, y, inc_y); @@ -105,7 +152,7 @@ RETURN_TYPE CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, x, inc_x, y, inc_y, result, 0, - ( void *)dot_thread_function, nthreads); + (void *)dot_thread_function, nthreads); ptr = (RETURN_TYPE *)result; for (i = 0; i < nthreads; i++) { From 0c239c9d483840dc6c939601b1a86fad973e9e4a Mon Sep 17 00:00:00 2001 From: abhishek-fujitsu Date: Tue, 22 Apr 2025 21:56:05 +0530 Subject: [PATCH 133/205] update contribution list --- CONTRIBUTORS.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 6b0814dcc..d8f57ef60 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -250,4 +250,7 @@ In chronological order: * Ye Tao * [2025-02-03] Optimize SBGEMM kernel on NEOVERSEV1 - * [2025-02-27] Add sbgemv_n_neon kernel \ No newline at end of file + * [2025-02-27] Add sbgemv_n_neon kernel + +* Abhishek Kumar + * [2025-04-22] Optimise dot kernel for NEOVERSE V1 \ No newline at end of file From e1bd631593b1f1437d5b79f9ddca1e3ef19a89a7 Mon Sep 17 00:00:00 2001 From: chitao1234 Date: Thu, 24 Apr 2025 18:59:10 +0800 Subject: [PATCH 134/205] allow the use of LAPACK_COMPLEX_CPP when using MSVC compiler --- lapack-netlib/LAPACKE/include/lapacke_config.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/lapack-netlib/LAPACKE/include/lapacke_config.h b/lapack-netlib/LAPACKE/include/lapacke_config.h index 4ef542fb1..a8679dfa2 100644 --- a/lapack-netlib/LAPACKE/include/lapacke_config.h +++ b/lapack-netlib/LAPACKE/include/lapacke_config.h @@ -75,7 +75,16 @@ extern "C" { #ifndef LAPACK_COMPLEX_CUSTOM #if defined(_MSC_VER) && !defined(__INTEL_CLANG_COMPILER) +#if defined(LAPACK_COMPLEX_CPP) + #include + #define lapack_complex_float std::complex + #define lapack_complex_double std::complex + #define lapack_complex_float_real(z) ((z).real()) + #define lapack_complex_float_imag(z) ((z).imag()) + #define lapack_complex_double_real(z) ((z).real()) + #define lapack_complex_double_imag(z) ((z).imag()) #define _CRT_USE_C_COMPLEX_H +#else #include #define LAPACK_COMPLEX_CUSTOM #define lapack_complex_float _Fcomplex @@ -84,6 +93,7 @@ extern "C" { #define lapack_complex_float_imag(z) (cimag(z)) #define lapack_complex_double_real(z) (creal(z)) #define lapack_complex_double_imag(z) (cimag(z)) +#endif #else #if defined(LAPACK_COMPLEX_STRUCTURE) From 7616c42095ead2755633eb41d98f5d17d58a9800 Mon Sep 17 00:00:00 2001 From: guoyuanplct Date: Fri, 25 Apr 2025 00:05:15 +0800 Subject: [PATCH 135/205] Optimized RVV_ZVL256B Implementation of zgemv_n The implementation of zgemv_n using RVV_ZVL256B has been optimized. Compared to the previous implementation, it has achieved a 1.5x performance improvement. --- kernel/riscv64/zgemv_n_vector.c | 197 ++++++++++++++++++++++++++++---- 1 file changed, 174 insertions(+), 23 deletions(-) diff --git a/kernel/riscv64/zgemv_n_vector.c b/kernel/riscv64/zgemv_n_vector.c index 104d3865d..77f55cdd9 100644 --- a/kernel/riscv64/zgemv_n_vector.c +++ b/kernel/riscv64/zgemv_n_vector.c @@ -27,23 +27,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) -#define FLOAT_V_T vfloat32m4_t -#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4) -#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4) -#define VSEV_FLOAT RISCV_RVV(vse32_v_f32m4) -#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4) -#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4) -#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f32m4) +#define VSETVL(n) RISCV_RVV(vsetvl_e32m2)(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m2) +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m2) +#define VSEV_FLOAT RISCV_RVV(vse32_v_f32m2) +#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m2) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m2) +#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f32m2) +#define VFMUL_VF_FLOAT RISCV_RVV(vfmul_vf_f32m2) +#define VSEV_FLOAT RISCV_RVV(vse32_v_f32m2) #else -#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) -#define FLOAT_V_T vfloat64m4_t -#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4) -#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4) -#define VSEV_FLOAT RISCV_RVV(vse64_v_f64m4) -#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4) -#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4) -#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m4) +#define VSETVL(n) RISCV_RVV(vsetvl_e64m2)(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m2) +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m2) +#define VSEV_FLOAT RISCV_RVV(vse64_v_f64m2) +#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m2) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m2) +#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m2) +#define VFMUL_VF_FLOAT RISCV_RVV(vfmul_vf_f64m2) +#define VSEV_FLOAT RISCV_RVV(vse64_v_f64m2) #endif int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) @@ -51,8 +55,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, BLASLONG i = 0, j = 0, k = 0; BLASLONG ix = 0, iy = 0; FLOAT *a_ptr = a; - FLOAT temp_r = 0.0, temp_i = 0.0; - FLOAT_V_T va0, va1, vy0, vy1; + FLOAT temp_r = 0.0, temp_i = 0.0 ,temp_r1 ,temp_i1, temp_r2 ,temp_i2 ,temp_r3, temp_i3,temp_rr[4] ,temp_ii[4]; + FLOAT_V_T va0, va1, vy0, vy1,vy0_new, vy1_new, va2 , va3 , va4 , va5, va6 , va7, temp_iv , temp_rv,x_v0 , x_v1,temp_v1 , temp_v2 , temp_v3 , temp_v4; unsigned int gvl = 0; BLASLONG stride_a = sizeof(FLOAT) * 2; BLASLONG stride_y = inc_y * sizeof(FLOAT) * 2; @@ -60,12 +64,20 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, BLASLONG inc_yv = inc_y * gvl * 2; BLASLONG inc_x2 = inc_x * 2; BLASLONG lda2 = lda * 2; + vy0_new = VLSEV_FLOAT(&y[iy], stride_y, gvl); + vy1_new = VLSEV_FLOAT(&y[iy+1], stride_y, gvl); for(k=0,j=0; k Date: Fri, 25 Apr 2025 00:27:27 +0800 Subject: [PATCH 136/205] Format the code --- kernel/riscv64/zgemv_n_vector.c | 342 ++++++++++++++++---------------- 1 file changed, 169 insertions(+), 173 deletions(-) diff --git a/kernel/riscv64/zgemv_n_vector.c b/kernel/riscv64/zgemv_n_vector.c index 77f55cdd9..cbed06c97 100644 --- a/kernel/riscv64/zgemv_n_vector.c +++ b/kernel/riscv64/zgemv_n_vector.c @@ -52,11 +52,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { - BLASLONG i = 0, j = 0, k = 0; + BLASLONG i = 0, j = 0, k = 0; BLASLONG ix = 0, iy = 0; FLOAT *a_ptr = a; - FLOAT temp_r = 0.0, temp_i = 0.0 ,temp_r1 ,temp_i1, temp_r2 ,temp_i2 ,temp_r3, temp_i3,temp_rr[4] ,temp_ii[4]; - FLOAT_V_T va0, va1, vy0, vy1,vy0_new, vy1_new, va2 , va3 , va4 , va5, va6 , va7, temp_iv , temp_rv,x_v0 , x_v1,temp_v1 , temp_v2 , temp_v3 , temp_v4; + FLOAT temp_r = 0.0, temp_i = 0.0, temp_r1, temp_i1, temp_r2, temp_i2, temp_r3, temp_i3, temp_rr[4], temp_ii[4]; + FLOAT_V_T va0, va1, vy0, vy1, vy0_new, vy1_new, va2, va3, va4, va5, va6, va7, temp_iv, temp_rv, x_v0, x_v1, temp_v1, temp_v2, temp_v3, temp_v4; unsigned int gvl = 0; BLASLONG stride_a = sizeof(FLOAT) * 2; BLASLONG stride_y = inc_y * sizeof(FLOAT) * 2; @@ -64,56 +64,58 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, BLASLONG inc_yv = inc_y * gvl * 2; BLASLONG inc_x2 = inc_x * 2; BLASLONG lda2 = lda * 2; - vy0_new = VLSEV_FLOAT(&y[iy], stride_y, gvl); - vy1_new = VLSEV_FLOAT(&y[iy+1], stride_y, gvl); - for(k=0,j=0; k Date: Wed, 30 Apr 2025 16:40:44 +0800 Subject: [PATCH 137/205] Loongarch64: fixed amax_lasx --- kernel/loongarch64/amax_lasx.S | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/kernel/loongarch64/amax_lasx.S b/kernel/loongarch64/amax_lasx.S index e964d4ddb..8d2acf283 100644 --- a/kernel/loongarch64/amax_lasx.S +++ b/kernel/loongarch64/amax_lasx.S @@ -56,17 +56,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. LDINT INCX, 0(INCX) #endif + xvxor.v VM0, VM0, VM0 bge $r0, N, .L999 bge $r0, INCX, .L999 li.d TEMP, 1 slli.d TEMP, TEMP, BASE_SHIFT slli.d INCX, INCX, BASE_SHIFT -#ifdef DOUBLE - xvldrepl.d VM0, X, 0 -#else - xvldrepl.w VM0, X, 0 -#endif - XVFSUB VM0, VM0, VM0 bne INCX, TEMP, .L20 srai.d I, N, 4 From be525521ad204c6080c10a8eca10799b67abd4e4 Mon Sep 17 00:00:00 2001 From: pengxu Date: Wed, 30 Apr 2025 16:40:55 +0800 Subject: [PATCH 138/205] Loongarch64: fixed asum_lasx --- kernel/loongarch64/asum_lasx.S | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/kernel/loongarch64/asum_lasx.S b/kernel/loongarch64/asum_lasx.S index 9a2c031f3..e5ab4df3d 100644 --- a/kernel/loongarch64/asum_lasx.S +++ b/kernel/loongarch64/asum_lasx.S @@ -103,21 +103,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvfadd.d res1, VX2, res1 xvfadd.d res1, VX3, res1 #else - xvfadd.s res2, res1, res2 xvpickve.w VX1, res1, 1 xvpickve.w VX2, res1, 2 xvpickve.w VX3, res1, 3 xvfadd.s res1, VX1, res1 xvfadd.s res1, VX2, res1 xvfadd.s res1, VX3, res1 - xvpickve.w VX0, res2, 4 - xvpickve.w VX1, res2, 5 - xvpickve.w VX2, res2, 6 - xvpickve.w VX3, res2, 7 + xvpickve.w VX0, res1, 4 + xvpickve.w VX1, res1, 5 + xvpickve.w VX2, res1, 6 + xvpickve.w VX3, res1, 7 xvfadd.s res1, VX0, res1 xvfadd.s res1, VX1, res1 xvfadd.s res1, VX2, res1 - xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX3, res1 #endif .align 3 @@ -217,21 +216,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvfadd.d res1, VX2, res1 xvfadd.d res1, VX3, res1 #else - xvfadd.s res2, res1, res2 xvpickve.w VX1, res1, 1 xvpickve.w VX2, res1, 2 xvpickve.w VX3, res1, 3 xvfadd.s res1, VX1, res1 xvfadd.s res1, VX2, res1 xvfadd.s res1, VX3, res1 - xvpickve.w VX0, res2, 4 - xvpickve.w VX1, res2, 5 - xvpickve.w VX2, res2, 6 - xvpickve.w VX3, res2, 7 + xvpickve.w VX0, res1, 4 + xvpickve.w VX1, res1, 5 + xvpickve.w VX2, res1, 6 + xvpickve.w VX3, res1, 7 xvfadd.s res1, VX0, res1 xvfadd.s res1, VX1, res1 xvfadd.s res1, VX2, res1 - xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX3, res1 #endif .align 3 From 74c97ef814c85134b3f6556f07cfb37b20ac93d5 Mon Sep 17 00:00:00 2001 From: pengxu Date: Wed, 30 Apr 2025 16:41:05 +0800 Subject: [PATCH 139/205] Loongarch64: fixed cdot_lasx --- kernel/loongarch64/cdot_lasx.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/loongarch64/cdot_lasx.S b/kernel/loongarch64/cdot_lasx.S index 0583e56ea..32b8bf982 100644 --- a/kernel/loongarch64/cdot_lasx.S +++ b/kernel/loongarch64/cdot_lasx.S @@ -288,7 +288,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvinsgr2vr.w x2, t2, 6 xvinsgr2vr.w x1, t3, 7 xvinsgr2vr.w x2, t4, 7 - addi.d Y, Y, 8 * SIZE + addi.d Y, Y, 16 * SIZE xvpickev.w x3, VX3, VX2 xvpickod.w x4, VX3, VX2 xvfmadd.s res1, x1, x3, res1 From d49319c2d2667bbc604dd78e0ec5831399c6b8f7 Mon Sep 17 00:00:00 2001 From: pengxu Date: Wed, 30 Apr 2025 16:41:18 +0800 Subject: [PATCH 140/205] Loongarch64: fixed cnrm2_lasx --- kernel/loongarch64/cnrm2_lasx.S | 78 ++++++++++++++++++++++----------- 1 file changed, 53 insertions(+), 25 deletions(-) diff --git a/kernel/loongarch64/cnrm2_lasx.S b/kernel/loongarch64/cnrm2_lasx.S index 3a60069ac..034f9de79 100644 --- a/kernel/loongarch64/cnrm2_lasx.S +++ b/kernel/loongarch64/cnrm2_lasx.S @@ -47,6 +47,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VX4 $xr21 #define res1 $xr19 #define res2 $xr20 +#define RCP $f2 +#define VALPHA $xr3 PROLOGUE @@ -55,10 +57,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. LDINT INCX, 0(INCX) #endif - xvxor.v res1, res1, res1 - xvxor.v res2, res2, res2 bge $r0, N, .L999 beq $r0, INCX, .L999 + + addi.d $sp, $sp, -32 + st.d $ra, $sp, 0 + st.d N, $sp, 8 + st.d X, $sp, 16 + st.d INCX, $sp, 24 +#ifdef DYNAMIC_ARCH + bl camax_k_LA264 +#else + bl camax_k +#endif + ld.d $ra, $sp, 0 + ld.d N, $sp, 8 + ld.d X, $sp, 16 + ld.d INCX, $sp, 24 + addi.d $sp, $sp, 32 + + frecip.s RCP, $f0 + vreplvei.w $vr3, $vr2, 0 + xvpermi.d VALPHA, $xr3,0x00 + xvxor.v res1, res1, res1 + xvxor.v res2, res2, res2 + fcmp.ceq.s $fcc0, $f0, $f19 + bcnez $fcc0, .L999 + li.d TEMP, SIZE slli.d INCX, INCX, ZBASE_SHIFT srai.d I, N, 2 @@ -67,13 +92,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .align 3 .L10: - xvld VX0, X, 0 * SIZE - xvfcvtl.d.s VX1, VX0 - xvfcvth.d.s VX2, VX0 - xvfmadd.d res1, VX1, VX1, res1 - xvfmadd.d res2, VX2, VX2, res2 addi.d I, I, -1 - addi.d X, X, 8 * SIZE + + xvld VX0, X, 0 * SIZE + xvld VX1, X, 8 * SIZE + xvfmul.s VX0, VX0, VALPHA + xvfmul.s VX1, VX1, VALPHA + xvfmadd.s res1, VX0, VX0, res1 + xvfmadd.s res2, VX1, VX1, res2 + + addi.d X, X, 16 * SIZE blt $r0, I, .L10 .align 3 b .L996 @@ -103,22 +131,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvinsgr2vr.w VX0, t3, 6 xvinsgr2vr.w VX0, t4, 7 add.d X, X, INCX - xvfcvtl.d.s VX1, VX0 - xvfcvth.d.s VX2, VX0 - xvfmadd.d res1, VX1, VX1, res1 - xvfmadd.d res2, VX2, VX2, res2 + xvfmul.s VX0, VX0, VALPHA + xvfmadd.s res2, VX0, VX0, res2 addi.d I, I, -1 blt $r0, I, .L21 b .L996 .L996: - xvfadd.d res1, res1, res2 - xvpickve.d VX1, res1, 1 - xvpickve.d VX2, res1, 2 - xvpickve.d VX3, res1, 3 - xvfadd.d res1, VX1, res1 - xvfadd.d res1, VX2, res1 - xvfadd.d res1, VX3, res1 + xvfadd.s res1, res1, res2 + xvpermi.d VX1, res1, 0x4e + xvfadd.s res1, res1, VX1 + vreplvei.w $vr17, $vr19, 1 + vreplvei.w $vr18, $vr19, 2 + vreplvei.w $vr21, $vr19, 3 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX3, res1 + xvfadd.s res1, VX4, res1 .align 3 .L997: @@ -130,18 +158,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fld.s a1, X, 0 * SIZE fld.s a2, X, 1 * SIZE addi.d I, I, -1 - fcvt.d.s a1, a1 - fcvt.d.s a2, a2 - fmadd.d res, a1, a1, res - fmadd.d res, a2, a2, res + fmul.s a1, a1, RCP + fmul.s a2, a2, RCP + fmadd.s res, a1, a1, res + fmadd.s res, a2, a2, res add.d X, X, INCX blt $r0, I, .L998 .align 3 .L999: - fsqrt.d res, res + fsqrt.s res, res + fmul.s $f0, res, $f0 move $r4, $r17 - fcvt.s.d $f0, res jirl $r0, $r1, 0x0 EPILOGUE From a98dd6d91156862d16fa0c43d718e3dfaf70e279 Mon Sep 17 00:00:00 2001 From: pengxu Date: Wed, 30 Apr 2025 16:41:28 +0800 Subject: [PATCH 141/205] Loongarch64: fixed copy_lasx --- kernel/loongarch64/copy_lasx.S | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/loongarch64/copy_lasx.S b/kernel/loongarch64/copy_lasx.S index 31f91cec1..6a723fb1e 100644 --- a/kernel/loongarch64/copy_lasx.S +++ b/kernel/loongarch64/copy_lasx.S @@ -260,9 +260,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add.d Y, Y, INCY ST a2, Y, 0 add.d Y, Y, INCY - ST a3, X, 0 + ST a3, Y, 0 add.d Y, Y, INCY - ST a4, X, 0 + ST a4, Y, 0 add.d Y, Y, INCY LD a1, X, 0 add.d X, X, INCX @@ -276,9 +276,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add.d Y, Y, INCY ST a2, Y, 0 add.d Y, Y, INCY - ST a3, X, 0 + ST a3, Y, 0 add.d Y, Y, INCY - ST a4, X, 0 + ST a4, Y, 0 add.d Y, Y, INCY addi.d I, I, -1 blt $r0, I, .L222 From dc5fa29851ea3c7c1e27195b21e3d593276d5475 Mon Sep 17 00:00:00 2001 From: pengxu Date: Wed, 30 Apr 2025 16:41:39 +0800 Subject: [PATCH 142/205] Loongarch64: fixed cscal_lasx --- kernel/loongarch64/cscal_lasx.S | 247 ++++++++------------------------ 1 file changed, 61 insertions(+), 186 deletions(-) diff --git a/kernel/loongarch64/cscal_lasx.S b/kernel/loongarch64/cscal_lasx.S index f53526663..e32071def 100644 --- a/kernel/loongarch64/cscal_lasx.S +++ b/kernel/loongarch64/cscal_lasx.S @@ -33,6 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHAI $f1 #define X $r7 #define INCX $r8 +#define DUMMY2 $r9 #define I $r12 #define TEMP $r13 @@ -65,6 +66,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. bge $r0, N, .L999 bge $r0, INCX, .L999 + ld.d DUMMY2, $sp, 0 li.d TEMP, 1 movgr2fr.d a1, $r0 FFINT a1, a1 @@ -86,24 +88,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif bne INCX, TEMP, .L22 +/////// INCX == 1 //////// .L11: - bge $r0, I, .L997 CMPEQ $fcc0, ALPHAR, a1 CMPEQ $fcc1, ALPHAI, a1 - bceqz $fcc0, .L13 - b .L14 - .align 3 + bge $r0, I, .L19 +/////// INCX == 1 && N >= 4 //////// + bnez DUMMY2, .L17 // if DUMMPY2 == 1, called from c/zscal. -.L13: - bceqz $fcc1, .L114 //alpha_r != 0.0 && alpha_i != 0.0 - b .L113 //alpha_r != 0.0 && alpha_i == 0.0 + bceqz $fcc0, .L17 -.L14: - bceqz $fcc1, .L114 //alpha_r == 0.0 && alpha_i != 0.0 - b .L111 //alpha_r == 0.0 && alpha_i == 0.0 - .align 3 + bceqz $fcc1, .L17 -.L111: //alpha_r == 0.0 && alpha_i == 0.0 +.L15: //alpha_r == 0.0 && alpha_i == 0.0 xvst VXZ, X, 0 * SIZE #ifdef DOUBLE xvst VXZ, X, 4 * SIZE @@ -113,41 +110,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi.d X, X, 16 * SIZE #endif addi.d I, I, -1 - blt $r0, I, .L111 - b .L997 - .align 3 - -.L113: //alpha_r != 0.0 && alpha_i == 0.0 - xvld VX0, X, 0 * SIZE -#ifdef DOUBLE - xvld VX1, X, 4 * SIZE - xvpickev.d x1, VX1, VX0 - xvpickod.d x2, VX1, VX0 - xvfmul.d x3, VXAR, x1 - xvfmul.d x4, VXAR, x2 - xvilvl.d VX2, x4 ,x3 - xvilvh.d VX3, x4, x3 - xvst VX2, X, 0 * SIZE - xvst VX3, X, 4 * SIZE - addi.d X, X, 8 * SIZE -#else - xvld VX1, X, 8 * SIZE - xvpickev.w x1, VX1, VX0 - xvpickod.w x2, VX1, VX0 - xvfmul.s x3, VXAR, x1 - xvfmul.s x4, VXAR, x2 - xvilvl.w VX2, x4 ,x3 - xvilvh.w VX3, x4, x3 - xvst VX2, X, 0 * SIZE - xvst VX3, X, 8 * SIZE - addi.d X, X, 16 * SIZE -#endif - addi.d I, I, -1 - blt $r0, I, .L113 - b .L997 + blt $r0, I, .L15 + b .L19 .align 3 -.L114: //alpha_r != 0.0 && alpha_i != 0.0 +.L17: xvld VX0, X, 0 * SIZE #ifdef DOUBLE xvld VX1, X, 4 * SIZE @@ -177,29 +144,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi.d X, X, 16 * SIZE #endif addi.d I, I, -1 - blt $r0, I, .L114 - b .L997 + blt $r0, I, .L17 + b .L19 .align 3 +/////// INCX == 1 && N < 8 /////// +.L19: +#ifdef DOUBLE + andi I, N, 3 +#else + andi I, N, 7 +#endif + beqz I, .L999 + bnez DUMMY2, .L998 // if DUMMPY2 == 1, called from c/zscal. + + bceqz $fcc0, .L998 + + bceqz $fcc1, .L998 + + b .L995 // alpha_r == 0.0 && alpha_i == 0.0 + .align 3 + +/////// INCX != 1 //////// .L22: - bge $r0, I, .L997 - move XX, X CMPEQ $fcc0, ALPHAR, a1 CMPEQ $fcc1, ALPHAI, a1 - bceqz $fcc0, .L23 - b .L24 - .align 3 - -.L23: - bceqz $fcc1, .L224 //alpha_r != 0.0 && alpha_i != 0.0 - b .L223 //alpha_r != 0.0 && alpha_i == 0.0 + move XX, X + bge $r0, I, .L29 + bnez DUMMY2, .L25 // if DUMMPY2 == 1, called from c/zscal. + bceqz $fcc0, .L25 -.L24: - bceqz $fcc1, .L224 //alpha_r == 0.0 && alpha_i != 0.0 - b .L221 //alpha_r == 0.0 && alpha_i == 0.0 - .align 3 + bceqz $fcc1, .L25 -.L221: //alpha_r == 0.0 && alpha_i == 0.0 +.L27: //alpha_r == 0.0 && alpha_i == 0.0 #ifdef DOUBLE xvstelm.d VXZ, X, 0, 0 xvstelm.d VXZ, X, 1 * SIZE, 0 @@ -239,122 +216,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif add.d X, X, INCX addi.d I, I, -1 - blt $r0, I, .L221 - b .L997 - .align 3 - -.L223: //alpha_r != 0.0 && alpha_i == 0.0 -#ifdef DOUBLE - ld.d t1, X, 0 * SIZE - ld.d t2, X, 1 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - ld.d t4, X, 1 * SIZE - add.d X, X, INCX - xvinsgr2vr.d x1, t1, 0 - xvinsgr2vr.d x2, t2, 0 - xvinsgr2vr.d x1, t3, 1 - xvinsgr2vr.d x2, t4, 1 - ld.d t1, X, 0 * SIZE - ld.d t2, X, 1 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - ld.d t4, X, 1 * SIZE - xvinsgr2vr.d x1, t1, 2 - xvinsgr2vr.d x2, t2, 2 - xvinsgr2vr.d x1, t3, 3 - xvinsgr2vr.d x2, t4, 3 - add.d X, X, INCX - - xvfmul.d x3, VXAR, x1 - xvfmul.d x4, VXAR, x2 - addi.d I, I, -1 - xvstelm.d x3, XX, 0 * SIZE, 0 - xvstelm.d x4, XX, 1 * SIZE, 0 - add.d XX, XX, INCX - xvstelm.d x3, XX, 0 * SIZE, 1 - xvstelm.d x4, XX, 1 * SIZE, 1 - add.d XX, XX, INCX - xvstelm.d x3, XX, 0 * SIZE, 2 - xvstelm.d x4, XX, 1 * SIZE, 2 - add.d XX, XX, INCX - xvstelm.d x3, XX, 0 * SIZE, 3 - xvstelm.d x4, XX, 1 * SIZE, 3 -#else - ld.w t1, X, 0 * SIZE - ld.w t2, X, 1 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - ld.w t4, X, 1 * SIZE - add.d X, X, INCX - xvinsgr2vr.w x1, t1, 0 - xvinsgr2vr.w x2, t2, 0 - xvinsgr2vr.w x1, t3, 1 - xvinsgr2vr.w x2, t4, 1 - ld.w t1, X, 0 * SIZE - ld.w t2, X, 1 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - ld.w t4, X, 1 * SIZE - xvinsgr2vr.w x1, t1, 2 - xvinsgr2vr.w x2, t2, 2 - xvinsgr2vr.w x1, t3, 3 - xvinsgr2vr.w x2, t4, 3 - add.d X, X, INCX - ld.w t1, X, 0 * SIZE - ld.w t2, X, 1 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - ld.w t4, X, 1 * SIZE - add.d X, X, INCX - xvinsgr2vr.w x1, t1, 4 - xvinsgr2vr.w x2, t2, 4 - xvinsgr2vr.w x1, t3, 5 - xvinsgr2vr.w x2, t4, 5 - ld.w t1, X, 0 * SIZE - ld.w t2, X, 1 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - ld.w t4, X, 1 * SIZE - xvinsgr2vr.w x1, t1, 6 - xvinsgr2vr.w x2, t2, 6 - xvinsgr2vr.w x1, t3, 7 - xvinsgr2vr.w x2, t4, 7 - add.d X, X, INCX - - xvfmul.s x3, VXAR, x1 - xvfmul.s x4, VXAR, x2 - addi.d I, I, -1 - xvstelm.w x3, XX, 0 * SIZE, 0 - xvstelm.w x4, XX, 1 * SIZE, 0 - add.d XX, XX, INCX - xvstelm.w x3, XX, 0 * SIZE, 1 - xvstelm.w x4, XX, 1 * SIZE, 1 - add.d XX, XX, INCX - xvstelm.w x3, XX, 0 * SIZE, 2 - xvstelm.w x4, XX, 1 * SIZE, 2 - add.d XX, XX, INCX - xvstelm.w x3, XX, 0 * SIZE, 3 - xvstelm.w x4, XX, 1 * SIZE, 3 - add.d XX, XX, INCX - xvstelm.w x3, XX, 0 * SIZE, 4 - xvstelm.w x4, XX, 1 * SIZE, 4 - add.d XX, XX, INCX - xvstelm.w x3, XX, 0 * SIZE, 5 - xvstelm.w x4, XX, 1 * SIZE, 5 - add.d XX, XX, INCX - xvstelm.w x3, XX, 0 * SIZE, 6 - xvstelm.w x4, XX, 1 * SIZE, 6 - add.d XX, XX, INCX - xvstelm.w x3, XX, 0 * SIZE, 7 - xvstelm.w x4, XX, 1 * SIZE, 7 -#endif - add.d XX, XX, INCX - blt $r0, I, .L223 - b .L997 + blt $r0, I, .L27 + b .L29 .align 3 -.L224: //alpha_r != 0.0 && alpha_i != 0.0 +.L25: #ifdef DOUBLE ld.d t1, X, 0 * SIZE ld.d t2, X, 1 * SIZE @@ -376,7 +242,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvinsgr2vr.d x1, t3, 3 xvinsgr2vr.d x2, t4, 3 add.d X, X, INCX - xvfmul.d VX0, VXAI, x2 xvfmsub.d x3, VXAR, x1, VX0 xvfmul.d VX1, VXAI, x1 @@ -434,7 +299,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvinsgr2vr.w x1, t3, 7 xvinsgr2vr.w x2, t4, 7 add.d X, X, INCX - xvfmul.s VX0, VXAI, x2 xvfmsub.s x3, VXAR, x1, VX0 xvfmul.s VX1, VXAI, x1 @@ -465,19 +329,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvstelm.w x4, XX, 1 * SIZE, 7 #endif add.d XX, XX, INCX - blt $r0, I, .L224 - b .L997 + blt $r0, I, .L25 + b .L29 .align 3 -.L997: +/////// INCX != 1 && N < 8 /////// +.L29: #ifdef DOUBLE - andi I, N, 3 + andi I, N, 3 #else - andi I, N, 7 + andi I, N, 7 #endif - bge $r0, I, .L999 - .align 3 + beqz I, .L999 + bnez DUMMY2, .L998 // if DUMMPY2 == 1, called from c/zscal. + + bceqz $fcc0, .L998 + bceqz $fcc1, .L998 + +.L995: // alpha_r == 0.0 && alpha_i == 0.0 + ST a1, X, 0 * SIZE + ST a1, X, 1 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L995 + b .L999 .L998: LD a1, X, 0 * SIZE LD a2, X, 1 * SIZE @@ -490,11 +366,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ST s2, X, 1 * SIZE add.d X, X, INCX blt $r0, I, .L998 - .align 3 + b .L999 .L999: move $r4, $r12 jirl $r0, $r1, 0x0 .align 3 - EPILOGUE From ba9569e382639ab96ddb896df13007f0e834d5ea Mon Sep 17 00:00:00 2001 From: pengxu Date: Wed, 30 Apr 2025 16:41:48 +0800 Subject: [PATCH 143/205] Loongarch64: fixed dot_lasx --- kernel/loongarch64/dot_lasx.S | 86 ++++++++++++----------------------- 1 file changed, 29 insertions(+), 57 deletions(-) diff --git a/kernel/loongarch64/dot_lasx.S b/kernel/loongarch64/dot_lasx.S index 11c896cb9..e72f848f8 100644 --- a/kernel/loongarch64/dot_lasx.S +++ b/kernel/loongarch64/dot_lasx.S @@ -53,8 +53,8 @@ PROLOGUE #endif /* init $f8 and $f9 to zero */ - SUB s1, s1, s1 - SUB s2, s2, s2 + xvxor.v $xr8, $xr8, $xr8 + xvxor.v $xr9, $xr9, $xr9 slli.d INCX, INCX, BASE_SHIFT li.d TEMP, SIZE slli.d INCY, INCY, BASE_SHIFT @@ -64,20 +64,6 @@ PROLOGUE /* !((inc_x == 1) && (inc_y == 1)) */ - /* init $xr8 and $xr9 to zero */ -#ifdef DOUBLE - xvldrepl.d $xr0, X, 0 -#else - xvldrepl.w $xr0, X, 0 -#endif -#ifdef DSDOT - xvfcvtl.d.s $xr0, $xr0 - xvfsub.d $xr8, $xr0, $xr0 - xvfsub.d $xr9, $xr0, $xr0 -#else - XVFSUB $xr8, $xr0, $xr0 - XVFSUB $xr9, $xr0, $xr0 -#endif #ifdef DOUBLE srai.d I, N, 4 @@ -99,31 +85,31 @@ PROLOGUE addi.w I, I, -1 addi.d X, X, 128 addi.d Y, Y, 128 -#ifdef DSDOT +#ifndef DOUBLE xvfcvtl.d.s $xr10, $xr0 xvfcvtl.d.s $xr11, $xr4 xvfcvth.d.s $xr12, $xr0 xvfcvth.d.s $xr13, $xr4 - xvfmadd.d $xr8, $xr10, $xr12, $xr8 - xvfmadd.d $xr9, $xr11, $xr13, $xr9 + xvfmadd.d $xr8, $xr10, $xr11, $xr8 + xvfmadd.d $xr9, $xr12, $xr13, $xr9 xvfcvtl.d.s $xr10, $xr1 xvfcvtl.d.s $xr11, $xr5 xvfcvth.d.s $xr12, $xr1 xvfcvth.d.s $xr13, $xr5 - xvfmadd.d $xr8, $xr10, $xr12, $xr8 - xvfmadd.d $xr9, $xr11, $xr13, $xr9 + xvfmadd.d $xr8, $xr10, $xr11, $xr8 + xvfmadd.d $xr9, $xr12, $xr13, $xr9 xvfcvtl.d.s $xr10, $xr2 xvfcvtl.d.s $xr11, $xr6 xvfcvth.d.s $xr12, $xr2 xvfcvth.d.s $xr13, $xr6 - xvfmadd.d $xr8, $xr10, $xr12, $xr8 - xvfmadd.d $xr9, $xr11, $xr13, $xr9 + xvfmadd.d $xr8, $xr10, $xr11, $xr8 + xvfmadd.d $xr9, $xr12, $xr13, $xr9 xvfcvtl.d.s $xr10, $xr3 xvfcvtl.d.s $xr11, $xr7 xvfcvth.d.s $xr12, $xr3 xvfcvth.d.s $xr13, $xr7 - xvfmadd.d $xr8, $xr10, $xr12, $xr8 - xvfmadd.d $xr9, $xr11, $xr13, $xr9 + xvfmadd.d $xr8, $xr10, $xr11, $xr8 + xvfmadd.d $xr9, $xr12, $xr13, $xr9 #else XVFMADD $xr8, $xr0, $xr4, $xr8 XVFMADD $xr9, $xr1, $xr5, $xr9 @@ -149,13 +135,13 @@ PROLOGUE addi.w I, I, -1 addi.d X, X, 32 addi.d Y, Y, 32 -#ifdef DSDOT +#ifndef DOUBLE xvfcvtl.d.s $xr10, $xr0 xvfcvtl.d.s $xr11, $xr4 xvfcvth.d.s $xr12, $xr0 xvfcvth.d.s $xr13, $xr4 - xvfmadd.d $xr8, $xr10, $xr12, $xr8 - xvfmadd.d $xr9, $xr11, $xr13, $xr9 + xvfmadd.d $xr8, $xr10, $xr11, $xr8 + xvfmadd.d $xr9, $xr12, $xr13, $xr9 #else XVFMADD $xr8, $xr0, $xr4, $xr8 #endif @@ -163,27 +149,12 @@ PROLOGUE .align 3 .L14: /* store dot in s1 $f8 */ -#ifdef DSDOT xvfadd.d $xr8, $xr8, $xr9 - fsub.s s2, s2, s2 /* set s2 to 0.0 */ + fsub.d s2, s2, s2 /* set s2 to 0.0 */ xvpermi.q $xr0, $xr8, 0x1 vfadd.d $vr8, $vr8, $vr0 vpackod.d $vr0, $vr8, $vr8 vfadd.d $vr8, $vr8, $vr0 -#else - XVFADD $xr8, $xr8, $xr9 - SUB s2, s2, s2 /* set s2 to 0.0 */ - xvpermi.q $xr0, $xr8, 0x1 - VFADD $vr8, $vr8, $vr0 - vpackod.d $vr0, $vr8, $vr8 -#ifdef DOUBLE - VFADD $vr8, $vr8, $vr0 -#else - VFADD $vr8, $vr8, $vr0 - vpackod.w $vr0, $vr8, $vr8 - VFADD $vr8, $vr8, $vr0 -#endif /* defined DOUBLE */ -#endif /* defined DSDOT */ .align 3 .L15: #ifdef DOUBLE @@ -197,7 +168,7 @@ PROLOGUE /* FLOAT: 1~7 ; DOUBLE: 1~3 */ LD a1, X, 0 LD b1, Y, 0 -#ifdef DSDOT +#ifndef DOUBLE fcvt.d.s a1, a1 fcvt.d.s b1, b1 fmadd.d s1, b1, a1, s1 @@ -240,7 +211,7 @@ PROLOGUE add.d X, X, INCX LD b1, Y, 0 * SIZE add.d Y, Y, INCY -#ifdef DSDOT +#ifndef DOUBLE fcvt.d.s a1, a1 fcvt.d.s b1, b1 fmadd.d s1, b1, a1, s1 @@ -252,7 +223,7 @@ PROLOGUE add.d X, X, INCX LD b1, Y, 0 * SIZE add.d Y, Y, INCY -#ifdef DSDOT +#ifndef DOUBLE fcvt.d.s a1, a1 fcvt.d.s b1, b1 fmadd.d s2, b1, a1, s2 @@ -264,7 +235,7 @@ PROLOGUE add.d X, X, INCX LD b1, Y, 0 * SIZE add.d Y, Y, INCY -#ifdef DSDOT +#ifndef DOUBLE fcvt.d.s a1, a1 fcvt.d.s b1, b1 fmadd.d s1, b1, a1, s1 @@ -276,7 +247,7 @@ PROLOGUE add.d X, X, INCX LD b1, Y, 0 * SIZE add.d Y, Y, INCY -#ifdef DSDOT +#ifndef DOUBLE fcvt.d.s a1, a1 fcvt.d.s b1, b1 fmadd.d s2, b1, a1, s2 @@ -288,7 +259,7 @@ PROLOGUE add.d X, X, INCX LD b1, Y, 0 * SIZE add.d Y, Y, INCY -#ifdef DSDOT +#ifndef DOUBLE fcvt.d.s a1, a1 fcvt.d.s b1, b1 fmadd.d s1, b1, a1, s1 @@ -300,7 +271,7 @@ PROLOGUE add.d X, X, INCX LD b1, Y, 0 * SIZE add.d Y, Y, INCY -#ifdef DSDOT +#ifndef DOUBLE fcvt.d.s a1, a1 fcvt.d.s b1, b1 fmadd.d s2, b1, a1, s2 @@ -312,7 +283,7 @@ PROLOGUE add.d X, X, INCX LD b1, Y, 0 * SIZE add.d Y, Y, INCY -#ifdef DSDOT +#ifndef DOUBLE fcvt.d.s a1, a1 fcvt.d.s b1, b1 fmadd.d s1, b1, a1, s1 @@ -325,7 +296,7 @@ PROLOGUE LD b1, Y, 0 * SIZE add.d Y, Y, INCY addi.d I, I, -1 -#ifdef DSDOT +#ifndef DOUBLE fcvt.d.s a1, a1 fcvt.d.s b1, b1 fmadd.d s2, b1, a1, s2 @@ -346,7 +317,7 @@ PROLOGUE LD b1, Y, 0 * SIZE add.d Y, Y, INCY addi.d I, I, -1 -#ifdef DSDOT +#ifndef DOUBLE fcvt.d.s a1, a1 fcvt.d.s b1, b1 fmadd.d s1, b1, a1, s1 @@ -357,12 +328,13 @@ PROLOGUE .align 3 .L999: -#ifdef DSDOT fadd.d $f0, s1, s2 + move $r4, $r17 +#if defined(DOUBLE) +#elif defined(DSDOT) #else - ADD $f0, s1, s2 + fcvt.s.d $f0, $f0 #endif - move $r4, $r17 jirl $r0, $r1, 0x0 EPILOGUE From b528b1b8ea5e4237cd6c4da4eb2df631f63b5782 Mon Sep 17 00:00:00 2001 From: pengxu Date: Wed, 30 Apr 2025 16:41:58 +0800 Subject: [PATCH 144/205] Loongarch64: fixed iamax_lasx --- kernel/loongarch64/iamax_lasx.S | 566 ++++++++++++++++---------------- 1 file changed, 282 insertions(+), 284 deletions(-) diff --git a/kernel/loongarch64/iamax_lasx.S b/kernel/loongarch64/iamax_lasx.S index 090da3004..a573fd4b7 100644 --- a/kernel/loongarch64/iamax_lasx.S +++ b/kernel/loongarch64/iamax_lasx.S @@ -56,25 +56,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VI3 $xr8 #define VI4 $xr19 #define VT0 $xr23 +#define VZE $xr3 +#define VT1 $xr4 +#define VT2 $xr5 +#define VC0 $xr6 PROLOGUE li.d i0, 0 bge $r0, N, .L999 bge $r0, INCX, .L999 li.d TEMP, 1 + xvldi VZE, 0 slli.d TEMP, TEMP, BASE_SHIFT slli.d INCX, INCX, BASE_SHIFT bne INCX, TEMP, .L20 xvld VM0, X, 0 #ifdef DOUBLE + xvfsub.d VT1, VZE, VM0 addi.d i0, i0, 1 srai.d I, N, 3 - bge $r0, I, .L21 - slli.d i0, i0, 2 //4 + xvfmaxa.d VM0, VM0, VT1 + bge $r0, I, .L11 + slli.d i0, i0, 1 //2 xvreplgr2vr.d VINC4, i0 - slli.d i0, i0, 1 //8 + slli.d i0, i0, 1 //4 xvreplgr2vr.d VINC8, i0 - addi.d i0, i0, -15 + addi.d i0, i0, -7 xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization addi.d i0, i0, 1 xvinsgr2vr.d VI1, i0, 1 @@ -82,19 +89,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvinsgr2vr.d VI1, i0, 2 addi.d i0, i0, 1 xvinsgr2vr.d VI1, i0, 3 - addi.d i0, i0, 5 - xvinsgr2vr.d VI0, i0, 0 //1 addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 1 //2 + xvinsgr2vr.d VI0, i0, 0 //initialize the index value for vectorization addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 2 //3 + xvinsgr2vr.d VI0, i0, 1 addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 3 //4 + xvinsgr2vr.d VI0, i0, 2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 3 #else + xvfsub.s VT1, VZE, VM0 addi.w i0, i0, 1 srai.d I, N, 3 + xvfmaxa.s VM0, VM0, VT1 bge $r0, I, .L21 - slli.w i0, i0, 3 //8 + slli.w i0, i0, 2 //4 + xvreplgr2vr.w VINC4, i0 + slli.w i0, i0, 1 //8 xvreplgr2vr.w VINC8, i0 addi.w i0, i0, -15 xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization @@ -135,73 +146,124 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifdef DOUBLE xvld VX0, X, 0 * SIZE xvadd.d VI1, VI1, VINC8 - xvld VX1, X, 4 * SIZE + xvld VX1, X, 2 * SIZE + xvadd.d VI2, VI1, VINC4 + xvfsub.d VT1, VZE, VX0 + xvfsub.d VT2, VZE, VX1 + xvfmaxa.d VX0, VX0, VT1 + xvfmaxa.d VX1, VX1, VT2 + xvfcmp.clt.d VT0, VX0, VX1 //abx(x0) < abs(x1) + xvbitsel.v x1, VX0, VX1, VT0 //abs(maxf) + xvbitsel.v x2, VI1, VI2, VT0 //i + + xvld VX0, X, 4 * SIZE + xvadd.d VI1, VI2, VINC4 + xvld VX1, X, 6 * SIZE xvadd.d VI2, VI1, VINC4 - xvfmaxa.d VM1, VX0, VX1 - xvfcmp.ceq.d VT0, VX0, VM1 + xvfsub.d VT1, VZE, VX0 + xvfsub.d VT2, VZE, VX1 + xvfmaxa.d VX0, VX0, VT1 + xvfmaxa.d VX1, VX1, VT2 + xvfcmp.clt.d VT0, VX0, VX1 + xvbitsel.v x3, VX0, VX1, VT0 //abs(maxf) + xvbitsel.v x4, VI1, VI2, VT0 //i + xvfcmp.clt.d VC0, x1, x3 + xvbitsel.v x1, x1, x3, VC0 //abs(maxf) + xvbitsel.v x2, x2, x4, VC0 //i + xvfcmp.clt.d VT0, VM0, x1 addi.d I, I, -1 - xvbitsel.v VI2, VI2, VI1, VT0 - xvfmaxa.d VM1, VM0, VM1 - xvfcmp.ceq.d VT0, VM0, VM1 addi.d X, X, 8 * SIZE - xvbitsel.v VM0, VM1, VM0, VT0 - xvbitsel.v VI0, VI2, VI0, VT0 + xvbitsel.v VM0, VM0, x1, VT0 + xvbitsel.v VI0, VI0, x2, VT0 #else xvld VX0, X, 0 * SIZE - addi.d I, I, -1 xvadd.w VI1, VI1, VINC8 - xvfmaxa.s VM1, VX0, VM0 - xvfcmp.ceq.s VT0, VM0, VM1 + xvld VX1, X, 4 * SIZE + xvadd.w VI2, VI1, VINC4 + xvfsub.s VT1, VZE, VX0 + xvfsub.s VT2, VZE, VX1 + xvfmaxa.s VX0, VX0, VT1 + xvfmaxa.s VX1, VX1, VT2 + xvfcmp.clt.s VT0, VX0, VX1 + xvbitsel.v x1, VX0, VX1, VT0 //abs(maxf) + xvbitsel.v x2, VI1, VI2, VT0 //i + addi.d I, I, -1 + xvfcmp.clt.s VT0, VM0, x1 addi.d X, X, 8 * SIZE - xvbitsel.v VM0, VM1, VM0, VT0 - xvbitsel.v VI0, VI1, VI0, VT0 + xvbitsel.v VM0, VM0, x1, VT0 + xvbitsel.v VI0, VI0, x2, VT0 + #endif blt $r0, I, .L10 .align 3 .L15: #ifdef DOUBLE - xvpickve.d VI1, VI0, 0 - xvpickve.d VI2, VI0, 1 - xvpickve.d VI3, VI0, 2 - xvpickve.d VI4, VI0, 3 - xvpickve.d x1, VM0, 0 - xvpickve.d x2, VM0, 1 - xvpickve.d x3, VM0, 2 - xvpickve.d x4, VM0, 3 + vreplvei.d $vr21, $vr20, 0 + vreplvei.d $vr22, $vr20, 1 + vreplvei.d $vr9, $vr15, 0 + vreplvei.d $vr10, $vr15, 1 + fcmp.ceq.d $fcc0, $f9, $f10 + bceqz $fcc0, .L16 + xvfcmp.clt.d VT0, VI1, VI2 + xvbitsel.v VI0, VI2, VI1, VT0 + b .L17 #else - xvxor.v VX0, VX0, VX0 - xvor.v VX0, VI0, VX0 - xvxor.v VX1, VX1, VX1 - xvor.v VX1, VM0, VX1 - xvpickve.w VI1, VI0, 0 - xvpickve.w VI2, VI0, 1 - xvpickve.w VI3, VI0, 2 - xvpickve.w VI4, VI0, 3 - xvpickve.w x1, VM0, 0 - xvpickve.w x2, VM0, 1 - xvpickve.w x3, VM0, 2 - xvpickve.w x4, VM0, 3 + vreplvei.w $vr21, $vr20, 0 + vreplvei.w $vr22, $vr20, 1 + vreplvei.w $vr8, $vr20, 2 + vreplvei.w $vr19, $vr20, 3 + vreplvei.w $vr9, $vr15, 0 + vreplvei.w $vr10, $vr15, 1 + vreplvei.w $vr11, $vr15, 2 + vreplvei.w $vr12, $vr15, 3 + b .L26 #endif - XVFMAXA VM1, x1, x2 - XVCMPEQ VT0, x1, VM1 - xvbitsel.v VINC4, VI2, VI1, VT0 - XVFMAXA VM0, x3, x4 - XVCMPEQ VT0, x3, VM0 - xvbitsel.v VINC8, VI4, VI3, VT0 - XVFMAXA VM0, VM0, VM1 - XVCMPEQ VT0, VM0, VM1 - xvbitsel.v VI0, VINC8, VINC4, VT0 - CMPEQ $fcc0, $f15, $f9 - bceqz $fcc0, .L26 - XVCMPLT VT0, VI1, VI0 + .align 3 + +#ifdef DOUBLE +.L16: + xvfcmp.clt.d VT0, x1, x2 + xvbitsel.v VI0, VI1, VI2, VT0 + xvbitsel.v VM0, x1, x2, VT0 + .align 3 + +.L17: + movfr2gr.d i0, $f20 + .align 3 + +.L11: //INCX==1 and N<8 + andi I, N, 7 + bge $r0, I, .L14 + srai.d i1, N, 3 + slli.d i1, i1, 3 + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L13: + fld.d $f9, X, 0 + fsub.d $f10, $f3, $f9 + xvfmaxa.d x1, x1, x2 + xvfcmp.clt.d VT0, VM0, x1 + xvbitsel.v VM0, VM0, x1, VT0 xvbitsel.v VI0, VI0, VI1, VT0 - b .L26 + addi.d I, I, -1 + addi.d i1, i1, 1 + addi.d X, X, SIZE + movgr2fr.d $f21, i1 + blt $r0, I, .L13 + movfr2gr.d i0, $f20 + .align 3 + +.L14: + move $r4, $r17 + jirl $r0, $r1, 0x0 .align 3 .L20: // INCX!=1 move TEMP, X -#ifdef DOUBLE addi.d i0, i0, 1 ld.d t1, TEMP, 0 * SIZE add.d TEMP, TEMP, INCX @@ -210,34 +272,103 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. bge $r0, I, .L21 ld.d t2, TEMP, 0 * SIZE add.d TEMP, TEMP, INCX - ld.d t3, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - ld.d t4, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX xvinsgr2vr.d VM0, t2, 1 - xvinsgr2vr.d VM0, t3, 2 - xvinsgr2vr.d VM0, t4, 3 - slli.d i0, i0, 2 //4 + slli.d i0, i0, 1 //2 + xvfsub.d VT1, VZE, VM0 xvreplgr2vr.d VINC4, i0 - slli.d i0, i0, 1 //8 + slli.d i0, i0, 1 //4 xvreplgr2vr.d VINC8, i0 - addi.d i0, i0, -15 + addi.d i0, i0, -7 + xvfmaxa.d VM0, VM0, VT1 xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization addi.d i0, i0, 1 xvinsgr2vr.d VI1, i0, 1 - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 2 - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 3 - addi.d i0, i0, 5 + addi.d i0, i0, 3 xvinsgr2vr.d VI0, i0, 0 //1 addi.d i0, i0, 1 xvinsgr2vr.d VI0, i0, 1 //2 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 2 //3 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 3 //4 + .align 3 + +.L24: + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t2, 1 + xvadd.d VI1, VI1, VINC8 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t2, 1 + xvadd.d VI2, VI1, VINC4 + + xvfsub.d VT1, VZE, VX0 + xvfsub.d VT2, VZE, VX1 + xvfmaxa.d VX0, VX0, VT1 + xvfmaxa.d VX1, VX1, VT2 + xvfcmp.clt.d VT0, VX0, VX1 + xvbitsel.v x1, VX0, VX1, VT0 + xvbitsel.v x2, VI1, VI2, VT0 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t2, 1 + xvadd.d VI1, VI2, VINC4 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t2, 1 + xvadd.d VI2, VI1, VINC4 + xvfsub.d VT1, VZE, VX0 + xvfsub.d VT2, VZE, VX1 + xvfmaxa.d VX0, VX0, VT1 + xvfmaxa.d VX1, VX1, VT2 + xvfcmp.clt.d VT0, VX0, VX1 + xvbitsel.v x3, VX0, VX1, VT0 + xvbitsel.v x4, VI1, VI2, VT0 + xvfcmp.clt.d VC0, x1, x3 + xvbitsel.v x1, x1, x3, VC0 + xvbitsel.v x2, x2, x4, VC0 + xvfcmp.clt.d VT0, VM0, x1 + xvbitsel.v VM0, VM0, x1, VT0 + xvbitsel.v VI0, VI0, x2, VT0 + + addi.d I, I, -1 + blt $r0, I, .L24 + .align 3 + +.L25: + vreplvei.d $vr21, $vr20, 0 + vreplvei.d $vr22, $vr20, 1 + vreplvei.d $vr9, $vr15, 0 + vreplvei.d $vr10, $vr15, 1 + fcmp.ceq.d $fcc0, $f10, $f9 + bceqz $fcc0, .L26 + xvfcmp.clt.d VT0, VI1, VI2 + xvbitsel.v VI0, VI2, VI1, VT0 + b .L27 + .align 3 + +.L26: + xvfcmp.clt.d VT0, x1, x2 + xvbitsel.v VI0, VI1, VI2, VT0 + xvbitsel.v VM0, x1, x2, VT0 + .align 3 + +.L27: + movfr2gr.d i0, $f20 + .align 3 + #else +.L20: // INCX!=1 + move TEMP, X addi.w i0, i0, 1 ld.w t1, TEMP, 0 * SIZE add.d TEMP, TEMP, INCX @@ -253,19 +384,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvinsgr2vr.w VM0, t2, 1 xvinsgr2vr.w VM0, t3, 2 xvinsgr2vr.w VM0, t4, 3 - ld.w t1, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - ld.w t2, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - ld.w t3, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - ld.w t4, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - xvinsgr2vr.w VM0, t1, 4 - xvinsgr2vr.w VM0, t2, 5 - xvinsgr2vr.w VM0, t3, 6 - xvinsgr2vr.w VM0, t4, 7 - slli.w i0, i0, 3 //8 + slli.w i0, i0, 2 //4 + xvreplgr2vr.w VINC4, i0 + slli.w i0, i0, 1 //8 xvreplgr2vr.w VINC8, i0 addi.w i0, i0, -15 xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization @@ -275,15 +396,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvinsgr2vr.w VI1, i0, 2 addi.w i0, i0, 1 xvinsgr2vr.w VI1, i0, 3 - addi.w i0, i0, 1 - xvinsgr2vr.w VI1, i0, 4 - addi.w i0, i0, 1 - xvinsgr2vr.w VI1, i0, 5 - addi.w i0, i0, 1 - xvinsgr2vr.w VI1, i0, 6 - addi.w i0, i0, 1 - xvinsgr2vr.w VI1, i0, 7 - addi.w i0, i0, 1 + addi.w i0, i0, 5 xvinsgr2vr.w VI0, i0, 0 //1 addi.w i0, i0, 1 xvinsgr2vr.w VI0, i0, 1 //2 @@ -291,54 +404,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvinsgr2vr.w VI0, i0, 2 //3 addi.w i0, i0, 1 xvinsgr2vr.w VI0, i0, 3 //4 - addi.w i0, i0, 1 - xvinsgr2vr.w VI0, i0, 4 //5 - addi.w i0, i0, 1 - xvinsgr2vr.w VI0, i0, 5 //6 - addi.w i0, i0, 1 - xvinsgr2vr.w VI0, i0, 6 //7 - addi.w i0, i0, 1 - xvinsgr2vr.w VI0, i0, 7 //8 -#endif .align 3 .L24: -#ifdef DOUBLE - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.d VX0, t1, 0 - xvinsgr2vr.d VX0, t2, 1 - xvinsgr2vr.d VX0, t3, 2 - xvinsgr2vr.d VX0, t4, 3 - xvadd.d VI1, VI1, VINC8 - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.d VX1, t1, 0 - xvinsgr2vr.d VX1, t2, 1 - xvinsgr2vr.d VX1, t3, 2 - xvinsgr2vr.d VX1, t4, 3 - xvadd.d VI2, VI1, VINC4 - xvfmaxa.d VM1, VX0, VX1 - xvfcmp.ceq.d VT0, VX0, VM1 - addi.d I, I, -1 - xvbitsel.v VI2, VI2, VI1, VT0 - xvfmaxa.d VM1, VM0, VM1 - xvfcmp.ceq.d VT0, VM0, VM1 - xvbitsel.v VM0, VM1, VM0, VT0 - xvbitsel.v VI0, VI2, VI0, VT0 -#else ld.w t1, X, 0 * SIZE add.d X, X, INCX ld.w t2, X, 0 * SIZE @@ -351,6 +419,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvinsgr2vr.w VX0, t2, 1 xvinsgr2vr.w VX0, t3, 2 xvinsgr2vr.w VX0, t4, 3 + xvadd.w VI1, VI1, VINC8 ld.w t1, X, 0 * SIZE add.d X, X, INCX ld.w t2, X, 0 * SIZE @@ -359,158 +428,80 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add.d X, X, INCX ld.w t4, X, 0 * SIZE add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 4 - xvinsgr2vr.w VX0, t2, 5 - xvinsgr2vr.w VX0, t3, 6 - xvinsgr2vr.w VX0, t4, 7 - xvadd.w VI1, VI1, VINC8 - xvfmaxa.s VM1, VX0, VM0 - xvfcmp.ceq.s VT0, VM1, VM0 + xvinsgr2vr.w VX1, t1, 0 + xvinsgr2vr.w VX1, t2, 1 + xvinsgr2vr.w VX1, t3, 2 + xvinsgr2vr.w VX1, t4, 3 + xvadd.w VI2, VI1, VINC4 + xvfsub.s VT1, VZE, VX0 + xvfsub.s VT2, VZE, VX1 + xvfmaxa.s VX0, VX0, VT1 + xvfmaxa.s VX1, VX1, VT2 + xvfcmp.clt.s VT0, VX0, VX1 + xvbitsel.v x1, VX0, VX1, VT0 + xvbitsel.v x2, VI1, VI2, VT0 //i + addi.d I, I, -1 - xvbitsel.v VM0, VM1, VM0, VT0 - xvbitsel.v VI0, VI1, VI0, VT0 -#endif + xvfcmp.clt.s VT0, VM0, x1 + xvbitsel.v VM0, VM0, x1, VT0 + xvbitsel.v VI0, VI0, x2, VT0 blt $r0, I, .L24 .align 3 .L25: -#ifdef DOUBLE - xvpickve.d VI1, VI0, 0 - xvpickve.d VI2, VI0, 1 - xvpickve.d VI3, VI0, 2 - xvpickve.d VI4, VI0, 3 - xvpickve.d x1, VM0, 0 - xvpickve.d x2, VM0, 1 - xvpickve.d x3, VM0, 2 - xvpickve.d x4, VM0, 3 - xvfmaxa.d VM1, x1, x2 - xvfcmp.ceq.d VT0, x1, VM1 - xvbitsel.v VINC4, VI2, VI1, VT0 - xvfmaxa.d VM0, x4, x3 - xvfcmp.ceq.d VT0, x3, VM0 - xvbitsel.v VINC8, VI4, VI3, VT0 - xvfmaxa.d VM0, VM0, VM1 - xvfcmp.ceq.d VT0, VM0, VM1 - xvbitsel.v VI0, VINC8, VINC4, VT0 -#else - xvxor.v VX0, VX0, VX0 - xvor.v VX0, VI0, VX0 - xvxor.v VX1, VX1, VX1 - xvor.v VX1, VM0, VX1 - xvpickve.w VI1, VI0, 0 - xvpickve.w VI2, VI0, 1 - xvpickve.w VI3, VI0, 2 - xvpickve.w VI4, VI0, 3 - xvpickve.w x1, VM0, 0 - xvpickve.w x2, VM0, 1 - xvpickve.w x3, VM0, 2 - xvpickve.w x4, VM0, 3 - xvfmaxa.s VM1, x1, x2 - xvfcmp.ceq.s VT0, x1, VM1 - xvbitsel.v VINC4, VI2, VI1, VT0 - xvfmaxa.s VM0, x3, x4 - xvfcmp.ceq.s VT0, x3, VM0 - xvbitsel.v VINC8, VI3, VI4, VT0 - xvfmaxa.s VM0, VM0, VM1 - xvfcmp.ceq.s VT0, VM0, VM1 - xvbitsel.v VM0, VM0, VM1, VT0 - xvbitsel.v VI0, VINC8, VINC4, VT0 -#endif - CMPEQ $fcc0, $f15, $f9 - bceqz $fcc0, .L26 - XVCMPLT VT0, VI1, VI0 - xvbitsel.v VI0, VI0, VI1, VT0 + vreplvei.w $vr21, $vr20, 0 + vreplvei.w $vr22, $vr20, 1 + vreplvei.w $vr8, $vr20, 2 + vreplvei.w $vr19, $vr20, 3 + vreplvei.w $vr9, $vr15, 0 + vreplvei.w $vr10, $vr15, 1 + vreplvei.w $vr11, $vr15, 2 + vreplvei.w $vr12, $vr15, 3 .align 3 .L26: - fcmp.ceq.d $fcc0, $f15, $f10 - bceqz $fcc0, .L27 - XVCMPLT VT0, VI2, VI0 - xvbitsel.v VI0, VI0, VI2, VT0 + fcmp.ceq.s $fcc0, $f9, $f10 + bceqz $fcc0, .L31 + xvfcmp.clt.s VT0, VI1, VI2 + xvbitsel.v VI1, VI2, VI1, VT0 + b .L32 .align 3 - -.L27: - fcmp.ceq.d $fcc0, $f15, $f11 - bceqz $fcc0, .L28 - XVCMPLT VT0, VI3, VI0 - xvbitsel.v VI0, VI0, VI3, VT0 +.L31: + xvfcmp.clt.s VT0, x1, x2 + xvbitsel.v VI1, VI1, VI2, VT0 + xvbitsel.v x1, x1, x2, VT0 .align 3 - -.L28: - fcmp.ceq.d $fcc0, $f15, $f12 - bceqz $fcc0, .L29 - XVCMPLT VT0, VI4, VI0 - xvbitsel.v VI0, VI0, VI4, VT0 +.L32: + fcmp.ceq.s $fcc0, $f11, $f12 + bceqz $fcc0, .L33 + xvfcmp.clt.s VT1, VI3, VI4 + xvbitsel.v VI3, VI4, VI3, VT1 + b .L34 .align 3 - -.L29: -#ifdef DOUBLE - movfr2gr.d i0, $f20 -#else - fmov.s $f16, $f20 -#endif +.L33: + xvfcmp.clt.s VT1, x3, x4 + xvbitsel.v x3, x3, x4, VT1 + xvbitsel.v VI3, VI3, VI4, VT1 .align 3 - -#ifdef DOUBLE - -#else -.L252: - xvxor.v VI0, VI0, VI0 - xvor.v VI0, VI0, VX0 - fmov.s $f13, $f15 - xvxor.v VM0, VM0, VM0 - xvor.v VM0, VM0, VX1 - xvpickve.w VI1, VI0, 4 - xvpickve.w VI2, VI0, 5 - xvpickve.w VI3, VI0, 6 - xvpickve.w VI4, VI0, 7 - xvpickve.w x1, VM0, 4 - xvpickve.w x2, VM0, 5 - xvpickve.w x3, VM0, 6 - xvpickve.w x4, VM0, 7 - xvfmaxa.s VM1, x1, x2 - xvfcmp.ceq.s VT0, x1, VM1 - xvbitsel.v VINC4, VI2, VI1, VT0 - xvfmaxa.s VM0, x3, x4 - xvfcmp.ceq.s VT0, x3, VM0 - xvbitsel.v VINC8, VI4, VI3, VT0 - xvfmaxa.s VM0, VM0, VM1 - xvfcmp.ceq.s VT0, VM0, VM1 - xvbitsel.v VI0, VINC8, VINC4, VT0 - fcmp.ceq.d $fcc0, $f15, $f9 - bceqz $fcc0, .L262 - xvfcmp.clt.s VT0, VI1, VI0 - xvbitsel.v VI0, VI0, VI1, VT0 +.L34: + fcmp.ceq.s $fcc0, $f9, $f11 + bceqz $fcc0, .L35 + xvfcmp.clt.s VT0, VI1, VI3 + xvbitsel.v VI0, VI3, VI1, VT0 + xvxor.v VM0, x1, VZE + b .L29 .align 3 - -.L262: - fcmp.ceq.d $fcc0, $f15, $f10 - bceqz $fcc0, .L272 - xvfcmp.clt.s VT0, VI2, VI0 - xvbitsel.v VI0, VI0, VI2, VT0 +.L35: + xvfcmp.clt.s VT0, x1, x3 + xvbitsel.v VM0, x1, x3, VT0 + xvbitsel.v VI0, VI1, VI3, VT0 .align 3 -.L272: - fcmp.ceq.d $fcc0, $f15, $f11 - bceqz $fcc0, .L282 - xvfcmp.clt.s VT0, VI3, VI0 - xvbitsel.v VI0, VI0, VI3, VT0 - .align 3 - -.L282: - fcmp.ceq.d $fcc0, $f15, $f12 - bceqz $fcc0, .L292 - xvfcmp.clt.s VT0, VI4, VI0 - xvbitsel.v VI0, VI0, VI4, VT0 +.L29: + movfr2gr.s i0, $f20 .align 3 -.L292: - xvfmaxa.s VM0, VX0, VM0 - xvfcmp.ceq.s VT0, VM0, VX0 - xvbitsel.v VI0, VI0, VI1, VT0 - movfr2gr.s i0, $f20 #endif - -.L21: //N<8 +.L21: // N<8 andi I, N, 7 bge $r0, I, .L999 srai.d i1, N, 3 @@ -521,17 +512,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .align 3 .L22: - LD $f9, X, 0 + LD $f9, X, 0 +#ifdef DOUBLE + fsub.d $f10, $f3, $f9 + xvfmaxa.d x1, x1, x2 + xvfcmp.clt.d VT0, VM0, x1 +#else + fsub.s $f10, $f3, $f9 + xvfmaxa.s x1, x1, x2 + xvfcmp.clt.s VT0, VM0, x1 +#endif + xvbitsel.v VM0, VM0, x1, VT0 + xvbitsel.v VI0, VI0, VI1, VT0 addi.d I, I, -1 - XVFMAXA VM1, x1, VM0 - XVCMPEQ VT0, VM0, VM1 - add.d X, X, INCX - xvbitsel.v VM0, VM1, VM0, VT0 - xvbitsel.v VI0, VI1, VI0, VT0 addi.d i1, i1, 1 + add.d X, X, INCX movgr2fr.d $f21, i1 blt $r0, I, .L22 - MTG i0, $f20 + MTG i0, $f20 .align 3 .L999: From 6dc4ca23918a12628c44cad4aacd934d2b8417df Mon Sep 17 00:00:00 2001 From: pengxu Date: Wed, 30 Apr 2025 16:42:12 +0800 Subject: [PATCH 145/205] Loongarch64: fixed icamax_lasx --- kernel/loongarch64/icamax_lasx.S | 412 +++++++++++++------------------ 1 file changed, 165 insertions(+), 247 deletions(-) diff --git a/kernel/loongarch64/icamax_lasx.S b/kernel/loongarch64/icamax_lasx.S index 7800cb917..fb47aa458 100644 --- a/kernel/loongarch64/icamax_lasx.S +++ b/kernel/loongarch64/icamax_lasx.S @@ -76,66 +76,66 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi.d i0, i0, 1 srai.d I, N, 2 bge $r0, I, .L21 - slli.d i0, i0, 2 //4 + slli.d i0, i0, 1 //2 xvreplgr2vr.d VINC4, i0 - addi.d i0, i0, -7 + addi.d i0, i0, -3 xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization - addi.d i0, i0, 2 + addi.d i0, i0, 1 xvinsgr2vr.d VI1, i0, 1 - addi.d i0, i0, -1 + addi.d i0, i0, 1 xvinsgr2vr.d VI1, i0, 2 - addi.d i0, i0, 2 - xvinsgr2vr.d VI1, i0, 3 addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 0 //1 - addi.d i0, i0, 2 - xvinsgr2vr.d VI0, i0, 1 //3 + xvinsgr2vr.d VI1, i0, 3 addi.d i0, i0, -1 - xvinsgr2vr.d VI0, i0, 2 //2 - addi.d i0, i0, 2 - xvinsgr2vr.d VI0, i0, 3 //4 + xvinsgr2vr.d VI0, i0, 0 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 3 #else li.w I, -1 xvreplgr2vr.w VI4, I xvffint.s.w VI4, VI4 // -1 bne INCX, TEMP, .L20 addi.w i0, i0, 1 - srai.d I, N, 3 + srai.d I, N, 2 bge $r0, I, .L21 - slli.w i0, i0, 3 //8 - xvreplgr2vr.w VINC8, i0 - addi.w i0, i0, -15 + slli.w i0, i0, 2 //4 + xvreplgr2vr.w VINC4, i0 + addi.w i0, i0, -7 xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization addi.w i0, i0, 1 xvinsgr2vr.w VI1, i0, 1 - addi.w i0, i0, 3 + addi.w i0, i0, 1 xvinsgr2vr.w VI1, i0, 2 addi.w i0, i0, 1 xvinsgr2vr.w VI1, i0, 3 - addi.w i0, i0, -3 + addi.w i0, i0, 1 xvinsgr2vr.w VI1, i0, 4 addi.w i0, i0, 1 xvinsgr2vr.w VI1, i0, 5 - addi.w i0, i0, 3 + addi.w i0, i0, 1 xvinsgr2vr.w VI1, i0, 6 addi.w i0, i0, 1 xvinsgr2vr.w VI1, i0, 7 + addi.w i0, i0, -3 + xvinsgr2vr.w VI0, i0, 0 addi.w i0, i0, 1 - xvinsgr2vr.w VI0, i0, 0 //1 + xvinsgr2vr.w VI0, i0, 1 addi.w i0, i0, 1 - xvinsgr2vr.w VI0, i0, 1 //2 - addi.w i0, i0, 3 - xvinsgr2vr.w VI0, i0, 2 //5 + xvinsgr2vr.w VI0, i0, 2 addi.w i0, i0, 1 - xvinsgr2vr.w VI0, i0, 3 //6 - addi.w i0, i0, -3 - xvinsgr2vr.w VI0, i0, 4 //3 + xvinsgr2vr.w VI0, i0, 3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 4 addi.w i0, i0, 1 - xvinsgr2vr.w VI0, i0, 5 //4 - addi.w i0, i0, 3 - xvinsgr2vr.w VI0, i0, 6 //7 + xvinsgr2vr.w VI0, i0, 5 addi.w i0, i0, 1 - xvinsgr2vr.w VI0, i0, 7 //8 + xvinsgr2vr.w VI0, i0, 6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 7 #endif .align 3 @@ -143,7 +143,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvld VX0, X, 0 * SIZE #ifdef DOUBLE xvadd.d VI1, VI1, VINC4 - xvld VX1, X, 4 * SIZE + xvld VX1, X, 2 * SIZE addi.d I, I, -1 xvpickev.d x1, VX1, VX0 xvpickod.d x2, VX1, VX0 @@ -153,22 +153,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvfcmp.clt.d VINC8, x2, VI3 xvbitsel.v x1, x1, x3, VT0 xvbitsel.v x2, x2, x4, VINC8 + xvfadd.d x1, x1, x2 + xvfmax.d x3, VM0, x1 + xvfcmp.ceq.d VT0, x3, VM0 + xvbitsel.v VM0, x3, VM0, VT0 + xvbitsel.v VI0, VI1, VI0, VT0 + xvld VX0, X, 4 * SIZE + xvadd.d VI1, VI1, VINC4 + xvld VX1, X, 6 * SIZE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + xvfmul.d x3, VI4, x1 + xvfmul.d x4, VI4, x2 #else - xvadd.w VI1, VI1, VINC8 - xvld VX1, X, 8 * SIZE + xvadd.w VI1, VI1, VINC4 + xvld VX1, X, 4 * SIZE addi.d I, I, -1 xvpickev.w x1, VX1, VX0 xvpickod.w x2, VX1, VX0 xvfmul.s x3, VI4, x1 xvfmul.s x4, VI4, x2 - xvfcmp.clt.s VT0, x1, VI3 - xvfcmp.clt.s VINC4, x2, VI3 - xvbitsel.v x1, x1, x3, VT0 - xvbitsel.v x2, x2, x4, VINC4 #endif - XVFADD x1, x1, x2 - XVFMAX x3, VM0, x1 - XVCMPEQ VT0, x3, VM0 + XVCMPLT VT0, x1, VI3 + XVCMPLT VINC8, x2, VI3 + xvbitsel.v x1, x1, x3, VT0 + xvbitsel.v x2, x2, x4, VINC8 + XVFADD x1, x1, x2 + XVFMAX x3, VM0, x1 + XVCMPEQ VT0, x3, VM0 addi.d X, X, 8 * SIZE xvbitsel.v VM0, x3, VM0, VT0 xvbitsel.v VI0, VI1, VI0, VT0 @@ -177,51 +189,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L15: #ifdef DOUBLE - xvpickve.d VI1, VI0, 0 - xvpickve.d VI2, VI0, 1 - xvpickve.d VI3, VI0, 2 - xvpickve.d VI4, VI0, 3 - xvpickve.d x1, VM0, 0 - xvpickve.d x2, VM0, 1 - xvpickve.d x3, VM0, 2 - xvpickve.d x4, VM0, 3 - xvfmax.d VM1, x1, x2 - xvfcmp.ceq.d VT0, VM1, x1 + vreplvei.d $vr21, $vr20, 0 + vreplvei.d $vr22, $vr20, 1 + vreplvei.d $vr9, $vr15, 0 + vreplvei.d $vr10, $vr15, 1 + fcmp.ceq.d $fcc0, $f10, $f9 + bceqz $fcc0, .L26 + xvfcmp.clt.d VT0, VI1, VI2 + xvbitsel.v VI0, VI2, VI1, VT0 + b .L27 +#else + vreplvei.w $vr21, $vr20, 0 + vreplvei.w $vr22, $vr20, 1 + vreplvei.w $vr8, $vr20, 2 + vreplvei.w $vr19, $vr20, 3 + vreplvei.w $vr9, $vr15, 0 + vreplvei.w $vr10, $vr15, 1 + vreplvei.w $vr11, $vr15, 2 + vreplvei.w $vr12, $vr15, 3 + xvfmaxa.s VM1, x1, x2 + xvfcmp.ceq.s VT0, VM1, x1 xvbitsel.v VINC4, VI2, VI1, VT0 - xvfmax.d VM0, x3, x4 - xvfcmp.ceq.d VT0, x3, VM0 + xvfmaxa.s VM0, x3, x4 + xvfcmp.ceq.s VT0, x3, VM0 xvbitsel.v VINC8, VI4, VI3, VT0 - xvfmax.d VM0, VM0, VM1 - xvfcmp.ceq.d VT0, VM0, VM1 + xvfmaxa.s VM0, VM0, VM1 + xvfcmp.ceq.s VT0, VM0, VM1 xvbitsel.v VI0, VINC8, VINC4, VT0 -#else - xvxor.v VX0, VX0, VX0 - xvor.v VX0, VI0, VX0 - xvxor.v VX1, VX1, VX1 - xvor.v VX1, VM0, VX1 - xvpickve.w VI1, VI0, 0 - xvpickve.w VI2, VI0, 1 - xvpickve.w VI3, VI0, 2 - xvpickve.w VI4, VI0, 3 - xvpickve.w x1, VM0, 0 - xvpickve.w x2, VM0, 1 - xvpickve.w x3, VM0, 2 - xvpickve.w x4, VM0, 3 - xvfcmp.clt.s VT0, x1, x2 - xvbitsel.v VM1, x1, x2, VT0 - xvbitsel.v VINC4, VI1, VI2, VT0 - xvfcmp.clt.s VT0, x3, x4 - xvbitsel.v VM0, x3, x4, VT0 - xvbitsel.v VINC8, VI3, VI4, VT0 - xvfcmp.clt.s VT0, VM0, VM1 - xvbitsel.v VM0, VM0, VM1, VT0 - xvbitsel.v VI0, VINC8, VINC4, VT0 -#endif fcmp.ceq.d $fcc0, $f15, $f9 bceqz $fcc0, .L26 - XVCMPLT VT0, VI1, VI0 + xvfcmp.clt.s VT0, VI1, VI0 xvbitsel.v VI0, VI0, VI1, VT0 b .L26 +#endif .align 3 .L20: // INCX!=1 @@ -229,62 +229,62 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi.d i0, i0, 1 srai.d I, N, 2 bge $r0, I, .L21 - slli.d i0, i0, 2 //4 + slli.d i0, i0, 1 //2 xvreplgr2vr.d VINC4, i0 - addi.d i0, i0, -7 + addi.d i0, i0, -3 xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization - addi.d i0, i0, 2 + addi.d i0, i0, 1 xvinsgr2vr.d VI1, i0, 1 - addi.d i0, i0, -1 + addi.d i0, i0, 1 xvinsgr2vr.d VI1, i0, 2 - addi.d i0, i0, 2 - xvinsgr2vr.d VI1, i0, 3 addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 0 //1 - addi.d i0, i0, 2 - xvinsgr2vr.d VI0, i0, 1 //3 + xvinsgr2vr.d VI1, i0, 3 addi.d i0, i0, -1 - xvinsgr2vr.d VI0, i0, 2 //2 - addi.d i0, i0, 2 - xvinsgr2vr.d VI0, i0, 3 //4 + xvinsgr2vr.d VI0, i0, 0 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 3 #else addi.w i0, i0, 1 - srai.d I, N, 3 + srai.d I, N, 2 bge $r0, I, .L21 - slli.w i0, i0, 3 //8 - xvreplgr2vr.w VINC8, i0 - addi.w i0, i0, -15 + slli.w i0, i0, 2 //4 + xvreplgr2vr.w VINC4, i0 + addi.w i0, i0, -7 xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization addi.w i0, i0, 1 xvinsgr2vr.w VI1, i0, 1 - addi.w i0, i0, 3 + addi.w i0, i0, 1 xvinsgr2vr.w VI1, i0, 2 addi.w i0, i0, 1 xvinsgr2vr.w VI1, i0, 3 - addi.w i0, i0, -3 + addi.w i0, i0, 1 xvinsgr2vr.w VI1, i0, 4 addi.w i0, i0, 1 xvinsgr2vr.w VI1, i0, 5 - addi.w i0, i0, 3 + addi.w i0, i0, 1 xvinsgr2vr.w VI1, i0, 6 addi.w i0, i0, 1 xvinsgr2vr.w VI1, i0, 7 + addi.w i0, i0, -3 + xvinsgr2vr.w VI0, i0, 0 addi.w i0, i0, 1 - xvinsgr2vr.w VI0, i0, 0 //1 + xvinsgr2vr.w VI0, i0, 1 addi.w i0, i0, 1 - xvinsgr2vr.w VI0, i0, 1 //2 - addi.w i0, i0, 3 - xvinsgr2vr.w VI0, i0, 2 //5 + xvinsgr2vr.w VI0, i0, 2 addi.w i0, i0, 1 - xvinsgr2vr.w VI0, i0, 3 //6 - addi.w i0, i0, -3 - xvinsgr2vr.w VI0, i0, 4 //3 + xvinsgr2vr.w VI0, i0, 3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 4 addi.w i0, i0, 1 - xvinsgr2vr.w VI0, i0, 5 //4 - addi.w i0, i0, 3 - xvinsgr2vr.w VI0, i0, 6 //7 + xvinsgr2vr.w VI0, i0, 5 addi.w i0, i0, 1 - xvinsgr2vr.w VI0, i0, 7 //8 + xvinsgr2vr.w VI0, i0, 6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 7 #endif .align 3 @@ -301,16 +301,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvinsgr2vr.d x1, t3, 1 xvinsgr2vr.d x2, t4, 1 xvadd.d VI1, VI1, VINC4 + xvfmul.d x3, VI4, x1 + xvfmul.d x4, VI4, x2 + xvfcmp.clt.d VT0, x1, VI3 + xvfcmp.clt.d VINC8, x2, VI3 + xvbitsel.v x1, x1, x3, VT0 + xvbitsel.v x2, x2, x4, VINC8 + xvfadd.d x1, x1, x2 + xvfmax.d x3, VM0, x1 ld.d t1, X, 0 * SIZE + xvfcmp.ceq.d VT0, x3, VM0 ld.d t2, X, 1 * SIZE + xvbitsel.v VM0, x3, VM0, VT0 + xvbitsel.v VI0, VI1, VI0, VT0 add.d X, X, INCX ld.d t3, X, 0 * SIZE ld.d t4, X, 1 * SIZE add.d X, X, INCX - xvinsgr2vr.d x1, t1, 2 - xvinsgr2vr.d x2, t2, 2 - xvinsgr2vr.d x1, t3, 3 - xvinsgr2vr.d x2, t4, 3 + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 1 + xvinsgr2vr.d x2, t4, 1 + xvadd.d VI1, VI1, VINC4 addi.d I, I, -1 xvfmul.d x3, VI4, x1 xvfmul.d x4, VI4, x2 @@ -332,6 +344,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvinsgr2vr.w x2, t2, 0 xvinsgr2vr.w x1, t3, 1 xvinsgr2vr.w x2, t4, 1 + xvadd.w VI1, VI1, VINC4 ld.w t1, X, 0 * SIZE ld.w t2, X, 1 * SIZE add.d X, X, INCX @@ -342,31 +355,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvinsgr2vr.w x2, t2, 2 xvinsgr2vr.w x1, t3, 3 xvinsgr2vr.w x2, t4, 3 - xvadd.w VI1, VI1, VINC8 - ld.w t1, X, 0 * SIZE - ld.w t2, X, 1 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - ld.w t4, X, 1 * SIZE - add.d X, X, INCX - xvinsgr2vr.w x1, t1, 4 - xvinsgr2vr.w x2, t2, 4 - xvinsgr2vr.w x1, t3, 5 - xvinsgr2vr.w x2, t4, 5 - xvadd.w VI1, VI1, VINC8 - ld.w t1, X, 0 * SIZE - ld.w t2, X, 1 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - ld.w t4, X, 1 * SIZE - add.d X, X, INCX - xvinsgr2vr.w x1, t1, 6 - xvinsgr2vr.w x2, t2, 6 - xvinsgr2vr.w x1, t3, 7 - xvinsgr2vr.w x2, t4, 7 addi.d I, I, -1 - xvpickev.w x1, VX1, VX0 - xvpickod.w x2, VX1, VX0 xvfmul.s x3, VI4, x1 xvfmul.s x4, VI4, x2 xvfcmp.clt.s VT0, x1, VI3 @@ -384,152 +373,82 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L25: #ifdef DOUBLE - xvpickve.d VI1, VI0, 0 - xvpickve.d VI2, VI0, 1 - xvpickve.d VI3, VI0, 2 - xvpickve.d VI4, VI0, 3 - xvpickve.d x1, VM0, 0 - xvpickve.d x2, VM0, 1 - xvpickve.d x3, VM0, 2 - xvpickve.d x4, VM0, 3 - xvfmaxa.d VM1, x1, x2 - xvfcmp.ceq.d VT0, VM1, x1 + vreplvei.d $vr21, $vr20, 0 + vreplvei.d $vr22, $vr20, 1 + vreplvei.d $vr9, $vr15, 0 + vreplvei.d $vr10, $vr15, 1 + fcmp.ceq.d $fcc0, $f10, $f9 + bceqz $fcc0, .L26 + xvfcmp.clt.d VT0, VI1, VI2 + xvbitsel.v VI0, VI2, VI1, VT0 + b .L27 +#else + vreplvei.w $vr21, $vr20, 0 + vreplvei.w $vr22, $vr20, 1 + vreplvei.w $vr8, $vr20, 2 + vreplvei.w $vr19, $vr20, 3 + vreplvei.w $vr9, $vr15, 0 + vreplvei.w $vr10, $vr15, 1 + vreplvei.w $vr11, $vr15, 2 + vreplvei.w $vr12, $vr15, 3 + xvfmaxa.s VM1, x1, x2 + xvfcmp.ceq.s VT0, VM1, x1 xvbitsel.v VINC4, VI2, VI1, VT0 - xvfmaxa.d VM0, x3, x4 - xvfcmp.ceq.d VT0, x3, VM0 + xvfmaxa.s VM0, x3, x4 + xvfcmp.ceq.s VT0, x3, VM0 xvbitsel.v VINC8, VI4, VI3, VT0 - xvfmaxa.d VM0, VM0, VM1 - xvfcmp.ceq.d VT0, VM0, VM1 + xvfmaxa.s VM0, VM0, VM1 + xvfcmp.ceq.s VT0, VM0, VM1 xvbitsel.v VI0, VINC8, VINC4, VT0 -#else - xvxor.v VX0, VX0, VX0 - xvor.v VX0, VI0, VX0 - xvxor.v VX1, VX1, VX1 - xvor.v VX1, VM0, VX1 - xvpickve.w VI1, VI0, 0 - xvpickve.w VI2, VI0, 1 - xvpickve.w VI3, VI0, 2 - xvpickve.w VI4, VI0, 3 - xvpickve.w x1, VM0, 0 - xvpickve.w x2, VM0, 1 - xvpickve.w x3, VM0, 2 - xvpickve.w x4, VM0, 3 - xvfcmp.clt.s VT0, x1, x2 - xvbitsel.v VM1, x1, x2, VT0 - xvbitsel.v VINC4, VI1, VI2, VT0 - xvfcmp.clt.s VT0, x3, x4 - xvbitsel.v VM0, x3, x4, VT0 - xvbitsel.v VINC8, VI3, VI4, VT0 - xvfcmp.clt.s VT0, VM0, VM1 - xvbitsel.v VM0, VM0, VM1, VT0 - xvbitsel.v VI0, VINC8, VINC4, VT0 -#endif fcmp.ceq.d $fcc0, $f15, $f9 bceqz $fcc0, .L26 - XVCMPLT VT0, VI1, VI0 + xvfcmp.clt.s VT0, VI1, VI0 xvbitsel.v VI0, VI0, VI1, VT0 +#endif .align 3 +#ifdef DOUBLE .L26: - fcmp.ceq.d $fcc0, $f15, $f10 - bceqz $fcc0, .L27 - XVCMPLT VT0, VI2, VI0 - xvbitsel.v VI0, VI0, VI2, VT0 + xvfmaxa.d VM0, x1, x2 + xvfcmp.ceq.d VT0, x1, VM0 + xvbitsel.v VI0, VI2, VI1, VT0 .align 3 .L27: - fcmp.ceq.d $fcc0, $f15, $f11 - bceqz $fcc0, .L28 - XVCMPLT VT0, VI3, VI0 - xvbitsel.v VI0, VI0, VI3, VT0 - .align 3 - -.L28: - fcmp.ceq.d $fcc0, $f15, $f12 - bceqz $fcc0, .L29 - XVCMPLT VT0, VI4, VI0 - xvbitsel.v VI0, VI0, VI4, VT0 - .align 3 - -.L29: -#ifdef DOUBLE movfr2gr.d i0, $f20 -#else - fmov.s $f16, $f20 -#endif .align 3 - -#ifdef DOUBLE #else -.L252: - xvxor.v VI0, VI0, VI0 - xvor.v VI0, VI0, VX0 - fmov.s $f13, $f15 - xvxor.v VM0, VM0, VM0 - xvor.v VM0, VM0, VX1 - xvpickve.w VI1, VI0, 4 - xvpickve.w VI2, VI0, 5 - xvpickve.w VI3, VI0, 6 - xvpickve.w VI4, VI0, 7 - xvpickve.w x1, VM0, 4 - xvpickve.w x2, VM0, 5 - xvpickve.w x3, VM0, 6 - xvpickve.w x4, VM0, 7 - xvfcmp.clt.s VT0, x1, x2 - xvbitsel.v x1, x1, x2, VT0 - xvbitsel.v VINC4, VI1, VI2, VT0 - xvfcmp.clt.s VT0, x3, x4 - xvbitsel.v VM0, x3, x4, VT0 - xvbitsel.v VINC8, VI3, VI4, VT0 - xvfcmp.clt.s VT0, VM0, x1 - xvbitsel.v VM0, VM0, x1, VT0 - xvbitsel.v VI0, VINC8, VINC4, VT0 - fcmp.ceq.d $fcc0, $f15, $f9 - bceqz $fcc0, .L262 - xvfcmp.clt.s VT0, VI1, VI0 - xvbitsel.v VI0, VI0, VI1, VT0 - .align 3 - -.L262: +.L26: fcmp.ceq.d $fcc0, $f15, $f10 - bceqz $fcc0, .L272 + bceqz $fcc0, .L27 xvfcmp.clt.s VT0, VI2, VI0 xvbitsel.v VI0, VI0, VI2, VT0 .align 3 -.L272: +.L27: fcmp.ceq.d $fcc0, $f15, $f11 - bceqz $fcc0, .L282 + bceqz $fcc0, .L28 xvfcmp.clt.s VT0, VI3, VI0 xvbitsel.v VI0, VI0, VI3, VT0 .align 3 -.L282: +.L28: fcmp.ceq.d $fcc0, $f15, $f12 - bceqz $fcc0, .L292 + bceqz $fcc0, .L29 xvfcmp.clt.s VT0, VI4, VI0 xvbitsel.v VI0, VI0, VI4, VT0 .align 3 -.L292: - fcmp.clt.s $fcc0, $f15, $f13 - fsel $f15, $f15, $f13, $fcc0 - fsel $f20, $f20, $f16, $fcc0 +.L29: movfr2gr.s i0, $f20 + .align 3 #endif -.L21: //N<8 -#ifdef DOUBLE +.L21: //N<4 andi I, N, 3 bge $r0, I, .L999 srai.d i1, N, 2 slli.d i1, i1, 2 -#else - andi I, N, 7 - bge $r0, I, .L999 - srai.d i1, N, 3 - slli.d i1, i1, 3 -#endif addi.d i1, i1, 1 //current index movgr2fr.d $f21, i1 movgr2fr.d $f20, i0 @@ -550,10 +469,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi.d i1, i1, 1 movgr2fr.d $f21, i1 blt $r0, I, .L22 - MTG i0, $f20 + MTG i0, $f20 .align 3 - .L999: move $r4, $r17 jirl $r0, $r1, 0x0 From 57bb46bedfca77fdbce2a480cb3949ca5ea9ab91 Mon Sep 17 00:00:00 2001 From: pengxu Date: Wed, 30 Apr 2025 16:42:22 +0800 Subject: [PATCH 146/205] Loongarch64: fixed rot_lasx --- kernel/loongarch64/rot_lasx.S | 1300 ++++----------------------------- 1 file changed, 123 insertions(+), 1177 deletions(-) diff --git a/kernel/loongarch64/rot_lasx.S b/kernel/loongarch64/rot_lasx.S index 71378e0b2..386e9136f 100644 --- a/kernel/loongarch64/rot_lasx.S +++ b/kernel/loongarch64/rot_lasx.S @@ -64,6 +64,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. slli.d TEMP, TEMP, BASE_SHIFT slli.d INCX, INCX, BASE_SHIFT slli.d INCY, INCY, BASE_SHIFT + move XX, X + move YY, Y #ifdef DOUBLE movfr2gr.d t1, C xvreplgr2vr.d VXC, t1 @@ -80,27 +82,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvreplgr2vr.w VXZ, t3 #endif srai.d I, N, 3 + bge $r0, I, .L997 bne INCX, TEMP, .L20 - bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 - b .L11 // INCX==1 and INCY==1 + bne INCY, TEMP, .L121 // INCX==1 and INCY!=1 + b .L111 // INCX==1 and INCY==1 .L20: - bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 - b .L21 // INCX!=1 and INCY==1 - -.L11: - bge $r0, I, .L997 - CMPEQ $fcc0, C, a1 - bcnez $fcc0, .L110 - CMPEQ $fcc0, S, a1 - bcnez $fcc0, .L112 // C!=0 S==0 - b .L111 // C!=0 S!=0 - .align 3 - -.L110: - CMPEQ $fcc0, S, a1 - bcnez $fcc0, .L114 // C==0 S==0 - b .L113 // C==0 S!=0 - .align 3 + bne INCY, TEMP, .L221 // INCX!=1 and INCY!=1 + b .L211 // INCX!=1 and INCY==1 .L111: // C!=0 S!=0 xvld VX0, X, 0 * SIZE @@ -130,90 +118,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. b .L997 .align 3 -.L112: // C!=0 S==0 - xvld VX0, X, 0 * SIZE - xvld VX2, Y, 0 * SIZE -#ifdef DOUBLE - xvld VX1, X, 4 * SIZE - xvld VX3, Y, 4 * SIZE -#endif - XVMUL VT0, VX0, VXC - XVMUL VT1, VX2, VXC - xvst VT0, X, 0 * SIZE - xvst VT1, Y, 0 * SIZE -#ifdef DOUBLE - XVMUL VT0, VX1, VXC - XVMUL VT1, VX3, VXC - xvst VT0, X, 4 * SIZE - xvst VT1, Y, 4 * SIZE -#endif - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L112 - b .L997 - .align 3 - -.L113: // C==0 S!=0 - xvld VX0, X, 0 * SIZE - xvld VX2, Y, 0 * SIZE -#ifdef DOUBLE - xvld VX1, X, 4 * SIZE - xvld VX3, Y, 4 * SIZE -#endif - XVMUL VT0, VX2, VXS - XVMUL VT1, VX0, VXS - XVFSUB VT1, VXZ, VT1 - xvst VT0, X, 0 * SIZE - xvst VT1, Y, 0 * SIZE -#ifdef DOUBLE - XVMUL VT0, VX3, VXS - XVMUL VT1, VX1, VXS - xvfsub.d VT1, VXZ, VT1 - xvst VT0, X, 4 * SIZE - xvst VT1, Y, 4 * SIZE -#endif - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L113 - b .L997 - .align 3 - -.L114: // C==0 S==0 - xvst VXZ, X, 0 * SIZE - xvst VXZ, Y, 0 * SIZE -#ifdef DOUBLE - xvst VXZ, X, 4 * SIZE - xvst VXZ, Y, 4 * SIZE -#endif - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L114 - b .L997 - .align 3 - -.L12: // INCX==1 and INCY!=1 - bge $r0, I, .L997 - move YY, Y - move XX, X - CMPEQ $fcc0, C, a1 - bcnez $fcc0, .L120 - CMPEQ $fcc0, S, a1 - bcnez $fcc0, .L122 // C!=0 S==0 - b .L121 // C!=0 S!=0 - .align 3 - -.L120: - CMPEQ $fcc0, S, a1 - bcnez $fcc0, .L124 // C==0 S==0 - b .L123 // C==0 S!=0 - .align 3 - .L121: // C!=0 S!=0 - xvld VX0, X, 0 * SIZE #ifdef DOUBLE + xvld VX0, X, 0 * SIZE ld.d t1, Y, 0 * SIZE add.d Y, Y, INCY ld.d t2, Y, 0 * SIZE @@ -221,12 +128,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld.d t3, Y, 0 * SIZE add.d Y, Y, INCY ld.d t4, Y, 0 * SIZE + add.d Y, Y, INCY xvinsgr2vr.d VX2, t1, 0 xvinsgr2vr.d VX2, t2, 1 xvinsgr2vr.d VX2, t3, 2 xvinsgr2vr.d VX2, t4, 3 - add.d Y, Y, INCY #else + xvld VX0, X, 0 * SIZE ld.w t1, Y, 0 * SIZE add.d Y, Y, INCY ld.w t2, Y, 0 * SIZE @@ -234,133 +142,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld.w t3, Y, 0 * SIZE add.d Y, Y, INCY ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY xvinsgr2vr.w VX2, t1, 0 xvinsgr2vr.w VX2, t2, 1 xvinsgr2vr.w VX2, t3, 2 xvinsgr2vr.w VX2, t4, 3 + ld.w t1, Y, 0 * SIZE add.d Y, Y, INCY - ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 0 * SIZE add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE + ld.w t3, Y, 0 * SIZE add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 0 * SIZE add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE xvinsgr2vr.w VX2, t1, 4 xvinsgr2vr.w VX2, t2, 5 xvinsgr2vr.w VX2, t3, 6 xvinsgr2vr.w VX2, t4, 7 - add.d Y, Y, INCY #endif XVMUL VT0, VX0, VXC XVFMADD VT0, VX2, VXS, VT0 XVMUL VT1, VX0, VXS XVMSUB VT1, VX2, VXC, VT1 - -#ifdef DOUBLE - xvld VX1, X, 4 * SIZE - xvst VT0, X, 0 * SIZE - xvstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.d VT1, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.d VT1, YY, 0, 3 - add.d YY, YY, INCY - ld.d t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - xvinsgr2vr.d VX3, t1, 0 - xvinsgr2vr.d VX3, t2, 1 - xvinsgr2vr.d VX3, t3, 2 - xvinsgr2vr.d VX3, t4, 3 - add.d Y, Y, INCY - XVMUL VT0, VX1, VXC - XVFMADD VT0, VX3, VXS, VT0 - XVMUL VT1, VX1, VXS - XVMSUB VT1, VX3, VXC, VT1 - xvst VT0, X, 4 * SIZE - xvstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.d VT1, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.d VT1, YY, 0, 3 -#else xvst VT0, X, 0 * SIZE - xvstelm.w VT1, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 3 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 4 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 5 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 6 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 7 - -#endif - add.d YY, YY, INCY - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L121 - b .L997 - .align 3 - -.L122: // C!=0 S==0 #ifdef DOUBLE - xvld VX0, X, 0 * SIZE - ld.d t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - xvinsgr2vr.d VX2, t1, 0 - xvinsgr2vr.d VX2, t2, 1 - xvinsgr2vr.d VX2, t3, 2 - xvinsgr2vr.d VX2, t4, 3 - add.d Y, Y, INCY - xvfmul.d VT0, VX0, VXC - xvfmul.d VT1, VX2, VXC - xvld VX1, X, 4 * SIZE - xvst VT0, X, 0 * SIZE - xvstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.d VT1, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.d VT1, YY, 0, 3 - add.d YY, YY, INCY - ld.d t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - xvinsgr2vr.d VX3, t1, 0 - xvinsgr2vr.d VX3, t2, 1 - xvinsgr2vr.d VX3, t3, 2 - xvinsgr2vr.d VX3, t4, 3 - add.d Y, Y, INCY - xvfmul.d VT0, VX1, VXC - xvfmul.d VT1, VX3, VXC - addi.d I, I, -1 - xvst VT0, X, 4 * SIZE xvstelm.d VT1, YY, 0, 0 add.d YY, YY, INCY xvstelm.d VT1, YY, 0, 1 @@ -368,102 +173,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvstelm.d VT1, YY, 0, 2 add.d YY, YY, INCY xvstelm.d VT1, YY, 0, 3 -#else - xvld VX0, X, 0 * SIZE - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - add.d Y, Y, INCY - xvinsgr2vr.w VX2, t1, 0 - xvinsgr2vr.w VX2, t2, 1 - xvinsgr2vr.w VX2, t3, 2 - xvinsgr2vr.w VX2, t4, 3 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - xvinsgr2vr.w VX2, t1, 4 - xvinsgr2vr.w VX2, t2, 5 - xvinsgr2vr.w VX2, t3, 6 - xvinsgr2vr.w VX2, t4, 7 - add.d Y, Y, INCY - xvfmul.s VT0, VX0, VXC - xvfmul.s VT1, VX2, VXC - xvst VT0, X, 0 * SIZE - xvstelm.w VT1, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 3 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 4 add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 5 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 6 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 7 -#endif - add.d YY, YY, INCY - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L122 - b .L997 - .align 3 -.L123: // C==0 S!=0 -#ifdef DOUBLE - xvld VX0, X, 0 * SIZE - ld.d t1, Y, 0 * SIZE + xvld VX0, X, 4 * SIZE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE + ld.d t3, Y, 0 * SIZE add.d Y, Y, INCY - ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 0 * SIZE add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE xvinsgr2vr.d VX2, t1, 0 xvinsgr2vr.d VX2, t2, 1 xvinsgr2vr.d VX2, t3, 2 xvinsgr2vr.d VX2, t4, 3 - add.d Y, Y, INCY - xvfmul.d VT0, VX2, VXS - xvfmul.d VT1, VX0, VXS - xvfsub.d VT1, VXZ, VT1 - xvst VT0, X, 0 * SIZE - xvstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.d VT1, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.d VT1, YY, 0, 3 - add.d YY, YY, INCY - xvld VX1, X, 4 * SIZE - ld.d t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - xvinsgr2vr.d VX3, t1, 0 - xvinsgr2vr.d VX3, t2, 1 - xvinsgr2vr.d VX3, t3, 2 - xvinsgr2vr.d VX3, t4, 3 - add.d Y, Y, INCY - xvfmul.d VT0, VX3, VXS - xvfmul.d VT1, VX1, VXS - xvfsub.d VT1, VXZ, VT1 - addi.d I, I, -1 + + XVMUL VT0, VX0, VXC + XVFMADD VT0, VX2, VXS, VT0 + XVMUL VT1, VX0, VXS + XVMSUB VT1, VX2, VXC, VT1 + xvst VT0, X, 4 * SIZE xvstelm.d VT1, YY, 0, 0 add.d YY, YY, INCY @@ -472,36 +202,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvstelm.d VT1, YY, 0, 2 add.d YY, YY, INCY xvstelm.d VT1, YY, 0, 3 + add.d YY, YY, INCY #else - xvld VX0, X, 0 * SIZE - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - add.d Y, Y, INCY - xvinsgr2vr.w VX2, t1, 0 - xvinsgr2vr.w VX2, t2, 1 - xvinsgr2vr.w VX2, t3, 2 - xvinsgr2vr.w VX2, t4, 3 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - xvinsgr2vr.w VX2, t1, 4 - xvinsgr2vr.w VX2, t2, 5 - xvinsgr2vr.w VX2, t3, 6 - xvinsgr2vr.w VX2, t4, 7 - add.d Y, Y, INCY - xvfmul.s VT0, VX2, VXS - xvfmul.s VT1, VX0, VXS - xvfsub.s VT1, VXZ, VT1 - xvst VT0, X, 0 * SIZE xvstelm.w VT1, YY, 0, 0 add.d YY, YY, INCY xvstelm.w VT1, YY, 0, 1 @@ -517,135 +219,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvstelm.w VT1, YY, 0, 6 add.d YY, YY, INCY xvstelm.w VT1, YY, 0, 7 -#endif add.d YY, YY, INCY +#endif addi.d X, X, 8 * SIZE addi.d I, I, -1 - blt $r0, I, .L123 + blt $r0, I, .L121 b .L997 .align 3 -.L124: // C==0 S==0 - xvst VXZ, X, 0 * SIZE +.L211: // C!=0 S!=0 #ifdef DOUBLE - xvst VXZ, X, 0 * SIZE - xvst VXZ, X, 4 * SIZE - xvstelm.d VXZ, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.d VXZ, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.d VXZ, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.d VXZ, YY, 0, 3 - add.d YY, YY, INCY - xvstelm.d VXZ, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.d VXZ, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.d VXZ, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.d VXZ, YY, 0, 3 + xvld VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 #else - xvst VXZ, X, 0 * SIZE - xvstelm.w VXZ, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 3 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 4 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 5 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 6 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 7 -#endif - add.d YY, YY, INCY - addi.d I, I, -1 - addi.d X, X, 8 * SIZE - blt $r0, I, .L124 - move Y, YY - b .L997 - .align 3 - -.L21:// INCX!=1 and INCY==1 - bge $r0, I, .L997 - move XX, X - CMPEQ $fcc0, C, a1 - bcnez $fcc0, .L210 - CMPEQ $fcc0, S, a1 - bcnez $fcc0, .L212 // C!=0 S==0 - b .L211 // C!=0 S!=0 - .align 3 - -.L210: - CMPEQ $fcc0, S, a1 - bcnez $fcc0, .L214 // C==0 S==0 - b .L213 // C==0 S!=0 - .align 3 - -.L211: // C!=0 S!=0 -#ifdef DOUBLE xvld VX2, Y, 0 * SIZE - ld.d t1, X, 0 * SIZE + ld.w t1, X, 0 * SIZE add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - xvinsgr2vr.d VX0, t1, 0 - xvinsgr2vr.d VX0, t2, 1 - xvinsgr2vr.d VX0, t3, 2 - xvinsgr2vr.d VX0, t4, 3 - add.d X, X, INCX - xvfmul.d VT0, VXC, VX0 - xvfmadd.d VT0, VX2, VXS, VT0 - xvfmul.d VT1, VXS, VX0 - xvfmsub.d VT1, VX2, VXC, VT1 - xvstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - xvstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - xvstelm.d VT0, XX, 0, 2 - add.d XX, XX, INCX - xvstelm.d VT0, XX, 0, 3 - add.d XX, XX, INCX - xvst VT1, Y, 0 * SIZE - xvld VX3, Y, 4 * SIZE - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - xvinsgr2vr.d VX1, t1, 0 - xvinsgr2vr.d VX1, t2, 1 - xvinsgr2vr.d VX1, t3, 2 - xvinsgr2vr.d VX1, t4, 3 - add.d X, X, INCX - xvfmul.d VT0, VX1, VXC - xvfmadd.d VT0, VX3, VXS, VT0 - xvfmul.d VT1, VX1, VXS - xvfmsub.d VT1, VX3, VXC, VT1 - xvstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - xvstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - xvstelm.d VT0, XX, 0, 2 - add.d XX, XX, INCX - xvstelm.d VT0, XX, 0, 3 - add.d XX, XX, INCX - xvst VT1, Y, 4 * SIZE -#else - xvld VX2, Y, 0 * SIZE - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE + ld.w t2, X, 0 * SIZE add.d X, X, INCX ld.w t3, X, 0 * SIZE add.d X, X, INCX @@ -655,166 +256,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvinsgr2vr.w VX0, t2, 1 xvinsgr2vr.w VX0, t3, 2 xvinsgr2vr.w VX0, t4, 3 - ld.w t1, X, 0 * SIZE + ld.w t1, X, 0 * SIZE add.d X, X, INCX - ld.w t2, X, 0 * SIZE + ld.w t2, X, 0 * SIZE add.d X, X, INCX ld.w t3, X, 0 * SIZE add.d X, X, INCX ld.w t4, X, 0 * SIZE - xvinsgr2vr.w VX0, t1, 4 - xvinsgr2vr.w VX0, t2, 5 - xvinsgr2vr.w VX0, t3, 6 - xvinsgr2vr.w VX0, t4, 7 - add.d X, X, INCX - xvfmul.s VT0, VXC, VX0 - xvfmadd.s VT0, VX2, VXS, VT0 - xvfmul.s VT1, VX0, VXS - xvfmsub.s VT1, VX2, VXC, VT1 - xvstelm.w VT0, XX, 0, 0 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 1 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 2 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 3 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 4 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 5 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 6 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 7 - add.d XX, XX, INCX - xvst VT1, Y, 0 * SIZE -#endif - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L211 - b .L997 - .align 3 - -.L212: // C!=0 S==0 -#ifdef DOUBLE - xvld VX2, Y, 0 * SIZE - ld.d t1, X, 0 * SIZE add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - xvinsgr2vr.d VX0, t1, 0 - xvinsgr2vr.d VX0, t2, 1 - xvinsgr2vr.d VX0, t3, 2 - xvinsgr2vr.d VX0, t4, 3 - add.d X, X, INCX - xvfmul.d VT0, VXC, VX0 - xvstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - xvstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - xvstelm.d VT0, XX, 0, 2 - add.d XX, XX, INCX - xvstelm.d VT0, XX, 0, 3 - add.d XX, XX, INCX - xvfmul.d VT1, VX2, VXC - xvst VT1, Y, 0 * SIZE - xvld VX3, Y, 4 * SIZE - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - xvinsgr2vr.d VX1, t1, 0 - xvinsgr2vr.d VX1, t2, 1 - xvinsgr2vr.d VX1, t3, 2 - xvinsgr2vr.d VX1, t4, 3 - add.d X, X, INCX - xvfmul.d VT0, VX1, VXC - xvstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - xvstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - xvstelm.d VT0, XX, 0, 2 - add.d XX, XX, INCX - xvstelm.d VT0, XX, 0, 3 - add.d XX, XX, INCX - xvfmul.d VT1, VX3, VXS - xvst VT1, Y, 4 * SIZE -#else - xvld VX2, Y, 0 * SIZE - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 0 - xvinsgr2vr.w VX0, t2, 1 - xvinsgr2vr.w VX0, t3, 2 - xvinsgr2vr.w VX0, t4, 3 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE xvinsgr2vr.w VX0, t1, 4 xvinsgr2vr.w VX0, t2, 5 xvinsgr2vr.w VX0, t3, 6 xvinsgr2vr.w VX0, t4, 7 - add.d X, X, INCX - xvfmul.s VT0, VXC, VX0 - xvfmul.s VT1, VX2, VXC - xvstelm.w VT0, XX, 0, 0 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 1 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 2 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 3 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 4 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 5 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 6 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 7 - add.d XX, XX, INCX - xvst VT1, Y, 0 * SIZE #endif - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L212 - b .L997 - .align 3 - -.L213: // C==0 S!=0 + XVMUL VT0, VXC, VX0 + XVFMADD VT0, VX2, VXS, VT0 + XVMUL VT1, VXS, VX0 + XVMSUB VT1, VX2, VXC, VT1 #ifdef DOUBLE - xvld VX2, Y, 0 * SIZE - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - xvinsgr2vr.d VX0, t1, 0 - xvinsgr2vr.d VX0, t2, 1 - xvinsgr2vr.d VX0, t3, 2 - xvinsgr2vr.d VX0, t4, 3 - add.d X, X, INCX - xvfmul.d VT0, VXS, VX2 - xvfmul.d VT1, VXS, VX0 - xvfsub.d VT1, VXZ, VT1 xvstelm.d VT0, XX, 0, 0 add.d XX, XX, INCX xvstelm.d VT0, XX, 0, 1 @@ -824,148 +283,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvstelm.d VT0, XX, 0, 3 add.d XX, XX, INCX xvst VT1, Y, 0 * SIZE - xvld VX3, Y, 4 * SIZE - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - xvinsgr2vr.d VX1, t1, 0 - xvinsgr2vr.d VX1, t2, 1 - xvinsgr2vr.d VX1, t3, 2 - xvinsgr2vr.d VX1, t4, 3 - add.d X, X, INCX - xvfmul.d VT0, VX3, VXS - xvfmul.d VT1, VX1, VXS - xvfsub.d VT1, VXZ, VT1 - xvstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - xvstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - xvstelm.d VT0, XX, 0, 2 - add.d XX, XX, INCX - xvstelm.d VT0, XX, 0, 3 - add.d XX, XX, INCX - xvst VT1, Y, 4 * SIZE -#else - xvld VX2, Y, 0 * SIZE - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 0 - xvinsgr2vr.w VX0, t2, 1 - xvinsgr2vr.w VX0, t3, 2 - xvinsgr2vr.w VX0, t4, 3 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - xvinsgr2vr.w VX0, t1, 4 - xvinsgr2vr.w VX0, t2, 5 - xvinsgr2vr.w VX0, t3, 6 - xvinsgr2vr.w VX0, t4, 7 - add.d X, X, INCX - xvfmul.s VT0, VXS, VX2 - xvfmul.s VT1, VXS, VX0 - xvfsub.s VT1, VXZ, VT1 - xvstelm.w VT0, XX, 0, 0 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 1 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 2 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 3 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 4 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 5 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 6 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 7 - add.d XX, XX, INCX - xvst VT1, Y, 0 * SIZE -#endif - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L213 - b .L997 - .align 3 - -.L214: // C==0 S==0 -#ifdef DOUBLE - xvstelm.d VXZ, XX, 0, 0 - add.d XX, XX, INCX - xvstelm.d VXZ, XX, 0, 1 - add.d XX, XX, INCX - xvstelm.d VXZ, XX, 0, 2 - add.d XX, XX, INCX - xvstelm.d VXZ, XX, 0, 3 - add.d XX, XX, INCX - xvst VT1, Y, 0 * SIZE - xvstelm.d VXZ, XX, 0, 0 - add.d XX, XX, INCX - xvstelm.d VXZ, XX, 0, 1 - add.d XX, XX, INCX - xvstelm.d VXZ, XX, 0, 2 - add.d XX, XX, INCX - xvstelm.d VXZ, XX, 0, 3 - add.d XX, XX, INCX - xvst VT1, Y, 4 * SIZE -#else - xvstelm.w VXZ, XX, 0, 0 - add.d XX, XX, INCX - xvstelm.w VXZ, XX, 0, 1 - add.d XX, XX, INCX - xvstelm.w VXZ, XX, 0, 2 - add.d XX, XX, INCX - xvstelm.w VXZ, XX, 0, 3 - add.d XX, XX, INCX - xvst VT1, Y, 0 * SIZE - xvstelm.w VXZ, XX, 0, 4 - add.d XX, XX, INCX - xvstelm.w VXZ, XX, 0, 5 - add.d XX, XX, INCX - xvstelm.w VXZ, XX, 0, 6 - add.d XX, XX, INCX - xvstelm.w VXZ, XX, 0, 7 - add.d XX, XX, INCX -#endif - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L211 - b .L997 - .align 3 - -.L22: - bge $r0, I, .L997 - move YY, Y - move XX, X - CMPEQ $fcc0, C, a1 - bcnez $fcc0, .L220 - CMPEQ $fcc0, S, a1 - bcnez $fcc0, .L222 // C!=0 S==0 - b .L221 // C!=0 S!=0 - .align 3 -.L220: - CMPEQ $fcc0, S, a1 - bcnez $fcc0, .L224 // C==0 S==0 - b .L223 // C==0 S!=0 - .align 3 - -.L221: // C!=0 S!=0 -#ifdef DOUBLE + xvld VX2, Y, 4 * SIZE ld.d t1, X, 0 * SIZE add.d X, X, INCX ld.d t2, X, 0 * SIZE @@ -978,135 +297,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvinsgr2vr.d VX0, t2, 1 xvinsgr2vr.d VX0, t3, 2 xvinsgr2vr.d VX0, t4, 3 - ld.d t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - xvinsgr2vr.d VX2, t1, 0 - xvinsgr2vr.d VX2, t2, 1 - xvinsgr2vr.d VX2, t3, 2 - xvinsgr2vr.d VX2, t4, 3 - add.d Y, Y, INCY - xvfmul.d VT0, VX0, VXC - xvfmadd.d VT0, VX2, VXS, VT0 - xvfmul.d VT1, VX0, VXS - xvfmsub.d VT1, VX2, VXC, VT1 - xvstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - xvstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - xvstelm.d VT0, XX, 0, 2 - add.d XX, XX, INCX - xvstelm.d VT0, XX, 0, 3 - add.d XX, XX, INCX - xvstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.d VT1, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.d VT1, YY, 0, 3 - add.d YY, YY, INCY - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.d VX1, t1, 0 - xvinsgr2vr.d VX1, t2, 1 - xvinsgr2vr.d VX1, t3, 2 - xvinsgr2vr.d VX1, t4, 3 - ld.d t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - xvinsgr2vr.d VX3, t1, 0 - xvinsgr2vr.d VX3, t2, 1 - xvinsgr2vr.d VX3, t3, 2 - xvinsgr2vr.d VX3, t4, 3 - add.d Y, Y, INCY - xvfmul.d VT0, VX1, VXC - xvfmadd.d VT0, VX3, VXS, VT0 - xvfmul.d VT1, VX1, VXS - xvfmsub.d VT1, VX3, VXC, VT1 + + XVMUL VT0, VXC, VX0 + XVFMADD VT0, VX2, VXS, VT0 + XVMUL VT1, VXS, VX0 + XVMSUB VT1, VX2, VXC, VT1 xvstelm.d VT0, XX, 0, 0 add.d XX, XX, INCX xvstelm.d VT0, XX, 0, 1 add.d XX, XX, INCX - xvstelm.d VT0, XX, 0, 2 - add.d XX, XX, INCX - xvstelm.d VT0, XX, 0, 3 - add.d XX, XX, INCX - xvstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.d VT1, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.d VT1, YY, 0, 3 - add.d YY, YY, INCY -#else - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 0 - xvinsgr2vr.w VX0, t2, 1 - xvinsgr2vr.w VX0, t3, 2 - xvinsgr2vr.w VX0, t4, 3 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 4 - xvinsgr2vr.w VX0, t2, 5 - xvinsgr2vr.w VX0, t3, 6 - xvinsgr2vr.w VX0, t4, 7 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - add.d Y, Y, INCY - xvinsgr2vr.w VX2, t1, 0 - xvinsgr2vr.w VX2, t2, 1 - xvinsgr2vr.w VX2, t3, 2 - xvinsgr2vr.w VX2, t4, 3 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - xvinsgr2vr.w VX2, t1, 4 - xvinsgr2vr.w VX2, t2, 5 - xvinsgr2vr.w VX2, t3, 6 - xvinsgr2vr.w VX2, t4, 7 - add.d Y, Y, INCY - xvfmul.s VT0, VX0, VXC - xvfmadd.s VT0, VX2, VXS, VT0 - xvfmul.s VT1, VX0, VXS - xvfmsub.s VT1, VX2, VXC, VT1 + xvstelm.d VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 3 + add.d XX, XX, INCX + xvst VT1, Y, 4 * SIZE +#else xvstelm.w VT0, XX, 0, 0 add.d XX, XX, INCX xvstelm.w VT0, XX, 0, 1 @@ -1123,232 +328,96 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add.d XX, XX, INCX xvstelm.w VT0, XX, 0, 7 add.d XX, XX, INCX - xvstelm.w VT1, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 3 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 4 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 5 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 6 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 7 - add.d YY, YY, INCY + xvst VT1, Y, 0 * SIZE #endif + addi.d Y, Y, 8 * SIZE addi.d I, I, -1 - blt $r0, I, .L221 + blt $r0, I, .L211 b .L997 .align 3 -.L222: // C!=0 S==0 +.L221: // C!=0 S!=0 #ifdef DOUBLE - ld.d t1, X, 0 * SIZE + ld.d t1, X, 0 * SIZE add.d X, X, INCX - ld.d t2, X, 0 * SIZE + ld.d t2, X, 0 * SIZE add.d X, X, INCX - ld.d t3, X, 0 * SIZE + ld.d t3, X, 0 * SIZE add.d X, X, INCX - ld.d t4, X, 0 * SIZE + ld.d t4, X, 0 * SIZE add.d X, X, INCX xvinsgr2vr.d VX0, t1, 0 xvinsgr2vr.d VX0, t2, 1 xvinsgr2vr.d VX0, t3, 2 xvinsgr2vr.d VX0, t4, 3 - ld.d t1, Y, 0 * SIZE + ld.d t1, Y, 0 * SIZE add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE add.d Y, Y, INCY - ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 0 * SIZE add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE xvinsgr2vr.d VX2, t1, 0 xvinsgr2vr.d VX2, t2, 1 xvinsgr2vr.d VX2, t3, 2 xvinsgr2vr.d VX2, t4, 3 - add.d Y, Y, INCY - xvfmul.d VT0, VX0, VXC - xvfmul.d VT1, VX2, VXC - xvstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - xvstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - xvstelm.d VT0, XX, 0, 2 - add.d XX, XX, INCX - xvstelm.d VT0, XX, 0, 3 - add.d XX, XX, INCX - xvstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.d VT1, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.d VT1, YY, 0, 3 - add.d YY, YY, INCY - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.d VX1, t1, 0 - xvinsgr2vr.d VX1, t2, 1 - xvinsgr2vr.d VX1, t3, 2 - xvinsgr2vr.d VX1, t4, 3 - ld.d t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - xvinsgr2vr.d VX3, t1, 0 - xvinsgr2vr.d VX3, t2, 1 - xvinsgr2vr.d VX3, t3, 2 - xvinsgr2vr.d VX3, t4, 3 - add.d Y, Y, INCY - xvfmul.d VT0, VX1, VXC - xvfmul.d VT1, VX3, VXC - xvstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - xvstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - xvstelm.d VT0, XX, 0, 2 - add.d XX, XX, INCX - xvstelm.d VT0, XX, 0, 3 - add.d XX, XX, INCX - xvstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.d VT1, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.d VT1, YY, 0, 3 - add.d YY, YY, INCY #else - ld.w t1, X, 0 * SIZE + ld.w t1, X, 0 * SIZE add.d X, X, INCX - ld.w t2, X, 0 * SIZE + ld.w t2, X, 0 * SIZE add.d X, X, INCX - ld.w t3, X, 0 * SIZE + ld.w t3, X, 0 * SIZE add.d X, X, INCX - ld.w t4, X, 0 * SIZE + ld.w t4, X, 0 * SIZE add.d X, X, INCX xvinsgr2vr.w VX0, t1, 0 xvinsgr2vr.w VX0, t2, 1 xvinsgr2vr.w VX0, t3, 2 xvinsgr2vr.w VX0, t4, 3 - ld.w t1, X, 0 * SIZE + ld.w t1, X, 0 * SIZE add.d X, X, INCX - ld.w t2, X, 0 * SIZE + ld.w t2, X, 0 * SIZE add.d X, X, INCX - ld.w t3, X, 0 * SIZE + ld.w t3, X, 0 * SIZE add.d X, X, INCX - ld.w t4, X, 0 * SIZE + ld.w t4, X, 0 * SIZE add.d X, X, INCX xvinsgr2vr.w VX0, t1, 4 xvinsgr2vr.w VX0, t2, 5 xvinsgr2vr.w VX0, t3, 6 xvinsgr2vr.w VX0, t4, 7 - ld.w t1, Y, 0 * SIZE + + ld.w t1, Y, 0 * SIZE add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE + ld.w t2, Y, 0 * SIZE add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE + ld.w t3, Y, 0 * SIZE add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE + ld.w t4, Y, 0 * SIZE add.d Y, Y, INCY xvinsgr2vr.w VX2, t1, 0 xvinsgr2vr.w VX2, t2, 1 xvinsgr2vr.w VX2, t3, 2 xvinsgr2vr.w VX2, t4, 3 - ld.w t1, Y, 0 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE + ld.w t3, Y, 0 * SIZE add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 0 * SIZE add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE xvinsgr2vr.w VX2, t1, 4 xvinsgr2vr.w VX2, t2, 5 xvinsgr2vr.w VX2, t3, 6 xvinsgr2vr.w VX2, t4, 7 - add.d Y, Y, INCY - xvfmul.s VT0, VX0, VXC - xvfmul.s VT1, VX2, VXC - xvstelm.w VT0, XX, 0, 0 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 1 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 2 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 3 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 4 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 5 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 6 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 7 - add.d XX, XX, INCX - xvstelm.w VT1, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 3 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 4 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 5 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 6 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 7 - add.d YY, YY, INCY #endif - addi.d I, I, -1 - blt $r0, I, .L222 - b .L997 - .align 3 - -.L223: // C==0 S!=0 + XVMUL VT0, VX0, VXC + XVFMADD VT0, VX2, VXS, VT0 + XVMUL VT1, VX0, VXS + XVMSUB VT1, VX2, VXC, VT1 #ifdef DOUBLE - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.d VX0, t1, 0 - xvinsgr2vr.d VX0, t2, 1 - xvinsgr2vr.d VX0, t3, 2 - xvinsgr2vr.d VX0, t4, 3 - ld.d t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - xvinsgr2vr.d VX2, t1, 0 - xvinsgr2vr.d VX2, t2, 1 - xvinsgr2vr.d VX2, t3, 2 - xvinsgr2vr.d VX2, t4, 3 - add.d Y, Y, INCY - xvfmul.d VT0, VX2, VXS - xvfmul.d VT1, VX0, VXS - xvfsub.d VT1, VXZ, VT1 xvstelm.d VT0, XX, 0, 0 add.d XX, XX, INCX xvstelm.d VT0, XX, 0, 1 @@ -1365,33 +434,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add.d YY, YY, INCY xvstelm.d VT1, YY, 0, 3 add.d YY, YY, INCY - ld.d t1, X, 0 * SIZE + + ld.d t1, X, 0 * SIZE add.d X, X, INCX - ld.d t2, X, 0 * SIZE + ld.d t2, X, 0 * SIZE add.d X, X, INCX - ld.d t3, X, 0 * SIZE + ld.d t3, X, 0 * SIZE add.d X, X, INCX - ld.d t4, X, 0 * SIZE + ld.d t4, X, 0 * SIZE add.d X, X, INCX - xvinsgr2vr.d VX1, t1, 0 - xvinsgr2vr.d VX1, t2, 1 - xvinsgr2vr.d VX1, t3, 2 - xvinsgr2vr.d VX1, t4, 3 - ld.d t1, Y, 0 * SIZE + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, Y, 0 * SIZE add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE + ld.d t2, Y, 0 * SIZE add.d Y, Y, INCY - ld.d t3, Y, 0 * SIZE + ld.d t3, Y, 0 * SIZE add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - xvinsgr2vr.d VX3, t1, 0 - xvinsgr2vr.d VX3, t2, 1 - xvinsgr2vr.d VX3, t3, 2 - xvinsgr2vr.d VX3, t4, 3 + ld.d t4, Y, 0 * SIZE add.d Y, Y, INCY - xvfmul.d VT0, VX3, VXS - xvfmul.d VT1, VX0, VXS - xvfsub.d VT1, VXZ, VT1 + xvinsgr2vr.d VX2, t1, 0 + xvinsgr2vr.d VX2, t2, 1 + xvinsgr2vr.d VX2, t3, 2 + xvinsgr2vr.d VX2, t4, 3 + + XVMUL VT0, VX0, VXC + XVFMADD VT0, VX2, VXS, VT0 + XVMUL VT1, VX0, VXS + XVMSUB VT1, VX2, VXC, VT1 xvstelm.d VT0, XX, 0, 0 add.d XX, XX, INCX xvstelm.d VT0, XX, 0, 1 @@ -1409,57 +481,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvstelm.d VT1, YY, 0, 3 add.d YY, YY, INCY #else - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 0 - xvinsgr2vr.w VX0, t2, 1 - xvinsgr2vr.w VX0, t3, 2 - xvinsgr2vr.w VX0, t4, 3 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 4 - xvinsgr2vr.w VX0, t2, 5 - xvinsgr2vr.w VX0, t3, 6 - xvinsgr2vr.w VX0, t4, 7 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - add.d Y, Y, INCY - xvinsgr2vr.w VX2, t1, 0 - xvinsgr2vr.w VX2, t2, 1 - xvinsgr2vr.w VX2, t3, 2 - xvinsgr2vr.w VX2, t4, 3 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - xvinsgr2vr.w VX2, t1, 4 - xvinsgr2vr.w VX2, t2, 5 - xvinsgr2vr.w VX2, t3, 6 - xvinsgr2vr.w VX2, t4, 7 - add.d Y, Y, INCY - xvfmul.s VT0, VX2, VXS - xvfmul.s VT1, VX0, VXS - xvfsub.s VT1, VXZ, VT1 xvstelm.w VT0, XX, 0, 0 add.d XX, XX, INCX xvstelm.w VT0, XX, 0, 1 @@ -1476,6 +497,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add.d XX, XX, INCX xvstelm.w VT0, XX, 0, 7 add.d XX, XX, INCX + xvstelm.w VT1, YY, 0, 0 add.d YY, YY, INCY xvstelm.w VT1, YY, 0, 1 @@ -1494,83 +516,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add.d YY, YY, INCY #endif addi.d I, I, -1 - blt $r0, I, .L223 - b .L997 - .align 3 - -.L224: // C==0 S==0 -#ifdef DOUBLE - xvstelm.d VXZ, XX, 0, 0 - add.d XX, XX, INCX - xvstelm.d VXZ, XX, 0, 1 - add.d XX, XX, INCX - xvstelm.d VXZ, XX, 0, 2 - add.d XX, XX, INCX - xvstelm.d VXZ, XX, 0, 3 - add.d XX, XX, INCX - xvstelm.d VXZ, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.d VXZ, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.d VXZ, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.d VXZ, YY, 0, 3 - add.d YY, YY, INCY - xvstelm.d VXZ, XX, 0, 0 - add.d XX, XX, INCX - xvstelm.d VXZ, XX, 0, 1 - add.d XX, XX, INCX - xvstelm.d VXZ, XX, 0, 2 - add.d XX, XX, INCX - xvstelm.d VXZ, XX, 0, 3 - add.d XX, XX, INCX - xvstelm.d VXZ, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.d VXZ, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.d VXZ, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.d VXZ, YY, 0, 3 -#else - xvstelm.w VXZ, XX, 0, 0 - add.d XX, XX, INCX - xvstelm.w VXZ, XX, 0, 1 - add.d XX, XX, INCX - xvstelm.w VXZ, XX, 0, 2 - add.d XX, XX, INCX - xvstelm.w VXZ, XX, 0, 3 - add.d XX, XX, INCX - xvstelm.w VXZ, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 3 - add.d YY, YY, INCY - xvstelm.w VXZ, XX, 0, 4 - add.d XX, XX, INCX - xvstelm.w VXZ, XX, 0, 5 - add.d XX, XX, INCX - xvstelm.w VXZ, XX, 0, 6 - add.d XX, XX, INCX - xvstelm.w VXZ, XX, 0, 7 - add.d XX, XX, INCX - xvstelm.w VXZ, YY, 0, 4 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 5 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 6 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 7 -#endif - add.d YY, YY, INCY - addi.d I, I, -1 - blt $r0, I, .L224 -#ifdef DOUBLE - move X, XX - move Y, YY -#endif + blt $r0, I, .L221 b .L997 .align 3 From b471fa337bdc59e11df6baacfd9d1202bb4079a7 Mon Sep 17 00:00:00 2001 From: pengxu Date: Wed, 30 Apr 2025 16:42:36 +0800 Subject: [PATCH 147/205] Loongarch64: fixed snrm2_lasx --- kernel/loongarch64/snrm2_lasx.S | 126 ++++++++++++++++++++++++-------- 1 file changed, 94 insertions(+), 32 deletions(-) diff --git a/kernel/loongarch64/snrm2_lasx.S b/kernel/loongarch64/snrm2_lasx.S index 3ae11e897..e5c9c557a 100644 --- a/kernel/loongarch64/snrm2_lasx.S +++ b/kernel/loongarch64/snrm2_lasx.S @@ -43,15 +43,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define t2 $r13 #define t3 $r14 #define t4 $r15 - -/* Don't change following FR unless you know the effects. */ #define VX0 $xr15 #define VX1 $xr16 #define VX2 $xr17 #define VX3 $xr18 #define VX4 $xr21 +#define VX5 $xr22 +/* Don't change following FR unless you know the effects. */ #define res1 $xr19 #define res2 $xr20 +#define RCP $f2 +#define VALPHA $xr3 + +// The optimization for snrm2 cannot simply involve +// extending the data type from float to double and +// then summing the squares of the data. LAPACK tests +// have shown that this approach can still lead to data overflow. +// Instead, we need to find the maximum absolute value in the entire +// array and divide each data element by this maximum value before +// performing the calculation. This approach can avoid overflow (and does not require extending the data type). PROLOGUE @@ -59,29 +69,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. LDINT N, 0(N) LDINT INCX, 0(INCX) #endif + bge $r0, N, .L999 + beq $r0, INCX, .L999 + addi.d $sp, $sp, -32 + st.d $ra, $sp, 0 + st.d N, $sp, 8 + st.d X, $sp, 16 + st.d INCX, $sp, 24 +#ifdef DYNAMIC_ARCH + bl samax_k_LA264 +#else + bl samax_k +#endif + ld.d $ra, $sp, 0 + ld.d N, $sp, 8 + ld.d X, $sp, 16 + ld.d INCX, $sp, 24 + addi.d $sp, $sp, 32 + + frecip.s RCP, $f0 + vreplvei.w $vr3, $vr2, 0 + xvpermi.d VALPHA, $xr3,0x00 xvxor.v res1, res1, res1 xvxor.v res2, res2, res2 - bge $r0, N, .L999 - beq $r0, INCX, .L999 + fcmp.ceq.s $fcc0, $f0, $f19 + bcnez $fcc0, .L999 li.d TEMP, SIZE slli.d INCX, INCX, BASE_SHIFT - srai.d I, N, 3 + srai.d I, N, 4 bne INCX, TEMP, .L20 - bge $r0, I, .L997 + bge $r0, I, .L997 .align 3 .L10: - xvld VX0, X, 0 - xvfcvtl.d.s VX1, VX0 - xvfcvth.d.s VX2, VX0 - xvfmadd.d res1, VX1, VX1, res1 - xvfmadd.d res2, VX2, VX2, res2 + xvld VX0, X, 0 + xvld VX5, X, 8 * SIZE addi.d I, I, -1 - addi.d X, X, 8 * SIZE + addi.d X, X, 16 * SIZE + + xvfmul.s VX0, VX0, VALPHA + xvfmul.s VX5, VX5, VALPHA + + xvfmadd.s res1, VX0, VX0, res1 + xvfmadd.s res2, VX5, VX5, res2 blt $r0, I, .L10 - .align 3 b .L996 + .align 3 .L20: bge $r0, I, .L997 @@ -107,47 +141,75 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld.w t3, X, 0 add.d X, X, INCX ld.w t4, X, 0 + add.d X, X, INCX xvinsgr2vr.w VX0, t1, 4 xvinsgr2vr.w VX0, t2, 5 xvinsgr2vr.w VX0, t3, 6 xvinsgr2vr.w VX0, t4, 7 + xvfmul.s VX0, VX0, VALPHA + xvfmadd.s res1, VX0, VX0, res1 + + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 add.d X, X, INCX - xvfcvtl.d.s VX1, VX0 - xvfcvth.d.s VX2, VX0 - xvfmadd.d res1, VX1, VX1, res1 - xvfmadd.d res2, VX2, VX2, res2 + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + xvfmul.s VX0, VX0, VALPHA + xvfmadd.s res2, VX0, VX0, res2 addi.d I, I, -1 blt $r0, I, .L21 - b .L996 + .align 3 .L996: - xvfadd.d res1, res1, res2 - xvpickve.d VX1, res1, 1 - xvpickve.d VX2, res1, 2 - xvpickve.d VX3, res1, 3 - fadd.d $f19, $f19, $f16 - fadd.d $f19, $f19, $f17 - fadd.d $f19, $f19, $f18 + xvfadd.s res1, res1, res2 + xvpermi.d VX1, res1, 0x4e + xvfadd.s res1, res1, VX1 + vreplvei.w $vr16, $vr19, 1 + vreplvei.w $vr17, $vr19, 2 + vreplvei.w $vr18, $vr19, 3 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX3, res1 .align 3 .L997: - andi I, N, 7 + andi I, N, 15 bge $r0, I, .L999 .align 3 .L998: fld.s $f15, X, 0 - add.d X, X, INCX - addi.d I, I, -1 - fcvt.d.s $f15, $f15 - fmadd.d $f19, $f15, $f15, $f19 + addi.d I, I, -1 + fmul.s $f15, $f15, RCP + fmadd.s $f19, $f15, $f15, $f19 + add.d X, X, INCX blt $r0, I, .L998 .align 3 .L999: - fsqrt.d $f19, $f19 + fsqrt.s $f19, $f19 + fmul.s $f0, $f19, $f0 move $r4, $r17 - fcvt.s.d $f0, $f19 jirl $r0, $r1, 0x0 + .align 3 EPILOGUE From f19e72c40293f617ab290fad2e7fc6865cc772cb Mon Sep 17 00:00:00 2001 From: pengxu Date: Wed, 30 Apr 2025 16:42:52 +0800 Subject: [PATCH 148/205] Loongarch64: fixed swap_lasx --- kernel/loongarch64/swap_lasx.S | 65 +++++----------------------------- 1 file changed, 9 insertions(+), 56 deletions(-) diff --git a/kernel/loongarch64/swap_lasx.S b/kernel/loongarch64/swap_lasx.S index 4767fffe3..3e7a14c42 100644 --- a/kernel/loongarch64/swap_lasx.S +++ b/kernel/loongarch64/swap_lasx.S @@ -318,62 +318,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. move XX, X .L222: - LD a1, X, 0 - add.d X, X, INCX - LD a2, X, 0 - add.d X, X, INCX - LD a3, X, 0 - add.d X, X, INCX - LD a4, X, 0 - add.d X, X, INCX - LD b1, Y, 0 - ST a1, Y, 0 - add.d Y, Y, INCY - LD b2, Y, 0 - ST a2, Y, 0 - add.d Y, Y, INCY - LD b3, Y, 0 - ST a3, Y, 0 - add.d Y, Y, INCY - LD b4, Y, 0 - ST a4, Y, 0 - add.d Y, Y, INCY - LD a1, X, 0 - add.d X, X, INCX - ST b1, XX, 0 - add.d XX, XX, INCX - LD b1, Y, 0 - ST a1, Y, 0 - add.d Y, Y, INCY - LD a2, X, 0 - add.d X, X, INCX - ST b2, XX, 0 - add.d XX, XX, INCX - LD b2, Y, 0 - ST a2, Y, 0 - add.d Y, Y, INCY - LD a3, X, 0 - add.d X, X, INCX - ST b3, XX, 0 - add.d XX, XX, INCX - LD b3, Y, 0 - ST a3, Y, 0 - LD a4, X, 0 - add.d X, X, INCX - ST b4, XX, 0 - add.d XX, XX, INCX - LD b4, Y, 0 - ST a4, Y, 0 - add.d Y, Y, INCY - ST b1, XX, 0 - add.d XX, XX, INCX - ST b2, XX, 0 - add.d XX, XX, INCX - ST b3, XX, 0 - add.d XX, XX, INCX - ST b4, XX, 0 - add.d XX, XX, INCX - addi.d I, I, -1 +.rept 8 + LD $f12, X, 0 + LD $f14, Y, 0 + ST $f12, Y, 0 + ST $f14, X, 0 + add.d X, X, INCX + add.d Y, Y, INCY +.endr + addi.d I, I, -1 blt $r0, I, .L222 .align 3 From 4bee135cc1362c168d419e3b021e4a97f5883816 Mon Sep 17 00:00:00 2001 From: Scott Tsai Date: Wed, 30 Apr 2025 20:53:11 +0800 Subject: [PATCH 149/205] cpuid_x86: improve Intel Arrow Lake detection Add Intel Arrow Lake CPUIDs. See the datasheet: https://edc.intel.com/content/www/us/en/design/products/platforms/details/arrow-lake-s/core-ultra-200s-series-processors-datasheet-volume-1-of-2/cpuid/ --- cpuid_x86.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/cpuid_x86.c b/cpuid_x86.c index 4e13f1462..1b09c7217 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1578,6 +1578,7 @@ int get_cpuname(void){ case 12: //family 6 exmodel 12 switch (model) { case 15: + case 6: // Arrow Lake if(support_avx512()) return CPUTYPE_SAPPHIRERAPIDS; if(support_avx2()) @@ -2421,6 +2422,22 @@ int get_coretype(void){ else return CORE_NEHALEM; } + case 12: + switch (model) { + case 6: // Arrow Lake + if(support_amx_bf16()) + return CORE_SAPPHIRERAPIDS; + if(support_avx512_bf16()) + return CORE_COOPERLAKE; + if(support_avx512()) + return CORE_SKYLAKEX; + if(support_avx2()) + return CORE_HASWELL; + if(support_avx()) + return CORE_SANDYBRIDGE; + else + return CORE_NEHALEM; + } } case 15: if (model <= 0x2) return CORE_NORTHWOOD; From 47b43054f18ea7bf36eb5d901f1533f2e0cb4cc3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 1 May 2025 11:52:22 +0200 Subject: [PATCH 150/205] Avoid out of bounds accesses in SCAL when INFO<0 --- lapack-netlib/SRC/dgeev.f | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lapack-netlib/SRC/dgeev.f b/lapack-netlib/SRC/dgeev.f index 4677b9f52..fc73bb226 100644 --- a/lapack-netlib/SRC/dgeev.f +++ b/lapack-netlib/SRC/dgeev.f @@ -506,17 +506,17 @@ * Undo scaling if necessary * 50 CONTINUE - IF( SCALEA ) THEN + IF( SCALEA .AND. INFO.GT.0) THEN CALL DLASCL( 'G', 0, 0, CSCALE, ANRM, N-INFO, 1, WR( INFO+1 ), $ MAX( N-INFO, 1 ), IERR ) CALL DLASCL( 'G', 0, 0, CSCALE, ANRM, N-INFO, 1, WI( INFO+1 ), $ MAX( N-INFO, 1 ), IERR ) - IF( INFO.GT.0 ) THEN + CALL DLASCL( 'G', 0, 0, CSCALE, ANRM, ILO-1, 1, WR, N, $ IERR ) CALL DLASCL( 'G', 0, 0, CSCALE, ANRM, ILO-1, 1, WI, N, $ IERR ) - END IF + END IF * WORK( 1 ) = MAXWRK From d48a2fc4692600826622bda8e36d865cc17e3f50 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 1 May 2025 11:53:50 +0200 Subject: [PATCH 151/205] Avoid out of bounds accesses in SCAL when INFO<0 --- lapack-netlib/SRC/cgeev.f | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lapack-netlib/SRC/cgeev.f b/lapack-netlib/SRC/cgeev.f index bb41599d1..af14aa73a 100644 --- a/lapack-netlib/SRC/cgeev.f +++ b/lapack-netlib/SRC/cgeev.f @@ -485,12 +485,12 @@ * Undo scaling if necessary * 50 CONTINUE - IF( SCALEA ) THEN + IF( SCALEA .AND. INFO.GT.0 ) THEN CALL CLASCL( 'G', 0, 0, CSCALE, ANRM, N-INFO, 1, W( INFO+1 ), $ MAX( N-INFO, 1 ), IERR ) - IF( INFO.GT.0 ) THEN + CALL CLASCL( 'G', 0, 0, CSCALE, ANRM, ILO-1, 1, W, N, IERR ) - END IF + END IF * WORK( 1 ) = SROUNDUP_LWORK(MAXWRK) From 4c0445aed1eafde035add619fac1d92d64b004db Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 1 May 2025 11:56:07 +0200 Subject: [PATCH 152/205] Avoid out of bounds accesses in SCAL when INFO <0 --- lapack-netlib/SRC/sgeev.f | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lapack-netlib/SRC/sgeev.f b/lapack-netlib/SRC/sgeev.f index 93f993265..adf1a1a9d 100644 --- a/lapack-netlib/SRC/sgeev.f +++ b/lapack-netlib/SRC/sgeev.f @@ -504,17 +504,17 @@ * Undo scaling if necessary * 50 CONTINUE - IF( SCALEA ) THEN + IF( SCALEA .AND. INFO.GT.0) THEN CALL SLASCL( 'G', 0, 0, CSCALE, ANRM, N-INFO, 1, WR( INFO+1 ), $ MAX( N-INFO, 1 ), IERR ) CALL SLASCL( 'G', 0, 0, CSCALE, ANRM, N-INFO, 1, WI( INFO+1 ), $ MAX( N-INFO, 1 ), IERR ) - IF( INFO.GT.0 ) THEN + CALL SLASCL( 'G', 0, 0, CSCALE, ANRM, ILO-1, 1, WR, N, $ IERR ) CALL SLASCL( 'G', 0, 0, CSCALE, ANRM, ILO-1, 1, WI, N, $ IERR ) - END IF + END IF * WORK( 1 ) = SROUNDUP_LWORK(MAXWRK) From 5c958dfe1eba7a2f5e6802a39199ab3c65c1a52f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 1 May 2025 11:58:21 +0200 Subject: [PATCH 153/205] Avoid of out of bounds accesses in SCAL when INFO<0 --- lapack-netlib/SRC/zgeev.f | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lapack-netlib/SRC/zgeev.f b/lapack-netlib/SRC/zgeev.f index b968900e2..6cf5c669c 100644 --- a/lapack-netlib/SRC/zgeev.f +++ b/lapack-netlib/SRC/zgeev.f @@ -485,12 +485,12 @@ * Undo scaling if necessary * 50 CONTINUE - IF( SCALEA ) THEN + IF( SCALEA .AND. INFO.GT.0) THEN CALL ZLASCL( 'G', 0, 0, CSCALE, ANRM, N-INFO, 1, W( INFO+1 ), $ MAX( N-INFO, 1 ), IERR ) - IF( INFO.GT.0 ) THEN + CALL ZLASCL( 'G', 0, 0, CSCALE, ANRM, ILO-1, 1, W, N, IERR ) - END IF + END IF * WORK( 1 ) = MAXWRK From 3c878f3e706e0b718dfc097dd86f511754cbcd65 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 9 May 2025 05:38:53 -0700 Subject: [PATCH 154/205] Cirrus CI: Update xcode version in the Apple crossbuilds (#5254) * Update xcode version in the Apple crossbuilds --- .cirrus.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.cirrus.yml b/.cirrus.yml index 741e04e18..da3236673 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -58,8 +58,8 @@ task: - export VALID_ARCHS="i386 x86_64" - xcrun --sdk macosx --show-sdk-path - xcodebuild -version - - export CC=/Applications/Xcode_15.4.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang - - export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode_15.4.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX14.5.sdk -arch x86_64" + - export CC=/Applications/Xcode_16.3.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang + - export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode_16.3.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX15.4.sdk -arch x86_64" - make TARGET=CORE2 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 RANLIB="ls -l" always: config_artifacts: @@ -78,8 +78,8 @@ task: - export #PATH=/opt/homebrew/opt/llvm/bin:$PATH - export #LDFLAGS="-L/opt/homebrew/opt/llvm/lib" - export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include" - - export CC=/Applications/Xcode_15.4.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang - - export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode_15.4.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS17.5.sdk -arch arm64 -miphoneos-version-min=10.0" + - export CC=/Applications/Xcode_16.3.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang + - export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode_16.3.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS18.4.sdk -arch arm64 -miphoneos-version-min=10.0" - xcrun --sdk iphoneos --show-sdk-path - ls -l /Applications - make TARGET=ARMV8 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 CROSS=1 From ebbe682f7d6cd841746878e947a5717f6e1b2c86 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 10 May 2025 08:38:06 -0700 Subject: [PATCH 155/205] Fix function prototypes --- ctest/c_cblat1c.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ctest/c_cblat1c.c b/ctest/c_cblat1c.c index d9a539097..a98cfdc31 100644 --- a/ctest/c_cblat1c.c +++ b/ctest/c_cblat1c.c @@ -440,7 +440,7 @@ static real c_b43 = (float)1.; extern /* Subroutine */ int ctest_(integer*, complex*, complex*, complex*, real*); static complex mwpcs[5], mwpct[5]; extern /* Subroutine */ int itest1_(integer*, integer*), stest1_(real*,real*,real*,real*); - extern /* Subroutine */ int cscaltest_(), itest1_(), stest1_(); + extern /* Subroutine */ int cscaltest_(integer*, complex*, complex*, integer*); static complex cx[8]; extern real scnrm2test_(integer*, complex*, integer*); static integer np1; From 0d69a2930d9b31dbc343e3c7dfc725714d809abe Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 10 May 2025 08:39:57 -0700 Subject: [PATCH 156/205] Fix empty prototypes of select/selctg --- lapack-netlib/SRC/cgees.c | 4 ++-- lapack-netlib/SRC/cgeesx.c | 4 ++-- lapack-netlib/SRC/cgges.c | 4 ++-- lapack-netlib/SRC/cgges3.c | 4 ++-- lapack-netlib/SRC/cggesx.c | 4 ++-- lapack-netlib/SRC/dgees.c | 2 +- lapack-netlib/SRC/dgeesx.c | 2 +- lapack-netlib/SRC/dgges.c | 4 ++-- lapack-netlib/SRC/dgges3.c | 4 ++-- lapack-netlib/SRC/dggesx.c | 4 ++-- lapack-netlib/SRC/sgees.c | 2 +- lapack-netlib/SRC/sgeesx.c | 2 +- lapack-netlib/SRC/sgges.c | 4 ++-- lapack-netlib/SRC/sgges3.c | 4 ++-- lapack-netlib/SRC/sggesx.c | 4 ++-- lapack-netlib/SRC/zgees.c | 4 ++-- lapack-netlib/SRC/zgeesx.c | 4 ++-- lapack-netlib/SRC/zgges.c | 5 +++-- lapack-netlib/SRC/zgges3.c | 5 +++-- lapack-netlib/SRC/zggesx.c | 5 +++-- 20 files changed, 39 insertions(+), 36 deletions(-) diff --git a/lapack-netlib/SRC/cgees.c b/lapack-netlib/SRC/cgees.c index 9145dc659..fca00f74f 100644 --- a/lapack-netlib/SRC/cgees.c +++ b/lapack-netlib/SRC/cgees.c @@ -710,8 +710,8 @@ or GE matrices */ /* > \ingroup complexGEeigen */ /* ===================================================================== */ -/* Subroutine */ void cgees_(char *jobvs, char *sort, L_fp select, integer *n, - complex *a, integer *lda, integer *sdim, complex *w, complex *vs, +/* Subroutine */ void cgees_(char *jobvs, char *sort, logical (*select)(complex*), + integer *n, complex *a, integer *lda, integer *sdim, complex *w, complex *vs, integer *ldvs, complex *work, integer *lwork, real *rwork, logical * bwork, integer *info) { diff --git a/lapack-netlib/SRC/cgeesx.c b/lapack-netlib/SRC/cgeesx.c index b4c408ce4..7b2a0e5b0 100644 --- a/lapack-netlib/SRC/cgeesx.c +++ b/lapack-netlib/SRC/cgeesx.c @@ -752,8 +752,8 @@ f"> */ /* > \ingroup complexGEeigen */ /* ===================================================================== */ -/* Subroutine */ void cgeesx_(char *jobvs, char *sort, L_fp select, char * - sense, integer *n, complex *a, integer *lda, integer *sdim, complex * +/* Subroutine */ void cgeesx_(char *jobvs, char *sort, logical (*select)(complex*), + char *sense, integer *n, complex *a, integer *lda, integer *sdim, complex * w, complex *vs, integer *ldvs, real *rconde, real *rcondv, complex * work, integer *lwork, real *rwork, logical *bwork, integer *info) { diff --git a/lapack-netlib/SRC/cgges.c b/lapack-netlib/SRC/cgges.c index ade0a3816..14350f7e3 100644 --- a/lapack-netlib/SRC/cgges.c +++ b/lapack-netlib/SRC/cgges.c @@ -784,8 +784,8 @@ or GE matrices */ /* > \ingroup complexGEeigen */ /* ===================================================================== */ -/* Subroutine */ void cgges_(char *jobvsl, char *jobvsr, char *sort, L_fp - selctg, integer *n, complex *a, integer *lda, complex *b, integer * +/* Subroutine */ void cgges_(char *jobvsl, char *jobvsr, char *sort, logical + (*selctg)(complex*,complex*), integer *n, complex *a, integer *lda, complex *b, integer * ldb, integer *sdim, complex *alpha, complex *beta, complex *vsl, integer *ldvsl, complex *vsr, integer *ldvsr, complex *work, integer * lwork, real *rwork, logical *bwork, integer *info) diff --git a/lapack-netlib/SRC/cgges3.c b/lapack-netlib/SRC/cgges3.c index 4cc9411a0..edc9e1b2d 100644 --- a/lapack-netlib/SRC/cgges3.c +++ b/lapack-netlib/SRC/cgges3.c @@ -783,8 +783,8 @@ f"> */ /* > \ingroup complexGEeigen */ /* ===================================================================== */ -/* Subroutine */ void cgges3_(char *jobvsl, char *jobvsr, char *sort, L_fp - selctg, integer *n, complex *a, integer *lda, complex *b, integer * +/* Subroutine */ void cgges3_(char *jobvsl, char *jobvsr, char *sort, logical + (*selctg)(complex*,complex*), integer *n, complex *a, integer *lda, complex *b, integer * ldb, integer *sdim, complex *alpha, complex *beta, complex *vsl, integer *ldvsl, complex *vsr, integer *ldvsr, complex *work, integer * lwork, real *rwork, logical *bwork, integer *info) diff --git a/lapack-netlib/SRC/cggesx.c b/lapack-netlib/SRC/cggesx.c index 375332cdb..ec6f4152c 100644 --- a/lapack-netlib/SRC/cggesx.c +++ b/lapack-netlib/SRC/cggesx.c @@ -843,8 +843,8 @@ f"> */ /* > \ingroup complexGEeigen */ /* ===================================================================== */ -/* Subroutine */ void cggesx_(char *jobvsl, char *jobvsr, char *sort, L_fp - selctg, char *sense, integer *n, complex *a, integer *lda, complex *b, +/* Subroutine */ void cggesx_(char *jobvsl, char *jobvsr, char *sort, logical + (*selctg)(complex*,complex*), char *sense, integer *n, complex *a, integer *lda, complex *b, integer *ldb, integer *sdim, complex *alpha, complex *beta, complex * vsl, integer *ldvsl, complex *vsr, integer *ldvsr, real *rconde, real *rcondv, complex *work, integer *lwork, real *rwork, integer *iwork, diff --git a/lapack-netlib/SRC/dgees.c b/lapack-netlib/SRC/dgees.c index b97c66575..0dd6547a8 100644 --- a/lapack-netlib/SRC/dgees.c +++ b/lapack-netlib/SRC/dgees.c @@ -729,7 +729,7 @@ or GE matrices */ /* > \ingroup doubleGEeigen */ /* ===================================================================== */ -/* Subroutine */ void dgees_(char *jobvs, char *sort, L_fp select, integer *n, +/* Subroutine */ void dgees_(char *jobvs, char *sort, logical(*select)(doublereal*,doublereal*), integer *n, doublereal *a, integer *lda, integer *sdim, doublereal *wr, doublereal *wi, doublereal *vs, integer *ldvs, doublereal *work, integer *lwork, logical *bwork, integer *info) diff --git a/lapack-netlib/SRC/dgeesx.c b/lapack-netlib/SRC/dgeesx.c index 5fa122ab6..8fba5454d 100644 --- a/lapack-netlib/SRC/dgeesx.c +++ b/lapack-netlib/SRC/dgeesx.c @@ -793,7 +793,7 @@ f"> */ /* > \ingroup doubleGEeigen */ /* ===================================================================== */ -/* Subroutine */ void dgeesx_(char *jobvs, char *sort, L_fp select, char * +/* Subroutine */ void dgeesx_(char *jobvs, char *sort, logical(*select)(doublereal*,doublereal*), char * sense, integer *n, doublereal *a, integer *lda, integer *sdim, doublereal *wr, doublereal *wi, doublereal *vs, integer *ldvs, doublereal *rconde, doublereal *rcondv, doublereal *work, integer * diff --git a/lapack-netlib/SRC/dgges.c b/lapack-netlib/SRC/dgges.c index a7f24de4a..28b82731e 100644 --- a/lapack-netlib/SRC/dgges.c +++ b/lapack-netlib/SRC/dgges.c @@ -798,8 +798,8 @@ or GE matrices */ /* > \ingroup doubleGEeigen */ /* ===================================================================== */ -/* Subroutine */ void dgges_(char *jobvsl, char *jobvsr, char *sort, L_fp - selctg, integer *n, doublereal *a, integer *lda, doublereal *b, +/* Subroutine */ void dgges_(char *jobvsl, char *jobvsr, char *sort, logical + (selctg)(doublereal*, doublereal*, doublereal*), integer *n, doublereal *a, integer *lda, doublereal *b, integer *ldb, integer *sdim, doublereal *alphar, doublereal *alphai, doublereal *beta, doublereal *vsl, integer *ldvsl, doublereal *vsr, integer *ldvsr, doublereal *work, integer *lwork, logical *bwork, diff --git a/lapack-netlib/SRC/dgges3.c b/lapack-netlib/SRC/dgges3.c index 8e1139349..28b4ace5c 100644 --- a/lapack-netlib/SRC/dgges3.c +++ b/lapack-netlib/SRC/dgges3.c @@ -796,8 +796,8 @@ f"> */ /* > \ingroup doubleGEeigen */ /* ===================================================================== */ -/* Subroutine */ void dgges3_(char *jobvsl, char *jobvsr, char *sort, L_fp - selctg, integer *n, doublereal *a, integer *lda, doublereal *b, +/* Subroutine */ void dgges3_(char *jobvsl, char *jobvsr, char *sort, logical + (*selctg)(doublereal*,doublereal*,doublereal*), integer *n, doublereal *a, integer *lda, doublereal *b, integer *ldb, integer *sdim, doublereal *alphar, doublereal *alphai, doublereal *beta, doublereal *vsl, integer *ldvsl, doublereal *vsr, integer *ldvsr, doublereal *work, integer *lwork, logical *bwork, diff --git a/lapack-netlib/SRC/dggesx.c b/lapack-netlib/SRC/dggesx.c index 3f5f6cd17..be19607c4 100644 --- a/lapack-netlib/SRC/dggesx.c +++ b/lapack-netlib/SRC/dggesx.c @@ -878,8 +878,8 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ void dggesx_(char *jobvsl, char *jobvsr, char *sort, L_fp - selctg, char *sense, integer *n, doublereal *a, integer *lda, +/* Subroutine */ void dggesx_(char *jobvsl, char *jobvsr, char *sort, logical + (*selctg)(doublereal*,doublereal*,doublereal*), char *sense, integer *n, doublereal *a, integer *lda, doublereal *b, integer *ldb, integer *sdim, doublereal *alphar, doublereal *alphai, doublereal *beta, doublereal *vsl, integer *ldvsl, doublereal *vsr, integer *ldvsr, doublereal *rconde, doublereal * diff --git a/lapack-netlib/SRC/sgees.c b/lapack-netlib/SRC/sgees.c index 99529f13d..18d0f27b0 100644 --- a/lapack-netlib/SRC/sgees.c +++ b/lapack-netlib/SRC/sgees.c @@ -482,7 +482,7 @@ or GE matrices */ /* > \ingroup realGEeigen */ /* ===================================================================== */ -/* Subroutine */ void sgees_(char *jobvs, char *sort, L_fp select, integer *n, +/* Subroutine */ void sgees_(char *jobvs, char *sort, logical(*select)(real*,real*), integer *n, real *a, integer *lda, integer *sdim, real *wr, real *wi, real *vs, integer *ldvs, real *work, integer *lwork, logical *bwork, integer * info) diff --git a/lapack-netlib/SRC/sgeesx.c b/lapack-netlib/SRC/sgeesx.c index 87f77ddd0..1d61f9c28 100644 --- a/lapack-netlib/SRC/sgeesx.c +++ b/lapack-netlib/SRC/sgeesx.c @@ -550,7 +550,7 @@ f"> */ /* > \ingroup realGEeigen */ /* ===================================================================== */ -/* Subroutine */ void sgeesx_(char *jobvs, char *sort, L_fp select, char * +/* Subroutine */ void sgeesx_(char *jobvs, char *sort, logical(*select)(real*,real*), char * sense, integer *n, real *a, integer *lda, integer *sdim, real *wr, real *wi, real *vs, integer *ldvs, real *rconde, real *rcondv, real * work, integer *lwork, integer *iwork, integer *liwork, logical *bwork, diff --git a/lapack-netlib/SRC/sgges.c b/lapack-netlib/SRC/sgges.c index 60a6d9348..b7c7691a9 100644 --- a/lapack-netlib/SRC/sgges.c +++ b/lapack-netlib/SRC/sgges.c @@ -555,8 +555,8 @@ or GE matrices */ /* > \ingroup realGEeigen */ /* ===================================================================== */ -/* Subroutine */ void sgges_(char *jobvsl, char *jobvsr, char *sort, L_fp - selctg, integer *n, real *a, integer *lda, real *b, integer *ldb, +/* Subroutine */ void sgges_(char *jobvsl, char *jobvsr, char *sort, logical + (*selctg)(real*,real*,real*), integer *n, real *a, integer *lda, real *b, integer *ldb, integer *sdim, real *alphar, real *alphai, real *beta, real *vsl, integer *ldvsl, real *vsr, integer *ldvsr, real *work, integer *lwork, logical *bwork, integer *info) diff --git a/lapack-netlib/SRC/sgges3.c b/lapack-netlib/SRC/sgges3.c index aefdda168..af99fb2fa 100644 --- a/lapack-netlib/SRC/sgges3.c +++ b/lapack-netlib/SRC/sgges3.c @@ -553,8 +553,8 @@ f"> */ /* > \ingroup realGEeigen */ /* ===================================================================== */ -/* Subroutine */ void sgges3_(char *jobvsl, char *jobvsr, char *sort, L_fp - selctg, integer *n, real *a, integer *lda, real *b, integer *ldb, +/* Subroutine */ void sgges3_(char *jobvsl, char *jobvsr, char *sort, logical + (*selctg)(real*,real*,real*), integer *n, real *a, integer *lda, real *b, integer *ldb, integer *sdim, real *alphar, real *alphai, real *beta, real *vsl, integer *ldvsl, real *vsr, integer *ldvsr, real *work, integer *lwork, logical *bwork, integer *info) diff --git a/lapack-netlib/SRC/sggesx.c b/lapack-netlib/SRC/sggesx.c index 72c798f35..7f60d4ae6 100644 --- a/lapack-netlib/SRC/sggesx.c +++ b/lapack-netlib/SRC/sggesx.c @@ -635,8 +635,8 @@ f"> */ /* > \endverbatim */ /* > */ /* ===================================================================== */ -/* Subroutine */ void sggesx_(char *jobvsl, char *jobvsr, char *sort, L_fp - selctg, char *sense, integer *n, real *a, integer *lda, real *b, +/* Subroutine */ void sggesx_(char *jobvsl, char *jobvsr, char *sort, logical + (*selctg)(real*,real*,real*), char *sense, integer *n, real *a, integer *lda, real *b, integer *ldb, integer *sdim, real *alphar, real *alphai, real *beta, real *vsl, integer *ldvsl, real *vsr, integer *ldvsr, real *rconde, real *rcondv, real *work, integer *lwork, integer *iwork, integer * diff --git a/lapack-netlib/SRC/zgees.c b/lapack-netlib/SRC/zgees.c index fea0f2573..eab6c7ba6 100644 --- a/lapack-netlib/SRC/zgees.c +++ b/lapack-netlib/SRC/zgees.c @@ -710,8 +710,8 @@ or GE matrices */ /* > \ingroup complex16GEeigen */ /* ===================================================================== */ -/* Subroutine */ void zgees_(char *jobvs, char *sort, L_fp select, integer *n, - doublecomplex *a, integer *lda, integer *sdim, doublecomplex *w, +/* Subroutine */ void zgees_(char *jobvs, char *sort, logical (*select)(doublecomplex*), + integer *n, doublecomplex *a, integer *lda, integer *sdim, doublecomplex *w, doublecomplex *vs, integer *ldvs, doublecomplex *work, integer *lwork, doublereal *rwork, logical *bwork, integer *info) { diff --git a/lapack-netlib/SRC/zgeesx.c b/lapack-netlib/SRC/zgeesx.c index 3749c53e5..bb5b6e164 100644 --- a/lapack-netlib/SRC/zgeesx.c +++ b/lapack-netlib/SRC/zgeesx.c @@ -751,8 +751,8 @@ f"> */ /* > \ingroup complex16GEeigen */ /* ===================================================================== */ -/* Subroutine */ void zgeesx_(char *jobvs, char *sort, L_fp select, char * - sense, integer *n, doublecomplex *a, integer *lda, integer *sdim, +/* Subroutine */ void zgeesx_(char *jobvs, char *sort, logical (*select)(doublecomplex*), + char * sense, integer *n, doublecomplex *a, integer *lda, integer *sdim, doublecomplex *w, doublecomplex *vs, integer *ldvs, doublereal * rconde, doublereal *rcondv, doublecomplex *work, integer *lwork, doublereal *rwork, logical *bwork, integer *info) diff --git a/lapack-netlib/SRC/zgges.c b/lapack-netlib/SRC/zgges.c index aab716add..f8426519d 100644 --- a/lapack-netlib/SRC/zgges.c +++ b/lapack-netlib/SRC/zgges.c @@ -784,8 +784,9 @@ or GE matrices */ /* > \ingroup complex16GEeigen */ /* ===================================================================== */ -/* Subroutine */ void zgges_(char *jobvsl, char *jobvsr, char *sort, L_fp - selctg, integer *n, doublecomplex *a, integer *lda, doublecomplex *b, +/* Subroutine */ void zgges_(char *jobvsl, char *jobvsr, char *sort, logical + (*selctg)(doublecomplex*,doublecomplex*), integer *n, doublecomplex *a, + integer *lda, doublecomplex *b, integer *ldb, integer *sdim, doublecomplex *alpha, doublecomplex * beta, doublecomplex *vsl, integer *ldvsl, doublecomplex *vsr, integer *ldvsr, doublecomplex *work, integer *lwork, doublereal *rwork, diff --git a/lapack-netlib/SRC/zgges3.c b/lapack-netlib/SRC/zgges3.c index 98149dabe..d0329ab0b 100644 --- a/lapack-netlib/SRC/zgges3.c +++ b/lapack-netlib/SRC/zgges3.c @@ -783,8 +783,9 @@ f"> */ /* > \ingroup complex16GEeigen */ /* ===================================================================== */ -/* Subroutine */ void zgges3_(char *jobvsl, char *jobvsr, char *sort, L_fp - selctg, integer *n, doublecomplex *a, integer *lda, doublecomplex *b, +/* Subroutine */ void zgges3_(char *jobvsl, char *jobvsr, char *sort, logical + (*selctg)(doublecomplex*,doublecomplex*), integer *n, doublecomplex *a, + integer *lda, doublecomplex *b, integer *ldb, integer *sdim, doublecomplex *alpha, doublecomplex * beta, doublecomplex *vsl, integer *ldvsl, doublecomplex *vsr, integer *ldvsr, doublecomplex *work, integer *lwork, doublereal *rwork, diff --git a/lapack-netlib/SRC/zggesx.c b/lapack-netlib/SRC/zggesx.c index fa4c5bd8d..3b621a0d2 100644 --- a/lapack-netlib/SRC/zggesx.c +++ b/lapack-netlib/SRC/zggesx.c @@ -843,8 +843,9 @@ f"> */ /* > \ingroup complex16GEeigen */ /* ===================================================================== */ -/* Subroutine */ void zggesx_(char *jobvsl, char *jobvsr, char *sort, L_fp - selctg, char *sense, integer *n, doublecomplex *a, integer *lda, +/* Subroutine */ void zggesx_(char *jobvsl, char *jobvsr, char *sort, logical + (*selctg)(doublecomplex*,doublecomplex*), char *sense, integer *n, + doublecomplex *a, integer *lda, doublecomplex *b, integer *ldb, integer *sdim, doublecomplex *alpha, doublecomplex *beta, doublecomplex *vsl, integer *ldvsl, doublecomplex *vsr, integer *ldvsr, doublereal *rconde, doublereal * From 5141a9099306be849429508e811292ee81802ba1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 10 May 2025 13:39:32 -0700 Subject: [PATCH 157/205] Fix ARMV9SME target in DYNAMIC_ARCH and add SME query code for MacOS (#5222) * Fix ARMV9SME target and add support_sme1 code for MacOS * make sgemm_direct unconditionally available on all arm64 * build a (dummy) sgemm_direct kernel on all arm64 * Update dynamic_arm64.c --- common_param.h | 2 -- driver/others/dynamic_arm64.c | 41 +++++++++++++++++++------- kernel/CMakeLists.txt | 4 ++- kernel/Makefile.L3 | 7 +++-- kernel/arm64/sgemm_direct_arm64_sme1.c | 6 ++++ kernel/setparam-ref.c | 2 -- 6 files changed, 44 insertions(+), 18 deletions(-) diff --git a/common_param.h b/common_param.h index d4d5a8eb2..2d771a27d 100644 --- a/common_param.h +++ b/common_param.h @@ -224,10 +224,8 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); int (*sgemm_direct_performant) (BLASLONG M, BLASLONG N, BLASLONG K); #endif #ifdef ARCH_ARM64 -#ifdef HAVE_SME void (*sgemm_direct) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG , float *, BLASLONG , float * , BLASLONG); #endif -#endif int (*sgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG); diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c index 31821ae78..428c5758b 100644 --- a/driver/others/dynamic_arm64.c +++ b/driver/others/dynamic_arm64.c @@ -43,6 +43,14 @@ #include #endif +#ifdef __APPLE__ +#include +int32_t value; +size_t length=sizeof(value); +int64_t value64; +size_t length64=sizeof(value64); +#endif + extern gotoblas_t gotoblas_ARMV8; #ifdef DYNAMIC_LIST #ifdef DYN_CORTEXA53 @@ -120,7 +128,7 @@ extern gotoblas_t gotoblas_ARMV9SME; #else #define gotoblas_ARMV9SME gotoblas_ARMV8 #endif -#ifdef DYN_CORTEX_A55 +#ifdef DYN_CORTEXA55 extern gotoblas_t gotoblas_CORTEXA55; #else #define gotoblas_CORTEXA55 gotoblas_ARMV8 @@ -147,17 +155,17 @@ extern gotoblas_t gotoblas_NEOVERSEV1; extern gotoblas_t gotoblas_NEOVERSEN2; extern gotoblas_t gotoblas_ARMV8SVE; extern gotoblas_t gotoblas_A64FX; +#ifndef NO_SME +extern gotoblas_t gotoblas_ARMV9SME; +#else +#define gotoblas_ARMV9SME gotoblas_ARMV8SVE +#endif #else #define gotoblas_NEOVERSEV1 gotoblas_ARMV8 #define gotoblas_NEOVERSEN2 gotoblas_ARMV8 #define gotoblas_ARMV8SVE gotoblas_ARMV8 #define gotoblas_A64FX gotoblas_ARMV8 -#endif - -#ifndef NO_SME -extern gotoblas_t gotoblas_ARMV9SME; -#else -#define gotoblas_ARMV9SME gotoblas_ARMV8SVE +#define gotoblas_ARMV9SME gotoblas_ARMV8 #endif extern gotoblas_t gotoblas_THUNDERX3T110; @@ -168,7 +176,7 @@ extern void openblas_warning(int verbose, const char * msg); #define FALLBACK_VERBOSE 1 #define NEOVERSEN1_FALLBACK "OpenBLAS : Your OS does not support SVE instructions. OpenBLAS is using Neoverse N1 kernels as a fallback, which may give poorer performance.\n" -#define NUM_CORETYPES 18 +#define NUM_CORETYPES 19 /* * In case asm/hwcap.h is outdated on the build system, make sure @@ -207,6 +215,7 @@ static char *corename[] = { "cortexa55", "armv8sve", "a64fx", + "armv9sme", "unknown" }; @@ -229,6 +238,7 @@ char *gotoblas_corename(void) { if (gotoblas == &gotoblas_CORTEXA55) return corename[15]; if (gotoblas == &gotoblas_ARMV8SVE) return corename[16]; if (gotoblas == &gotoblas_A64FX) return corename[17]; + if (gotoblas == &gotoblas_ARMV9SME) return corename[18]; return corename[NUM_CORETYPES]; } @@ -266,6 +276,7 @@ static gotoblas_t *force_coretype(char *coretype) { case 15: return (&gotoblas_CORTEXA55); case 16: return (&gotoblas_ARMV8SVE); case 17: return (&gotoblas_A64FX); + case 18: return (&gotoblas_ARMV9SME); } snprintf(message, 128, "Core not found: %s\n", coretype); openblas_warning(1, message); @@ -277,6 +288,11 @@ static gotoblas_t *get_coretype(void) { char coremsg[128]; #if defined (OS_DARWIN) +//future #if !defined(NO_SME) +// if (support_sme1()) { +// return &gotoblas_ARMV9SME; +// } +// #endif return &gotoblas_NEOVERSEN1; #endif @@ -439,6 +455,7 @@ static gotoblas_t *get_coretype(void) { } break; case 0x61: // Apple +//future if (support_sme1()) return &gotoblas_ARMV9SME; return &gotoblas_NEOVERSEN1; break; default: @@ -446,8 +463,8 @@ static gotoblas_t *get_coretype(void) { openblas_warning(1, coremsg); } -#if !defined(NO_SME) && defined(HWCAP2_SME) - if ((getauxval(AT_HWCAP2) & HWCAP2_SME)) { +#if !defined(NO_SME) + if (support_sme1()) { return &gotoblas_ARMV9SME; } #endif @@ -511,6 +528,10 @@ int support_sme1(void) { if(getauxval(AT_HWCAP2) & HWCAP2_SME){ ret = 1; } +#endif +#if defined(__APPLE__) + sysctlbyname("hw.optional.arm.FEAT_SME",&value64,&length64,NULL,0); + ret = value64; #endif return ret; } diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index 81185f603..48c895588 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -208,7 +208,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) set(USE_TRMM true) endif () set(USE_DIRECT_SGEMM false) - if (X86_64 OR (ARM64 AND (UC_TARGET_CORE MATCHES ARMV9SME))) + if (X86_64 OR ARM64) set(USE_DIRECT_SGEMM true) endif() @@ -225,9 +225,11 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) set (SGEMMDIRECTSMEKERNEL sgemm_direct_sme1.S) set (SGEMMDIRECTPREKERNEL sgemm_direct_sme1_preprocess.S) GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTKERNEL}" "" "gemm_direct" false "" "" false SINGLE) + if (HAVE_SME) GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTSMEKERNEL}" "" "gemm_direct_sme1" false "" "" false SINGLE) GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTPREKERNEL}" "" "gemm_direct_sme1_preprocess" false "" "" false SINGLE) endif () + endif () endif() foreach (float_type SINGLE DOUBLE) diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 41f16f9c9..2bd6b294f 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -103,8 +103,8 @@ endif ifeq ($(ARCH), arm64) ifeq ($(TARGET_CORE), ARMV9SME) HAVE_SME = 1 -SGEMMDIRECTKERNEL = sgemm_direct_arm64_sme1.c endif +SGEMMDIRECTKERNEL = sgemm_direct_arm64_sme1.c endif endif endif @@ -143,9 +143,10 @@ SKERNELOBJS += \ sgemm_direct_performant$(TSUFFIX).$(SUFFIX) endif ifeq ($(ARCH), arm64) +SKERNELOBJS += \ + sgemm_direct$(TSUFFIX).$(SUFFIX) ifdef HAVE_SME SKERNELOBJS += \ - sgemm_direct$(TSUFFIX).$(SUFFIX) \ sgemm_direct_sme1$(TSUFFIX).$(SUFFIX) \ sgemm_direct_sme1_preprocess$(TSUFFIX).$(SUFFIX) endif @@ -835,9 +836,9 @@ $(KDIR)sgemm_direct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMDIRECTKERNEL) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ endif ifeq ($(ARCH), arm64) -ifdef HAVE_SME $(KDIR)sgemm_direct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMDIRECTKERNEL) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ +ifdef HAVE_SME $(KDIR)sgemm_direct_sme1$(TSUFFIX).$(SUFFIX) : $(CC) $(CFLAGS) -c $(KERNELDIR)/sgemm_direct_sme1.S -UDOUBLE -UCOMPLEX -o $@ $(KDIR)sgemm_direct_sme1_preprocess$(TSUFFIX).$(SUFFIX) : diff --git a/kernel/arm64/sgemm_direct_arm64_sme1.c b/kernel/arm64/sgemm_direct_arm64_sme1.c index 50c2a9a2d..13c337a13 100644 --- a/kernel/arm64/sgemm_direct_arm64_sme1.c +++ b/kernel/arm64/sgemm_direct_arm64_sme1.c @@ -71,4 +71,10 @@ void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A,\ free(A_mod); } +#else + +void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A,\ + BLASLONG strideA, float * __restrict B, BLASLONG strideB ,\ + float * __restrict R, BLASLONG strideR){} + #endif diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index 24c285557..5a5045ce2 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -180,9 +180,7 @@ gotoblas_t TABLE_NAME = { sgemm_direct_performantTS, #endif #ifdef ARCH_ARM64 -#ifdef HAVE_SME sgemm_directTS, -#endif #endif sgemm_kernelTS, sgemm_betaTS, From 73214446602758b9aaf73f48de8d3b81990b9343 Mon Sep 17 00:00:00 2001 From: Ye Tao Date: Mon, 12 May 2025 13:41:21 +0000 Subject: [PATCH 158/205] enable sbgemm to be forward to sbgemv on arm64 --- Makefile.system | 1 + 1 file changed, 1 insertion(+) diff --git a/Makefile.system b/Makefile.system index ac6a41c92..38646c3c6 100644 --- a/Makefile.system +++ b/Makefile.system @@ -276,6 +276,7 @@ SMALL_MATRIX_OPT = 1 endif ifeq ($(ARCH), arm64) GEMM_GEMV_FORWARD = 1 +GEMM_GEMV_FORWARD_BF16 = 1 endif ifeq ($(ARCH), riscv) GEMM_GEMV_FORWARD = 1 From 0ccb05058312caed86befc75923b6f888ae4e7a6 Mon Sep 17 00:00:00 2001 From: pengxu Date: Tue, 13 May 2025 16:08:33 +0800 Subject: [PATCH 159/205] Loongarch64: fixed cgemm_ncopy_16_lasx --- kernel/loongarch64/cgemm_ncopy_16_lasx.S | 774 +++++++---------------- 1 file changed, 212 insertions(+), 562 deletions(-) diff --git a/kernel/loongarch64/cgemm_ncopy_16_lasx.S b/kernel/loongarch64/cgemm_ncopy_16_lasx.S index 7c2d0ac64..4b9225314 100644 --- a/kernel/loongarch64/cgemm_ncopy_16_lasx.S +++ b/kernel/loongarch64/cgemm_ncopy_16_lasx.S @@ -45,18 +45,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define S6 $r17 #define S7 $r18 #define S8 $r19 -#define S9 $r20 -#define S10 $r23 -#define S11 $r24 -#define S12 $r25 -#define S13 $r26 -#define S14 $r27 -#define S15 $r28 -#define S16 $r29 -#define TD $r30 -#define TS $r31 +#define S9 $r23 +#define S10 $r24 +#define S11 $r25 +#define S12 $r26 +#define S13 $r27 +#define S14 $r28 +#define S15 $r29 +#define S16 $r30 +#define TD $r20 +#define TS $r11 #define TL $r7 -#define T0 $r6 #define ZERO $r0 #define F0 $f0 @@ -67,6 +66,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define F5 $f5 #define F6 $f6 #define F7 $f7 +#define F8 $f8 +#define F9 $f9 +#define F10 $f10 +#define F11 $f11 +#define F12 $f12 +#define F13 $f13 +#define F14 $f14 +#define F15 $f15 /* LASX vectors */ #define U0 $xr0 #define U1 $xr1 @@ -103,589 +110,232 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE - addi.d $sp, $sp, -0x90 - SDARG $r23, $sp, 0x00 - SDARG $r24, $sp, 0x08 - SDARG $r25, $sp, 0x10 - SDARG $r26, $sp, 0x18 - SDARG $r27, $sp, 0x20 - SDARG $r28, $sp, 0x28 - SDARG $r29, $sp, 0x30 - SDARG $r30, $sp, 0x38 - SDARG $r31, $sp, 0x40 - ST $f23, $sp, 0x48 - ST $f24, $sp, 0x50 - ST $f25, $sp, 0x58 - ST $f26, $sp, 0x60 - ST $f27, $sp, 0x68 - ST $f28, $sp, 0x70 - ST $f29, $sp, 0x78 - ST $f30, $sp, 0x80 - ST $f31, $sp, 0x88 - - move TD, DST - move TS, SRC - slli.d TL, LDA, 0x03 - slli.d T0, TL, 0x01 - srai.d J, N, 0x04 + addi.d $sp, $sp, -64 + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 24 + SDARG $r27, $sp, 32 + SDARG $r28, $sp, 40 + SDARG $r29, $sp, 48 + SDARG $r30, $sp, 56 + + move TD, DST //boffset + move TS, SRC //aoffset + slli.d TL, LDA, 0x03 //lda + srai.d J, N, 0x04 //j beq J, ZERO, .L_N8 -.L_J1: /* J-- */ +.L_J1: /* if(j>0) j--*/ move S1, TS add.d S2, TS, TL - srai.d I, M, 0x03 + move I, M add.d S3, S2, TL - addi.d J, J, -1 add.d S4, S3, TL - add.d S5, S3, T0 - add.d S6, S4, T0 - add.d S7, S5, T0 - add.d S8, S6, T0 - add.d S9, S7, T0 - add.d S10, S8, T0 - add.d S11, S9, T0 - add.d S12, S10, T0 - add.d S13, S11, T0 - add.d S14, S12, T0 - add.d S15, S13, T0 - add.d S16, S14, T0 - add.d TS, S15, T0 - beq I, ZERO, .L_I7 - -.L_I1: /* I-- */ - xvld U0, S1, 0x00 - xvld U1, S2, 0x00 - xvld U2, S3, 0x00 - xvld U3, S4, 0x00 - xvld U4, S5, 0x00 - xvld U5, S6, 0x00 - xvld U6, S7, 0x00 - xvld U7, S8, 0x00 - xvld U8, S9, 0x00 - xvld U9, S10, 0x00 - xvld U10, S11, 0x00 - xvld U11, S12, 0x00 - xvld U12, S13, 0x00 - xvld U13, S14, 0x00 - xvld U14, S15, 0x00 - xvld U15, S16, 0x00 - - xvpackev.d D0, U1, U0 - xvpackod.d D1, U1, U0 - xvpackev.d D2, U3, U2 - xvpackod.d D3, U3, U2 - xvpackev.d D4, U5, U4 - xvpackod.d D5, U5, U4 - xvpackev.d D6, U7, U6 - xvpackod.d D7, U7, U6 - - xvpackev.d D8, U9, U8 - xvpackod.d D9, U9, U8 - xvpackev.d D10, U11, U10 - xvpackod.d D11, U11, U10 - xvpackev.d D12, U13, U12 - xvpackod.d D13, U13, U12 - xvpackev.d D14, U15, U14 - xvpackod.d D15, U15, U14 - - xvand.v U0, D0, D0 - xvpermi.q D0, D2, 0x02 // 0 - xvand.v U4, D4, D4 - xvpermi.q D4, D6, 0x02 // 1 - xvand.v U1, D1, D1 - xvpermi.q D1, D3, 0x02 // 4 - xvand.v U5, D5, D5 - xvpermi.q D5, D7, 0x02 // 5 - xvpermi.q D2, U0, 0x31 // 8 - xvpermi.q D6, U4, 0x31 // 9 - xvpermi.q D3, U1, 0x31 // 12 - xvpermi.q D7, U5, 0x31 // 13 - - xvand.v U8, D8, D8 - xvpermi.q D8, D10, 0x02 // 2 - xvand.v U12, D12, D12 - xvpermi.q D12, D14, 0x02 // 3 - xvand.v U9, D9, D9 - xvpermi.q D9, D11, 0x02 // 6 - xvand.v U13, D13, D13 - xvpermi.q D13, D15, 0x02 // 7 - xvpermi.q D10, U8, 0x31 // 10 - xvpermi.q D14, U12, 0x31 // 11 - xvpermi.q D11, U9, 0x31 // 14 - xvpermi.q D15, U13, 0x31 // 15 - - xvst D0, TD, 0x00 // 0 - xvst D4, TD, 0x20 // 1 - xvst D8, TD, 0x40 // 2 - xvst D12, TD, 0x60 // 3 - xvst D1, TD, 0x80 // 4 - xvst D5, TD, 0xA0 // 5 - xvst D9, TD, 0xC0 // 6 - xvst D13, TD, 0xE0 // 7 - addi.d TD, TD, 0x100 - xvst D2, TD, 0x00 // 8 - xvst D6, TD, 0x20 // 9 - xvst D10, TD, 0x40 // 10 - xvst D14, TD, 0x60 // 11 - xvst D3, TD, 0x80 // 12 - xvst D7, TD, 0xA0 // 13 - xvst D11, TD, 0xC0 // 14 - xvst D15, TD, 0xE0 // 15 - addi.d TD, TD, 0x100 - - xvld U0, S1, 0x20 - xvld U1, S2, 0x20 - xvld U2, S3, 0x20 - xvld U3, S4, 0x20 - xvld U4, S5, 0x20 - xvld U5, S6, 0x20 - xvld U6, S7, 0x20 - xvld U7, S8, 0x20 - xvld U8, S9, 0x20 - xvld U9, S10, 0x20 - xvld U10, S11, 0x20 - xvld U11, S12, 0x20 - xvld U12, S13, 0x20 - xvld U13, S14, 0x20 - xvld U14, S15, 0x20 - xvld U15, S16, 0x20 - - xvpackev.d D0, U1, U0 - xvpackod.d D1, U1, U0 - xvpackev.d D2, U3, U2 - xvpackod.d D3, U3, U2 - xvpackev.d D4, U5, U4 - xvpackod.d D5, U5, U4 - xvpackev.d D6, U7, U6 - xvpackod.d D7, U7, U6 - - xvpackev.d D8, U9, U8 - xvpackod.d D9, U9, U8 - xvpackev.d D10, U11, U10 - xvpackod.d D11, U11, U10 - xvpackev.d D12, U13, U12 - xvpackod.d D13, U13, U12 - xvpackev.d D14, U15, U14 - xvpackod.d D15, U15, U14 - - xvand.v U0, D0, D0 - xvpermi.q D0, D2, 0x02 // 0 - xvand.v U4, D4, D4 - xvpermi.q D4, D6, 0x02 // 1 - xvand.v U1, D1, D1 - xvpermi.q D1, D3, 0x02 // 4 - xvand.v U5, D5, D5 - xvpermi.q D5, D7, 0x02 // 5 - xvpermi.q D2, U0, 0x31 // 8 - xvpermi.q D6, U4, 0x31 // 9 - xvpermi.q D3, U1, 0x31 // 12 - xvpermi.q D7, U5, 0x31 // 13 - - xvand.v U8, D8, D8 - xvpermi.q D8, D10, 0x02 // 2 - xvand.v U12, D12, D12 - xvpermi.q D12, D14, 0x02 // 3 - xvand.v U9, D9, D9 - xvpermi.q D9, D11, 0x02 // 6 - xvand.v U13, D13, D13 - xvpermi.q D13, D15, 0x02 // 7 - xvpermi.q D10, U8, 0x31 // 10 - xvpermi.q D14, U12, 0x31 // 11 - xvpermi.q D11, U9, 0x31 // 14 - xvpermi.q D15, U13, 0x31 // 15 - - xvst D0, TD, 0x00 // 0 - xvst D4, TD, 0x20 // 1 - xvst D8, TD, 0x40 // 2 - xvst D12, TD, 0x60 // 3 - xvst D1, TD, 0x80 // 4 - xvst D5, TD, 0xA0 // 5 - xvst D9, TD, 0xC0 // 6 - xvst D13, TD, 0xE0 // 7 - addi.d TD, TD, 0x100 - xvst D2, TD, 0x00 // 8 - xvst D6, TD, 0x20 // 9 - xvst D10, TD, 0x40 // 10 - xvst D14, TD, 0x60 // 11 - xvst D3, TD, 0x80 // 12 - xvst D7, TD, 0xA0 // 13 - xvst D11, TD, 0xC0 // 14 - xvst D15, TD, 0xE0 // 15 - addi.d TD, TD, 0x100 - - - addi.d S1, S1, 0x40 - addi.d S2, S2, 0x40 - addi.d S3, S3, 0x40 - addi.d S4, S4, 0x40 - addi.d S5, S5, 0x40 - addi.d S6, S6, 0x40 - addi.d S7, S7, 0x40 - addi.d S8, S8, 0x40 - addi.d S9, S9, 0x40 - addi.d S10, S10, 0x40 - addi.d S11, S11, 0x40 - addi.d S12, S12, 0x40 - addi.d S13, S13, 0x40 - addi.d S14, S14, 0x40 - addi.d S15, S15, 0x40 - addi.d S16, S16, 0x40 - + add.d S5, S4, TL + add.d S6, S5, TL + add.d S7, S6, TL + add.d S8, S7, TL + add.d S9, S8, TL + add.d S10, S9, TL + add.d S11, S10, TL + add.d S12, S11, TL + add.d S13, S12, TL + add.d S14, S13, TL + add.d S15, S14, TL + add.d S16, S15, TL + add.d TS, S16, TL + beq I, ZERO, .L_J11 + +.L_I1: /* if(i>0) i--*/ + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fld.d F2, S3, 0x00 + fld.d F3, S4, 0x00 + fld.d F4, S5, 0x00 + fld.d F5, S6, 0x00 + fld.d F6, S7, 0x00 + fld.d F7, S8, 0x00 + + fst.d F0, TD, 0x00 + fst.d F1, TD, 0x08 + fst.d F2, TD, 0x10 + fst.d F3, TD, 0x18 + fst.d F4, TD, 0x20 + fst.d F5, TD, 0x28 + fst.d F6, TD, 0x30 + fst.d F7, TD, 0x38 + + fld.d F0, S9, 0x00 + fld.d F1, S10, 0x00 + fld.d F2, S11, 0x00 + fld.d F3, S12, 0x00 + fld.d F4, S13, 0x00 + fld.d F5, S14, 0x00 + fld.d F6, S15, 0x00 + fld.d F7, S16, 0x00 + + fst.d F0, TD, 0x40 + fst.d F1, TD, 0x48 + fst.d F2, TD, 0x50 + fst.d F3, TD, 0x58 + fst.d F4, TD, 0x60 + fst.d F5, TD, 0x68 + fst.d F6, TD, 0x70 + fst.d F7, TD, 0x78 + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d S3, S3, 0x08 + addi.d S4, S4, 0x08 + addi.d S5, S5, 0x08 + addi.d S6, S6, 0x08 + addi.d S7, S7, 0x08 + addi.d S8, S8, 0x08 + addi.d S9, S9, 0x08 + addi.d S10, S10, 0x08 + addi.d S11, S11, 0x08 + addi.d S12, S12, 0x08 + addi.d S13, S13, 0x08 + addi.d S14, S14, 0x08 + addi.d S15, S15, 0x08 + addi.d S16, S16, 0x08 + addi.d TD, TD, 0x80 addi.d I, I, -1 blt ZERO, I, .L_I1 -.L_I7: - andi I, M, 0x07 - beq I, ZERO, .L_I0 - -.L_II1: /* I-- */ - fld.d F0, S1, 0x00 - fld.d F1, S2, 0x00 - fld.d F2, S3, 0x00 - fld.d F3, S4, 0x00 - fld.d F4, S5, 0x00 - fld.d F5, S6, 0x00 - fld.d F6, S7, 0x00 - fld.d F7, S8, 0x00 - - fst.d F0, TD, 0x00 - addi.d S1, S1, 0x08 - fst.d F1, TD, 0x08 - addi.d S2, S2, 0x08 - fst.d F2, TD, 0x10 - addi.d S3, S3, 0x08 - fst.d F3, TD, 0x18 - addi.d S4, S4, 0x08 - fst.d F4, TD, 0x20 - addi.d S5, S5, 0x08 - fst.d F5, TD, 0x28 - addi.d S6, S6, 0x08 - fst.d F6, TD, 0x30 - addi.d S7, S7, 0x08 - fst.d F7, TD, 0x38 - addi.d S8, S8, 0x08 - addi.d TD, TD, 0x40 - - fld.d F0, S9, 0x00 - fld.d F1, S10, 0x00 - fld.d F2, S11, 0x00 - fld.d F3, S12, 0x00 - fld.d F4, S13, 0x00 - fld.d F5, S14, 0x00 - fld.d F6, S15, 0x00 - fld.d F7, S16, 0x00 - - fst.d F0, TD, 0x00 - addi.d S9, S9, 0x08 - fst.d F1, TD, 0x08 - addi.d S10, S10, 0x08 - fst.d F2, TD, 0x10 - addi.d S11, S11, 0x08 - fst.d F3, TD, 0x18 - addi.d S12, S12, 0x08 - fst.d F4, TD, 0x20 - addi.d S13, S13, 0x08 - fst.d F5, TD, 0x28 - addi.d S14, S14, 0x08 - fst.d F6, TD, 0x30 - addi.d S15, S15, 0x08 - fst.d F7, TD, 0x38 - addi.d S16, S16, 0x08 - addi.d TD, TD, 0x40 - - addi.d I, I, -1 - blt ZERO, I, .L_II1 - -.L_I0: - blt ZERO, J, .L_J1 - -.L_N8: - andi J, N, 0x08 - beq ZERO, J, .L_N4 +.L_J11: /* j--*/ + addi.d J, J, -1 + blt ZERO, J, .L_J1 + +.L_N8: /* if(n&8)*/ + andi I, N, 0x08 + beq I, ZERO, .L_N4 move S1, TS add.d S2, TS, TL - srai.d I, M, 0x03 + move I, M add.d S3, S2, TL - add.d S4, S2, T0 - add.d S5, S3, T0 - add.d S6, S4, T0 - add.d S7, S5, T0 - add.d S8, S6, T0 - add.d TS, S7, T0 - beq I, ZERO, .L_8I3 - -.L_8I1: /* I-- */ - xvld U0, S1, 0x00 - xvld U1, S2, 0x00 - xvld U2, S3, 0x00 - xvld U3, S4, 0x00 - xvld U4, S5, 0x00 - xvld U5, S6, 0x00 - xvld U6, S7, 0x00 - xvld U7, S8, 0x00 - - xvpackev.d D0, U1, U0 - xvpackod.d D1, U1, U0 - xvpackev.d D2, U3, U2 - xvpackod.d D3, U3, U2 - xvpackev.d D4, U5, U4 - xvpackod.d D5, U5, U4 - xvpackev.d D6, U7, U6 - xvpackod.d D7, U7, U6 - - xvand.v U0, D0, D0 - xvpermi.q D0, D2, 0x02 // 0 - xvand.v U4, D4, D4 - xvpermi.q D4, D6, 0x02 // 1 - xvand.v U1, D1, D1 - xvpermi.q D1, D3, 0x02 // 2 - xvand.v U5, D5, D5 - xvpermi.q D5, D7, 0x02 // 3 - xvpermi.q D2, U0, 0x31 // 4 - xvpermi.q D6, U4, 0x31 // 5 - xvpermi.q D3, U1, 0x31 // 6 - xvpermi.q D7, U5, 0x31 // 7 - - xvst D0, TD, 0x00 - xvst D4, TD, 0x20 - xvst D1, TD, 0x40 - xvst D5, TD, 0x60 - xvst D2, TD, 0x80 - xvst D6, TD, 0xA0 - xvst D3, TD, 0xC0 - xvst D7, TD, 0xE0 - addi.d TD, TD, 0x100 - - xvld U0, S1, 0x20 - xvld U1, S2, 0x20 - xvld U2, S3, 0x20 - xvld U3, S4, 0x20 - xvld U4, S5, 0x20 - xvld U5, S6, 0x20 - xvld U6, S7, 0x20 - xvld U7, S8, 0x20 - - xvpackev.d D0, U1, U0 - xvpackod.d D1, U1, U0 - xvpackev.d D2, U3, U2 - xvpackod.d D3, U3, U2 - xvpackev.d D4, U5, U4 - xvpackod.d D5, U5, U4 - xvpackev.d D6, U7, U6 - xvpackod.d D7, U7, U6 - - xvand.v U0, D0, D0 - xvpermi.q D0, D2, 0x02 // 0 - xvand.v U4, D4, D4 - xvpermi.q D4, D6, 0x02 // 1 - xvand.v U1, D1, D1 - xvpermi.q D1, D3, 0x02 // 2 - xvand.v U5, D5, D5 - xvpermi.q D5, D7, 0x02 // 3 - xvpermi.q D2, U0, 0x31 // 4 - xvpermi.q D6, U4, 0x31 // 5 - xvpermi.q D3, U1, 0x31 // 6 - xvpermi.q D7, U5, 0x31 // 7 - - xvst D0, TD, 0x00 - xvst D4, TD, 0x20 - xvst D1, TD, 0x40 - xvst D5, TD, 0x60 - xvst D2, TD, 0x80 - xvst D6, TD, 0xA0 - xvst D3, TD, 0xC0 - xvst D7, TD, 0xE0 - addi.d TD, TD, 0x100 - - addi.d S1, S1, 0x40 - addi.d S2, S2, 0x40 - addi.d S3, S3, 0x40 - addi.d S4, S4, 0x40 - addi.d S5, S5, 0x40 - addi.d S6, S6, 0x40 - addi.d S7, S7, 0x40 - addi.d S8, S8, 0x40 - + add.d S4, S3, TL + add.d S5, S4, TL + add.d S6, S5, TL + add.d S7, S6, TL + add.d S8, S7, TL + add.d TS, S8, TL + beq I, ZERO, .L_N4 + +.L_N81: /* if(i>0) i--*/ + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fld.d F2, S3, 0x00 + fld.d F3, S4, 0x00 + fld.d F4, S5, 0x00 + fld.d F5, S6, 0x00 + fld.d F6, S7, 0x00 + fld.d F7, S8, 0x00 + + fst.d F0, TD, 0x00 + fst.d F1, TD, 0x08 + fst.d F2, TD, 0x10 + fst.d F3, TD, 0x18 + fst.d F4, TD, 0x20 + fst.d F5, TD, 0x28 + fst.d F6, TD, 0x30 + fst.d F7, TD, 0x38 + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d S3, S3, 0x08 + addi.d S4, S4, 0x08 + addi.d S5, S5, 0x08 + addi.d S6, S6, 0x08 + addi.d S7, S7, 0x08 + addi.d S8, S8, 0x08 + addi.d TD, TD, 0x40 addi.d I, I, -1 - blt ZERO, I, .L_8I1 - -.L_8I3: - andi I, M, 0x07 - beq I, ZERO, .L_N4 - -.L_8I11: - fld.d F0, S1, 0x00 - fld.d F1, S2, 0x00 - fld.d F2, S3, 0x00 - fld.d F3, S4, 0x00 - fld.d F4, S5, 0x00 - fld.d F5, S6, 0x00 - fld.d F6, S7, 0x00 - fld.d F7, S8, 0x00 - - fst.d F0, TD, 0x00 - addi.d S1, S1, 0x08 - fst.d F1, TD, 0x08 - addi.d S2, S2, 0x08 - fst.d F2, TD, 0x10 - addi.d S3, S3, 0x08 - fst.d F3, TD, 0x18 - addi.d S4, S4, 0x08 - fst.d F4, TD, 0x20 - addi.d S5, S5, 0x08 - fst.d F5, TD, 0x28 - addi.d S6, S6, 0x08 - fst.d F6, TD, 0x30 - addi.d S7, S7, 0x08 - fst.d F7, TD, 0x38 - addi.d S8, S8, 0x08 - - addi.d TD, TD, 0x40 - addi.d I, I, -1 - blt ZERO, I, .L_8I11 - -.L_N4: - andi J, N, 0x04 - beq ZERO, J, .L_N2 + blt ZERO, I, .L_N81 + +.L_N4: /* if(n&4)*/ + andi I, N, 0x04 + beq I, ZERO, .L_N2 move S1, TS add.d S2, TS, TL - srai.d I, M, 0x02 + move I, M add.d S3, S2, TL - add.d S4, S2, T0 - add.d TS, S3, T0 - beq I, ZERO, .L_I3 - -.L_4I1: /* I-- */ - xvld U0, S1, 0x00 - xvld U1, S2, 0x00 - xvld U2, S3, 0x00 - xvld U3, S4, 0x00 - - xvpackev.d D0, U1, U0 - xvpackod.d D1, U1, U0 - xvpackev.d D2, U3, U2 - xvpackod.d D3, U3, U2 - - xvand.v U0, D0, D0 - xvpermi.q D0, D2, 0x02 // 0 - xvand.v U1, D1, D1 - xvpermi.q D1, D3, 0x02 // 1 - xvpermi.q D2, U0, 0x31 // 2 - xvpermi.q D3, U1, 0x31 // 3 - - xvst D0, TD, 0x00 - xvst D1, TD, 0x20 - xvst D2, TD, 0x40 - xvst D3, TD, 0x60 - - addi.d S1, S1, 0x20 - addi.d S2, S2, 0x20 - addi.d S3, S3, 0x20 - addi.d S4, S4, 0x20 - addi.d TD, TD, 0x80 - + add.d S4, S3, TL + add.d TS, S4, TL + beq I, ZERO, .L_N2 + +.L_N41: /* if(i>0)*/ + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fld.d F2, S3, 0x00 + fld.d F3, S4, 0x00 + + fst.d F0, TD, 0x00 + fst.d F1, TD, 0x08 + fst.d F2, TD, 0x10 + fst.d F3, TD, 0x18 + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d S3, S3, 0x08 + addi.d S4, S4, 0x08 + addi.d TD, TD, 0x20 addi.d I, I, -1 - blt ZERO, I, .L_4I1 - -.L_I3: - andi I, M, 0x03 - beq I, ZERO, .L_N2 - -.L_4II1: - fld.d F0, S1, 0x00 - fld.d F1, S2, 0x00 - fld.d F2, S3, 0x00 - fld.d F3, S4, 0x00 - - fst.d F0, TD, 0x00 - addi.d S1, S1, 0x08 - fst.d F1, TD, 0x08 - addi.d S2, S2, 0x08 - fst.d F2, TD, 0x10 - addi.d S3, S3, 0x08 - fst.d F3, TD, 0x18 - addi.d S4, S4, 0x08 - - addi.d TD, TD, 0x20 - addi.d I, I, -1 - blt ZERO, I, .L_4II1 - -.L_N2: - andi J, N, 0x02 - beq ZERO, J, .L_N1 + blt ZERO, I, .L_N41 + +.L_N2: /* if(n&2)*/ + andi I, N, 0x02 + beq I, ZERO, .L_N1 move S1, TS add.d S2, TS, TL - srai.d I, M, 0x01 + move I, M add.d TS, S2, TL - beq I, ZERO, .L_NI1 - -.L_2I1: /* I-- */ - xvld U0, S1, 0x00 - xvld U1, S2, 0x00 - - xvpackev.d D0, U1, U0 - xvpackod.d D1, U1, U0 - - xvpermi.q D0, D1, 0x02 // 0 + beq I, ZERO, .L_N1 - xvst D0, TD, 0x00 +.L_N21: /* if(i>0)*/ + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 - addi.d S1, S1, 0x10 - addi.d S2, S2, 0x10 - addi.d TD, TD, 0x20 + fst.d F0, TD, 0x00 + fst.d F1, TD, 0x08 + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d TD, TD, 0x10 addi.d I, I, -1 - blt ZERO, I, .L_2I1 - -.L_NI1: - andi I, M, 0x01 - beq I, ZERO, .L_N1 - + blt ZERO, I, .L_N21 - fld.d F0, S1, 0x00 - fld.d F1, S2, 0x00 +.L_N1: /* if(n&2)*/ + andi I, N, 0x01 + beq I, ZERO, .L_N0 - fst.d F0, TD, 0x00 - addi.d S1, S1, 0x08 - fst.d F1, TD, 0x08 - addi.d S2, S2, 0x08 - addi.d TD, TD, 0x10 + move S1, TS + move I, M + beq I, ZERO, .L_N0 -.L_N1: - move S1, TS - beq ZERO, M, .L_N0 +.L_N11: /* if(i>0)*/ + fld.d F0, S1, 0x00 + fst.d F0, TD, 0x00 -.L_M1: - fld.d F0, S1, 0x00 - addi.d S1, S1, 0x08 - fst.d F0, TD, 0x00 - addi.d TD, TD, 0x08 - addi.d M, M, -1 - blt ZERO, M, .L_M1 + addi.d S1, S1, 0x08 + addi.d TD, TD, 0x08 + addi.d I, I, -1 + blt ZERO, I, .L_N11 .L_N0: - LDARG $r23, $sp, 0x00 - LDARG $r24, $sp, 0x08 - LDARG $r25, $sp, 0x10 - LDARG $r26, $sp, 0x18 - LDARG $r27, $sp, 0x20 - LDARG $r28, $sp, 0x28 - LDARG $r29, $sp, 0x30 - LDARG $r30, $sp, 0x38 - LDARG $r31, $sp, 0x40 - LD $f23, $sp, 0x48 - LD $f24, $sp, 0x50 - LD $f25, $sp, 0x58 - LD $f26, $sp, 0x60 - LD $f27, $sp, 0x68 - LD $f28, $sp, 0x70 - LD $f29, $sp, 0x78 - LD $f30, $sp, 0x80 - LD $f31, $sp, 0x88 - addi.d $sp, $sp, 0x90 - jirl $r0, $r1, 0x00 + LDARG $r23, $sp, 0 + LDARG $r24, $sp, 8 + LDARG $r25, $sp, 16 + LDARG $r26, $sp, 24 + LDARG $r27, $sp, 32 + LDARG $r28, $sp, 40 + LDARG $r29, $sp, 48 + LDARG $r30, $sp, 56 + addi.d $sp, $sp, 64 + jirl $r0, $r1, 0x00 EPILOGUE \ No newline at end of file From a978ad318070e24f5ca0cf4221b55abb5869287e Mon Sep 17 00:00:00 2001 From: pengxu Date: Tue, 13 May 2025 16:09:12 +0800 Subject: [PATCH 160/205] Loongarch64: add C functions of zgemm_ncopy_16 --- kernel/generic/zgemm_ncopy_16.c | 332 ++++++++++++++++++++++++++++++++ 1 file changed, 332 insertions(+) create mode 100644 kernel/generic/zgemm_ncopy_16.c diff --git a/kernel/generic/zgemm_ncopy_16.c b/kernel/generic/zgemm_ncopy_16.c new file mode 100644 index 000000000..088103525 --- /dev/null +++ b/kernel/generic/zgemm_ncopy_16.c @@ -0,0 +1,332 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ + BLASLONG i, j; + + IFLOAT *aoffset; + IFLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; + IFLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8; + IFLOAT *aoffset9, *aoffset10, *aoffset11, *aoffset12; + IFLOAT *aoffset13, *aoffset14, *aoffset15, *aoffset16; + + IFLOAT *boffset; + IFLOAT ctemp01, ctemp02, ctemp03, ctemp04; + IFLOAT ctemp05, ctemp06, ctemp07, ctemp08; + IFLOAT ctemp09, ctemp10, ctemp11, ctemp12; + IFLOAT ctemp13, ctemp14, ctemp15, ctemp16; + IFLOAT ctemp17, ctemp18, ctemp19, ctemp20; + IFLOAT ctemp21, ctemp22, ctemp23, ctemp24; + IFLOAT ctemp25, ctemp26, ctemp27, ctemp28; + IFLOAT ctemp29, ctemp30, ctemp31, ctemp32; + + aoffset = a; + boffset = b; + lda *= 2; + + j = (n >> 4); + if (j > 0){ + do{ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset5 = aoffset4 + lda; + aoffset6 = aoffset5 + lda; + aoffset7 = aoffset6 + lda; + aoffset8 = aoffset7 + lda; + aoffset9 = aoffset8 + lda; + aoffset10 = aoffset9 + lda; + aoffset11 = aoffset10 + lda; + aoffset12 = aoffset11 + lda; + aoffset13 = aoffset12 + lda; + aoffset14 = aoffset13 + lda; + aoffset15 = aoffset14 + lda; + aoffset16 = aoffset15 + lda; + aoffset += 16 * lda; + + i = m; + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + ctemp05 = *(aoffset3 + 0); + ctemp06 = *(aoffset3 + 1); + ctemp07 = *(aoffset4 + 0); + ctemp08 = *(aoffset4 + 1); + ctemp09 = *(aoffset5 + 0); + ctemp10 = *(aoffset5 + 1); + ctemp11 = *(aoffset6 + 0); + ctemp12 = *(aoffset6 + 1); + ctemp13 = *(aoffset7 + 0); + ctemp14 = *(aoffset7 + 1); + ctemp15 = *(aoffset8 + 0); + ctemp16 = *(aoffset8 + 1); + + ctemp17 = *(aoffset9 + 0); + ctemp18 = *(aoffset9 + 1); + ctemp19 = *(aoffset10 + 0); + ctemp20 = *(aoffset10 + 1); + ctemp21 = *(aoffset11 + 0); + ctemp22 = *(aoffset11 + 1); + ctemp23 = *(aoffset12 + 0); + ctemp24 = *(aoffset12 + 1); + ctemp25 = *(aoffset13 + 0); + ctemp26 = *(aoffset13 + 1); + ctemp27 = *(aoffset14 + 0); + ctemp28 = *(aoffset14 + 1); + ctemp29 = *(aoffset15 + 0); + ctemp30 = *(aoffset15 + 1); + ctemp31 = *(aoffset16 + 0); + ctemp32 = *(aoffset16 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + *(boffset + 4) = ctemp05; + *(boffset + 5) = ctemp06; + *(boffset + 6) = ctemp07; + *(boffset + 7) = ctemp08; + *(boffset + 8) = ctemp09; + *(boffset + 9) = ctemp10; + *(boffset + 10) = ctemp11; + *(boffset + 11) = ctemp12; + *(boffset + 12) = ctemp13; + *(boffset + 13) = ctemp14; + *(boffset + 14) = ctemp15; + *(boffset + 15) = ctemp16; + + *(boffset + 16) = ctemp17; + *(boffset + 17) = ctemp18; + *(boffset + 18) = ctemp19; + *(boffset + 19) = ctemp20; + *(boffset + 20) = ctemp21; + *(boffset + 21) = ctemp22; + *(boffset + 22) = ctemp23; + *(boffset + 23) = ctemp24; + *(boffset + 24) = ctemp25; + *(boffset + 25) = ctemp26; + *(boffset + 26) = ctemp27; + *(boffset + 27) = ctemp28; + *(boffset + 28) = ctemp29; + *(boffset + 29) = ctemp30; + *(boffset + 30) = ctemp31; + *(boffset + 31) = ctemp32; + + aoffset1 += 2; + aoffset2 += 2; + aoffset3 += 2; + aoffset4 += 2; + aoffset5 += 2; + aoffset6 += 2; + aoffset7 += 2; + aoffset8 += 2; + aoffset9 += 2; + aoffset10 += 2; + aoffset11 += 2; + aoffset12 += 2; + aoffset13 += 2; + aoffset14 += 2; + aoffset15 += 2; + aoffset16 += 2; + + boffset += 32; + i --; + }while(i > 0); + } + j--; + }while(j > 0); + } /* end of if(j > 0) */ + + if (n & 8){ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset5 = aoffset4 + lda; + aoffset6 = aoffset5 + lda; + aoffset7 = aoffset6 + lda; + aoffset8 = aoffset7 + lda; + aoffset += 8 * lda; + + i = m; + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + ctemp05 = *(aoffset3 + 0); + ctemp06 = *(aoffset3 + 1); + ctemp07 = *(aoffset4 + 0); + ctemp08 = *(aoffset4 + 1); + ctemp09 = *(aoffset5 + 0); + ctemp10 = *(aoffset5 + 1); + ctemp11 = *(aoffset6 + 0); + ctemp12 = *(aoffset6 + 1); + ctemp13 = *(aoffset7 + 0); + ctemp14 = *(aoffset7 + 1); + ctemp15 = *(aoffset8 + 0); + ctemp16 = *(aoffset8 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + *(boffset + 4) = ctemp05; + *(boffset + 5) = ctemp06; + *(boffset + 6) = ctemp07; + *(boffset + 7) = ctemp08; + *(boffset + 8) = ctemp09; + *(boffset + 9) = ctemp10; + *(boffset + 10) = ctemp11; + *(boffset + 11) = ctemp12; + *(boffset + 12) = ctemp13; + *(boffset + 13) = ctemp14; + *(boffset + 14) = ctemp15; + *(boffset + 15) = ctemp16; + + aoffset1 += 2; + aoffset2 += 2; + aoffset3 += 2; + aoffset4 += 2; + aoffset5 += 2; + aoffset6 += 2; + aoffset7 += 2; + aoffset8 += 2; + + boffset += 16; + i --; + }while(i > 0); + } + } + + if (n & 4){ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset += 4 * lda; + + i = m; + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + ctemp05 = *(aoffset3 + 0); + ctemp06 = *(aoffset3 + 1); + ctemp07 = *(aoffset4 + 0); + ctemp08 = *(aoffset4 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + *(boffset + 4) = ctemp05; + *(boffset + 5) = ctemp06; + *(boffset + 6) = ctemp07; + *(boffset + 7) = ctemp08; + + aoffset1 += 2; + aoffset2 += 2; + aoffset3 += 2; + aoffset4 += 2; + + boffset += 8; + i --; + }while(i > 0); + } + } /* end of if(j > 0) */ + + if (n & 2){ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset += 2 * lda; + + i = m; + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + + aoffset1 += 2; + aoffset2 += 2; + boffset += 4; + i --; + }while(i > 0); + } + + } /* end of if(j > 0) */ + + if (n & 1){ + aoffset1 = aoffset; + + i = m; + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + + aoffset1 += 2; + boffset += 2; + i --; + }while(i > 0); + } + + } /* end of if(j > 0) */ + + return 0; +} From 9a7e3f102b393f263559ef2852d92e7138ff5482 Mon Sep 17 00:00:00 2001 From: guoyuanplct Date: Wed, 14 May 2025 00:09:26 +0800 Subject: [PATCH 161/205] kernel/riscv64:Fixed the bug of openblas_utest_ext failing in c/zgemv and some c/zgbmv tests: --- kernel/riscv64/zgemv_n_vector.c | 83 +++++++++++++++++++++++++-------- 1 file changed, 63 insertions(+), 20 deletions(-) diff --git a/kernel/riscv64/zgemv_n_vector.c b/kernel/riscv64/zgemv_n_vector.c index cbed06c97..8d44dd25a 100644 --- a/kernel/riscv64/zgemv_n_vector.c +++ b/kernel/riscv64/zgemv_n_vector.c @@ -66,7 +66,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, BLASLONG lda2 = lda * 2; vy0_new = VLSEV_FLOAT(&y[iy], stride_y, gvl); vy1_new = VLSEV_FLOAT(&y[iy + 1], stride_y, gvl); - for (k = 0, j = 0; k < m / gvl; k++) + for (k = 0, j = 0; k < m / gvl; k ++) { a_ptr = a; ix = 0; @@ -121,30 +121,73 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, #endif a_ptr += lda2; ix += inc_x2; + } - for (; i < n; i += 4) + for (i = n % 4 ; i < n; i += 4) { #if !defined(XCONJ) - - x_v0 = VLSEV_FLOAT(&x[ix], inc_x2 * sizeof(FLOAT), 4); - x_v1 = VLSEV_FLOAT(&x[ix + 1], inc_x2 * sizeof(FLOAT), 4); - temp_rv = VFMUL_VF_FLOAT(x_v0, alpha_r, 4); - temp_iv = VFMUL_VF_FLOAT(x_v0, alpha_i, 4); - temp_rv = VFNMSACVF_FLOAT(temp_rv, alpha_i, x_v1, 4); - temp_iv = VFMACCVF_FLOAT(temp_iv, alpha_r, x_v1, 4); - VSEV_FLOAT(&temp_rr[0], temp_rv, 4); - VSEV_FLOAT(&temp_ii[0], temp_iv, 4); + // temp_rr[0] = alpha_r * x[ix] - alpha_i * x[ix + 1]; + // temp_rr[1] = alpha_r * x[ix + inc_x2] - alpha_i * x[ix + inc_x2 + 1]; + x_v0 = VLSEV_FLOAT(&x[ix], inc_x2 * sizeof(FLOAT), 2); + x_v1 = VLSEV_FLOAT(&x[ix + 1], inc_x2 * sizeof(FLOAT), 2); + temp_rv = VFMUL_VF_FLOAT(x_v0, alpha_r, 2); + temp_rv = VFNMSACVF_FLOAT(temp_rv, alpha_i, x_v1, 2); + + // temp_ii[0] = alpha_r * x[ix + 1] + alpha_i * x[ix]; + // temp_ii[1] = alpha_r * x[ix + inc_x2 + 1] + alpha_i * x[ix + inc_x2]; + temp_iv = VFMUL_VF_FLOAT(x_v0, alpha_i, 2); + temp_iv = VFMACCVF_FLOAT(temp_iv, alpha_r, x_v1, 2); + VSEV_FLOAT(&temp_rr[0], temp_rv, 2); + VSEV_FLOAT(&temp_ii[0], temp_iv, 2); + + // temp_rr[2] = alpha_r * x[ix + inc_x2 * 2] - alpha_i * x[ix + inc_x2 * 2 + 1]; + // temp_rr[3] = alpha_r * x[ix + inc_x2 * 3] - alpha_i * x[ix + inc_x2 * 3 + 1]; + x_v0 = VLSEV_FLOAT(&x[ix + inc_x2 * 2], inc_x2 * sizeof(FLOAT), 2); + x_v1 = VLSEV_FLOAT(&x[ix + inc_x2 * 2 + 1], inc_x2 * sizeof(FLOAT), 2); + temp_rv = VFMUL_VF_FLOAT(x_v0, alpha_r, 2); + temp_rv = VFNMSACVF_FLOAT(temp_rv, alpha_i, x_v1, 2); + + // temp_ii[2] = alpha_r * x[ix + inc_x2 * 2 + 1] + alpha_i * x[ix + inc_x2 * 2]; + // temp_ii[3] = alpha_r * x[ix + inc_x2 * 3 + 1] + alpha_i * x[ix + inc_x2 * 3]; + temp_iv = VFMUL_VF_FLOAT(x_v0, alpha_i, 2); + temp_iv = VFMACCVF_FLOAT(temp_iv, alpha_r, x_v1, 2); + VSEV_FLOAT(&temp_rr[2], temp_rv, 2); + VSEV_FLOAT(&temp_ii[2], temp_iv, 2); #else - x_v0 = VLSEV_FLOAT(&x[ix], inc_x2 * sizeof(FLOAT), 4); - x_v1 = VLSEV_FLOAT(&x[ix + 1], inc_x2 * sizeof(FLOAT), 4); - temp_rv = VFMUL_VF_FLOAT(x_v0, alpha_r, 4); - temp_iv = VFMUL_VF_FLOAT(x_v0, alpha_i, 4); - temp_rv = VFMACCVF_FLOAT(temp_rv, alpha_i, x_v1, 4); - temp_iv = VFNMSACVF_FLOAT(temp_iv, alpha_r, x_v1, 4); - VSEV_FLOAT(&temp_rr[0], temp_rv, 4); - VSEV_FLOAT(&temp_ii[0], temp_iv, 4); + // temp_rr[0] = alpha_r * x[ix] + alpha_i * x[ix + 1]; + // temp_rr[1] = alpha_r * x[ix + inc_x2] + alpha_i * x[ix + inc_x2 + 1]; + x_v0 = VLSEV_FLOAT(&x[ix], inc_x2 * sizeof(FLOAT), 2); + x_v1 = VLSEV_FLOAT(&x[ix + 1], inc_x2 * sizeof(FLOAT), 2); + temp_rv = VFMUL_VF_FLOAT(x_v0, alpha_r, 2); + temp_rv = VFMACCVF_FLOAT(temp_rv, alpha_i, x_v1, 2); + + + // temp_ii[0] = alpha_r * x[ix + 1] - alpha_i * x[ix]; + // temp_ii[1] = alpha_r * x[ix + inc_x2 + 1] - alpha_i * x[ix + inc_x2]; + temp_iv = VFMUL_VF_FLOAT(x_v1, alpha_r, 2); + temp_iv = VFNMSACVF_FLOAT(temp_iv, alpha_i, x_v0, 2); + VSEV_FLOAT(&temp_rr[0], temp_rv, 2); + VSEV_FLOAT(&temp_ii[0], temp_iv, 2); + + + // temp_rr[2] = alpha_r * x[ix + inc_x2 * 2] + alpha_i * x[ix + inc_x2 * 2 + 1]; + // temp_rr[3] = alpha_r * x[ix + inc_x2 * 3] + alpha_i * x[ix + inc_x2 * 3 + 1]; + x_v0 = VLSEV_FLOAT(&x[ix + inc_x2 * 2], inc_x2 * sizeof(FLOAT), 2); + x_v1 = VLSEV_FLOAT(&x[ix + inc_x2 * 2 + 1], inc_x2 * sizeof(FLOAT), 2); + temp_rv = VFMUL_VF_FLOAT(x_v0, alpha_r, 2); + temp_rv = VFMACCVF_FLOAT(temp_rv, alpha_i, x_v1, 2); + + + temp_ii[2] = alpha_r * x[ix + inc_x2 * 2 + 1] - alpha_i * x[ix + inc_x2 * 2]; + temp_ii[3] = alpha_r * x[ix + inc_x2 * 3 + 1] - alpha_i * x[ix + inc_x2 * 3]; + temp_iv = VFMUL_VF_FLOAT(x_v1, alpha_r, 2); + temp_iv = VFNMSACVF_FLOAT(temp_iv, alpha_i, x_v0, 2); + VSEV_FLOAT(&temp_rr[2], temp_rv, 2); + VSEV_FLOAT(&temp_ii[2], temp_iv, 2); + + #endif @@ -257,7 +300,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl); VSSEV_FLOAT(&y[iy + 1], stride_y, vy1, gvl); j += gvl * 2; - iy += inc_yv; + iy += inc_yv ; } // tail if (j / 2 < m) From 4d213653d857d6365221b79c16d4e151120e9fbe Mon Sep 17 00:00:00 2001 From: guoyuanplct Date: Thu, 15 May 2025 13:29:14 +0800 Subject: [PATCH 162/205] kernel/riscv64:Added support for omatcopy on riscv64. --- kernel/riscv64/KERNEL.RISCV64_ZVL256B | 6 ++ kernel/riscv64/omatcopy_cn_vector.c | 125 ++++++++++++++++++++++++++ kernel/riscv64/zomatcopy_cn_vector.c | 111 +++++++++++++++++++++++ 3 files changed, 242 insertions(+) create mode 100644 kernel/riscv64/omatcopy_cn_vector.c create mode 100644 kernel/riscv64/zomatcopy_cn_vector.c diff --git a/kernel/riscv64/KERNEL.RISCV64_ZVL256B b/kernel/riscv64/KERNEL.RISCV64_ZVL256B index 9915fd949..ba7a52bbf 100644 --- a/kernel/riscv64/KERNEL.RISCV64_ZVL256B +++ b/kernel/riscv64/KERNEL.RISCV64_ZVL256B @@ -201,3 +201,9 @@ endif ifndef ZGEMM_BETA ZGEMM_BETA = ../generic/zgemm_beta.c endif + +ZOMATCOPY_CN = zomatcopy_cn_vector.c +COMATCOPY_CN = zomatcopy_cn_vector.c + +DOMATCOPY_CN = omatcopy_cn_vector.c +SOMATCOPY_CN = omatcopy_cn_vector.c diff --git a/kernel/riscv64/omatcopy_cn_vector.c b/kernel/riscv64/omatcopy_cn_vector.c new file mode 100644 index 000000000..444c8232d --- /dev/null +++ b/kernel/riscv64/omatcopy_cn_vector.c @@ -0,0 +1,125 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + + +#if !defined(DOUBLE) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m4)() +#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) +#define FLOAT_V_T vfloat32m4_t +#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4) +#define VSEV_FLOAT RISCV_RVV(vse32_v_f32m4) +#define VFMULVF_FLOAT RISCV_RVV(vfmul_vf_f32m4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m4) +#else +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m4)() +#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) +#define FLOAT_V_T vfloat64m4_t +#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4) +#define VSEV_FLOAT RISCV_RVV(vse64_v_f64m4) +#define VFMULVF_FLOAT RISCV_RVV(vfmul_vf_f64m4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m4) +#endif + + +int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb) +{ + BLASLONG i,j; + FLOAT *aptr,*bptr; + size_t vl; + + FLOAT_V_T va, vb,va1,vb1; + if ( rows <= 0 ) return(0); + if ( cols <= 0 ) return(0); + + aptr = a; + bptr = b; + + if ( alpha == 0.0 ) + { + vl = VSETVL_MAX; + va = VFMVVF_FLOAT(0, vl); + for ( i=0; i Date: Thu, 15 May 2025 18:55:47 +0800 Subject: [PATCH 163/205] Format Code --- kernel/riscv64/omatcopy_cn_vector.c | 2 -- kernel/riscv64/zomatcopy_cn_vector.c | 5 ----- 2 files changed, 7 deletions(-) diff --git a/kernel/riscv64/omatcopy_cn_vector.c b/kernel/riscv64/omatcopy_cn_vector.c index 444c8232d..d079310b8 100644 --- a/kernel/riscv64/omatcopy_cn_vector.c +++ b/kernel/riscv64/omatcopy_cn_vector.c @@ -26,8 +26,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" - - #if !defined(DOUBLE) #define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m4)() #define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) diff --git a/kernel/riscv64/zomatcopy_cn_vector.c b/kernel/riscv64/zomatcopy_cn_vector.c index bbfbd214a..b141ed4a6 100644 --- a/kernel/riscv64/zomatcopy_cn_vector.c +++ b/kernel/riscv64/zomatcopy_cn_vector.c @@ -70,7 +70,6 @@ int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, FLOAT_VX2_T va, vb; unsigned int gvl = 0; - if ( rows <= 0 ) return(0); if ( cols <= 0 ) return(0); @@ -85,8 +84,6 @@ int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, for(j=0; j Date: Fri, 16 May 2025 18:24:46 +0800 Subject: [PATCH 164/205] Add retry mechanism after deadlock timeout for c910v. --- .github/workflows/c910v.yml | 36 +++++++++++++++++++++++++++++++++--- 1 file changed, 33 insertions(+), 3 deletions(-) diff --git a/.github/workflows/c910v.yml b/.github/workflows/c910v.yml index c5b497316..9981c437b 100644 --- a/.github/workflows/c910v.yml +++ b/.github/workflows/c910v.yml @@ -83,9 +83,39 @@ jobs: - name: test run: | - export PATH=$GITHUB_WORKSPACE/qemu-install/bin/:$PATH - qemu-riscv64 ./utest/openblas_utest - qemu-riscv64 ./utest/openblas_utest_ext + run_with_retry() { + local cmd="$1" + local time_out=10 + local retries=10 + local attempt=0 + + for ((i=1; i<=retries; i++)); do + attempt=$((i)) + if timeout -s 12 --preserve-status $time_out $cmd; then + echo "Command succeeded on attempt $i." + return 0 + else + local exit_code=$? + if [ $exit_code -eq 140 ]; then + echo "Attempt $i timed out (retrying...)" + time_out=$((time_out + 5)) + else + echo "Attempt $i failed with exit code $exit_code. Aborting workflow." + exit $exit_code + fi + fi + done + echo "All $retries attempts failed, giving up." + echo "Final failure was due to timeout." + echo "Aborting workflow." + exit $exit_code + } + export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH + which qemu-riscv64 + export QEMU_BIN=$(which qemu-riscv64) + run_with_retry "$QEMU_BIN ./utest/openblas_utest" + run_with_retry "$QEMU_BIN ./utest/openblas_utest_ext" + OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xscblat1 OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xdcblat1 OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xccblat1 From 6680e0592f9f2e4e0551b13d7c1f6fc3e225fe95 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 17 May 2025 05:12:15 -0700 Subject: [PATCH 165/205] Fix conditional inclusion of SGEMM_KERNEL_DIRECT --- interface/gemm.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/interface/gemm.c b/interface/gemm.c index d36925629..54e5604fd 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -417,21 +417,24 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS PRINT_DEBUG_CNAME; -#if !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) && defined(USE_SGEMM_KERNEL_DIRECT) -#if defined(DYNAMIC_ARCH) && defined(ARCH_x86) - if (support_avx512() ) +#if !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) +#if defined(ARCH_x86) && (defined(USE_SGEMM_KERNEL_DIRECT)||defined(DYNAMIC_ARCH)) +#if defined(DYNAMIC_ARCH) + if (support_avx512() ) +#endif if (beta == 0 && alpha == 1.0 && order == CblasRowMajor && TransA == CblasNoTrans && TransB == CblasNoTrans && SGEMM_DIRECT_PERFORMANT(m,n,k)) { SGEMM_DIRECT(m, n, k, a, lda, b, ldb, c, ldc); return; } #endif -#if defined(DYNAMIC_ARCH) && defined(ARCH_ARM64) - if (support_sme1()){ +#if defined(ARCH_ARM64) && (defined(USE_SGEMM_KERNEL_DIRECT)||defined(DYNAMIC_ARCH)) +#if defined(DYNAMIC_ARCH) + if (support_sme1()) +#endif if (beta == 0 && alpha == 1.0 && order == CblasRowMajor && TransA == CblasNoTrans && TransB == CblasNoTrans) { SGEMM_DIRECT(m, n, k, a, lda, b, ldb, c, ldc); return; } - } #endif #endif From f2022c23aca676dcfc43a539bde02dc14411966b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 19 May 2025 16:08:12 +0200 Subject: [PATCH 166/205] Remove sve capability from NeoverseN1 and specify CortexX2/A?10 as arm8.4a --- cmake/cc.cmake | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cmake/cc.cmake b/cmake/cc.cmake index f292f1c57..66b316f7f 100644 --- a/cmake/cc.cmake +++ b/cmake/cc.cmake @@ -229,9 +229,9 @@ if (${CORE} STREQUAL NEOVERSEN1) if (${CMAKE_C_COMPILER_ID} STREQUAL "NVC" AND NOT NO_SVE) set (CCOMMON_OPT "${CCOMMON_OPT} -tp=neoverse-n1") elseif (${GCC_VERSION} VERSION_GREATER 9.4 OR ${GCC_VERSION} VERSION_EQUAL 9.4) - set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve -mtune=neoverse-n1") + set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a -mtune=neoverse-n1") else () - set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve") + set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a") endif() endif () endif () @@ -260,13 +260,13 @@ endif () if (${CORE} STREQUAL CORTEXA510) if (NOT DYNAMIC_ARCH) - set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve") + set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.4-a+sve") endif () endif () if (${CORE} STREQUAL CORTEXA710) if (NOT DYNAMIC_ARCH) - set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve") + set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.4-a+sve") endif () endif () @@ -278,7 +278,7 @@ endif () if (${CORE} STREQUAL CORTEXX2) if (NOT DYNAMIC_ARCH) - set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve") + set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.4-a+sve") endif () endif () From 8779eac3b8afb1b862b85ef08ceec3305d054e09 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 19 May 2025 08:55:14 -0700 Subject: [PATCH 167/205] Do not add a 64 suffix to the library name if the user-provided suffix already contains it --- CMakeLists.txt | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f94c4c474..7094eb5b7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -123,7 +123,12 @@ message(WARNING "CMake support is experimental. It does not yet support all buil include("${PROJECT_SOURCE_DIR}/cmake/utils.cmake") include("${PROJECT_SOURCE_DIR}/cmake/system.cmake") -set(OpenBLAS_LIBNAME ${LIBNAMEPREFIX}openblas${LIBNAMESUFFIX}${SUFFIX64_UNDERSCORE}) +string(FIND "${LIBNAMESUFFIX}" "${SUFFIX64_UNDERSCORE}" HAVE64) +if (${HAVE64} GREATER -1) + set(OpenBLAS_LIBNAME ${LIBNAMEPREFIX}openblas${LIBNAMESUFFIX}) +else () + set(OpenBLAS_LIBNAME ${LIBNAMEPREFIX}openblas${LIBNAMESUFFIX}${SUFFIX64_UNDERSCORE}) +endif () set(BLASDIRS interface driver/level2 driver/level3 driver/others) @@ -716,4 +721,5 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PN}ConfigVersion.cmake DESTINATION ${CMAKECONFIG_INSTALL_DIR}) install(EXPORT "${PN}${SUFFIX64}Targets" NAMESPACE "${PN}${SUFFIX64}::" - DESTINATION ${CMAKECONFIG_INSTALL_DIR}) \ No newline at end of file + DESTINATION ${CMAKECONFIG_INSTALL_DIR}) + From 4ca76d9de4ce8808498aa314e7dad961eef16d5b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 19 May 2025 12:07:24 -0700 Subject: [PATCH 168/205] Expressly provide a shared libs option --- CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7094eb5b7..f13f707f9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -62,6 +62,7 @@ option(CPP_THREAD_SAFETY_TEST "Run a massively parallel DGEMM test to confirm th option(CPP_THREAD_SAFETY_GEMV "Run a massively parallel DGEMV test to confirm thread safety of the library (requires OpenMP)" OFF) option(BUILD_STATIC_LIBS "Build static library" OFF) +option(BUILD_SHARED_LIBS "Build shared library" OFF) if(NOT BUILD_STATIC_LIBS AND NOT BUILD_SHARED_LIBS) set(BUILD_STATIC_LIBS ON CACHE BOOL "Build static library" FORCE) endif() From 2351a98005c68aca88e9403ff19f83fe90c6bd49 Mon Sep 17 00:00:00 2001 From: Masato Nakagawa Date: Wed, 21 May 2025 21:21:52 +0900 Subject: [PATCH 169/205] Update 2D thread-partitioned GEMM for M << N case. --- driver/level3/level3_thread.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index 77aaeee6b..05d349d97 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -851,9 +851,19 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, IFLOAT *sa, IF /* Objective function come from sum of partitions in m and n. */ /* (n / nthreads_n) + (m / nthreads_m) */ /* = (n * nthreads_m + m * nthreads_n) / (nthreads_n * nthreads_m) */ - while (nthreads_m % 2 == 0 && n * nthreads_m + m * nthreads_n > n * (nthreads_m / 2) + m * (nthreads_n * 2)) { - nthreads_m /= 2; - nthreads_n *= 2; + BLASLONG cost = 0, div = 0; + for (BLASLONG i = 1; i <= sqrt(nthreads_m); i++) { + if (nthreads_m % i) continue; + BLASLONG j = nthreads_m / i; + BLASLONG cost_i = n * j + m * nthreads_n * i; + BLASLONG cost_j = n * i + m * nthreads_n * j; + if (cost == 0 || + cost_i < cost) {cost = cost_i; div = i;} + if (cost_j < cost) {cost = cost_j; div = j;} + } + if (div > 1) { + nthreads_m /= div; + nthreads_n *= div; } } From bd573a9d387abb3bd81a88660f2b064302ab3a93 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 21 May 2025 22:01:02 +0200 Subject: [PATCH 170/205] Expand mingw32 gfortran workaround to all versions after 14.1 --- ctest/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ctest/CMakeLists.txt b/ctest/CMakeLists.txt index 03b157843..83a715005 100644 --- a/ctest/CMakeLists.txt +++ b/ctest/CMakeLists.txt @@ -6,7 +6,7 @@ enable_language(Fortran) endif() set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DADD${BU} -DCBLAS") -if (BINARY32 AND CMAKE_C_PLATFORM_ID MATCHES "MinGW" AND CMAKE_Fortran_COMPILER_VERSION VERSION_EQUAL 14.2) +if (BINARY32 AND CMAKE_C_PLATFORM_ID MATCHES "MinGW" AND CMAKE_Fortran_COMPILER_VERSION VERSION_GREATER 14.1) list(REMOVE_ITEM ${CMAKE_Fortran_FLAGS} -O3 -O2 -O1 -Os) set (CMAKE_Fortran_FLAGS_RELEASE "" CACHE STRING "" FORCE) endif() From 42b7d1f8972f6444395ce71125da2eed1a0ae196 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 21 May 2025 22:03:38 +0200 Subject: [PATCH 171/205] Fix addressing of alpha in CBLAS --- interface/zsyr.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/interface/zsyr.c b/interface/zsyr.c index 8bc9ac177..51cca84ee 100644 --- a/interface/zsyr.c +++ b/interface/zsyr.c @@ -116,12 +116,12 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, #else -void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, int n, FLOAT alpha, FLOAT *x, int incx, FLOAT *a, int lda) { +void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, int n, void* valpha, FLOAT *x, int incx, FLOAT *a, int lda) { FLOAT *buffer; int uplo; blasint info; - FLOAT * ALPHA = α + FLOAT * ALPHA = (FLOAT*)valpha; FLOAT alpha_r = ALPHA[0]; FLOAT alpha_i = ALPHA[1]; #ifdef SMP From 20f2ba014143f195ff5cafa9b3bb98d5c89a3a03 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 21 May 2025 23:44:17 +0200 Subject: [PATCH 172/205] Move declaration of i for pre-C99 compilers --- driver/level3/level3_thread.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index 05d349d97..db3bffc10 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -852,7 +852,8 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, IFLOAT *sa, IF /* (n / nthreads_n) + (m / nthreads_m) */ /* = (n * nthreads_m + m * nthreads_n) / (nthreads_n * nthreads_m) */ BLASLONG cost = 0, div = 0; - for (BLASLONG i = 1; i <= sqrt(nthreads_m); i++) { + BLASLONG i; + for (i = 1; i <= sqrt(nthreads_m); i++) { if (nthreads_m % i) continue; BLASLONG j = nthreads_m / i; BLASLONG cost_i = n * j + m * nthreads_n * i; From 669c847ceb87faf5242d88cdf13d687ae6573038 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 23 May 2025 05:52:48 -0700 Subject: [PATCH 173/205] support extra flag for NaN handling --- kernel/x86_64/cscal.c | 74 ++++++++++++++++++++++++++++++------------- kernel/x86_64/zscal.c | 68 ++++++++++++++++++++++++++++----------- 2 files changed, 101 insertions(+), 41 deletions(-) diff --git a/kernel/x86_64/cscal.c b/kernel/x86_64/cscal.c index 212a21594..be32bf35a 100644 --- a/kernel/x86_64/cscal.c +++ b/kernel/x86_64/cscal.c @@ -229,10 +229,9 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, if ( da_i == 0.0 ) { - + if (!dummy2) { while(j < n1) { - x[i]=0.0; x[i+1]=0.0; x[i+inc_x]=0.0; @@ -244,21 +243,48 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, while(j < n) { - x[i]=0.0; x[i+1]=0.0; i += inc_x ; j++; - } + } else { + float temp; + while(j < n1) + { + if (isnan(x[i])|| isnan(x[i+1])) + temp=NAN; + else + temp=0.0; + x[i]=temp; + x[i+1]=temp; + if (isnan(x[i+inc_x])|| isnan(x[i+inc_x+1])) + temp=NAN; + else + temp=0.0; + x[i+inc_x]= temp; + x[i+inc_x+1]= temp; + i += 2*inc_x; + j+=2; + } + while(j < n) + { + if (isnan(x[i])|| isnan(x[i+1])) + temp=NAN; + else + temp=0.0; + x[i]=temp; + x[i+1]=temp; + i += inc_x; + j++; + } + } } else { - while(j < n1) { - if (isnan(x[i]) || isinf(x[i])) temp0 = NAN; else @@ -278,7 +304,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, x[i+inc_x] = temp1; i += 2*inc_x ; j+=2; - } while(j < n) @@ -305,14 +330,12 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, else { - - if ( da_i == 0.0 ) + if ( da_i == 0.0 && dummy2 ) { BLASLONG n1 = n & -2; while(j < n1) { - temp0 = da_r * x[i]; x[i+1] = da_r * x[i+1]; x[i] = temp0; @@ -367,22 +390,19 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, return(0); } - BLASLONG n1 = n & -16; if ( n1 > 0 ) { alpha[0] = da_r; alpha[1] = da_i; - if ( da_r == 0.0 ) - if ( da_i == 0 ) + if ( da_i == 0 && !dummy2) cscal_kernel_16_zero(n1 , alpha , x); else - cscal_kernel_16_zero_r(n1 , alpha , x); + cscal_kernel_16/*_zero_r*/(n1 , alpha , x); else cscal_kernel_16(n1 , alpha , x); - i = n1 << 1; j = n1; } @@ -393,6 +413,8 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, { FLOAT res=0.0; if (isnan(da_r)) res= da_r; + if (dummy2) + if (isnan(x[i])||isnan(x[i+1])) res= NAN; while(j < n) { x[i]=res; @@ -415,7 +437,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, } else { - while(j < n) { temp0 = -da_i * x[i+1]; @@ -424,11 +445,10 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, if (!isinf(x[i+1])) x[i+1] = da_i * x[i]; else x[i+1] = NAN; - if ( x[i] == x[i]) //preserve NaN + if ( !isnan(x[i])) //preserve NaN x[i] = temp0; i += 2 ; j++; - } } @@ -439,12 +459,22 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, if ( da_i == 0.0 ) { - while(j < n) { - + temp0 = da_r * x[i]; - x[i+1] = da_r * x[i+1]; + if (dummy2) { + if (isnan(x[i])||isinf(x[i])) temp0=NAN; + if (isnan(x[i+1])||isinf(x[i+1])) + x[i+1]=NAN; + else + x[i+1] = da_r * x[i+1]; + } else { + if (isnan(x[i])) + x[i+1] = NAN; + else + x[i+1] = da_r * x[i+1]; + } x[i] = temp0; i += 2 ; j++; @@ -476,7 +506,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, temp0 = da_r * x[i] - da_i * x[i+1]; x[i+1] = da_r * x[i+1] + da_i * x[i]; - x[i] = temp0; + if(!isnan(x[i]))x[i] = temp0; i += 2 ; j++; diff --git a/kernel/x86_64/zscal.c b/kernel/x86_64/zscal.c index 7859ef6e3..b3d146fd0 100644 --- a/kernel/x86_64/zscal.c +++ b/kernel/x86_64/zscal.c @@ -222,13 +222,14 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, if ( da_r == 0.0 ) { + BLASLONG n1 = n & -2; if ( da_i == 0.0 ) { + if (!dummy2) { while(j < n1) { - x[i]=0.0; x[i+1]=0.0; x[i+inc_x]=0.0; @@ -245,9 +246,40 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, x[i+1]=0.0; i += inc_x ; j++; + } + } else { + float temp; + while(j < n1) + { + if (isnan(x[i])|| isnan(x[i+1])) + temp=NAN; + else + temp=0.0; + x[i]=temp; + x[i+1]=temp; + if (isnan(x[i+inc_x])|| isnan(x[i+inc_x+1])) + temp=NAN; + else + temp=0.0; + x[i+inc_x]= temp; + x[i+inc_x+1]= temp; + i += 2*inc_x; + j+=2; } + while(j < n) + { + if (isnan(x[i])|| isnan(x[i+1])) + temp=NAN; + else + temp=0.0; + x[i]=temp; + x[i+1]=temp; + i += inc_x; + j++; + } + } } else { @@ -260,7 +292,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, temp0 = -da_i * x[i+1]; if (!isinf(x[i+1])) x[i+1] = da_i * x[i]; - else x[i+1] = NAN; + else x[i+1] = NAN; x[i] = temp0; if (isnan(x[i+inc_x]) || isinf(x[i+inc_x])) temp1 = NAN; @@ -291,16 +323,13 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, } - - } } else { - - if ( da_i == 0.0 ) + if ( da_i == 0.0 && dummy2) { BLASLONG n1 = n & -2; @@ -370,26 +399,27 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, alpha[1] = da_i; if ( da_r == 0.0 ) - if ( da_i == 0 ) + if ( da_i == 0 && !dummy2 ) zscal_kernel_8_zero(n1 , alpha , x); else -// zscal_kernel_8_zero_r(n1 , alpha , x); zscal_kernel_8(n1 , alpha , x); else - if ( da_i == 0 && da_r == da_r) + /* if ( da_i == 0 && da_r == da_r ) zscal_kernel_8_zero_i(n1 , alpha , x); - else + else*/ zscal_kernel_8(n1 , alpha , x); - } + i = n1 << 1; j = n1; - - if ( da_r == 0.0 || da_r != da_r ) + } + if ( da_r == 0.0 || isnan(da_r) ) { if ( da_i == 0.0 ) { - FLOAT res=0.0; - if (da_r != da_r) res= da_r; + FLOAT res=0.0; + if (isnan(da_r)) res= da_r; + if (dummy2) + if (isnan(x[i])||isnan(x[i+1])) res= NAN; while(j < n) { x[i]=res; @@ -412,7 +442,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, } else { - while(j < n) { temp0 = -da_i * x[i+1]; @@ -421,7 +450,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, if (!isinf(x[i+1])) x[i+1] = da_i * x[i]; else x[i+1] = NAN; - if ( x[i] == x[i]) //preserve NaN + if ( !isnan(x[i])) //preserve NaN x[i] = temp0; i += 2 ; j++; @@ -437,8 +466,9 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, { while(j < n) { - temp0 = da_r * x[i]; + if (isnan(x[i]))x[i+1]=NAN; + else x[i+1] = da_r * x[i+1]; x[i] = temp0; i += 2 ; @@ -453,7 +483,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, { temp0 = da_r * x[i] - da_i * x[i+1]; x[i+1] = da_r * x[i+1] + da_i * x[i]; - x[i] = temp0; + if(!isnan(x[i]))x[i] = temp0; i += 2 ; j++; From 28f8fdaf0f87c9bca2a79dd41536bd7ff2027e0c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 23 May 2025 14:59:59 +0200 Subject: [PATCH 174/205] support flag for NaN/Inf handling and fix scaling of NaN/Inf values --- kernel/arm64/zscal.S | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/kernel/arm64/zscal.S b/kernel/arm64/zscal.S index 4bd43320d..93e51b70c 100644 --- a/kernel/arm64/zscal.S +++ b/kernel/arm64/zscal.S @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define INC_X x4 /* X stride */ #define I x5 /* loop variable */ #define X_COPY x6 /* Copy of X */ - +#define FLAG x7 /* NaN handling level */ /******************************************************************************* * Macro definitions *******************************************************************************/ @@ -217,11 +217,15 @@ zscal_begin: cmp N, xzr ble .Lzscal_kernel_L999 + ldr FLAG, [sp] + cmp FLAG, #1 + beq .Lzscal_kernel_R_non_zero + fcmp DA_R, #0.0 bne .Lzscal_kernel_R_non_zero - fcmp DA_I, #0.0 - beq .Lzscal_kernel_RI_zero +// fcmp DA_I, #0.0 +// beq .Lzscal_kernel_RI_zero // b .Lzscal_kernel_R_zero From cf06250d36b21f6d0962fc2c84fdd426b93085bd Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 24 May 2025 06:06:24 -0700 Subject: [PATCH 175/205] add handling of dummy2 flag --- kernel/power/zscal.S | 6 ++++++ kernel/power/zscal.c | 2 +- kernel/power/zscal_ppc440.S | 5 +++++ 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/kernel/power/zscal.S b/kernel/power/zscal.S index ae68ee672..5b2861651 100644 --- a/kernel/power/zscal.S +++ b/kernel/power/zscal.S @@ -51,6 +51,7 @@ #define X r8 #define INCX r9 #endif +#define FLAG r11 #endif #if defined(_AIX) || defined(__APPLE__) @@ -61,6 +62,7 @@ #define X r8 #define INCX r9 #endif +#define FLAG r11 #endif #define FZERO f0 @@ -94,6 +96,10 @@ fcmpu cr0, FZERO, ALPHA_I bne- cr0, LL(A1I1) + LDLONG FLAG, 104(SP) + cmpwi cr0, FLAG, 1 + beq- cr0, LL(A1I1) + cmpwi cr0, INCX, 2 * SIZE bne- cr0, LL(A0IN) diff --git a/kernel/power/zscal.c b/kernel/power/zscal.c index 6b7392d0c..671dc9612 100644 --- a/kernel/power/zscal.c +++ b/kernel/power/zscal.c @@ -136,7 +136,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F if ( inc_x <= 0 ) return(0); - if (da_r == ZERO && da_i == ZERO) { + if (da_r == ZERO && da_i == ZERO && dummy2 == 0) { //clear the vector and return if (inc_x == 1) { memset(x, 0, n*COMPSIZE*SIZE); diff --git a/kernel/power/zscal_ppc440.S b/kernel/power/zscal_ppc440.S index 55dd1b87b..c75bb4ae2 100644 --- a/kernel/power/zscal_ppc440.S +++ b/kernel/power/zscal_ppc440.S @@ -64,6 +64,7 @@ #endif #define INC1 r11 +#define FLAG r12 #define FZERO f0 #define ALPHA_R f1 @@ -97,6 +98,10 @@ fcmpu cr0, FZERO, ALPHA_I bne- cr0, LL(A1I1) + lwz FLAG, FRAMESLOT(0)(SP) + cmpwi cr0, FLAG, 1 + beq- cr0, LL(A1I1) + LL(A0IN): srawi. r0, N, 3 mtspr CTR, r0 From fb8dc8ff5c0382d048017e9e6174197b044b17bc Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 25 May 2025 14:47:06 -0700 Subject: [PATCH 176/205] Add dummy2 flag handling --- kernel/zarch/cscal.c | 70 +++++++++++++++++++++++++++++++++++++------- kernel/zarch/zscal.c | 63 ++++++++++++++++++++++++++++++++------- 2 files changed, 111 insertions(+), 22 deletions(-) diff --git a/kernel/zarch/cscal.c b/kernel/zarch/cscal.c index e623f306b..1c9f2cda7 100644 --- a/kernel/zarch/cscal.c +++ b/kernel/zarch/cscal.c @@ -210,7 +210,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, BLASLONG n1 = n & -2; if (da_i == 0.0) { - + if (dummy2 == 0) { while (j < n1) { x[i] = 0.0; @@ -230,11 +230,43 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, j++; } + } else { + while (j < n1) { + if (isnan(x[i]) || isinf(x[i]) || isnan(x[i+1])) { + x[i] = NAN; + x[i+1] = NAN; + }else{ + x[i] = 0.0; + x[i + 1] = 0.0; + } + if (isnan(x[i+inc_x]) || isinf(x[i+inc_x]) || isnan(x[i+1+inc_x])) { + x[i + inc_x] = NAN; + x[i + 1 + inc_x] = NAN; + } else { + x[i + inc_x] = 0.0; + x[i + 1 + inc_x] = 0.0; + } + i += 2 * inc_x; + j += 2; + + } + while (j < n) { + if (isnan(x[i]) || isinf(x[i]) || isnan(x[i+1])) { + x[i] = NAN; + x[i+1] = NAN; + }else{ + x[i] = 0.0; + x[i + 1] = 0.0; + } + i += inc_x; + j++; + } + } } else { while (j < n1) { - if (isnan(x[i]) || isinf(x[i])) + if (isnan(x[i]) || isinf(x[i])) temp0 = NAN; else temp0 = -da_i * x[i + 1]; @@ -276,7 +308,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, } else { - if (da_i == 0.0) { + if (da_i == 0.0 && dummy2) { BLASLONG n1 = n & -2; while (j < n1) { @@ -335,12 +367,16 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, alpha[1] = da_i; if (da_r == 0.0) - if (da_i == 0) + if (da_i == 0 && dummy2 == 0) cscal_kernel_16_zero(n1, x); - else + else { +/* if (dummy2 == 0) cscal_kernel_16_zero_r(n1, alpha, x); - else if (da_i == 0) - cscal_kernel_16_zero_i(n1, alpha, x); + else*/ + cscal_kernel_16(n1, da_r, da_i, x); + } +/* else if (da_i == 0 && !isnan(da_r)) + cscal_kernel_16/*_zero_i(n1, alpha, x);*/ else cscal_kernel_16(n1, da_r, da_i, x); @@ -354,7 +390,8 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, float res = 0.0; if (isnan(da_r)) res = da_r; while (j < n) { - + if (dummy2) + if (isnan(x[i])|| isnan(x[i+1])) res=NAN; x[i] = res; x[i + 1] = res; i += 2; @@ -382,7 +419,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, x[i + 1] = da_i * x[i]; else x[i + 1] = NAN; - if (x[i] == x[i]) + if (!isnan(x[i])) x[i] = temp0; i += 2; j++; @@ -398,7 +435,18 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, while (j < n) { temp0 = da_r * x[i]; - x[i + 1] = da_r * x[i + 1]; + if (dummy2) { + if (isnan(x[i])||isinf(x[i]))temp0 = NAN; + if (isnan(x[i+1])||isinf(x[i+1])) + x[i+1] = NAN; + else + x[i+1] = da_r * x[i + 1]; + } else { + if (isnan(x[i])) + x[i + 1] = NAN; + else + x[i + 1] = da_r * x[i + 1]; + } x[i] = temp0; i += 2; j++; @@ -411,7 +459,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, temp0 = da_r * x[i] - da_i * x[i + 1]; x[i + 1] = da_r * x[i + 1] + da_i * x[i]; - x[i] = temp0; + if (!isnan(x[i])) x[i] = temp0; i += 2; j++; diff --git a/kernel/zarch/zscal.c b/kernel/zarch/zscal.c index 36466a6e0..5111bc455 100644 --- a/kernel/zarch/zscal.c +++ b/kernel/zarch/zscal.c @@ -208,7 +208,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, BLASLONG n1 = n & -2; if (da_i == 0.0) { - + if (dummy2 == 0) { while (j < n1) { x[i] = 0.0; @@ -228,7 +228,38 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, j++; } - + } else { + while (j < n1) { + if (isnan(x[i]) || isinf(x[i]) || isnan(x[i+1])) { + x[i] = NAN; + x[i+1] = NAN; + } else { + x[i] = 0.0; + x[i+1] = 0.0; + } + if (isnan(x[i+inc_x]) || isinf(x[i+inc_x]) || isnan(x[i+inc_x+1])) { + x[i + inc_x] = NAN; + x[i + inc_x + 1] = NAN; + } else { + x[i + inc_x] = 0.; + x[i + inc_x + 1] = 0.; + } + i += 2 * inc_x; + j += 2; + } + + while (j < n) { + if (isnan(x[i]) || isinf(x[i]) || isnan(x[i+1])) { + x[i] = NAN; + x[i+1] = NAN; + } else { + x[i] = 0.; + x[i+1] = 0.; + } + i += inc_x; + j++; + } + } } else { while (j < n1) { @@ -276,7 +307,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, } else { - if (da_i == 0.0) { + if (da_i == 0.0 && dummy2) { BLASLONG n1 = n & -2; while (j < n1) { @@ -335,12 +366,10 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, alpha[1] = da_i; if (da_r == 0.0) - if (da_i == 0) + if (da_i == 0 && dummy2 == 0) zscal_kernel_8_zero(n1, x); else zscal_kernel_8(n1, da_r, da_i, x); - else if (da_i == 0 && da_r == da_r) - zscal_kernel_8_zero_i(n1, alpha, x); else zscal_kernel_8(n1, da_r, da_i, x); @@ -354,7 +383,8 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, double res= 0.0; if (isnan(da_r)) res = da_r; while (j < n) { - + if (dummy2) + if (isnan(x[i]) || isnan(x[i+1])) res = NAN; x[i] = res; x[i + 1] = res; i += 2; @@ -381,7 +411,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, x[i + 1] = da_i * x[i]; else x[i + 1] = NAN; - if (x[i]==x[i]) + if (!isnan(x[i])) x[i] = temp0; i += 2; j++; @@ -397,8 +427,19 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, while (j < n) { temp0 = da_r * x[i]; - x[i + 1] = da_r * x[i + 1]; - x[i] = temp0; + if (dummy2) { + if (isnan(x[i]) || isinf(x[i])) temp0 = NAN; + if (isnan(x[i + 1]) || isinf(x[i + 1])) + x[i + 1] = NAN; + else + x[i + 1] = da_r * x[i + 1]; + } else { + if (isnan(x[i])) + x[i + 1] = NAN; + else + x[i + 1] = da_r * x[i + 1]; + } + x[i] = temp0; i += 2; j++; @@ -410,7 +451,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, temp0 = da_r * x[i] - da_i * x[i + 1]; x[i + 1] = da_r * x[i + 1] + da_i * x[i]; - x[i] = temp0; + if (!isnan(x[i])) x[i] = temp0; i += 2; j++; From 45fd2d9b0790c5ca3698502d65d59d38d911ef4f Mon Sep 17 00:00:00 2001 From: guoyuanplct Date: Thu, 29 May 2025 17:50:44 +0800 Subject: [PATCH 177/205] Optimized the axpby function. --- kernel/riscv64/KERNEL.RISCV64_ZVL256B | 8 ++ kernel/riscv64/axpby_vector_v2.c | 149 ++++++++++++++++++++++++++ 2 files changed, 157 insertions(+) create mode 100644 kernel/riscv64/axpby_vector_v2.c diff --git a/kernel/riscv64/KERNEL.RISCV64_ZVL256B b/kernel/riscv64/KERNEL.RISCV64_ZVL256B index ba7a52bbf..0fd6adb8b 100644 --- a/kernel/riscv64/KERNEL.RISCV64_ZVL256B +++ b/kernel/riscv64/KERNEL.RISCV64_ZVL256B @@ -169,11 +169,15 @@ SSYMV_U_KERNEL = symv_U_vector.c SSYMV_L_KERNEL = symv_L_vector.c DSYMV_U_KERNEL = symv_U_vector.c DSYMV_L_KERNEL = symv_L_vector.c + + CSYMV_U_KERNEL = ../generic/zsymv_k.c CSYMV_L_KERNEL = ../generic/zsymv_k.c ZSYMV_U_KERNEL = ../generic/zsymv_k.c ZSYMV_L_KERNEL = ../generic/zsymv_k.c + + CHEMV_L_KERNEL = zhemv_LM_vector.c CHEMV_M_KERNEL = zhemv_LM_vector.c CHEMV_U_KERNEL = zhemv_UV_vector.c @@ -207,3 +211,7 @@ COMATCOPY_CN = zomatcopy_cn_vector.c DOMATCOPY_CN = omatcopy_cn_vector.c SOMATCOPY_CN = omatcopy_cn_vector.c + +SAXPBYKERNEL = axpby_vector_v2.c +DAXPBYKERNEL = axpby_vector_v2.c + diff --git a/kernel/riscv64/axpby_vector_v2.c b/kernel/riscv64/axpby_vector_v2.c new file mode 100644 index 000000000..369346e1b --- /dev/null +++ b/kernel/riscv64/axpby_vector_v2.c @@ -0,0 +1,149 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) RISCV_RVV(vsetvl_e32m8)(n) +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m8) +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m8) +#define VSEV_FLOAT RISCV_RVV(vse32_v_f32m8) +#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m8) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m8) +#define VFMULVF_FLOAT RISCV_RVV(vfmul_vf_f32m8) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m8) +#else +#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) +#define FLOAT_V_T vfloat64m4_t +#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4) +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4) +#define VSEV_FLOAT RISCV_RVV(vse64_v_f64m4) +#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4) +#define VFMULVF_FLOAT RISCV_RVV(vfmul_vf_f64m4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m4) +#endif + +int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *y, BLASLONG inc_y) +{ + FLOAT_V_T vx, vy; + unsigned int gvl; + if (n <= 0) + return (0); + if (inc_x == 1 && inc_y == 1) + { + while (n > 0) + { + gvl = VSETVL(n); + + vx = VLEV_FLOAT(x, gvl); + vy = VLEV_FLOAT(y, gvl); + + vy = VFMULVF_FLOAT(vy, beta, gvl); + vy = VFMACCVF_FLOAT(vy, alpha, vx, gvl); + + VSEV_FLOAT(y, vy, gvl); + + x += gvl; + y += gvl; + n -= gvl; + } + } + else if (1 == inc_x) + { + BLASLONG stride_y = inc_y * sizeof(FLOAT); + while (n > 0) + { + gvl = VSETVL(n); + vy = VLSEV_FLOAT(y, stride_y, gvl); + vx = VLEV_FLOAT(x, gvl); + + vy = VFMULVF_FLOAT(vy, beta, gvl); + vy = VFMACCVF_FLOAT(vy, alpha, vx, gvl); + + VSSEV_FLOAT(y, stride_y, vy, gvl); + + x += gvl; + y += gvl * inc_y; + n -= gvl; + } + } + else if (1 == inc_y) + { + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + while (n > 0) + { + gvl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, gvl); + vy = VLEV_FLOAT(y, gvl); + + vy = VFMULVF_FLOAT(vy, beta, gvl); + vy = VFMACCVF_FLOAT(vy, alpha, vx, gvl); + + VSEV_FLOAT(y, vy, gvl); + + x += gvl * inc_x; + y += gvl; + n -= gvl; + } + } + else if (inc_y == 0) + { + FLOAT vf = y[0]; + for (; n > 0; n--) + { + vf = (vf * beta) + (x[0] * alpha); + x += inc_x; + } + y[0] = vf; + } + else + { + BLASLONG stride_x = inc_x * sizeof(FLOAT); + BLASLONG stride_y = inc_y * sizeof(FLOAT); + while (n > 0) + { + gvl = VSETVL(n); + vy = VLSEV_FLOAT(y, stride_y, gvl); + vx = VLSEV_FLOAT(x, stride_x, gvl); + + vy = VFMULVF_FLOAT(vy, beta, gvl); + vy = VFMACCVF_FLOAT(vy, alpha, vx, gvl); + + VSSEV_FLOAT(y, stride_y, vy, gvl); + + x += gvl * inc_x; + y += gvl * inc_y; + n -= gvl; + } + } + + return (0); +} From d2003dc8869366c1054e41f181a4d41a152035db Mon Sep 17 00:00:00 2001 From: guoyuanplct Date: Thu, 29 May 2025 18:38:22 +0800 Subject: [PATCH 178/205] del lines --- kernel/riscv64/KERNEL.RISCV64_ZVL256B | 4 ---- 1 file changed, 4 deletions(-) diff --git a/kernel/riscv64/KERNEL.RISCV64_ZVL256B b/kernel/riscv64/KERNEL.RISCV64_ZVL256B index 0fd6adb8b..2b4f0a545 100644 --- a/kernel/riscv64/KERNEL.RISCV64_ZVL256B +++ b/kernel/riscv64/KERNEL.RISCV64_ZVL256B @@ -170,14 +170,11 @@ SSYMV_L_KERNEL = symv_L_vector.c DSYMV_U_KERNEL = symv_U_vector.c DSYMV_L_KERNEL = symv_L_vector.c - CSYMV_U_KERNEL = ../generic/zsymv_k.c CSYMV_L_KERNEL = ../generic/zsymv_k.c ZSYMV_U_KERNEL = ../generic/zsymv_k.c ZSYMV_L_KERNEL = ../generic/zsymv_k.c - - CHEMV_L_KERNEL = zhemv_LM_vector.c CHEMV_M_KERNEL = zhemv_LM_vector.c CHEMV_U_KERNEL = zhemv_UV_vector.c @@ -214,4 +211,3 @@ SOMATCOPY_CN = omatcopy_cn_vector.c SAXPBYKERNEL = axpby_vector_v2.c DAXPBYKERNEL = axpby_vector_v2.c - From 2ae019161a85333a35018b517d4b34474a7694e9 Mon Sep 17 00:00:00 2001 From: guoyuanplct Date: Thu, 5 Jun 2025 21:53:03 +0800 Subject: [PATCH 179/205] fixed the performance problem in RISCV64_ZVL256 when OPENBLAS_K is small --- kernel/riscv64/zaxpy_vector.c | 47 ++++++++++++++++++++++++++++++ kernel/riscv64/zdot_vector.c | 54 ++++++++++++++++++++++++++++++++++- 2 files changed, 100 insertions(+), 1 deletion(-) diff --git a/kernel/riscv64/zaxpy_vector.c b/kernel/riscv64/zaxpy_vector.c index 1e766c5f4..dd5906931 100644 --- a/kernel/riscv64/zaxpy_vector.c +++ b/kernel/riscv64/zaxpy_vector.c @@ -43,8 +43,55 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m4) #endif +#if !defined(DOUBLE) +inline int small_caxpy_kernel(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +#else +inline int small_zaxpy_kernel(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +#endif +{ + BLASLONG i=0; + BLASLONG ix,iy; + BLASLONG inc_x2; + BLASLONG inc_y2; + + if ( n <= 0 ) return(0); + if ( da_r == 0.0 && da_i == 0.0 ) return(0); + + ix = 0; + iy = 0; + + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; + + while(i < n) + { +#if !defined(CONJ) + y[iy] += ( da_r * x[ix] - da_i * x[ix+1] ) ; + y[iy+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ; +#else + y[iy] += ( da_r * x[ix] + da_i * x[ix+1] ) ; + y[iy+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ; +#endif + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + return(0); + +} + int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { +#if !defined(DOUBLE) + if(n < 16) { + return small_caxpy_kernel(n, dummy0, dummy1, da_r, da_i, x, inc_x, y, inc_y, dummy, dummy2); + } +#else + if(n < 8) { + return small_zaxpy_kernel(n, dummy0, dummy1, da_r, da_i, x, inc_x, y, inc_y, dummy, dummy2); + } +#endif BLASLONG i = 0, j = 0; BLASLONG ix = 0,iy = 0; if(n <= 0) return(0); diff --git a/kernel/riscv64/zdot_vector.c b/kernel/riscv64/zdot_vector.c index 13b8fe378..398de28e5 100644 --- a/kernel/riscv64/zdot_vector.c +++ b/kernel/riscv64/zdot_vector.c @@ -68,8 +68,60 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFNMSACVV_FLOAT RISCV_RVV(vfnmsac_vv_f64m4) #endif +#if !defined(DOUBLE) + inline OPENBLAS_COMPLEX_FLOAT small_cdot_kernel(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +#else +inline OPENBLAS_COMPLEX_FLOAT small_zdot_kernel(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +#endif +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT dot[2]; + OPENBLAS_COMPLEX_FLOAT result; + BLASLONG inc_x2; + BLASLONG inc_y2; + + dot[0]=0.0; + dot[1]=0.0; + + CREAL(result) = 0.0 ; + CIMAG(result) = 0.0 ; + + if ( n < 1 ) return(result); + + inc_x2 = 2 * inc_x ; + inc_y2 = 2 * inc_y ; + + while(i < n) + { +#if !defined(CONJ) + dot[0] += ( x[ix] * y[iy] - x[ix+1] * y[iy+1] ) ; + dot[1] += ( x[ix+1] * y[iy] + x[ix] * y[iy+1] ) ; +#else + dot[0] += ( x[ix] * y[iy] + x[ix+1] * y[iy+1] ) ; + dot[1] -= ( x[ix+1] * y[iy] - x[ix] * y[iy+1] ) ; +#endif + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + CREAL(result) = dot[0]; + CIMAG(result) = dot[1]; + return(result); + +} OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { +#if !defined(DOUBLE) +if(n < 16) { + return small_cdot_kernel(n, x, inc_x, y, inc_y); +} +#else +if(n < 8) { + return small_zdot_kernel(n, x, inc_x, y, inc_y); +} +#endif BLASLONG i=0, j=0; BLASLONG ix=0,iy=0; FLOAT dot[2]; @@ -148,4 +200,4 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA CREAL(result) = dot[0]; CIMAG(result) = dot[1]; return(result); -} +} \ No newline at end of file From 5442aff218e47fdf882dd2828b3552618b4bc761 Mon Sep 17 00:00:00 2001 From: Arne Juul Date: Sun, 8 Jun 2025 19:50:15 +0000 Subject: [PATCH 180/205] Accumulate results in output register explicitly --- kernel/arm64/dot_kernel_asimd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/arm64/dot_kernel_asimd.c b/kernel/arm64/dot_kernel_asimd.c index a404c9636..f52112830 100644 --- a/kernel/arm64/dot_kernel_asimd.c +++ b/kernel/arm64/dot_kernel_asimd.c @@ -134,7 +134,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. " fadd v4.4s, v4.4s, v6.4s \n" \ " fadd v0.4s, v0.4s, v4.4s \n" \ " faddp v0.4s, v0.4s, v0.4s \n" \ - " faddp v0.4s, v0.4s, v0.4s \n" + " faddp "OUT", v0.2s \n" #else /* !defined(DSDOT) */ #define KERNEL_F1 \ From f18b7a46bf4597a2b8eb07365cf41a888dfc924a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 11 Jun 2025 01:47:43 -0700 Subject: [PATCH 181/205] add dummy2 flag handling for inf/nan agnostic zeroing --- kernel/riscv64/zscal_rvv.c | 14 +++++++++++++- kernel/riscv64/zscal_vector.c | 13 ++++++++++--- 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/kernel/riscv64/zscal_rvv.c b/kernel/riscv64/zscal_rvv.c index ae79d9f9d..9f990e0c0 100644 --- a/kernel/riscv64/zscal_rvv.c +++ b/kernel/riscv64/zscal_rvv.c @@ -70,6 +70,11 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F FLOAT_VX2_T vx2; if(inc_x == 1) { + if (dummy2 == 0 && da_r==0. && da_i == 0.) { + BLASLONG i; + for (i=0; i < n*2; i++) x[i]=0.; + return(0); + } else { for (size_t vl; n > 0; n -= vl, x += vl*2) { vl = VSETVL(n); @@ -80,6 +85,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F vt = VFMULVF_FLOAT(vr, da_r, vl); vt = VFNMSACVF_FLOAT(vt, da_i, vi, vl); + vi = VFMULVF_FLOAT(vi, da_r, vl); vi = VFMACCVF_FLOAT(vi, da_i, vr, vl); @@ -87,9 +93,14 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F vx2 = VSET_VX2(vx2, 1, vi); VSSEG_FLOAT(x, vx2, vl); } + } } else { - + if (dummy2 == 0 && da_r==0. && da_i == 0.) { + BLASLONG i,ix=0,inc_x2=2*inc_x; + for (i=0; i < n; i++) {x[ix]=0.;x[ix+1]=0.;ix+=inc_x2;}; + return(0); + } else { for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { vl = VSETVL(n); @@ -105,6 +116,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F vx2 = VSET_VX2(vx2, 0, vt); vx2 = VSET_VX2(vx2, 1, vi); VSSSEG_FLOAT(x, stride_x, vx2, vl); + } } } diff --git a/kernel/riscv64/zscal_vector.c b/kernel/riscv64/zscal_vector.c index 536bbdf73..a72361b04 100644 --- a/kernel/riscv64/zscal_vector.c +++ b/kernel/riscv64/zscal_vector.c @@ -57,9 +57,15 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F if((n <= 0) || (inc_x <= 0)) return(0); - unsigned int gvl = 0; - FLOAT_V_T vt, v0, v1; - { + if (dummy2 == 0 && da_r == 0. && da_i == 0.) { + int i,inc_x2,ix; + inc_x2 = 2*inc_x; + ix=0; + for (i=0;i Date: Wed, 11 Jun 2025 22:10:46 +0200 Subject: [PATCH 182/205] Use generic SCAL kernels to address inf/nan handling for now --- kernel/sparc/KERNEL | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/sparc/KERNEL b/kernel/sparc/KERNEL index d6580609b..b2a8184a8 100644 --- a/kernel/sparc/KERNEL +++ b/kernel/sparc/KERNEL @@ -86,3 +86,8 @@ endif ifndef QROTMKERNEL QROTMKERNEL = ../generic/rotm.c endif + +SSCALKERNEL = ../arm/scal.c +DSCALKERNEL = ../arm/scal.c +CSCALKERNEL = ../arm/zscal.c +ZSCALKERNEL = ../arm/zscal.c From e12132abd4b43d4e2560bd492204b6cba26f8563 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 11 Jun 2025 22:12:10 +0200 Subject: [PATCH 183/205] Use generic C/ZSCAL kernels to address inf/nan handling for now --- kernel/x86/KERNEL | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/x86/KERNEL b/kernel/x86/KERNEL index 1095c1528..3ae268e6c 100644 --- a/kernel/x86/KERNEL +++ b/kernel/x86/KERNEL @@ -200,3 +200,6 @@ endif ifndef QROTMKERNEL QROTMKERNEL = ../generic/rotm.c endif + +CSCALKERNEL = ../arm/zscal.c +ZSCALKERNEL = ../arm/zscal.c From 58eeb9041cfe93c56fb09337040b3994bddd8fc0 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 12 Jun 2025 03:03:01 -0700 Subject: [PATCH 184/205] fix handling of dummy2 --- kernel/arm64/zscal.S | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/kernel/arm64/zscal.S b/kernel/arm64/zscal.S index 93e51b70c..97d8a8b7a 100644 --- a/kernel/arm64/zscal.S +++ b/kernel/arm64/zscal.S @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define INC_X x4 /* X stride */ #define I x5 /* loop variable */ #define X_COPY x6 /* Copy of X */ -#define FLAG x7 /* NaN handling level */ +#define FLAG x7 /******************************************************************************* * Macro definitions *******************************************************************************/ @@ -216,23 +216,22 @@ zscal_begin: cmp N, xzr ble .Lzscal_kernel_L999 - - ldr FLAG, [sp] - cmp FLAG, #1 - beq .Lzscal_kernel_R_non_zero +ldr FLAG, [sp] +cmp FLAG, #1 +beq .Lzscal_kernel_RI_non_zero fcmp DA_R, #0.0 bne .Lzscal_kernel_R_non_zero -// fcmp DA_I, #0.0 -// beq .Lzscal_kernel_RI_zero + fcmp DA_I, #0.0 + beq .Lzscal_kernel_RI_zero // b .Lzscal_kernel_R_zero .Lzscal_kernel_R_non_zero: fcmp DA_I, #0.0 - beq .Lzscal_kernel_I_zero +//QUAK beq .Lzscal_kernel_I_zero /******************************************************************************* * A_R != 0 && A_I != 0 From 549a9f1dbb152945b14f9376d055ad7c12042917 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 12 Jun 2025 18:54:33 +0200 Subject: [PATCH 185/205] Disable the default SSE kernels for CSCAL/ZSCAL for now --- kernel/x86_64/KERNEL | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/x86_64/KERNEL b/kernel/x86_64/KERNEL index c270ff077..a8ba70a31 100644 --- a/kernel/x86_64/KERNEL +++ b/kernel/x86_64/KERNEL @@ -323,11 +323,11 @@ DSCALKERNEL = scal_sse2.S endif ifndef CSCALKERNEL -CSCALKERNEL = zscal_sse.S +CSCALKERNEL = ../arm/zscal.c endif ifndef ZSCALKERNEL -ZSCALKERNEL = zscal_sse2.S +ZSCALKERNEL = ../arm/zscal.c endif ifndef ASCALKERNEL From 73af02b89fa807402ff37d459c593e121542ee6f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 12 Jun 2025 13:33:56 -0700 Subject: [PATCH 186/205] use dummy2 as Inf/NAN handling flag --- kernel/riscv64/zscal.c | 89 +++++++++++++++++++----------------------- 1 file changed, 40 insertions(+), 49 deletions(-) diff --git a/kernel/riscv64/zscal.c b/kernel/riscv64/zscal.c index 8499145f4..b210f9af3 100644 --- a/kernel/riscv64/zscal.c +++ b/kernel/riscv64/zscal.c @@ -27,65 +27,56 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /************************************************************************************** * 2013/09/14 Saar -* BLASTEST float : OK -* BLASTEST double : OK -* CTEST : OK -* TEST : OK +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : OK +* TEST : OK * **************************************************************************************/ #include "common.h" +// The c/zscal_k function is called not only by cblas_c/zscal but also by other upper-level interfaces. +// In certain cases, the expected return values for cblas_s/zscal differ from those of other upper-level interfaces. +// To handle this, we use the dummy2 parameter to differentiate between them. int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { - BLASLONG i=0; - BLASLONG inc_x2; - BLASLONG ip = 0; - FLOAT temp; + BLASLONG i = 0; + BLASLONG inc_x2; + BLASLONG ip = 0; + FLOAT temp; - if ( (n <= 0) || (inc_x <= 0)) - return(0); + if ((n <= 0) || (inc_x <= 0)) + return(0); + inc_x2 = 2 * inc_x; + if (dummy2 == 0) { + for (i = 0; i < n; i++) + { + if (da_r == 0.0 && da_i == 0.0) + { + x[ip] = 0.0; + x[ip+1] = 0.0; + } + else + { + temp = da_r * x[ip] - da_i * x[ip+1]; + x[ip+1] = da_r * x[ip+1] + da_i * x[ip] ; + x[ip] = temp; + } - inc_x2 = 2 * inc_x; - for ( i=0; i Date: Fri, 13 Jun 2025 00:54:27 -0700 Subject: [PATCH 187/205] resync with the generic arm version for inf/nan handling --- kernel/mips/zscal.c | 97 ++++++++++++++++++++++----------------------- 1 file changed, 47 insertions(+), 50 deletions(-) diff --git a/kernel/mips/zscal.c b/kernel/mips/zscal.c index ae1c87fce..b210f9af3 100644 --- a/kernel/mips/zscal.c +++ b/kernel/mips/zscal.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2016, The OpenBLAS Project +Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -25,61 +25,58 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + #include "common.h" +// The c/zscal_k function is called not only by cblas_c/zscal but also by other upper-level interfaces. +// In certain cases, the expected return values for cblas_s/zscal differ from those of other upper-level interfaces. +// To handle this, we use the dummy2 parameter to differentiate between them. int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { - BLASLONG i=0; - BLASLONG inc_x2; - BLASLONG ip = 0; - FLOAT temp; + BLASLONG i = 0; + BLASLONG inc_x2; + BLASLONG ip = 0; + FLOAT temp; - inc_x2 = 2 * inc_x; - for ( i=0; i Date: Fri, 13 Jun 2025 13:32:02 +0200 Subject: [PATCH 188/205] temporarily change default C/ZSCAL to the non-asm implementation --- kernel/mips64/KERNEL | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/mips64/KERNEL b/kernel/mips64/KERNEL index 2ebd8a5bd..0ebb459b3 100644 --- a/kernel/mips64/KERNEL +++ b/kernel/mips64/KERNEL @@ -6,6 +6,9 @@ CROTKERNEL = ../mips/zrot.c ZROTKERNEL = ../mips/zrot.c CSWAPKERNEL = ../mips/zswap.c ZSWAPKERNEL = ../mips/zswap.c + +CSCALKERNEL = zscal.c +ZSCALKERNEL = zscal.c ifndef SNRM2KERNEL From e338d34ce1d3c3cfed50e0060fee392ca7ef3166 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 13 Jun 2025 13:37:15 +0200 Subject: [PATCH 189/205] fix path --- kernel/mips64/KERNEL | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/mips64/KERNEL b/kernel/mips64/KERNEL index 0ebb459b3..d720aaff5 100644 --- a/kernel/mips64/KERNEL +++ b/kernel/mips64/KERNEL @@ -7,8 +7,8 @@ ZROTKERNEL = ../mips/zrot.c CSWAPKERNEL = ../mips/zswap.c ZSWAPKERNEL = ../mips/zswap.c -CSCALKERNEL = zscal.c -ZSCALKERNEL = zscal.c +CSCALKERNEL = ../mips/zscal.c +ZSCALKERNEL = ../mips/zscal.c ifndef SNRM2KERNEL From 5e393f207cb8615617e235f539078ff2da0362fa Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 15 Jun 2025 00:06:34 +0200 Subject: [PATCH 190/205] fix source file used for sbgemmt/sbgemmtr --- interface/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/interface/CMakeLists.txt b/interface/CMakeLists.txt index a3ee6559e..eb2bce3f0 100644 --- a/interface/CMakeLists.txt +++ b/interface/CMakeLists.txt @@ -125,8 +125,8 @@ endif () if (BUILD_BFLOAT16) GenerateNamedObjects("bf16dot.c" "" "sbdot" ${CBLAS_FLAG} "" "" true "BFLOAT16") GenerateNamedObjects("gemm.c" "" "sbgemm" ${CBLAS_FLAG} "" "" true "BFLOAT16") - GenerateNamedObjects("gemmt.c" "" "sbgemmt" ${CBLAS_FLAG} "" "" true "BFLOAT16") - GenerateNamedObjects("gemmt.c" "RNAME" "sbgemmtr" ${CBLAS_FLAG} "" "" true "BFLOAT16") + GenerateNamedObjects("sbgemmt.c" "" "sbgemmt" ${CBLAS_FLAG} "" "" true "BFLOAT16") + GenerateNamedObjects("sbgemmt.c" "RNAME" "sbgemmtr" ${CBLAS_FLAG} "" "" true "BFLOAT16") GenerateNamedObjects("sbgemv.c" "" "sbgemv" ${CBLAS_FLAG} "" "" true "BFLOAT16") GenerateNamedObjects("tobf16.c" "SINGLE_PREC" "sbstobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16") GenerateNamedObjects("tobf16.c" "DOUBLE_PREC" "sbdtobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16") From 874744976ca4366841b25122fc5dfb1cbd981b30 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 15 Jun 2025 19:11:26 +0200 Subject: [PATCH 191/205] fix dimension used in nancheck (Reference-LAPACK PR 1135) --- lapack-netlib/LAPACKE/src/lapacke_cunmlq.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lapack-netlib/LAPACKE/src/lapacke_cunmlq.c b/lapack-netlib/LAPACKE/src/lapacke_cunmlq.c index 224fa7866..b5160aeb7 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_cunmlq.c +++ b/lapack-netlib/LAPACKE/src/lapacke_cunmlq.c @@ -48,8 +48,10 @@ lapack_int LAPACKE_cunmlq( int matrix_layout, char side, char trans, } #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { + lapack_int r; /* Optionally check input matrices for NaNs */ - if( LAPACKE_cge_nancheck( matrix_layout, k, m, a, lda ) ) { + r = LAPACKE_lsame( side, 'l' ) ? m : n; + if( LAPACKE_cge_nancheck( matrix_layout, k, r, a, lda ) ) { return -7; } if( LAPACKE_cge_nancheck( matrix_layout, m, n, c, ldc ) ) { From d8a2324699d6eb9f9eda7f611c51c83d6f489a72 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 15 Jun 2025 19:13:23 +0200 Subject: [PATCH 192/205] fix dimension used in nancheck (Reference-LAPACK PR 1135) --- lapack-netlib/LAPACKE/src/lapacke_cunmlq_work.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/LAPACKE/src/lapacke_cunmlq_work.c b/lapack-netlib/LAPACKE/src/lapacke_cunmlq_work.c index 204dc72a7..7f6a24897 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_cunmlq_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_cunmlq_work.c @@ -90,7 +90,7 @@ lapack_int LAPACKE_cunmlq_work( int matrix_layout, char side, char trans, goto exit_level_1; } /* Transpose input matrices */ - LAPACKE_cge_trans( matrix_layout, k, m, a, lda, a_t, lda_t ); + LAPACKE_cge_trans( matrix_layout, k, r, a, lda, a_t, lda_t ); LAPACKE_cge_trans( matrix_layout, m, n, c, ldc, c_t, ldc_t ); /* Call LAPACK function and adjust info */ LAPACK_cunmlq( &side, &trans, &m, &n, &k, a_t, &lda_t, tau, c_t, &ldc_t, From 2a6beac88f7e7e5ddd63f06d53138ba21bf15023 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 15 Jun 2025 19:14:53 +0200 Subject: [PATCH 193/205] fix dimension used in transposition (Reference-LAPACK PR 1135) --- lapack-netlib/LAPACKE/src/lapacke_zunmlq_work.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/LAPACKE/src/lapacke_zunmlq_work.c b/lapack-netlib/LAPACKE/src/lapacke_zunmlq_work.c index e82e7e3c6..1217514dd 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zunmlq_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zunmlq_work.c @@ -90,7 +90,7 @@ lapack_int LAPACKE_zunmlq_work( int matrix_layout, char side, char trans, goto exit_level_1; } /* Transpose input matrices */ - LAPACKE_zge_trans( matrix_layout, k, m, a, lda, a_t, lda_t ); + LAPACKE_zge_trans( matrix_layout, k, r, a, lda, a_t, lda_t ); LAPACKE_zge_trans( matrix_layout, m, n, c, ldc, c_t, ldc_t ); /* Call LAPACK function and adjust info */ LAPACK_zunmlq( &side, &trans, &m, &n, &k, a_t, &lda_t, tau, c_t, &ldc_t, From f4e517705051d7bb73de5bb3c2d6945682d77d3f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 15 Jun 2025 19:16:58 +0200 Subject: [PATCH 194/205] fix dimension used in nancheck (Reference-LAPACK PR 1135) --- lapack-netlib/LAPACKE/src/lapacke_zunmlq.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lapack-netlib/LAPACKE/src/lapacke_zunmlq.c b/lapack-netlib/LAPACKE/src/lapacke_zunmlq.c index 06b10389c..602104620 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zunmlq.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zunmlq.c @@ -48,8 +48,10 @@ lapack_int LAPACKE_zunmlq( int matrix_layout, char side, char trans, } #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { + lapack_int r; /* Optionally check input matrices for NaNs */ - if( LAPACKE_zge_nancheck( matrix_layout, k, m, a, lda ) ) { + r = LAPACKE_lsame( side, 'l' ) ? m : n; + if( LAPACKE_zge_nancheck( matrix_layout, k, r, a, lda ) ) { return -7; } if( LAPACKE_zge_nancheck( matrix_layout, m, n, c, ldc ) ) { From 906b9df31648eb5407bdfd79af407d154ef1b051 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 15 Jun 2025 19:34:01 +0200 Subject: [PATCH 195/205] fix missing initialization --- lapack-netlib/SRC/cgeqp3rk.f | 1 + 1 file changed, 1 insertion(+) diff --git a/lapack-netlib/SRC/cgeqp3rk.f b/lapack-netlib/SRC/cgeqp3rk.f index 731c44edb..fecf8d85c 100644 --- a/lapack-netlib/SRC/cgeqp3rk.f +++ b/lapack-netlib/SRC/cgeqp3rk.f @@ -761,6 +761,7 @@ * for the whole original matrix stored in A(1:M,1:N). * KP1 = ISAMAX( N, RWORK( 1 ), 1 ) + MAXC2NRM = RWORK( KP1 ) * * ==================================================================. * From 1804ff58d7967c528f6006fe2f10ea3f3c2d3d06 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 15 Jun 2025 19:35:34 +0200 Subject: [PATCH 196/205] fix missing initialization --- lapack-netlib/SRC/zgeqp3rk.f | 1 + 1 file changed, 1 insertion(+) diff --git a/lapack-netlib/SRC/zgeqp3rk.f b/lapack-netlib/SRC/zgeqp3rk.f index 01dcce0de..f637966c8 100644 --- a/lapack-netlib/SRC/zgeqp3rk.f +++ b/lapack-netlib/SRC/zgeqp3rk.f @@ -760,6 +760,7 @@ * for the whole original matrix stored in A(1:M,1:N). * KP1 = IDAMAX( N, RWORK( 1 ), 1 ) + MAXC2NRM = RWORK( KP1 ) * * ==================================================================. * From bad47bd0249c8bbec5d862745709148a1a858268 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 15 Jun 2025 13:47:14 -0700 Subject: [PATCH 197/205] Fix too strict leading dimensions check in LAPACKE_?gesdd_work (Reference-LAPACK PR #1126) (#5307) * relax leading dimensions check (Reference-LAPACK PR #1126) --- lapack-netlib/LAPACKE/src/lapacke_cgesdd_work.c | 5 ++++- lapack-netlib/LAPACKE/src/lapacke_dgesdd_work.c | 5 ++++- lapack-netlib/LAPACKE/src/lapacke_sgesdd_work.c | 5 ++++- lapack-netlib/LAPACKE/src/lapacke_zgesdd_work.c | 5 ++++- 4 files changed, 16 insertions(+), 4 deletions(-) diff --git a/lapack-netlib/LAPACKE/src/lapacke_cgesdd_work.c b/lapack-netlib/LAPACKE/src/lapacke_cgesdd_work.c index 70198ccdc..472788af6 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_cgesdd_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_cgesdd_work.c @@ -58,6 +58,9 @@ lapack_int LAPACKE_cgesdd_work( int matrix_layout, char jobz, lapack_int m, lapack_int nrows_vt = ( LAPACKE_lsame( jobz, 'a' ) || ( LAPACKE_lsame( jobz, 'o' ) && m>=n) ) ? n : ( LAPACKE_lsame( jobz, 's' ) ? MIN(m,n) : 1); + lapack_int ncols_vt = ( LAPACKE_lsame( jobz, 'a' ) || + LAPACKE_lsame( jobz, 's' ) || + ( LAPACKE_lsame( jobz, 'o' && m >=n) ) ? n : 1); lapack_int lda_t = MAX(1,m); lapack_int ldu_t = MAX(1,nrows_u); lapack_int ldvt_t = MAX(1,nrows_vt); @@ -75,7 +78,7 @@ lapack_int LAPACKE_cgesdd_work( int matrix_layout, char jobz, lapack_int m, LAPACKE_xerbla( "LAPACKE_cgesdd_work", info ); return info; } - if( ldvt < n ) { + if( ldvt < ncols_vt ) { info = -11; LAPACKE_xerbla( "LAPACKE_cgesdd_work", info ); return info; diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgesdd_work.c b/lapack-netlib/LAPACKE/src/lapacke_dgesdd_work.c index 7bef2230c..3f805b554 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dgesdd_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dgesdd_work.c @@ -56,6 +56,9 @@ lapack_int LAPACKE_dgesdd_work( int matrix_layout, char jobz, lapack_int m, lapack_int nrows_vt = ( LAPACKE_lsame( jobz, 'a' ) || ( LAPACKE_lsame( jobz, 'o' ) && m>=n) ) ? n : ( LAPACKE_lsame( jobz, 's' ) ? MIN(m,n) : 1); + lapack_int ncols_vt = ( LAPACKE_lsame( jobz, 'a' ) || + LAPACKE_lsame( jobz, 's' ) || + ( LAPACKE_lsame( jobz, 'o' && m >=n) ) ? n : 1); lapack_int lda_t = MAX(1,m); lapack_int ldu_t = MAX(1,nrows_u); lapack_int ldvt_t = MAX(1,nrows_vt); @@ -73,7 +76,7 @@ lapack_int LAPACKE_dgesdd_work( int matrix_layout, char jobz, lapack_int m, LAPACKE_xerbla( "LAPACKE_dgesdd_work", info ); return info; } - if( ldvt < n ) { + if( ldvt < ncols_vt ) { info = -11; LAPACKE_xerbla( "LAPACKE_dgesdd_work", info ); return info; diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgesdd_work.c b/lapack-netlib/LAPACKE/src/lapacke_sgesdd_work.c index b6619e38b..4cfb91f33 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_sgesdd_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_sgesdd_work.c @@ -56,6 +56,9 @@ lapack_int LAPACKE_sgesdd_work( int matrix_layout, char jobz, lapack_int m, lapack_int nrows_vt = ( LAPACKE_lsame( jobz, 'a' ) || ( LAPACKE_lsame( jobz, 'o' ) && m>=n) ) ? n : ( LAPACKE_lsame( jobz, 's' ) ? MIN(m,n) : 1); + lapack_int ncols_vt = ( LAPACKE_lsame( jobz, 'a' ) || + LAPACKE_lsame( jobz, 's' ) || + ( LAPACKE_lsame( jobz, 'o' && m >=n) ) ? n : 1); lapack_int lda_t = MAX(1,m); lapack_int ldu_t = MAX(1,nrows_u); lapack_int ldvt_t = MAX(1,nrows_vt); @@ -73,7 +76,7 @@ lapack_int LAPACKE_sgesdd_work( int matrix_layout, char jobz, lapack_int m, LAPACKE_xerbla( "LAPACKE_sgesdd_work", info ); return info; } - if( ldvt < n ) { + if( ldvt < ncols_vt ) { info = -11; LAPACKE_xerbla( "LAPACKE_sgesdd_work", info ); return info; diff --git a/lapack-netlib/LAPACKE/src/lapacke_zgesdd_work.c b/lapack-netlib/LAPACKE/src/lapacke_zgesdd_work.c index fc07fe9cb..5d513fb20 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zgesdd_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zgesdd_work.c @@ -58,6 +58,9 @@ lapack_int LAPACKE_zgesdd_work( int matrix_layout, char jobz, lapack_int m, lapack_int nrows_vt = ( LAPACKE_lsame( jobz, 'a' ) || ( LAPACKE_lsame( jobz, 'o' ) && m>=n) ) ? n : ( LAPACKE_lsame( jobz, 's' ) ? MIN(m,n) : 1); + lapack_int ncols_vt = ( LAPACKE_lsame( jobz, 'a' ) || + LAPACKE_lsame( jobz, 's' ) || + ( LAPACKE_lsame( jobz, 'o' && m >=n) ) ? n : 1); lapack_int lda_t = MAX(1,m); lapack_int ldu_t = MAX(1,nrows_u); lapack_int ldvt_t = MAX(1,nrows_vt); @@ -75,7 +78,7 @@ lapack_int LAPACKE_zgesdd_work( int matrix_layout, char jobz, lapack_int m, LAPACKE_xerbla( "LAPACKE_zgesdd_work", info ); return info; } - if( ldvt < n ) { + if( ldvt < ncols_vt ) { info = -11; LAPACKE_xerbla( "LAPACKE_zgesdd_work", info ); return info; From 3fe7f196e659015e3381d9ed73de76dccfbe0e2a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 15 Jun 2025 23:31:45 +0200 Subject: [PATCH 198/205] Update the Changelog for version 0.3.30 --- Changelog.txt | 130 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 130 insertions(+) diff --git a/Changelog.txt b/Changelog.txt index b52734c82..3e988fdaa 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,134 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.3.30 +16-Jun-2025 + +general: + - fixed an installation problem with the thread safety test in gmake builds + - fixed spurious overwriting of an input array in complex GEMMT/GEMMTR + - fixed naming of GEMMTR in error messages from XERBLA + - fixed compilation of SBGEMMT/SBGEMMTR in CMake builds + - fixed the implementation of ?NRM2 to handle INCX=0 correctly + - removed tests for CSROT and ZDROT that relied on unspecified behavior + - fixed a performance regression in multithreaded GEMM that was particularly + serious on POWER targets + - fixed linking issues when using LLVM's flang-new with gmake + - fixed a potential thread safety problem with C11 atomic operations + - further improved the workload partitioning in parallel GEMM + - fixed omission of LAPACKE interfaces for CGESVDQ,CTRSYL3 and ?GEQPF in + CMake builds + - fixed mishandling of setting NO_LAPACK to FALSE, and incorrect dependencies + for LAPACK function SPMV in CMake builds + - added explicit CMake options for building LAPACKE and shared libraries + - simplified and improved handling of OpenMP options in CMake builds + - reworked Windows DLL generation in CMake builds to ensure correct symbol + renaming (pre/postfixing) and optional generation of PDB files for debugging + - updated the Perl script version of the gensymbol utility for use with + Windows-on-Arm + - Fixed building with (Mingw) gmake on Windows to ensure completeness of the + LAPACK included in the static library (potential race condition due to the + Windows version of the "ln" utility creating snapshot copies rather than links) + - fixed unwanted deletion of the lapacke_mangling.h file by "make clean" + - fixed potential duplication of a _64 suffix on library names in CMake builds + - fixed compilation of the C fallback copies of the LAPACK code with GCC 15 + - included fixed from the Reference-LAPACK project: + - fixed a truncated error message in the EIG part of the testsuite + (Reference-LAPACK PR 1119) + - fixed too strict check in LAPACKE_?gesdd_work (PR #1126) + - fixed memory corruption when calling ?GEEV with non-finite data (PR #1128) + - fixed missing initialization of a variable in C/GEQP3RK (PR #1131) + - fixed 2nd dimension chosen in C/ZUNMLQ transposition operation (PR #1135) + +x86_64: + - fixed an error in the SBGEMV kernel for Cooper Lake/Sapphire Rapids + - fixed corner cases of NAN and INF input handling in CSCAL and ZSCAL + - improved the compiler identification code for flang-new + - fixed a potential build issue in the ZSUM kernel + - fixed "argument list too long" errors when building on MacOS + - added cpu autodetection support for several new Arrow Lake models + - fixed conditional inclusion of the fast path SGEMM kernel in DYNAMIC_ARCH + - fixed compilation with the MinGW build of GCC 15 + +arm64: + - added an optimized SBGEMM kernel for NEOVERSEV1 + - improved 1xN SBGEMM performance by forwarding to SBGEMV + - introduced a stepwise increase of the thread count used for + SGEMM and SGEMV on NEOVERSEV1/V2 in relation to problem size + - introduced a stepwise increase of the thread count used for + DGEMV on NEOVERSEV1 in relation to problem size + - introduced a stepwise increase of the thread count used for + SDOT and DDOT on NEOVERSEV1 in relation to problem size + - worked around assembler limitations in LLVM for Windows-on-Arm + - enabled cpu type autodetection from the registry on Windows-on-Arm + - improved multithreading threshold for GEMV and GESV on Windows-on-Arm + - fixed overoptimization issues with LLVM's flang in Windows-on-Arm + - fixed corner cases of NAN and INF input handling in CSCAL and ZSCAL + - added a fast path SGEMM kernel for small workloads on SME capable targets + - improved performance of SGEMM and DGEMM kernels for small workloads + - improved performance of SGEMV and DGEMV on SVE-capable targets + - improved performance of SGEMV on NEOVERSEN1 and Apple M + - added optimized SSYMV and DSYMV kernels for NEOVERSEN1, Apple M and all + SVE capable targets + - added optimized SBGEMV kernels for NEOVERSEV1/V2/N2 + - improved performance of SGEMM through faster NCOPY kernels + - added compiler options for the NVIDIA HPC Compiler Suite + - fixed compilation on OSX with XCode 16.3 and later + - fixed cpu core type and cache size detection on Apple M4 + - updated GEMM parameter settings for Neoverse cpus in cross-builds with CMake + - fixed default compiler options for NEOVERSEN1 and CORTEXX2 in CMake builds + - fixed conditional inclusion of the fast path SGEMM kernel in DYNAMIC_ARCH + - fixed potential miscompilation of the non-SVE SDOT kernel + +riscv64: + - added optimized SROTM and DROTM kernels for x280 + - fixed corner cases of NAN and INF input handling in CSCAL and ZSCAL + - improved performance of GEMM_TCOPY on RVV1.0 targets with + VLEN of 128 or 256 + - improved performance of OMATCOPY on targets with VLEN 256 + - greatly improved performance of SGEMV/DGEMV + - improved performance of CGEMV and ZGEMV on C910V and all RVV targets + with VLEN 256 + - improved performance of SAXPBY and DAXPBY on C910V and all RVV targets + with VLEN 256 + - improved performance of AXPY and DOT on C910V and ZVL256B targets by + falling back to non-vectorized code for very small N. (Thereby fixing + poor performance of CHBMV/ZHBMV for very small K) + - fixed CMake build failures of the TRMM kernels + +loongarch64: + - improved performance of the LSX versions of SSYMV/DSYMV + - made the LASX versions of the DSYMV and SSYMV kernels + compatible with hardware changes in LA664 and future targets + - fixed inaccuracies in several LASX kernels + - improved compatibility of LSX kernels with LA264 targets + - fixed handling of deprecated target names in CMake builds + - fixed corner cases of NAN and INF input handling in CSCAL and ZSCAL + +power: + - fixed building for PPCG4 with CMake + - fixed SSCAL/DSCAL on PPC970 running FreeBSD + - fixed a potential alignment issue in the POWER8 SGEMV kernel + - fixed corner cases of NAN and INF input handling in CSCAL and ZSCAL + +zarch: + - fixed corner cases of NAN and INF input handling in CSCAL and ZSCAL + - fixed unwanted generation of object files with a writable stack + +x86: + - fixed corner cases of NAN and INF input handling in CSCAL and ZSCAL + +arm: + - fixed corner cases of NAN and INF input handling in CSCAL and ZSCAL + +sparc: + - fixed corner cases of NAN and INF input handling in CSCAL and ZSCAL + +alpha: + - fixed build failure caused by spurious Windows-only typecasts + +cell: + - fixed probable build issue caused by spurious Windows-only typecasts + ==================================================================== Version 0.3.29 12-Jan-2025 From 1dd396033a4a30a3a623bfa58a04d0560e4fb9d8 Mon Sep 17 00:00:00 2001 From: Masato Nakagawa Date: Mon, 16 Jun 2025 19:50:08 +0900 Subject: [PATCH 199/205] Fix:Problem with identifying some ARM64 processors. --- cpuid_arm64.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpuid_arm64.c b/cpuid_arm64.c index c60725828..2bf93cc87 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -276,11 +276,11 @@ int detect(void) fclose(infile); } } - sprintf(cpuimpl,"0x%2x",implementer); + sprintf(cpuimpl,"0x%02x",implementer); cpu_implementer=strdup(cpuimpl); } qsort(cpucores,1024,sizeof(int),cpusort); - sprintf(cpupart,"0x%3x",cpucores[0]); + sprintf(cpupart,"0x%03x",cpucores[0]); cpu_part=strdup(cpupart); if(cpu_part != NULL && cpu_implementer != NULL) { // Arm From 53cd6e7ff783ef74b5de63ca86fd55a3b001d727 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 16 Jun 2025 13:31:15 +0200 Subject: [PATCH 200/205] Update Changelog.txt --- Changelog.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/Changelog.txt b/Changelog.txt index 3e988fdaa..ea1a31a11 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -50,6 +50,7 @@ x86_64: - fixed compilation with the MinGW build of GCC 15 arm64: + - fixed cpu type detection of A64FX and some ThunderX models (broken in 0.3.29) - added an optimized SBGEMM kernel for NEOVERSEV1 - improved 1xN SBGEMM performance by forwarding to SBGEMV - introduced a stepwise increase of the thread count used for From 3318a2b904a416cc00b913e870cd74c700a7f129 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 17 Jun 2025 22:41:40 +0200 Subject: [PATCH 201/205] override CDOT and ZDOT with the generic C kernel --- kernel/x86/KERNEL | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/x86/KERNEL b/kernel/x86/KERNEL index 3ae268e6c..0be5ef5c5 100644 --- a/kernel/x86/KERNEL +++ b/kernel/x86/KERNEL @@ -203,3 +203,5 @@ endif CSCALKERNEL = ../arm/zscal.c ZSCALKERNEL = ../arm/zscal.c +CDOTKERNEL = ../arm/zdot.c +ZDOTKERNEL = ../arm/zdot.c From e684e363778422bea3ee57a0ee44679e02097b6e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 17 Jun 2025 22:50:53 +0200 Subject: [PATCH 202/205] Add 32bit manylinux to match what python wheel build tests use --- azure-pipelines.yml | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 7941bf463..21e0b9693 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -25,14 +25,28 @@ jobs: echo "FROM quay.io/pypa/manylinux1_x86_64 COPY . /tmp/openblas RUN cd /tmp/openblas && \ - COMMON_FLAGS='DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32' && \ - BTYPE='BINARY=64' CC=gcc && \ - make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE && \ - make -C test $COMMON_FLAGS $BTYPE && \ - make -C ctest $COMMON_FLAGS $BTYPE && \ - make -C utest $COMMON_FLAGS $BTYPE" > Dockerfile + CC=gcc && \ + make QUIET_MAKE=1 BINARY=64 DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 && \ + make -C test BINARY=64 DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 && \ + make -C ctest BINARY=64 DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 && \ + make -C utest BINARY=64 DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32" > Dockerfile docker build . displayName: Run manylinux1 docker build +- job: manylinux_32bit + pool: + vmImage: 'ubuntu-latest' + steps: + - script: | + echo "FROM quay.io/pypa/manylinux2014_i686 + COPY . /tmp/openblas + RUN cd /tmp/openblas && \ + CC=gcc && \ + make QUIET_MAKE=1 BINARY=32 TARGET=NEHALEM NUM_THREADS=32 && \ + make -C test BINARY=32 TARGET=NEHALEM NUM_THREADS=32 && \ + make -C ctest BINARY=32 TARGET=NEHALEM NUM_THREADS=32 && \ + make -C utest BINARY=32 TARGET=NEHALEM NUM_THREADS=32" > Dockerfile + docker build . + displayName: Run manylinux 32bit docker build - job: Intel_SDE_skx pool: vmImage: 'ubuntu-latest' From e541bf68f5c1736256e298c5d48192864465b413 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 18 Jun 2025 09:54:08 +0200 Subject: [PATCH 203/205] support AmpereOne/OneA as NeoverseN1 --- driver/others/dynamic_arm64.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c index 428c5758b..70b51f6fc 100644 --- a/driver/others/dynamic_arm64.c +++ b/driver/others/dynamic_arm64.c @@ -440,13 +440,21 @@ static gotoblas_t *get_coretype(void) { return &gotoblas_TSV110; } break; - case 0x50: // Ampere + case 0x50: // Ampere/AppliedMicro switch (part) { case 0x000: // Skylark/EMAG8180 return &gotoblas_EMAG8180; } break; + case 0xc0: // Ampere + switch(part) + { + case 0xac3: + case 0xac4: + return &gotoblas_NEOVERSEN1; + } + break; case 0x51: // Qualcomm switch (part) { From 79b4dd0fb073407121c6fd280fd3e787d6184a8f Mon Sep 17 00:00:00 2001 From: minicx Date: Wed, 18 Jun 2025 15:03:03 +0300 Subject: [PATCH 204/205] fix(arm): add .note.GNU-stack to ARM assembly to prevent writable-stack warnings Add .section .note.GNU-stack in ARM assembly epilogue on Linux/ELF targets to avoid warnings about a writable/executable stack and ensure shared objects do not require an executable stack. Signed-off-by: minicx --- common_arm.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/common_arm.h b/common_arm.h index d6291018b..d0d245143 100644 --- a/common_arm.h +++ b/common_arm.h @@ -114,7 +114,15 @@ static inline int blas_quickdivide(blasint x, blasint y){ OPENBLAS_ARM_TYPE_FUNCTION \ REALNAME: -#define EPILOGUE +#if defined(__ELF__) && defined(__linux__) +# define GNUSTACK .section .note.GNU-stack,"",%progbits +#else +# define GNUSTACK +#endif + +#define EPILOGUE \ + GNUSTACK + #define PROFCODE From 157273fda0c31689d5f73f240923424f2448ad69 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 18 Jun 2025 22:44:36 +0200 Subject: [PATCH 205/205] another round of last minute updates for 0.3.30 --- Changelog.txt | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Changelog.txt b/Changelog.txt index ea1a31a11..e4ba72986 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,7 +1,7 @@ OpenBLAS ChangeLog ==================================================================== Version 0.3.30 -16-Jun-2025 +19-Jun-2025 general: - fixed an installation problem with the thread safety test in gmake builds @@ -51,6 +51,7 @@ x86_64: arm64: - fixed cpu type detection of A64FX and some ThunderX models (broken in 0.3.29) + - added support for the AmpereOne/1A cpus in DYNAMIC_ ARCH builds - added an optimized SBGEMM kernel for NEOVERSEV1 - improved 1xN SBGEMM performance by forwarding to SBGEMV - introduced a stepwise increase of the thread count used for @@ -117,9 +118,11 @@ zarch: x86: - fixed corner cases of NAN and INF input handling in CSCAL and ZSCAL + - worked around potential miscompilation of CDOT with very old binutils arm: - fixed corner cases of NAN and INF input handling in CSCAL and ZSCAL + - fixed unwanted generation of object files with a writable stack sparc: - fixed corner cases of NAN and INF input handling in CSCAL and ZSCAL