From e114880dc4e6d3413303fe045dee19b6c389e979 Mon Sep 17 00:00:00 2001 From: gxw Date: Fri, 17 Jan 2025 16:01:50 +0800 Subject: [PATCH 01/37] kernel/generic: Fixed cscal and zscal --- interface/zscal.c | 4 +-- kernel/arm/zscal.c | 89 +++++++++++++++++++++------------------------- 2 files changed, 42 insertions(+), 51 deletions(-) diff --git a/interface/zscal.c b/interface/zscal.c index 498377343..0e52d113b 100644 --- a/interface/zscal.c +++ b/interface/zscal.c @@ -98,7 +98,7 @@ void CNAME(blasint n, FLOAT alpha_r, void *vx, blasint incx){ if (nthreads == 1) { #endif - SCAL_K(n, 0, 0, alpha[0], alpha[1], x, incx, NULL, 0, NULL, 0); + SCAL_K(n, 0, 0, alpha[0], alpha[1], x, incx, NULL, 0, NULL, 1); #ifdef SMP } else { @@ -108,7 +108,7 @@ void CNAME(blasint n, FLOAT alpha_r, void *vx, blasint incx){ mode = BLAS_SINGLE | BLAS_COMPLEX; #endif - blas_level1_thread(mode, n, 0, 0, alpha, x, incx, NULL, 0, NULL, 0, (int (*)(void))SCAL_K, nthreads); + blas_level1_thread(mode, n, 0, 0, alpha, x, incx, NULL, 0, NULL, 1, (int (*)(void))SCAL_K, nthreads); } #endif diff --git a/kernel/arm/zscal.c b/kernel/arm/zscal.c index c4855f73e..b210f9af3 100644 --- a/kernel/arm/zscal.c +++ b/kernel/arm/zscal.c @@ -27,65 +27,56 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /************************************************************************************** * 2013/09/14 Saar -* BLASTEST float : OK -* BLASTEST double : OK -* CTEST : OK -* TEST : OK +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : OK +* TEST : OK * **************************************************************************************/ #include "common.h" +// The c/zscal_k function is called not only by cblas_c/zscal but also by other upper-level interfaces. +// In certain cases, the expected return values for cblas_s/zscal differ from those of other upper-level interfaces. +// To handle this, we use the dummy2 parameter to differentiate between them. int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { - BLASLONG i=0; - BLASLONG inc_x2; - BLASLONG ip = 0; - FLOAT temp; + BLASLONG i = 0; + BLASLONG inc_x2; + BLASLONG ip = 0; + FLOAT temp; - if ( (n <= 0) || (inc_x <= 0)) - return(0); + if ((n <= 0) || (inc_x <= 0)) + return(0); + inc_x2 = 2 * inc_x; + if (dummy2 == 0) { + for (i = 0; i < n; i++) + { + if (da_r == 0.0 && da_i == 0.0) + { + x[ip] = 0.0; + x[ip+1] = 0.0; + } + else + { + temp = da_r * x[ip] - da_i * x[ip+1]; + x[ip+1] = da_r * x[ip+1] + da_i * x[ip] ; + x[ip] = temp; + } - inc_x2 = 2 * inc_x; - for ( i=0; i Date: Mon, 20 Jan 2025 15:58:15 +0800 Subject: [PATCH 02/37] utest: Add utest for {c/z}scal and {c/z}gemv --- utest/test_gemv.c | 474 +++++++++++++++++++++++++++++++++++++++++++++ utest/test_zscal.c | 54 ++++++ 2 files changed, 528 insertions(+) diff --git a/utest/test_gemv.c b/utest/test_gemv.c index dab6d2f11..66fc30995 100644 --- a/utest/test_gemv.c +++ b/utest/test_gemv.c @@ -128,3 +128,477 @@ CTEST(dgemv, 0_nan_inf_incy_2) } #endif + +#ifdef BUILD_COMPLEX + +CTEST(cgemv, 0_nan_inf) +{ + int i; + blasint N = 17; + blasint incX = 1; + blasint incY = 1; + float alpha[2] = {0.0, 0.0}; + float beta[2] = {0.0, 0.0}; + char trans = 'N'; + float A[17 * 17 * 4]; + float X[17 * 2]; + float Y[17 * 2]; + + memset(A, 0, sizeof(A)); + memset(X, 0, sizeof(X)); + for (i = 0; i < (2 * N - 2); i += 4) + { + Y[i] = NAN; + Y[i + 1] = NAN; + + Y[i + 2] = INFINITY; + Y[i + 3] = INFINITY; + } + Y[2 * N - 1] = NAN; + Y[2 * N - 2] = NAN; + BLASFUNC(cgemv)(&trans, &N, &N, alpha, A, &N, X, &incX, beta, Y, &incY); + for (i = 0; i < 2 * N; i ++) + ASSERT_TRUE(Y[i] == 0.0); +} + +CTEST(cgemv, 0_nan_inf_incy_2) +{ + int i; + blasint N = 17; + blasint incX = 1; + blasint incY = 2; + float alpha[2] = {0.0, 0.0}; + float beta[2] = {0.0, 0.0}; + char trans = 'N'; + float A[17 * 17 * 4]; + float X[17]; + float Y[17 * 2 * 2]; + float *ay = Y; + + memset(A, 0, sizeof(A)); + memset(X, 0, sizeof(X)); + memset(Y, 0, sizeof(Y)); + for (i = 0; i < (2 * N - 2); i += 4) + { + ay[0] = NAN; + ay[1] = NAN; + ay += 4; + ay[0] = INFINITY; + ay[1] = INFINITY; + ay += 4; + } + Y[4 * N - 4] = NAN; + Y[4 * N - 3] = NAN; + BLASFUNC(cgemv)(&trans, &N, &N, alpha, A, &N, X, &incX, beta, Y, &incY); + for (i = 0; i < 4 * N; i ++) + ASSERT_TRUE(Y[i] == 0.0); +} + +CTEST(cgemv, 0_2_nan_1_inf_1) +{ + int i; + blasint N = 17; + blasint incX = 1; + blasint incY = 1; + float alpha[2] = {0.0, 0.0}; + float beta[2] = {0.0, 2.0}; + char trans = 'N'; + float A[17 * 17 * 4]; + float X[17 * 2]; + float Y[17 * 2]; + + memset(A, 0, sizeof(A)); + memset(X, 0, sizeof(X)); + for (i = 0; i < (2 * N - 2); i += 4) + { + Y[i] = NAN; + Y[i + 1] = 1.0; + + Y[i + 2] = INFINITY; + Y[i + 3] = 1.0; + } + Y[2 * N - 2] = NAN; + Y[2 * N - 1] = 1.0; + BLASFUNC(cgemv)(&trans, &N, &N, alpha, A, &N, X, &incX, beta, Y, &incY); + for (i = 0; i < 2 * N; i += 2) { + if ((i >> 1) % 2){ + ASSERT_TRUE(isnan(Y[i])); + ASSERT_TRUE(isinf(Y[i + 1])); + } + else { + ASSERT_TRUE(isnan(Y[i])); + ASSERT_TRUE(isnan(Y[i + 1])); + } + } +} + +CTEST(cgemv, 0_2_nan_1_inf_1_incy_2) +{ + int i; + blasint N = 17; + blasint incX = 1; + blasint incY = 2; + float alpha[2] = {0.0, 0.0}; + float beta[2] = {0.0, 2.0}; + char trans = 'N'; + float A[17 * 17 * 4]; + float X[17]; + float Y[17 * 2 * 2]; + float *ay = Y; + + memset(A, 0, sizeof(A)); + memset(X, 0, sizeof(X)); + memset(Y, 0, sizeof(Y)); + for (i = 0; i < (2 * N - 2); i += 4) + { + ay[0] = NAN; + ay[1] = 1.0; + ay += 4; + ay[0] = INFINITY; + ay[1] = 1.0; + ay += 4; + } + Y[4 * N - 4] = NAN; + Y[4 * N - 3] = 1.0; + BLASFUNC(cgemv)(&trans, &N, &N, alpha, A, &N, X, &incX, beta, Y, &incY); + for (i = 0; i < 4 * N; i += 2) { + if ((i >> 1) % 2) { + ASSERT_TRUE(Y[i] == 0.0); + ASSERT_TRUE(Y[i + 1] == 0.0); + } + else { + if ((i >> 2) % 2) { + ASSERT_TRUE(isnan(Y[i])); + ASSERT_TRUE(isinf(Y[i + 1])); + } + else { + ASSERT_TRUE(isnan(Y[i])); + ASSERT_TRUE(isnan(Y[i + 1])); + } + } + } +} + +CTEST(cgemv, 2_0_nan_1_inf_1) +{ + int i; + blasint N = 17; + blasint incX = 1; + blasint incY = 1; + float alpha[2] = {0.0, 0.0}; + float beta[2] = {2.0, 0.0}; + char trans = 'N'; + float A[17 * 17 * 4]; + float X[17 * 2]; + float Y[17 * 2]; + + memset(A, 0, sizeof(A)); + memset(X, 0, sizeof(X)); + for (i = 0; i < (2 * N - 2); i += 4) + { + Y[i] = NAN; + Y[i + 1] = 1.0; + + Y[i + 2] = INFINITY; + Y[i + 3] = 1.0; + } + Y[2 * N - 2] = NAN; + Y[2 * N - 1] = 1.0; + BLASFUNC(cgemv)(&trans, &N, &N, alpha, A, &N, X, &incX, beta, Y, &incY); + for (i = 0; i < 2 * N; i += 2) { + if ((i >> 1) % 2){ + ASSERT_TRUE(isinf(Y[i])); + ASSERT_TRUE(isnan(Y[i + 1])); + } + else { + ASSERT_TRUE(isnan(Y[i])); + ASSERT_TRUE(isnan(Y[i + 1])); + } + } +} + +CTEST(cgemv, 2_0_nan_1_inf_1_incy_2) +{ + int i; + blasint N = 17; + blasint incX = 1; + blasint incY = 2; + float alpha[2] = {0.0, 0.0}; + float beta[2] = {2.0, 0.0}; + char trans = 'N'; + float A[17 * 17 * 4]; + float X[17]; + float Y[17 * 2 * 2]; + float *ay = Y; + + memset(A, 0, sizeof(A)); + memset(X, 0, sizeof(X)); + memset(Y, 0, sizeof(Y)); + for (i = 0; i < (2 * N - 2); i += 4) + { + ay[0] = NAN; + ay[1] = 1.0; + ay += 4; + ay[0] = INFINITY; + ay[1] = 1.0; + ay += 4; + } + Y[4 * N - 4] = NAN; + Y[4 * N - 3] = 1.0; + BLASFUNC(cgemv)(&trans, &N, &N, alpha, A, &N, X, &incX, beta, Y, &incY); + for (i = 0; i < 4 * N; i += 2) { + if ((i >> 1) % 2) { + ASSERT_TRUE(Y[i] == 0.0); + ASSERT_TRUE(Y[i + 1] == 0.0); + } + else { + if ((i >> 2) % 2) { + ASSERT_TRUE(isinf(Y[i])); + ASSERT_TRUE(isnan(Y[i + 1])); + } + else { + ASSERT_TRUE(isnan(Y[i])); + ASSERT_TRUE(isnan(Y[i + 1])); + } + } + } +} + +#endif + +#ifdef BUILD_COMPLEX16 + +CTEST(zgemv, 0_nan_inf) +{ + int i; + blasint N = 17; + blasint incX = 1; + blasint incY = 1; + double alpha[2] = {0.0, 0.0}; + double beta[2] = {0.0, 0.0}; + char trans = 'N'; + double A[17 * 17 * 4]; + double X[17 * 2]; + double Y[17 * 2]; + + memset(A, 0, sizeof(A)); + memset(X, 0, sizeof(X)); + for (i = 0; i < (2 * N - 2); i += 4) + { + Y[i] = NAN; + Y[i + 1] = NAN; + + Y[i + 2] = INFINITY; + Y[i + 3] = INFINITY; + } + Y[2 * N - 1] = NAN; + Y[2 * N - 2] = NAN; + BLASFUNC(zgemv)(&trans, &N, &N, alpha, A, &N, X, &incX, beta, Y, &incY); + for (i = 0; i < 2 * N; i ++) + ASSERT_TRUE(Y[i] == 0.0); +} + +CTEST(zgemv, 0_nan_inf_incy_2) +{ + int i; + blasint N = 17; + blasint incX = 1; + blasint incY = 2; + double alpha[2] = {0.0, 0.0}; + double beta[2] = {0.0, 0.0}; + char trans = 'N'; + double A[17 * 17 * 4]; + double X[17]; + double Y[17 * 2 * 2]; + double *ay = Y; + + memset(A, 0, sizeof(A)); + memset(X, 0, sizeof(X)); + memset(Y, 0, sizeof(Y)); + for (i = 0; i < (2 * N - 2); i += 4) + { + ay[0] = NAN; + ay[1] = NAN; + ay += 4; + ay[0] = INFINITY; + ay[1] = INFINITY; + ay += 4; + } + Y[4 * N - 4] = NAN; + Y[4 * N - 3] = NAN; + BLASFUNC(zgemv)(&trans, &N, &N, alpha, A, &N, X, &incX, beta, Y, &incY); + for (i = 0; i < 4 * N; i ++) + ASSERT_TRUE(Y[i] == 0.0); +} + +CTEST(zgemv, 0_2_nan_1_inf_1) +{ + int i; + blasint N = 17; + blasint incX = 1; + blasint incY = 1; + double alpha[2] = {0.0, 0.0}; + double beta[2] = {0.0, 2.0}; + char trans = 'N'; + double A[17 * 17 * 4]; + double X[17 * 2]; + double Y[17 * 2]; + + memset(A, 0, sizeof(A)); + memset(X, 0, sizeof(X)); + for (i = 0; i < (2 * N - 2); i += 4) + { + Y[i] = NAN; + Y[i + 1] = 1.0; + + Y[i + 2] = INFINITY; + Y[i + 3] = 1.0; + } + Y[2 * N - 2] = NAN; + Y[2 * N - 1] = 1.0; + BLASFUNC(zgemv)(&trans, &N, &N, alpha, A, &N, X, &incX, beta, Y, &incY); + for (i = 0; i < 2 * N; i += 2) { + if ((i >> 1) % 2){ + ASSERT_TRUE(isnan(Y[i])); + ASSERT_TRUE(isinf(Y[i + 1])); + } + else { + ASSERT_TRUE(isnan(Y[i])); + ASSERT_TRUE(isnan(Y[i + 1])); + } + } +} + +CTEST(zgemv, 0_2_nan_1_inf_1_incy_2) +{ + int i; + blasint N = 17; + blasint incX = 1; + blasint incY = 2; + double alpha[2] = {0.0, 0.0}; + double beta[2] = {0.0, 2.0}; + char trans = 'N'; + double A[17 * 17 * 4]; + double X[17]; + double Y[17 * 2 * 2]; + double *ay = Y; + + memset(A, 0, sizeof(A)); + memset(X, 0, sizeof(X)); + memset(Y, 0, sizeof(Y)); + for (i = 0; i < (2 * N - 2); i += 4) + { + ay[0] = NAN; + ay[1] = 1.0; + ay += 4; + ay[0] = INFINITY; + ay[1] = 1.0; + ay += 4; + } + Y[4 * N - 4] = NAN; + Y[4 * N - 3] = 1.0; + BLASFUNC(zgemv)(&trans, &N, &N, alpha, A, &N, X, &incX, beta, Y, &incY); + for (i = 0; i < 4 * N; i += 2) { + if ((i >> 1) % 2) { + ASSERT_TRUE(Y[i] == 0.0); + ASSERT_TRUE(Y[i + 1] == 0.0); + } + else { + if ((i >> 2) % 2) { + ASSERT_TRUE(isnan(Y[i])); + ASSERT_TRUE(isinf(Y[i + 1])); + } + else { + ASSERT_TRUE(isnan(Y[i])); + ASSERT_TRUE(isnan(Y[i + 1])); + } + } + } +} + +CTEST(zgemv, 2_0_nan_1_inf_1) +{ + int i; + blasint N = 17; + blasint incX = 1; + blasint incY = 1; + double alpha[2] = {0.0, 0.0}; + double beta[2] = {2.0, 0.0}; + char trans = 'N'; + double A[17 * 17 * 4]; + double X[17 * 2]; + double Y[17 * 2]; + + memset(A, 0, sizeof(A)); + memset(X, 0, sizeof(X)); + for (i = 0; i < (2 * N - 2); i += 4) + { + Y[i] = NAN; + Y[i + 1] = 1.0; + + Y[i + 2] = INFINITY; + Y[i + 3] = 1.0; + } + Y[2 * N - 2] = NAN; + Y[2 * N - 1] = 1.0; + BLASFUNC(zgemv)(&trans, &N, &N, alpha, A, &N, X, &incX, beta, Y, &incY); + for (i = 0; i < 2 * N; i += 2) { + if ((i >> 1) % 2){ + ASSERT_TRUE(isinf(Y[i])); + ASSERT_TRUE(isnan(Y[i + 1])); + } + else { + ASSERT_TRUE(isnan(Y[i])); + ASSERT_TRUE(isnan(Y[i + 1])); + } + } +} + +CTEST(zgemv, 2_0_nan_1_inf_1_incy_2) +{ + int i; + blasint N = 17; + blasint incX = 1; + blasint incY = 2; + double alpha[2] = {0.0, 0.0}; + double beta[2] = {2.0, 0.0}; + char trans = 'N'; + double A[17 * 17 * 4]; + double X[17]; + double Y[17 * 2 * 2]; + double *ay = Y; + + memset(A, 0, sizeof(A)); + memset(X, 0, sizeof(X)); + memset(Y, 0, sizeof(Y)); + for (i = 0; i < (2 * N - 2); i += 4) + { + ay[0] = NAN; + ay[1] = 1.0; + ay += 4; + ay[0] = INFINITY; + ay[1] = 1.0; + ay += 4; + } + Y[4 * N - 4] = NAN; + Y[4 * N - 3] = 1.0; + BLASFUNC(zgemv)(&trans, &N, &N, alpha, A, &N, X, &incX, beta, Y, &incY); + for (i = 0; i < 4 * N; i += 2) { + if ((i >> 1) % 2) { + ASSERT_TRUE(Y[i] == 0.0); + ASSERT_TRUE(Y[i + 1] == 0.0); + } + else { + if ((i >> 2) % 2) { + ASSERT_TRUE(isinf(Y[i])); + ASSERT_TRUE(isnan(Y[i + 1])); + } + else { + ASSERT_TRUE(isnan(Y[i])); + ASSERT_TRUE(isnan(Y[i + 1])); + } + } + } +} + +#endif diff --git a/utest/test_zscal.c b/utest/test_zscal.c index 09e63752c..57d78b690 100644 --- a/utest/test_zscal.c +++ b/utest/test_zscal.c @@ -442,6 +442,33 @@ CTEST(cscal, i_0inf_inc_2) ASSERT_TRUE(isnan(inf[17])); } +CTEST(cscal, i00_NAN) +{ + blasint N=9; + blasint incX=1; + float i[] = {0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0 }; + float nan[] = {NAN, 0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0}; + BLASFUNC(cscal)(&N, i, nan, &incX); + ASSERT_TRUE(isnan(nan[0])); + ASSERT_TRUE(isnan(nan[1])); + ASSERT_TRUE(isnan(nan[16])); + ASSERT_TRUE(isnan(nan[17])); +} + +CTEST(cscal, i00_NAN_incx_2) +{ + blasint N=9; + blasint incX=2; + float i[] = {0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0 }; + float nan[] = {0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, + 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN}; + BLASFUNC(cscal)(&N, i, nan, &incX); + ASSERT_TRUE(isnan(nan[0])); + ASSERT_TRUE(isnan(nan[1])); + ASSERT_TRUE(isnan(nan[16])); + ASSERT_TRUE(isnan(nan[17])); +} + #endif #ifdef BUILD_COMPLEX16 @@ -588,4 +615,31 @@ CTEST(zscal, i_0inf_inc_2) ASSERT_TRUE(isnan(inf[17])); } +CTEST(zscal, i00_NAN) +{ + blasint N=9; + blasint incX=1; + double i[] = {0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0 }; + double nan[] = {NAN, 0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0}; + BLASFUNC(zscal)(&N, i, nan, &incX); + ASSERT_TRUE(isnan(nan[0])); + ASSERT_TRUE(isnan(nan[1])); + ASSERT_TRUE(isnan(nan[16])); + ASSERT_TRUE(isnan(nan[17])); +} + +CTEST(zscal, i00_NAN_incx_2) +{ + blasint N=9; + blasint incX=2; + double i[] = {0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0, 0,0 }; + double nan[] = {0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, + 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN, 0,NAN}; + BLASFUNC(zscal)(&N, i, nan, &incX); + ASSERT_TRUE(isnan(nan[0])); + ASSERT_TRUE(isnan(nan[1])); + ASSERT_TRUE(isnan(nan[16])); + ASSERT_TRUE(isnan(nan[17])); +} + #endif From b2117bb2cadd0c1894104ce6a7c0980cd7c9ffb7 Mon Sep 17 00:00:00 2001 From: gxw Date: Thu, 16 Jan 2025 19:44:11 +0800 Subject: [PATCH 03/37] LoongArch64: Fixed LSX version of cscal and zscal --- kernel/loongarch64/cscal_lsx.S | 218 +++++++++------------------------ 1 file changed, 58 insertions(+), 160 deletions(-) diff --git a/kernel/loongarch64/cscal_lsx.S b/kernel/loongarch64/cscal_lsx.S index 241d3d16e..c235a206a 100644 --- a/kernel/loongarch64/cscal_lsx.S +++ b/kernel/loongarch64/cscal_lsx.S @@ -33,6 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHAI $f1 #define X $r7 #define INCX $r8 +#define DUMMY2 $r9 #define I $r12 #define TEMP $r13 @@ -65,6 +66,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. bge $r0, N, .L999 bge $r0, INCX, .L999 + ld.d DUMMY2, $sp, 0 li.d TEMP, 1 movgr2fr.d a1, $r0 FFINT a1, a1 @@ -84,24 +86,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. srai.d I, N, 2 bne INCX, TEMP, .L22 +/////// INCX == 1 //////// .L11: - bge $r0, I, .L997 CMPEQ $fcc0, ALPHAR, a1 CMPEQ $fcc1, ALPHAI, a1 - bceqz $fcc0, .L13 - b .L14 - .align 3 + bge $r0, I, .L19 -.L13: - bceqz $fcc1, .L114 //alpha_r != 0.0 && alpha_i != 0.0 - b .L113 //alpha_r != 0.0 && alpha_i == 0.0 +/////// INCX == 1 && N >= 4 //////// + bnez DUMMY2, .L17 // if DUMMPY2 == 1, called from c/zscal. -.L14: - bceqz $fcc1, .L114 //alpha_r == 0.0 && alpha_i != 0.0 - b .L111 //alpha_r == 0.0 && alpha_i == 0.0 - .align 3 + bceqz $fcc0, .L17 -.L111: //alpha_r == 0.0 && alpha_i == 0.0 + bceqz $fcc1, .L17 + +.L15: //alpha_r == 0.0 && alpha_i == 0.0 vst VXZ, X, 0 * SIZE #ifdef DOUBLE vst VXZ, X, 2 * SIZE @@ -112,50 +110,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif addi.d X, X, 8 * SIZE addi.d I, I, -1 - blt $r0, I, .L111 - b .L997 - .align 3 - -.L113: //alpha_r != 0.0 && alpha_i == 0.0 - vld VX0, X, 0 * SIZE -#ifdef DOUBLE - vld VX1, X, 2 * SIZE - vpickev.d x1, VX1, VX0 - vpickod.d x2, VX1, VX0 - vfmul.d x3, VXAR, x1 - vfmul.d x4, VXAR, x2 - vilvl.d VX2, x4 ,x3 - vilvh.d VX3, x4, x3 - vst VX2, X, 0 * SIZE - vst VX3, X, 2 * SIZE - vld VX0, X, 4 * SIZE - vld VX1, X, 6 * SIZE - vpickev.d x1, VX1, VX0 - vpickod.d x2, VX1, VX0 - vfmul.d x3, VXAR, x1 - vfmul.d x4, VXAR, x2 - vilvl.d VX2, x4 ,x3 - vilvh.d VX3, x4, x3 - vst VX2, X, 4 * SIZE - vst VX3, X, 6 * SIZE -#else - vld VX1, X, 4 * SIZE - vpickev.w x1, VX1, VX0 - vpickod.w x2, VX1, VX0 - vfmul.s x3, VXAR, x1 - vfmul.s x4, VXAR, x2 - vilvl.w VX2, x4 ,x3 - vilvh.w VX3, x4, x3 - vst VX2, X, 0 * SIZE - vst VX3, X, 4 * SIZE -#endif - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L113 - b .L997 + blt $r0, I, .L15 + b .L19 .align 3 -.L114: //alpha_r != 0.0 && alpha_i != 0.0 +.L17: vld VX0, X, 0 * SIZE #ifdef DOUBLE vld VX1, X, 2 * SIZE @@ -196,29 +155,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif addi.d X, X, 8 * SIZE addi.d I, I, -1 - blt $r0, I, .L114 - b .L997 + blt $r0, I, .L17 + b .L19 .align 3 +/////// INCX == 1 && N < 8 /////// +.L19: + andi I, N, 3 + beqz I, .L999 + bnez DUMMY2, .L998 // if DUMMPY2 == 1, called from c/zscal. + + bceqz $fcc0, .L998 + + bceqz $fcc1, .L998 + + b .L995 // alpha_r == 0.0 && alpha_i == 0.0 + +/////// INCX != 1 //////// .L22: - bge $r0, I, .L997 - move XX, X CMPEQ $fcc0, ALPHAR, a1 CMPEQ $fcc1, ALPHAI, a1 - bceqz $fcc0, .L23 - b .L24 - .align 3 + move XX, X + bge $r0, I, .L29 + bnez DUMMY2, .L25 // if DUMMPY2 == 1, called from c/zscal. -.L23: - bceqz $fcc1, .L224 //alpha_r != 0.0 && alpha_i != 0.0 - b .L223 //alpha_r != 0.0 && alpha_i == 0.0 + bceqz $fcc0, .L25 -.L24: - bceqz $fcc1, .L224 //alpha_r == 0.0 && alpha_i != 0.0 - b .L221 //alpha_r == 0.0 && alpha_i == 0.0 - .align 3 + bceqz $fcc1, .L25 -.L221: //alpha_r == 0.0 && alpha_i == 0.0 +.L27: //alpha_r == 0.0 && alpha_i == 0.0 #ifdef DOUBLE vstelm.d VXZ, X, 0, 0 vstelm.d VXZ, X, 1 * SIZE, 0 @@ -246,92 +211,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif add.d X, X, INCX addi.d I, I, -1 - blt $r0, I, .L221 - b .L997 + blt $r0, I, .L27 + b .L29 .align 3 -.L223: //alpha_r != 0.0 && alpha_i == 0.0 -#ifdef DOUBLE - ld.d t1, X, 0 * SIZE - ld.d t2, X, 1 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - ld.d t4, X, 1 * SIZE - add.d X, X, INCX - vinsgr2vr.d x1, t1, 0 - vinsgr2vr.d x2, t2, 0 - vinsgr2vr.d x1, t3, 1 - vinsgr2vr.d x2, t4, 1 - vfmul.d x3, VXAR, x1 - vfmul.d x4, VXAR, x2 - vstelm.d x3, XX, 0 * SIZE, 0 - vstelm.d x4, XX, 1 * SIZE, 0 - add.d XX, XX, INCX - vstelm.d x3, XX, 0 * SIZE, 1 - vstelm.d x4, XX, 1 * SIZE, 1 - add.d XX, XX, INCX - - ld.d t1, X, 0 * SIZE - ld.d t2, X, 1 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - ld.d t4, X, 1 * SIZE - vinsgr2vr.d x1, t1, 0 - vinsgr2vr.d x2, t2, 0 - vinsgr2vr.d x1, t3, 1 - vinsgr2vr.d x2, t4, 1 - add.d X, X, INCX - vfmul.d x3, VXAR, x1 - vfmul.d x4, VXAR, x2 - addi.d I, I, -1 - vstelm.d x3, XX, 0 * SIZE, 0 - vstelm.d x4, XX, 1 * SIZE, 0 - add.d XX, XX, INCX - vstelm.d x3, XX, 0 * SIZE, 1 - vstelm.d x4, XX, 1 * SIZE, 1 -#else - ld.w t1, X, 0 * SIZE - ld.w t2, X, 1 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - ld.w t4, X, 1 * SIZE - add.d X, X, INCX - vinsgr2vr.w x1, t1, 0 - vinsgr2vr.w x2, t2, 0 - vinsgr2vr.w x1, t3, 1 - vinsgr2vr.w x2, t4, 1 - ld.w t1, X, 0 * SIZE - ld.w t2, X, 1 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - ld.w t4, X, 1 * SIZE - vinsgr2vr.w x1, t1, 2 - vinsgr2vr.w x2, t2, 2 - vinsgr2vr.w x1, t3, 3 - vinsgr2vr.w x2, t4, 3 - add.d X, X, INCX - - vfmul.s x3, VXAR, x1 - vfmul.s x4, VXAR, x2 - addi.d I, I, -1 - vstelm.w x3, XX, 0 * SIZE, 0 - vstelm.w x4, XX, 1 * SIZE, 0 - add.d XX, XX, INCX - vstelm.w x3, XX, 0 * SIZE, 1 - vstelm.w x4, XX, 1 * SIZE, 1 - add.d XX, XX, INCX - vstelm.w x3, XX, 0 * SIZE, 2 - vstelm.w x4, XX, 1 * SIZE, 2 - add.d XX, XX, INCX - vstelm.w x3, XX, 0 * SIZE, 3 - vstelm.w x4, XX, 1 * SIZE, 3 -#endif - add.d XX, XX, INCX - blt $r0, I, .L223 - b .L997 - .align 3 - -.L224: //alpha_r != 0.0 && alpha_i != 0.0 +.L25: #ifdef DOUBLE ld.d t1, X, 0 * SIZE ld.d t2, X, 1 * SIZE @@ -414,15 +298,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vstelm.w x4, XX, 1 * SIZE, 3 #endif add.d XX, XX, INCX - blt $r0, I, .L224 - b .L997 + blt $r0, I, .L25 + b .L29 .align 3 -.L997: - andi I, N, 3 - bge $r0, I, .L999 - .align 3 +/////// INCX != 1 && N < 8 /////// +.L29: + andi I, N, 3 + beqz I, .L999 + bnez DUMMY2, .L998 // if DUMMPY2 == 1, called from c/zscal. + + bceqz $fcc0, .L998 + + bceqz $fcc1, .L998 + b .L995 // alpha_r == 0.0 && alpha_i == 0.0 + +.L995: // alpha_r == 0.0 && alpha_i == 0.0 + ST a1, X, 0 * SIZE + ST a1, X, 1 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L995 + b .L999 .L998: LD a1, X, 0 * SIZE LD a2, X, 1 * SIZE @@ -435,7 +333,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ST s2, X, 1 * SIZE add.d X, X, INCX blt $r0, I, .L998 - .align 3 + b .L999 .L999: move $r4, $r12 From 5392f6df6908dfb64fd972d4c467058226547294 Mon Sep 17 00:00:00 2001 From: gxw Date: Thu, 16 Jan 2025 19:50:22 +0800 Subject: [PATCH 04/37] LoongArch64: Fixed LASX version of cscal and zscal --- kernel/loongarch64/cscal_lasx.S | 244 ++++++++------------------------ 1 file changed, 61 insertions(+), 183 deletions(-) diff --git a/kernel/loongarch64/cscal_lasx.S b/kernel/loongarch64/cscal_lasx.S index f53526663..daeb180e9 100644 --- a/kernel/loongarch64/cscal_lasx.S +++ b/kernel/loongarch64/cscal_lasx.S @@ -33,6 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHAI $f1 #define X $r7 #define INCX $r8 +#define DUMMY2 $r9 #define I $r12 #define TEMP $r13 @@ -65,6 +66,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. bge $r0, N, .L999 bge $r0, INCX, .L999 + ld.d DUMMY2, $sp, 0 li.d TEMP, 1 movgr2fr.d a1, $r0 FFINT a1, a1 @@ -86,24 +88,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif bne INCX, TEMP, .L22 +/////// INCX == 1 //////// .L11: - bge $r0, I, .L997 CMPEQ $fcc0, ALPHAR, a1 CMPEQ $fcc1, ALPHAI, a1 - bceqz $fcc0, .L13 - b .L14 - .align 3 + bge $r0, I, .L19 +/////// INCX == 1 && N >= 4 //////// + bnez DUMMY2, .L17 // if DUMMPY2 == 1, called from c/zscal. -.L13: - bceqz $fcc1, .L114 //alpha_r != 0.0 && alpha_i != 0.0 - b .L113 //alpha_r != 0.0 && alpha_i == 0.0 + bceqz $fcc0, .L17 -.L14: - bceqz $fcc1, .L114 //alpha_r == 0.0 && alpha_i != 0.0 - b .L111 //alpha_r == 0.0 && alpha_i == 0.0 - .align 3 + bceqz $fcc1, .L17 -.L111: //alpha_r == 0.0 && alpha_i == 0.0 +.L15: //alpha_r == 0.0 && alpha_i == 0.0 xvst VXZ, X, 0 * SIZE #ifdef DOUBLE xvst VXZ, X, 4 * SIZE @@ -113,41 +110,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi.d X, X, 16 * SIZE #endif addi.d I, I, -1 - blt $r0, I, .L111 - b .L997 + blt $r0, I, .L15 + b .L19 .align 3 -.L113: //alpha_r != 0.0 && alpha_i == 0.0 - xvld VX0, X, 0 * SIZE -#ifdef DOUBLE - xvld VX1, X, 4 * SIZE - xvpickev.d x1, VX1, VX0 - xvpickod.d x2, VX1, VX0 - xvfmul.d x3, VXAR, x1 - xvfmul.d x4, VXAR, x2 - xvilvl.d VX2, x4 ,x3 - xvilvh.d VX3, x4, x3 - xvst VX2, X, 0 * SIZE - xvst VX3, X, 4 * SIZE - addi.d X, X, 8 * SIZE -#else - xvld VX1, X, 8 * SIZE - xvpickev.w x1, VX1, VX0 - xvpickod.w x2, VX1, VX0 - xvfmul.s x3, VXAR, x1 - xvfmul.s x4, VXAR, x2 - xvilvl.w VX2, x4 ,x3 - xvilvh.w VX3, x4, x3 - xvst VX2, X, 0 * SIZE - xvst VX3, X, 8 * SIZE - addi.d X, X, 16 * SIZE -#endif - addi.d I, I, -1 - blt $r0, I, .L113 - b .L997 - .align 3 - -.L114: //alpha_r != 0.0 && alpha_i != 0.0 +.L17: xvld VX0, X, 0 * SIZE #ifdef DOUBLE xvld VX1, X, 4 * SIZE @@ -177,29 +144,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi.d X, X, 16 * SIZE #endif addi.d I, I, -1 - blt $r0, I, .L114 - b .L997 + blt $r0, I, .L17 + b .L19 + .align 3 + +/////// INCX == 1 && N < 8 /////// +.L19: +#ifdef DOUBLE + andi I, N, 3 +#else + andi I, N, 7 +#endif + beqz I, .L999 + bnez DUMMY2, .L998 // if DUMMPY2 == 1, called from c/zscal. + + bceqz $fcc0, .L998 + + bceqz $fcc1, .L998 + + b .L995 // alpha_r == 0.0 && alpha_i == 0.0 .align 3 +/////// INCX != 1 //////// .L22: - bge $r0, I, .L997 - move XX, X CMPEQ $fcc0, ALPHAR, a1 CMPEQ $fcc1, ALPHAI, a1 - bceqz $fcc0, .L23 - b .L24 - .align 3 - -.L23: - bceqz $fcc1, .L224 //alpha_r != 0.0 && alpha_i != 0.0 - b .L223 //alpha_r != 0.0 && alpha_i == 0.0 + move XX, X + bge $r0, I, .L29 + bnez DUMMY2, .L25 // if DUMMPY2 == 1, called from c/zscal. + bceqz $fcc0, .L25 -.L24: - bceqz $fcc1, .L224 //alpha_r == 0.0 && alpha_i != 0.0 - b .L221 //alpha_r == 0.0 && alpha_i == 0.0 - .align 3 + bceqz $fcc1, .L25 -.L221: //alpha_r == 0.0 && alpha_i == 0.0 +.L27: //alpha_r == 0.0 && alpha_i == 0.0 #ifdef DOUBLE xvstelm.d VXZ, X, 0, 0 xvstelm.d VXZ, X, 1 * SIZE, 0 @@ -239,122 +216,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif add.d X, X, INCX addi.d I, I, -1 - blt $r0, I, .L221 - b .L997 + blt $r0, I, .L27 + b .L29 .align 3 -.L223: //alpha_r != 0.0 && alpha_i == 0.0 -#ifdef DOUBLE - ld.d t1, X, 0 * SIZE - ld.d t2, X, 1 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - ld.d t4, X, 1 * SIZE - add.d X, X, INCX - xvinsgr2vr.d x1, t1, 0 - xvinsgr2vr.d x2, t2, 0 - xvinsgr2vr.d x1, t3, 1 - xvinsgr2vr.d x2, t4, 1 - ld.d t1, X, 0 * SIZE - ld.d t2, X, 1 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - ld.d t4, X, 1 * SIZE - xvinsgr2vr.d x1, t1, 2 - xvinsgr2vr.d x2, t2, 2 - xvinsgr2vr.d x1, t3, 3 - xvinsgr2vr.d x2, t4, 3 - add.d X, X, INCX - - xvfmul.d x3, VXAR, x1 - xvfmul.d x4, VXAR, x2 - addi.d I, I, -1 - xvstelm.d x3, XX, 0 * SIZE, 0 - xvstelm.d x4, XX, 1 * SIZE, 0 - add.d XX, XX, INCX - xvstelm.d x3, XX, 0 * SIZE, 1 - xvstelm.d x4, XX, 1 * SIZE, 1 - add.d XX, XX, INCX - xvstelm.d x3, XX, 0 * SIZE, 2 - xvstelm.d x4, XX, 1 * SIZE, 2 - add.d XX, XX, INCX - xvstelm.d x3, XX, 0 * SIZE, 3 - xvstelm.d x4, XX, 1 * SIZE, 3 -#else - ld.w t1, X, 0 * SIZE - ld.w t2, X, 1 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - ld.w t4, X, 1 * SIZE - add.d X, X, INCX - xvinsgr2vr.w x1, t1, 0 - xvinsgr2vr.w x2, t2, 0 - xvinsgr2vr.w x1, t3, 1 - xvinsgr2vr.w x2, t4, 1 - ld.w t1, X, 0 * SIZE - ld.w t2, X, 1 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - ld.w t4, X, 1 * SIZE - xvinsgr2vr.w x1, t1, 2 - xvinsgr2vr.w x2, t2, 2 - xvinsgr2vr.w x1, t3, 3 - xvinsgr2vr.w x2, t4, 3 - add.d X, X, INCX - ld.w t1, X, 0 * SIZE - ld.w t2, X, 1 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - ld.w t4, X, 1 * SIZE - add.d X, X, INCX - xvinsgr2vr.w x1, t1, 4 - xvinsgr2vr.w x2, t2, 4 - xvinsgr2vr.w x1, t3, 5 - xvinsgr2vr.w x2, t4, 5 - ld.w t1, X, 0 * SIZE - ld.w t2, X, 1 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - ld.w t4, X, 1 * SIZE - xvinsgr2vr.w x1, t1, 6 - xvinsgr2vr.w x2, t2, 6 - xvinsgr2vr.w x1, t3, 7 - xvinsgr2vr.w x2, t4, 7 - add.d X, X, INCX - - xvfmul.s x3, VXAR, x1 - xvfmul.s x4, VXAR, x2 - addi.d I, I, -1 - xvstelm.w x3, XX, 0 * SIZE, 0 - xvstelm.w x4, XX, 1 * SIZE, 0 - add.d XX, XX, INCX - xvstelm.w x3, XX, 0 * SIZE, 1 - xvstelm.w x4, XX, 1 * SIZE, 1 - add.d XX, XX, INCX - xvstelm.w x3, XX, 0 * SIZE, 2 - xvstelm.w x4, XX, 1 * SIZE, 2 - add.d XX, XX, INCX - xvstelm.w x3, XX, 0 * SIZE, 3 - xvstelm.w x4, XX, 1 * SIZE, 3 - add.d XX, XX, INCX - xvstelm.w x3, XX, 0 * SIZE, 4 - xvstelm.w x4, XX, 1 * SIZE, 4 - add.d XX, XX, INCX - xvstelm.w x3, XX, 0 * SIZE, 5 - xvstelm.w x4, XX, 1 * SIZE, 5 - add.d XX, XX, INCX - xvstelm.w x3, XX, 0 * SIZE, 6 - xvstelm.w x4, XX, 1 * SIZE, 6 - add.d XX, XX, INCX - xvstelm.w x3, XX, 0 * SIZE, 7 - xvstelm.w x4, XX, 1 * SIZE, 7 -#endif - add.d XX, XX, INCX - blt $r0, I, .L223 - b .L997 - .align 3 - -.L224: //alpha_r != 0.0 && alpha_i != 0.0 +.L25: #ifdef DOUBLE ld.d t1, X, 0 * SIZE ld.d t2, X, 1 * SIZE @@ -465,19 +331,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvstelm.w x4, XX, 1 * SIZE, 7 #endif add.d XX, XX, INCX - blt $r0, I, .L224 - b .L997 + blt $r0, I, .L25 + b .L29 .align 3 -.L997: +/////// INCX != 1 && N < 8 /////// +.L29: #ifdef DOUBLE - andi I, N, 3 + andi I, N, 3 #else - andi I, N, 7 + andi I, N, 7 #endif - bge $r0, I, .L999 - .align 3 + beqz I, .L999 + bnez DUMMY2, .L998 // if DUMMPY2 == 1, called from c/zscal. + bceqz $fcc0, .L998 + + bceqz $fcc1, .L998 + +.L995: // alpha_r == 0.0 && alpha_i == 0.0 + ST a1, X, 0 * SIZE + ST a1, X, 1 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L995 + b .L999 .L998: LD a1, X, 0 * SIZE LD a2, X, 1 * SIZE @@ -490,7 +368,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ST s2, X, 1 * SIZE add.d X, X, INCX blt $r0, I, .L998 - .align 3 + b .L999 .L999: move $r4, $r12 From 2da86b80c939187936dd155def9380332cb3a67b Mon Sep 17 00:00:00 2001 From: gxw Date: Wed, 22 Jan 2025 14:32:20 +0800 Subject: [PATCH 05/37] LoongArch64: Fixed scalar version of cscal and zscal --- kernel/loongarch64/zscal.S | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/loongarch64/zscal.S b/kernel/loongarch64/zscal.S index a12e527a5..f6213b159 100644 --- a/kernel/loongarch64/zscal.S +++ b/kernel/loongarch64/zscal.S @@ -53,6 +53,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE li.d TEMP, 2 * SIZE + ld.d XX, $sp, 0 // Load dummy2 + slli.d XX, XX, ZBASE_SHIFT MTC a1, $r0 slli.d INCX, INCX, ZBASE_SHIFT bge $r0, N, .L999 @@ -60,6 +62,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. CMPEQ $fcc1, ALPHA_I, a1 bceqz $fcc0, .L50 bceqz $fcc1, .L50 + beq XX, TEMP, .L50 // if dummp2 == 1, do not directly copy 0 srai.d I, N, 2 bne INCX, TEMP, .L20 bge $r0, I, .L15 From 73214446602758b9aaf73f48de8d3b81990b9343 Mon Sep 17 00:00:00 2001 From: Ye Tao Date: Mon, 12 May 2025 13:41:21 +0000 Subject: [PATCH 06/37] enable sbgemm to be forward to sbgemv on arm64 --- Makefile.system | 1 + 1 file changed, 1 insertion(+) diff --git a/Makefile.system b/Makefile.system index ac6a41c92..38646c3c6 100644 --- a/Makefile.system +++ b/Makefile.system @@ -276,6 +276,7 @@ SMALL_MATRIX_OPT = 1 endif ifeq ($(ARCH), arm64) GEMM_GEMV_FORWARD = 1 +GEMM_GEMV_FORWARD_BF16 = 1 endif ifeq ($(ARCH), riscv) GEMM_GEMV_FORWARD = 1 From 0ccb05058312caed86befc75923b6f888ae4e7a6 Mon Sep 17 00:00:00 2001 From: pengxu Date: Tue, 13 May 2025 16:08:33 +0800 Subject: [PATCH 07/37] Loongarch64: fixed cgemm_ncopy_16_lasx --- kernel/loongarch64/cgemm_ncopy_16_lasx.S | 774 +++++++---------------- 1 file changed, 212 insertions(+), 562 deletions(-) diff --git a/kernel/loongarch64/cgemm_ncopy_16_lasx.S b/kernel/loongarch64/cgemm_ncopy_16_lasx.S index 7c2d0ac64..4b9225314 100644 --- a/kernel/loongarch64/cgemm_ncopy_16_lasx.S +++ b/kernel/loongarch64/cgemm_ncopy_16_lasx.S @@ -45,18 +45,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define S6 $r17 #define S7 $r18 #define S8 $r19 -#define S9 $r20 -#define S10 $r23 -#define S11 $r24 -#define S12 $r25 -#define S13 $r26 -#define S14 $r27 -#define S15 $r28 -#define S16 $r29 -#define TD $r30 -#define TS $r31 +#define S9 $r23 +#define S10 $r24 +#define S11 $r25 +#define S12 $r26 +#define S13 $r27 +#define S14 $r28 +#define S15 $r29 +#define S16 $r30 +#define TD $r20 +#define TS $r11 #define TL $r7 -#define T0 $r6 #define ZERO $r0 #define F0 $f0 @@ -67,6 +66,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define F5 $f5 #define F6 $f6 #define F7 $f7 +#define F8 $f8 +#define F9 $f9 +#define F10 $f10 +#define F11 $f11 +#define F12 $f12 +#define F13 $f13 +#define F14 $f14 +#define F15 $f15 /* LASX vectors */ #define U0 $xr0 #define U1 $xr1 @@ -103,589 +110,232 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE - addi.d $sp, $sp, -0x90 - SDARG $r23, $sp, 0x00 - SDARG $r24, $sp, 0x08 - SDARG $r25, $sp, 0x10 - SDARG $r26, $sp, 0x18 - SDARG $r27, $sp, 0x20 - SDARG $r28, $sp, 0x28 - SDARG $r29, $sp, 0x30 - SDARG $r30, $sp, 0x38 - SDARG $r31, $sp, 0x40 - ST $f23, $sp, 0x48 - ST $f24, $sp, 0x50 - ST $f25, $sp, 0x58 - ST $f26, $sp, 0x60 - ST $f27, $sp, 0x68 - ST $f28, $sp, 0x70 - ST $f29, $sp, 0x78 - ST $f30, $sp, 0x80 - ST $f31, $sp, 0x88 - - move TD, DST - move TS, SRC - slli.d TL, LDA, 0x03 - slli.d T0, TL, 0x01 - srai.d J, N, 0x04 + addi.d $sp, $sp, -64 + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 24 + SDARG $r27, $sp, 32 + SDARG $r28, $sp, 40 + SDARG $r29, $sp, 48 + SDARG $r30, $sp, 56 + + move TD, DST //boffset + move TS, SRC //aoffset + slli.d TL, LDA, 0x03 //lda + srai.d J, N, 0x04 //j beq J, ZERO, .L_N8 -.L_J1: /* J-- */ +.L_J1: /* if(j>0) j--*/ move S1, TS add.d S2, TS, TL - srai.d I, M, 0x03 + move I, M add.d S3, S2, TL - addi.d J, J, -1 add.d S4, S3, TL - add.d S5, S3, T0 - add.d S6, S4, T0 - add.d S7, S5, T0 - add.d S8, S6, T0 - add.d S9, S7, T0 - add.d S10, S8, T0 - add.d S11, S9, T0 - add.d S12, S10, T0 - add.d S13, S11, T0 - add.d S14, S12, T0 - add.d S15, S13, T0 - add.d S16, S14, T0 - add.d TS, S15, T0 - beq I, ZERO, .L_I7 - -.L_I1: /* I-- */ - xvld U0, S1, 0x00 - xvld U1, S2, 0x00 - xvld U2, S3, 0x00 - xvld U3, S4, 0x00 - xvld U4, S5, 0x00 - xvld U5, S6, 0x00 - xvld U6, S7, 0x00 - xvld U7, S8, 0x00 - xvld U8, S9, 0x00 - xvld U9, S10, 0x00 - xvld U10, S11, 0x00 - xvld U11, S12, 0x00 - xvld U12, S13, 0x00 - xvld U13, S14, 0x00 - xvld U14, S15, 0x00 - xvld U15, S16, 0x00 - - xvpackev.d D0, U1, U0 - xvpackod.d D1, U1, U0 - xvpackev.d D2, U3, U2 - xvpackod.d D3, U3, U2 - xvpackev.d D4, U5, U4 - xvpackod.d D5, U5, U4 - xvpackev.d D6, U7, U6 - xvpackod.d D7, U7, U6 - - xvpackev.d D8, U9, U8 - xvpackod.d D9, U9, U8 - xvpackev.d D10, U11, U10 - xvpackod.d D11, U11, U10 - xvpackev.d D12, U13, U12 - xvpackod.d D13, U13, U12 - xvpackev.d D14, U15, U14 - xvpackod.d D15, U15, U14 - - xvand.v U0, D0, D0 - xvpermi.q D0, D2, 0x02 // 0 - xvand.v U4, D4, D4 - xvpermi.q D4, D6, 0x02 // 1 - xvand.v U1, D1, D1 - xvpermi.q D1, D3, 0x02 // 4 - xvand.v U5, D5, D5 - xvpermi.q D5, D7, 0x02 // 5 - xvpermi.q D2, U0, 0x31 // 8 - xvpermi.q D6, U4, 0x31 // 9 - xvpermi.q D3, U1, 0x31 // 12 - xvpermi.q D7, U5, 0x31 // 13 - - xvand.v U8, D8, D8 - xvpermi.q D8, D10, 0x02 // 2 - xvand.v U12, D12, D12 - xvpermi.q D12, D14, 0x02 // 3 - xvand.v U9, D9, D9 - xvpermi.q D9, D11, 0x02 // 6 - xvand.v U13, D13, D13 - xvpermi.q D13, D15, 0x02 // 7 - xvpermi.q D10, U8, 0x31 // 10 - xvpermi.q D14, U12, 0x31 // 11 - xvpermi.q D11, U9, 0x31 // 14 - xvpermi.q D15, U13, 0x31 // 15 - - xvst D0, TD, 0x00 // 0 - xvst D4, TD, 0x20 // 1 - xvst D8, TD, 0x40 // 2 - xvst D12, TD, 0x60 // 3 - xvst D1, TD, 0x80 // 4 - xvst D5, TD, 0xA0 // 5 - xvst D9, TD, 0xC0 // 6 - xvst D13, TD, 0xE0 // 7 - addi.d TD, TD, 0x100 - xvst D2, TD, 0x00 // 8 - xvst D6, TD, 0x20 // 9 - xvst D10, TD, 0x40 // 10 - xvst D14, TD, 0x60 // 11 - xvst D3, TD, 0x80 // 12 - xvst D7, TD, 0xA0 // 13 - xvst D11, TD, 0xC0 // 14 - xvst D15, TD, 0xE0 // 15 - addi.d TD, TD, 0x100 - - xvld U0, S1, 0x20 - xvld U1, S2, 0x20 - xvld U2, S3, 0x20 - xvld U3, S4, 0x20 - xvld U4, S5, 0x20 - xvld U5, S6, 0x20 - xvld U6, S7, 0x20 - xvld U7, S8, 0x20 - xvld U8, S9, 0x20 - xvld U9, S10, 0x20 - xvld U10, S11, 0x20 - xvld U11, S12, 0x20 - xvld U12, S13, 0x20 - xvld U13, S14, 0x20 - xvld U14, S15, 0x20 - xvld U15, S16, 0x20 - - xvpackev.d D0, U1, U0 - xvpackod.d D1, U1, U0 - xvpackev.d D2, U3, U2 - xvpackod.d D3, U3, U2 - xvpackev.d D4, U5, U4 - xvpackod.d D5, U5, U4 - xvpackev.d D6, U7, U6 - xvpackod.d D7, U7, U6 - - xvpackev.d D8, U9, U8 - xvpackod.d D9, U9, U8 - xvpackev.d D10, U11, U10 - xvpackod.d D11, U11, U10 - xvpackev.d D12, U13, U12 - xvpackod.d D13, U13, U12 - xvpackev.d D14, U15, U14 - xvpackod.d D15, U15, U14 - - xvand.v U0, D0, D0 - xvpermi.q D0, D2, 0x02 // 0 - xvand.v U4, D4, D4 - xvpermi.q D4, D6, 0x02 // 1 - xvand.v U1, D1, D1 - xvpermi.q D1, D3, 0x02 // 4 - xvand.v U5, D5, D5 - xvpermi.q D5, D7, 0x02 // 5 - xvpermi.q D2, U0, 0x31 // 8 - xvpermi.q D6, U4, 0x31 // 9 - xvpermi.q D3, U1, 0x31 // 12 - xvpermi.q D7, U5, 0x31 // 13 - - xvand.v U8, D8, D8 - xvpermi.q D8, D10, 0x02 // 2 - xvand.v U12, D12, D12 - xvpermi.q D12, D14, 0x02 // 3 - xvand.v U9, D9, D9 - xvpermi.q D9, D11, 0x02 // 6 - xvand.v U13, D13, D13 - xvpermi.q D13, D15, 0x02 // 7 - xvpermi.q D10, U8, 0x31 // 10 - xvpermi.q D14, U12, 0x31 // 11 - xvpermi.q D11, U9, 0x31 // 14 - xvpermi.q D15, U13, 0x31 // 15 - - xvst D0, TD, 0x00 // 0 - xvst D4, TD, 0x20 // 1 - xvst D8, TD, 0x40 // 2 - xvst D12, TD, 0x60 // 3 - xvst D1, TD, 0x80 // 4 - xvst D5, TD, 0xA0 // 5 - xvst D9, TD, 0xC0 // 6 - xvst D13, TD, 0xE0 // 7 - addi.d TD, TD, 0x100 - xvst D2, TD, 0x00 // 8 - xvst D6, TD, 0x20 // 9 - xvst D10, TD, 0x40 // 10 - xvst D14, TD, 0x60 // 11 - xvst D3, TD, 0x80 // 12 - xvst D7, TD, 0xA0 // 13 - xvst D11, TD, 0xC0 // 14 - xvst D15, TD, 0xE0 // 15 - addi.d TD, TD, 0x100 - - - addi.d S1, S1, 0x40 - addi.d S2, S2, 0x40 - addi.d S3, S3, 0x40 - addi.d S4, S4, 0x40 - addi.d S5, S5, 0x40 - addi.d S6, S6, 0x40 - addi.d S7, S7, 0x40 - addi.d S8, S8, 0x40 - addi.d S9, S9, 0x40 - addi.d S10, S10, 0x40 - addi.d S11, S11, 0x40 - addi.d S12, S12, 0x40 - addi.d S13, S13, 0x40 - addi.d S14, S14, 0x40 - addi.d S15, S15, 0x40 - addi.d S16, S16, 0x40 - + add.d S5, S4, TL + add.d S6, S5, TL + add.d S7, S6, TL + add.d S8, S7, TL + add.d S9, S8, TL + add.d S10, S9, TL + add.d S11, S10, TL + add.d S12, S11, TL + add.d S13, S12, TL + add.d S14, S13, TL + add.d S15, S14, TL + add.d S16, S15, TL + add.d TS, S16, TL + beq I, ZERO, .L_J11 + +.L_I1: /* if(i>0) i--*/ + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fld.d F2, S3, 0x00 + fld.d F3, S4, 0x00 + fld.d F4, S5, 0x00 + fld.d F5, S6, 0x00 + fld.d F6, S7, 0x00 + fld.d F7, S8, 0x00 + + fst.d F0, TD, 0x00 + fst.d F1, TD, 0x08 + fst.d F2, TD, 0x10 + fst.d F3, TD, 0x18 + fst.d F4, TD, 0x20 + fst.d F5, TD, 0x28 + fst.d F6, TD, 0x30 + fst.d F7, TD, 0x38 + + fld.d F0, S9, 0x00 + fld.d F1, S10, 0x00 + fld.d F2, S11, 0x00 + fld.d F3, S12, 0x00 + fld.d F4, S13, 0x00 + fld.d F5, S14, 0x00 + fld.d F6, S15, 0x00 + fld.d F7, S16, 0x00 + + fst.d F0, TD, 0x40 + fst.d F1, TD, 0x48 + fst.d F2, TD, 0x50 + fst.d F3, TD, 0x58 + fst.d F4, TD, 0x60 + fst.d F5, TD, 0x68 + fst.d F6, TD, 0x70 + fst.d F7, TD, 0x78 + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d S3, S3, 0x08 + addi.d S4, S4, 0x08 + addi.d S5, S5, 0x08 + addi.d S6, S6, 0x08 + addi.d S7, S7, 0x08 + addi.d S8, S8, 0x08 + addi.d S9, S9, 0x08 + addi.d S10, S10, 0x08 + addi.d S11, S11, 0x08 + addi.d S12, S12, 0x08 + addi.d S13, S13, 0x08 + addi.d S14, S14, 0x08 + addi.d S15, S15, 0x08 + addi.d S16, S16, 0x08 + addi.d TD, TD, 0x80 addi.d I, I, -1 blt ZERO, I, .L_I1 -.L_I7: - andi I, M, 0x07 - beq I, ZERO, .L_I0 - -.L_II1: /* I-- */ - fld.d F0, S1, 0x00 - fld.d F1, S2, 0x00 - fld.d F2, S3, 0x00 - fld.d F3, S4, 0x00 - fld.d F4, S5, 0x00 - fld.d F5, S6, 0x00 - fld.d F6, S7, 0x00 - fld.d F7, S8, 0x00 - - fst.d F0, TD, 0x00 - addi.d S1, S1, 0x08 - fst.d F1, TD, 0x08 - addi.d S2, S2, 0x08 - fst.d F2, TD, 0x10 - addi.d S3, S3, 0x08 - fst.d F3, TD, 0x18 - addi.d S4, S4, 0x08 - fst.d F4, TD, 0x20 - addi.d S5, S5, 0x08 - fst.d F5, TD, 0x28 - addi.d S6, S6, 0x08 - fst.d F6, TD, 0x30 - addi.d S7, S7, 0x08 - fst.d F7, TD, 0x38 - addi.d S8, S8, 0x08 - addi.d TD, TD, 0x40 - - fld.d F0, S9, 0x00 - fld.d F1, S10, 0x00 - fld.d F2, S11, 0x00 - fld.d F3, S12, 0x00 - fld.d F4, S13, 0x00 - fld.d F5, S14, 0x00 - fld.d F6, S15, 0x00 - fld.d F7, S16, 0x00 - - fst.d F0, TD, 0x00 - addi.d S9, S9, 0x08 - fst.d F1, TD, 0x08 - addi.d S10, S10, 0x08 - fst.d F2, TD, 0x10 - addi.d S11, S11, 0x08 - fst.d F3, TD, 0x18 - addi.d S12, S12, 0x08 - fst.d F4, TD, 0x20 - addi.d S13, S13, 0x08 - fst.d F5, TD, 0x28 - addi.d S14, S14, 0x08 - fst.d F6, TD, 0x30 - addi.d S15, S15, 0x08 - fst.d F7, TD, 0x38 - addi.d S16, S16, 0x08 - addi.d TD, TD, 0x40 - - addi.d I, I, -1 - blt ZERO, I, .L_II1 - -.L_I0: - blt ZERO, J, .L_J1 - -.L_N8: - andi J, N, 0x08 - beq ZERO, J, .L_N4 +.L_J11: /* j--*/ + addi.d J, J, -1 + blt ZERO, J, .L_J1 + +.L_N8: /* if(n&8)*/ + andi I, N, 0x08 + beq I, ZERO, .L_N4 move S1, TS add.d S2, TS, TL - srai.d I, M, 0x03 + move I, M add.d S3, S2, TL - add.d S4, S2, T0 - add.d S5, S3, T0 - add.d S6, S4, T0 - add.d S7, S5, T0 - add.d S8, S6, T0 - add.d TS, S7, T0 - beq I, ZERO, .L_8I3 - -.L_8I1: /* I-- */ - xvld U0, S1, 0x00 - xvld U1, S2, 0x00 - xvld U2, S3, 0x00 - xvld U3, S4, 0x00 - xvld U4, S5, 0x00 - xvld U5, S6, 0x00 - xvld U6, S7, 0x00 - xvld U7, S8, 0x00 - - xvpackev.d D0, U1, U0 - xvpackod.d D1, U1, U0 - xvpackev.d D2, U3, U2 - xvpackod.d D3, U3, U2 - xvpackev.d D4, U5, U4 - xvpackod.d D5, U5, U4 - xvpackev.d D6, U7, U6 - xvpackod.d D7, U7, U6 - - xvand.v U0, D0, D0 - xvpermi.q D0, D2, 0x02 // 0 - xvand.v U4, D4, D4 - xvpermi.q D4, D6, 0x02 // 1 - xvand.v U1, D1, D1 - xvpermi.q D1, D3, 0x02 // 2 - xvand.v U5, D5, D5 - xvpermi.q D5, D7, 0x02 // 3 - xvpermi.q D2, U0, 0x31 // 4 - xvpermi.q D6, U4, 0x31 // 5 - xvpermi.q D3, U1, 0x31 // 6 - xvpermi.q D7, U5, 0x31 // 7 - - xvst D0, TD, 0x00 - xvst D4, TD, 0x20 - xvst D1, TD, 0x40 - xvst D5, TD, 0x60 - xvst D2, TD, 0x80 - xvst D6, TD, 0xA0 - xvst D3, TD, 0xC0 - xvst D7, TD, 0xE0 - addi.d TD, TD, 0x100 - - xvld U0, S1, 0x20 - xvld U1, S2, 0x20 - xvld U2, S3, 0x20 - xvld U3, S4, 0x20 - xvld U4, S5, 0x20 - xvld U5, S6, 0x20 - xvld U6, S7, 0x20 - xvld U7, S8, 0x20 - - xvpackev.d D0, U1, U0 - xvpackod.d D1, U1, U0 - xvpackev.d D2, U3, U2 - xvpackod.d D3, U3, U2 - xvpackev.d D4, U5, U4 - xvpackod.d D5, U5, U4 - xvpackev.d D6, U7, U6 - xvpackod.d D7, U7, U6 - - xvand.v U0, D0, D0 - xvpermi.q D0, D2, 0x02 // 0 - xvand.v U4, D4, D4 - xvpermi.q D4, D6, 0x02 // 1 - xvand.v U1, D1, D1 - xvpermi.q D1, D3, 0x02 // 2 - xvand.v U5, D5, D5 - xvpermi.q D5, D7, 0x02 // 3 - xvpermi.q D2, U0, 0x31 // 4 - xvpermi.q D6, U4, 0x31 // 5 - xvpermi.q D3, U1, 0x31 // 6 - xvpermi.q D7, U5, 0x31 // 7 - - xvst D0, TD, 0x00 - xvst D4, TD, 0x20 - xvst D1, TD, 0x40 - xvst D5, TD, 0x60 - xvst D2, TD, 0x80 - xvst D6, TD, 0xA0 - xvst D3, TD, 0xC0 - xvst D7, TD, 0xE0 - addi.d TD, TD, 0x100 - - addi.d S1, S1, 0x40 - addi.d S2, S2, 0x40 - addi.d S3, S3, 0x40 - addi.d S4, S4, 0x40 - addi.d S5, S5, 0x40 - addi.d S6, S6, 0x40 - addi.d S7, S7, 0x40 - addi.d S8, S8, 0x40 - + add.d S4, S3, TL + add.d S5, S4, TL + add.d S6, S5, TL + add.d S7, S6, TL + add.d S8, S7, TL + add.d TS, S8, TL + beq I, ZERO, .L_N4 + +.L_N81: /* if(i>0) i--*/ + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fld.d F2, S3, 0x00 + fld.d F3, S4, 0x00 + fld.d F4, S5, 0x00 + fld.d F5, S6, 0x00 + fld.d F6, S7, 0x00 + fld.d F7, S8, 0x00 + + fst.d F0, TD, 0x00 + fst.d F1, TD, 0x08 + fst.d F2, TD, 0x10 + fst.d F3, TD, 0x18 + fst.d F4, TD, 0x20 + fst.d F5, TD, 0x28 + fst.d F6, TD, 0x30 + fst.d F7, TD, 0x38 + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d S3, S3, 0x08 + addi.d S4, S4, 0x08 + addi.d S5, S5, 0x08 + addi.d S6, S6, 0x08 + addi.d S7, S7, 0x08 + addi.d S8, S8, 0x08 + addi.d TD, TD, 0x40 addi.d I, I, -1 - blt ZERO, I, .L_8I1 - -.L_8I3: - andi I, M, 0x07 - beq I, ZERO, .L_N4 - -.L_8I11: - fld.d F0, S1, 0x00 - fld.d F1, S2, 0x00 - fld.d F2, S3, 0x00 - fld.d F3, S4, 0x00 - fld.d F4, S5, 0x00 - fld.d F5, S6, 0x00 - fld.d F6, S7, 0x00 - fld.d F7, S8, 0x00 - - fst.d F0, TD, 0x00 - addi.d S1, S1, 0x08 - fst.d F1, TD, 0x08 - addi.d S2, S2, 0x08 - fst.d F2, TD, 0x10 - addi.d S3, S3, 0x08 - fst.d F3, TD, 0x18 - addi.d S4, S4, 0x08 - fst.d F4, TD, 0x20 - addi.d S5, S5, 0x08 - fst.d F5, TD, 0x28 - addi.d S6, S6, 0x08 - fst.d F6, TD, 0x30 - addi.d S7, S7, 0x08 - fst.d F7, TD, 0x38 - addi.d S8, S8, 0x08 - - addi.d TD, TD, 0x40 - addi.d I, I, -1 - blt ZERO, I, .L_8I11 - -.L_N4: - andi J, N, 0x04 - beq ZERO, J, .L_N2 + blt ZERO, I, .L_N81 + +.L_N4: /* if(n&4)*/ + andi I, N, 0x04 + beq I, ZERO, .L_N2 move S1, TS add.d S2, TS, TL - srai.d I, M, 0x02 + move I, M add.d S3, S2, TL - add.d S4, S2, T0 - add.d TS, S3, T0 - beq I, ZERO, .L_I3 - -.L_4I1: /* I-- */ - xvld U0, S1, 0x00 - xvld U1, S2, 0x00 - xvld U2, S3, 0x00 - xvld U3, S4, 0x00 - - xvpackev.d D0, U1, U0 - xvpackod.d D1, U1, U0 - xvpackev.d D2, U3, U2 - xvpackod.d D3, U3, U2 - - xvand.v U0, D0, D0 - xvpermi.q D0, D2, 0x02 // 0 - xvand.v U1, D1, D1 - xvpermi.q D1, D3, 0x02 // 1 - xvpermi.q D2, U0, 0x31 // 2 - xvpermi.q D3, U1, 0x31 // 3 - - xvst D0, TD, 0x00 - xvst D1, TD, 0x20 - xvst D2, TD, 0x40 - xvst D3, TD, 0x60 - - addi.d S1, S1, 0x20 - addi.d S2, S2, 0x20 - addi.d S3, S3, 0x20 - addi.d S4, S4, 0x20 - addi.d TD, TD, 0x80 - + add.d S4, S3, TL + add.d TS, S4, TL + beq I, ZERO, .L_N2 + +.L_N41: /* if(i>0)*/ + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fld.d F2, S3, 0x00 + fld.d F3, S4, 0x00 + + fst.d F0, TD, 0x00 + fst.d F1, TD, 0x08 + fst.d F2, TD, 0x10 + fst.d F3, TD, 0x18 + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d S3, S3, 0x08 + addi.d S4, S4, 0x08 + addi.d TD, TD, 0x20 addi.d I, I, -1 - blt ZERO, I, .L_4I1 - -.L_I3: - andi I, M, 0x03 - beq I, ZERO, .L_N2 - -.L_4II1: - fld.d F0, S1, 0x00 - fld.d F1, S2, 0x00 - fld.d F2, S3, 0x00 - fld.d F3, S4, 0x00 - - fst.d F0, TD, 0x00 - addi.d S1, S1, 0x08 - fst.d F1, TD, 0x08 - addi.d S2, S2, 0x08 - fst.d F2, TD, 0x10 - addi.d S3, S3, 0x08 - fst.d F3, TD, 0x18 - addi.d S4, S4, 0x08 - - addi.d TD, TD, 0x20 - addi.d I, I, -1 - blt ZERO, I, .L_4II1 - -.L_N2: - andi J, N, 0x02 - beq ZERO, J, .L_N1 + blt ZERO, I, .L_N41 + +.L_N2: /* if(n&2)*/ + andi I, N, 0x02 + beq I, ZERO, .L_N1 move S1, TS add.d S2, TS, TL - srai.d I, M, 0x01 + move I, M add.d TS, S2, TL - beq I, ZERO, .L_NI1 - -.L_2I1: /* I-- */ - xvld U0, S1, 0x00 - xvld U1, S2, 0x00 - - xvpackev.d D0, U1, U0 - xvpackod.d D1, U1, U0 - - xvpermi.q D0, D1, 0x02 // 0 + beq I, ZERO, .L_N1 - xvst D0, TD, 0x00 +.L_N21: /* if(i>0)*/ + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 - addi.d S1, S1, 0x10 - addi.d S2, S2, 0x10 - addi.d TD, TD, 0x20 + fst.d F0, TD, 0x00 + fst.d F1, TD, 0x08 + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d TD, TD, 0x10 addi.d I, I, -1 - blt ZERO, I, .L_2I1 - -.L_NI1: - andi I, M, 0x01 - beq I, ZERO, .L_N1 - + blt ZERO, I, .L_N21 - fld.d F0, S1, 0x00 - fld.d F1, S2, 0x00 +.L_N1: /* if(n&2)*/ + andi I, N, 0x01 + beq I, ZERO, .L_N0 - fst.d F0, TD, 0x00 - addi.d S1, S1, 0x08 - fst.d F1, TD, 0x08 - addi.d S2, S2, 0x08 - addi.d TD, TD, 0x10 + move S1, TS + move I, M + beq I, ZERO, .L_N0 -.L_N1: - move S1, TS - beq ZERO, M, .L_N0 +.L_N11: /* if(i>0)*/ + fld.d F0, S1, 0x00 + fst.d F0, TD, 0x00 -.L_M1: - fld.d F0, S1, 0x00 - addi.d S1, S1, 0x08 - fst.d F0, TD, 0x00 - addi.d TD, TD, 0x08 - addi.d M, M, -1 - blt ZERO, M, .L_M1 + addi.d S1, S1, 0x08 + addi.d TD, TD, 0x08 + addi.d I, I, -1 + blt ZERO, I, .L_N11 .L_N0: - LDARG $r23, $sp, 0x00 - LDARG $r24, $sp, 0x08 - LDARG $r25, $sp, 0x10 - LDARG $r26, $sp, 0x18 - LDARG $r27, $sp, 0x20 - LDARG $r28, $sp, 0x28 - LDARG $r29, $sp, 0x30 - LDARG $r30, $sp, 0x38 - LDARG $r31, $sp, 0x40 - LD $f23, $sp, 0x48 - LD $f24, $sp, 0x50 - LD $f25, $sp, 0x58 - LD $f26, $sp, 0x60 - LD $f27, $sp, 0x68 - LD $f28, $sp, 0x70 - LD $f29, $sp, 0x78 - LD $f30, $sp, 0x80 - LD $f31, $sp, 0x88 - addi.d $sp, $sp, 0x90 - jirl $r0, $r1, 0x00 + LDARG $r23, $sp, 0 + LDARG $r24, $sp, 8 + LDARG $r25, $sp, 16 + LDARG $r26, $sp, 24 + LDARG $r27, $sp, 32 + LDARG $r28, $sp, 40 + LDARG $r29, $sp, 48 + LDARG $r30, $sp, 56 + addi.d $sp, $sp, 64 + jirl $r0, $r1, 0x00 EPILOGUE \ No newline at end of file From a978ad318070e24f5ca0cf4221b55abb5869287e Mon Sep 17 00:00:00 2001 From: pengxu Date: Tue, 13 May 2025 16:09:12 +0800 Subject: [PATCH 08/37] Loongarch64: add C functions of zgemm_ncopy_16 --- kernel/generic/zgemm_ncopy_16.c | 332 ++++++++++++++++++++++++++++++++ 1 file changed, 332 insertions(+) create mode 100644 kernel/generic/zgemm_ncopy_16.c diff --git a/kernel/generic/zgemm_ncopy_16.c b/kernel/generic/zgemm_ncopy_16.c new file mode 100644 index 000000000..088103525 --- /dev/null +++ b/kernel/generic/zgemm_ncopy_16.c @@ -0,0 +1,332 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ + BLASLONG i, j; + + IFLOAT *aoffset; + IFLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; + IFLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8; + IFLOAT *aoffset9, *aoffset10, *aoffset11, *aoffset12; + IFLOAT *aoffset13, *aoffset14, *aoffset15, *aoffset16; + + IFLOAT *boffset; + IFLOAT ctemp01, ctemp02, ctemp03, ctemp04; + IFLOAT ctemp05, ctemp06, ctemp07, ctemp08; + IFLOAT ctemp09, ctemp10, ctemp11, ctemp12; + IFLOAT ctemp13, ctemp14, ctemp15, ctemp16; + IFLOAT ctemp17, ctemp18, ctemp19, ctemp20; + IFLOAT ctemp21, ctemp22, ctemp23, ctemp24; + IFLOAT ctemp25, ctemp26, ctemp27, ctemp28; + IFLOAT ctemp29, ctemp30, ctemp31, ctemp32; + + aoffset = a; + boffset = b; + lda *= 2; + + j = (n >> 4); + if (j > 0){ + do{ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset5 = aoffset4 + lda; + aoffset6 = aoffset5 + lda; + aoffset7 = aoffset6 + lda; + aoffset8 = aoffset7 + lda; + aoffset9 = aoffset8 + lda; + aoffset10 = aoffset9 + lda; + aoffset11 = aoffset10 + lda; + aoffset12 = aoffset11 + lda; + aoffset13 = aoffset12 + lda; + aoffset14 = aoffset13 + lda; + aoffset15 = aoffset14 + lda; + aoffset16 = aoffset15 + lda; + aoffset += 16 * lda; + + i = m; + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + ctemp05 = *(aoffset3 + 0); + ctemp06 = *(aoffset3 + 1); + ctemp07 = *(aoffset4 + 0); + ctemp08 = *(aoffset4 + 1); + ctemp09 = *(aoffset5 + 0); + ctemp10 = *(aoffset5 + 1); + ctemp11 = *(aoffset6 + 0); + ctemp12 = *(aoffset6 + 1); + ctemp13 = *(aoffset7 + 0); + ctemp14 = *(aoffset7 + 1); + ctemp15 = *(aoffset8 + 0); + ctemp16 = *(aoffset8 + 1); + + ctemp17 = *(aoffset9 + 0); + ctemp18 = *(aoffset9 + 1); + ctemp19 = *(aoffset10 + 0); + ctemp20 = *(aoffset10 + 1); + ctemp21 = *(aoffset11 + 0); + ctemp22 = *(aoffset11 + 1); + ctemp23 = *(aoffset12 + 0); + ctemp24 = *(aoffset12 + 1); + ctemp25 = *(aoffset13 + 0); + ctemp26 = *(aoffset13 + 1); + ctemp27 = *(aoffset14 + 0); + ctemp28 = *(aoffset14 + 1); + ctemp29 = *(aoffset15 + 0); + ctemp30 = *(aoffset15 + 1); + ctemp31 = *(aoffset16 + 0); + ctemp32 = *(aoffset16 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + *(boffset + 4) = ctemp05; + *(boffset + 5) = ctemp06; + *(boffset + 6) = ctemp07; + *(boffset + 7) = ctemp08; + *(boffset + 8) = ctemp09; + *(boffset + 9) = ctemp10; + *(boffset + 10) = ctemp11; + *(boffset + 11) = ctemp12; + *(boffset + 12) = ctemp13; + *(boffset + 13) = ctemp14; + *(boffset + 14) = ctemp15; + *(boffset + 15) = ctemp16; + + *(boffset + 16) = ctemp17; + *(boffset + 17) = ctemp18; + *(boffset + 18) = ctemp19; + *(boffset + 19) = ctemp20; + *(boffset + 20) = ctemp21; + *(boffset + 21) = ctemp22; + *(boffset + 22) = ctemp23; + *(boffset + 23) = ctemp24; + *(boffset + 24) = ctemp25; + *(boffset + 25) = ctemp26; + *(boffset + 26) = ctemp27; + *(boffset + 27) = ctemp28; + *(boffset + 28) = ctemp29; + *(boffset + 29) = ctemp30; + *(boffset + 30) = ctemp31; + *(boffset + 31) = ctemp32; + + aoffset1 += 2; + aoffset2 += 2; + aoffset3 += 2; + aoffset4 += 2; + aoffset5 += 2; + aoffset6 += 2; + aoffset7 += 2; + aoffset8 += 2; + aoffset9 += 2; + aoffset10 += 2; + aoffset11 += 2; + aoffset12 += 2; + aoffset13 += 2; + aoffset14 += 2; + aoffset15 += 2; + aoffset16 += 2; + + boffset += 32; + i --; + }while(i > 0); + } + j--; + }while(j > 0); + } /* end of if(j > 0) */ + + if (n & 8){ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset5 = aoffset4 + lda; + aoffset6 = aoffset5 + lda; + aoffset7 = aoffset6 + lda; + aoffset8 = aoffset7 + lda; + aoffset += 8 * lda; + + i = m; + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + ctemp05 = *(aoffset3 + 0); + ctemp06 = *(aoffset3 + 1); + ctemp07 = *(aoffset4 + 0); + ctemp08 = *(aoffset4 + 1); + ctemp09 = *(aoffset5 + 0); + ctemp10 = *(aoffset5 + 1); + ctemp11 = *(aoffset6 + 0); + ctemp12 = *(aoffset6 + 1); + ctemp13 = *(aoffset7 + 0); + ctemp14 = *(aoffset7 + 1); + ctemp15 = *(aoffset8 + 0); + ctemp16 = *(aoffset8 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + *(boffset + 4) = ctemp05; + *(boffset + 5) = ctemp06; + *(boffset + 6) = ctemp07; + *(boffset + 7) = ctemp08; + *(boffset + 8) = ctemp09; + *(boffset + 9) = ctemp10; + *(boffset + 10) = ctemp11; + *(boffset + 11) = ctemp12; + *(boffset + 12) = ctemp13; + *(boffset + 13) = ctemp14; + *(boffset + 14) = ctemp15; + *(boffset + 15) = ctemp16; + + aoffset1 += 2; + aoffset2 += 2; + aoffset3 += 2; + aoffset4 += 2; + aoffset5 += 2; + aoffset6 += 2; + aoffset7 += 2; + aoffset8 += 2; + + boffset += 16; + i --; + }while(i > 0); + } + } + + if (n & 4){ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset += 4 * lda; + + i = m; + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + ctemp05 = *(aoffset3 + 0); + ctemp06 = *(aoffset3 + 1); + ctemp07 = *(aoffset4 + 0); + ctemp08 = *(aoffset4 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + *(boffset + 4) = ctemp05; + *(boffset + 5) = ctemp06; + *(boffset + 6) = ctemp07; + *(boffset + 7) = ctemp08; + + aoffset1 += 2; + aoffset2 += 2; + aoffset3 += 2; + aoffset4 += 2; + + boffset += 8; + i --; + }while(i > 0); + } + } /* end of if(j > 0) */ + + if (n & 2){ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset += 2 * lda; + + i = m; + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + + aoffset1 += 2; + aoffset2 += 2; + boffset += 4; + i --; + }while(i > 0); + } + + } /* end of if(j > 0) */ + + if (n & 1){ + aoffset1 = aoffset; + + i = m; + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + + aoffset1 += 2; + boffset += 2; + i --; + }while(i > 0); + } + + } /* end of if(j > 0) */ + + return 0; +} From 9a7e3f102b393f263559ef2852d92e7138ff5482 Mon Sep 17 00:00:00 2001 From: guoyuanplct Date: Wed, 14 May 2025 00:09:26 +0800 Subject: [PATCH 09/37] kernel/riscv64:Fixed the bug of openblas_utest_ext failing in c/zgemv and some c/zgbmv tests: --- kernel/riscv64/zgemv_n_vector.c | 83 +++++++++++++++++++++++++-------- 1 file changed, 63 insertions(+), 20 deletions(-) diff --git a/kernel/riscv64/zgemv_n_vector.c b/kernel/riscv64/zgemv_n_vector.c index cbed06c97..8d44dd25a 100644 --- a/kernel/riscv64/zgemv_n_vector.c +++ b/kernel/riscv64/zgemv_n_vector.c @@ -66,7 +66,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, BLASLONG lda2 = lda * 2; vy0_new = VLSEV_FLOAT(&y[iy], stride_y, gvl); vy1_new = VLSEV_FLOAT(&y[iy + 1], stride_y, gvl); - for (k = 0, j = 0; k < m / gvl; k++) + for (k = 0, j = 0; k < m / gvl; k ++) { a_ptr = a; ix = 0; @@ -121,30 +121,73 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, #endif a_ptr += lda2; ix += inc_x2; + } - for (; i < n; i += 4) + for (i = n % 4 ; i < n; i += 4) { #if !defined(XCONJ) - - x_v0 = VLSEV_FLOAT(&x[ix], inc_x2 * sizeof(FLOAT), 4); - x_v1 = VLSEV_FLOAT(&x[ix + 1], inc_x2 * sizeof(FLOAT), 4); - temp_rv = VFMUL_VF_FLOAT(x_v0, alpha_r, 4); - temp_iv = VFMUL_VF_FLOAT(x_v0, alpha_i, 4); - temp_rv = VFNMSACVF_FLOAT(temp_rv, alpha_i, x_v1, 4); - temp_iv = VFMACCVF_FLOAT(temp_iv, alpha_r, x_v1, 4); - VSEV_FLOAT(&temp_rr[0], temp_rv, 4); - VSEV_FLOAT(&temp_ii[0], temp_iv, 4); + // temp_rr[0] = alpha_r * x[ix] - alpha_i * x[ix + 1]; + // temp_rr[1] = alpha_r * x[ix + inc_x2] - alpha_i * x[ix + inc_x2 + 1]; + x_v0 = VLSEV_FLOAT(&x[ix], inc_x2 * sizeof(FLOAT), 2); + x_v1 = VLSEV_FLOAT(&x[ix + 1], inc_x2 * sizeof(FLOAT), 2); + temp_rv = VFMUL_VF_FLOAT(x_v0, alpha_r, 2); + temp_rv = VFNMSACVF_FLOAT(temp_rv, alpha_i, x_v1, 2); + + // temp_ii[0] = alpha_r * x[ix + 1] + alpha_i * x[ix]; + // temp_ii[1] = alpha_r * x[ix + inc_x2 + 1] + alpha_i * x[ix + inc_x2]; + temp_iv = VFMUL_VF_FLOAT(x_v0, alpha_i, 2); + temp_iv = VFMACCVF_FLOAT(temp_iv, alpha_r, x_v1, 2); + VSEV_FLOAT(&temp_rr[0], temp_rv, 2); + VSEV_FLOAT(&temp_ii[0], temp_iv, 2); + + // temp_rr[2] = alpha_r * x[ix + inc_x2 * 2] - alpha_i * x[ix + inc_x2 * 2 + 1]; + // temp_rr[3] = alpha_r * x[ix + inc_x2 * 3] - alpha_i * x[ix + inc_x2 * 3 + 1]; + x_v0 = VLSEV_FLOAT(&x[ix + inc_x2 * 2], inc_x2 * sizeof(FLOAT), 2); + x_v1 = VLSEV_FLOAT(&x[ix + inc_x2 * 2 + 1], inc_x2 * sizeof(FLOAT), 2); + temp_rv = VFMUL_VF_FLOAT(x_v0, alpha_r, 2); + temp_rv = VFNMSACVF_FLOAT(temp_rv, alpha_i, x_v1, 2); + + // temp_ii[2] = alpha_r * x[ix + inc_x2 * 2 + 1] + alpha_i * x[ix + inc_x2 * 2]; + // temp_ii[3] = alpha_r * x[ix + inc_x2 * 3 + 1] + alpha_i * x[ix + inc_x2 * 3]; + temp_iv = VFMUL_VF_FLOAT(x_v0, alpha_i, 2); + temp_iv = VFMACCVF_FLOAT(temp_iv, alpha_r, x_v1, 2); + VSEV_FLOAT(&temp_rr[2], temp_rv, 2); + VSEV_FLOAT(&temp_ii[2], temp_iv, 2); #else - x_v0 = VLSEV_FLOAT(&x[ix], inc_x2 * sizeof(FLOAT), 4); - x_v1 = VLSEV_FLOAT(&x[ix + 1], inc_x2 * sizeof(FLOAT), 4); - temp_rv = VFMUL_VF_FLOAT(x_v0, alpha_r, 4); - temp_iv = VFMUL_VF_FLOAT(x_v0, alpha_i, 4); - temp_rv = VFMACCVF_FLOAT(temp_rv, alpha_i, x_v1, 4); - temp_iv = VFNMSACVF_FLOAT(temp_iv, alpha_r, x_v1, 4); - VSEV_FLOAT(&temp_rr[0], temp_rv, 4); - VSEV_FLOAT(&temp_ii[0], temp_iv, 4); + // temp_rr[0] = alpha_r * x[ix] + alpha_i * x[ix + 1]; + // temp_rr[1] = alpha_r * x[ix + inc_x2] + alpha_i * x[ix + inc_x2 + 1]; + x_v0 = VLSEV_FLOAT(&x[ix], inc_x2 * sizeof(FLOAT), 2); + x_v1 = VLSEV_FLOAT(&x[ix + 1], inc_x2 * sizeof(FLOAT), 2); + temp_rv = VFMUL_VF_FLOAT(x_v0, alpha_r, 2); + temp_rv = VFMACCVF_FLOAT(temp_rv, alpha_i, x_v1, 2); + + + // temp_ii[0] = alpha_r * x[ix + 1] - alpha_i * x[ix]; + // temp_ii[1] = alpha_r * x[ix + inc_x2 + 1] - alpha_i * x[ix + inc_x2]; + temp_iv = VFMUL_VF_FLOAT(x_v1, alpha_r, 2); + temp_iv = VFNMSACVF_FLOAT(temp_iv, alpha_i, x_v0, 2); + VSEV_FLOAT(&temp_rr[0], temp_rv, 2); + VSEV_FLOAT(&temp_ii[0], temp_iv, 2); + + + // temp_rr[2] = alpha_r * x[ix + inc_x2 * 2] + alpha_i * x[ix + inc_x2 * 2 + 1]; + // temp_rr[3] = alpha_r * x[ix + inc_x2 * 3] + alpha_i * x[ix + inc_x2 * 3 + 1]; + x_v0 = VLSEV_FLOAT(&x[ix + inc_x2 * 2], inc_x2 * sizeof(FLOAT), 2); + x_v1 = VLSEV_FLOAT(&x[ix + inc_x2 * 2 + 1], inc_x2 * sizeof(FLOAT), 2); + temp_rv = VFMUL_VF_FLOAT(x_v0, alpha_r, 2); + temp_rv = VFMACCVF_FLOAT(temp_rv, alpha_i, x_v1, 2); + + + temp_ii[2] = alpha_r * x[ix + inc_x2 * 2 + 1] - alpha_i * x[ix + inc_x2 * 2]; + temp_ii[3] = alpha_r * x[ix + inc_x2 * 3 + 1] - alpha_i * x[ix + inc_x2 * 3]; + temp_iv = VFMUL_VF_FLOAT(x_v1, alpha_r, 2); + temp_iv = VFNMSACVF_FLOAT(temp_iv, alpha_i, x_v0, 2); + VSEV_FLOAT(&temp_rr[2], temp_rv, 2); + VSEV_FLOAT(&temp_ii[2], temp_iv, 2); + + #endif @@ -257,7 +300,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl); VSSEV_FLOAT(&y[iy + 1], stride_y, vy1, gvl); j += gvl * 2; - iy += inc_yv; + iy += inc_yv ; } // tail if (j / 2 < m) From 4d213653d857d6365221b79c16d4e151120e9fbe Mon Sep 17 00:00:00 2001 From: guoyuanplct Date: Thu, 15 May 2025 13:29:14 +0800 Subject: [PATCH 10/37] kernel/riscv64:Added support for omatcopy on riscv64. --- kernel/riscv64/KERNEL.RISCV64_ZVL256B | 6 ++ kernel/riscv64/omatcopy_cn_vector.c | 125 ++++++++++++++++++++++++++ kernel/riscv64/zomatcopy_cn_vector.c | 111 +++++++++++++++++++++++ 3 files changed, 242 insertions(+) create mode 100644 kernel/riscv64/omatcopy_cn_vector.c create mode 100644 kernel/riscv64/zomatcopy_cn_vector.c diff --git a/kernel/riscv64/KERNEL.RISCV64_ZVL256B b/kernel/riscv64/KERNEL.RISCV64_ZVL256B index 9915fd949..ba7a52bbf 100644 --- a/kernel/riscv64/KERNEL.RISCV64_ZVL256B +++ b/kernel/riscv64/KERNEL.RISCV64_ZVL256B @@ -201,3 +201,9 @@ endif ifndef ZGEMM_BETA ZGEMM_BETA = ../generic/zgemm_beta.c endif + +ZOMATCOPY_CN = zomatcopy_cn_vector.c +COMATCOPY_CN = zomatcopy_cn_vector.c + +DOMATCOPY_CN = omatcopy_cn_vector.c +SOMATCOPY_CN = omatcopy_cn_vector.c diff --git a/kernel/riscv64/omatcopy_cn_vector.c b/kernel/riscv64/omatcopy_cn_vector.c new file mode 100644 index 000000000..444c8232d --- /dev/null +++ b/kernel/riscv64/omatcopy_cn_vector.c @@ -0,0 +1,125 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + + +#if !defined(DOUBLE) +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m4)() +#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) +#define FLOAT_V_T vfloat32m4_t +#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4) +#define VSEV_FLOAT RISCV_RVV(vse32_v_f32m4) +#define VFMULVF_FLOAT RISCV_RVV(vfmul_vf_f32m4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m4) +#else +#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m4)() +#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) +#define FLOAT_V_T vfloat64m4_t +#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4) +#define VSEV_FLOAT RISCV_RVV(vse64_v_f64m4) +#define VFMULVF_FLOAT RISCV_RVV(vfmul_vf_f64m4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m4) +#endif + + +int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb) +{ + BLASLONG i,j; + FLOAT *aptr,*bptr; + size_t vl; + + FLOAT_V_T va, vb,va1,vb1; + if ( rows <= 0 ) return(0); + if ( cols <= 0 ) return(0); + + aptr = a; + bptr = b; + + if ( alpha == 0.0 ) + { + vl = VSETVL_MAX; + va = VFMVVF_FLOAT(0, vl); + for ( i=0; i Date: Thu, 15 May 2025 18:55:47 +0800 Subject: [PATCH 11/37] Format Code --- kernel/riscv64/omatcopy_cn_vector.c | 2 -- kernel/riscv64/zomatcopy_cn_vector.c | 5 ----- 2 files changed, 7 deletions(-) diff --git a/kernel/riscv64/omatcopy_cn_vector.c b/kernel/riscv64/omatcopy_cn_vector.c index 444c8232d..d079310b8 100644 --- a/kernel/riscv64/omatcopy_cn_vector.c +++ b/kernel/riscv64/omatcopy_cn_vector.c @@ -26,8 +26,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" - - #if !defined(DOUBLE) #define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m4)() #define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) diff --git a/kernel/riscv64/zomatcopy_cn_vector.c b/kernel/riscv64/zomatcopy_cn_vector.c index bbfbd214a..b141ed4a6 100644 --- a/kernel/riscv64/zomatcopy_cn_vector.c +++ b/kernel/riscv64/zomatcopy_cn_vector.c @@ -70,7 +70,6 @@ int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, FLOAT_VX2_T va, vb; unsigned int gvl = 0; - if ( rows <= 0 ) return(0); if ( cols <= 0 ) return(0); @@ -85,8 +84,6 @@ int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, for(j=0; j Date: Fri, 16 May 2025 18:24:46 +0800 Subject: [PATCH 12/37] Add retry mechanism after deadlock timeout for c910v. --- .github/workflows/c910v.yml | 36 +++++++++++++++++++++++++++++++++--- 1 file changed, 33 insertions(+), 3 deletions(-) diff --git a/.github/workflows/c910v.yml b/.github/workflows/c910v.yml index c5b497316..9981c437b 100644 --- a/.github/workflows/c910v.yml +++ b/.github/workflows/c910v.yml @@ -83,9 +83,39 @@ jobs: - name: test run: | - export PATH=$GITHUB_WORKSPACE/qemu-install/bin/:$PATH - qemu-riscv64 ./utest/openblas_utest - qemu-riscv64 ./utest/openblas_utest_ext + run_with_retry() { + local cmd="$1" + local time_out=10 + local retries=10 + local attempt=0 + + for ((i=1; i<=retries; i++)); do + attempt=$((i)) + if timeout -s 12 --preserve-status $time_out $cmd; then + echo "Command succeeded on attempt $i." + return 0 + else + local exit_code=$? + if [ $exit_code -eq 140 ]; then + echo "Attempt $i timed out (retrying...)" + time_out=$((time_out + 5)) + else + echo "Attempt $i failed with exit code $exit_code. Aborting workflow." + exit $exit_code + fi + fi + done + echo "All $retries attempts failed, giving up." + echo "Final failure was due to timeout." + echo "Aborting workflow." + exit $exit_code + } + export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH + which qemu-riscv64 + export QEMU_BIN=$(which qemu-riscv64) + run_with_retry "$QEMU_BIN ./utest/openblas_utest" + run_with_retry "$QEMU_BIN ./utest/openblas_utest_ext" + OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xscblat1 OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xdcblat1 OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xccblat1 From 6680e0592f9f2e4e0551b13d7c1f6fc3e225fe95 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 17 May 2025 05:12:15 -0700 Subject: [PATCH 13/37] Fix conditional inclusion of SGEMM_KERNEL_DIRECT --- interface/gemm.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/interface/gemm.c b/interface/gemm.c index d36925629..54e5604fd 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -417,21 +417,24 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS PRINT_DEBUG_CNAME; -#if !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) && defined(USE_SGEMM_KERNEL_DIRECT) -#if defined(DYNAMIC_ARCH) && defined(ARCH_x86) - if (support_avx512() ) +#if !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) +#if defined(ARCH_x86) && (defined(USE_SGEMM_KERNEL_DIRECT)||defined(DYNAMIC_ARCH)) +#if defined(DYNAMIC_ARCH) + if (support_avx512() ) +#endif if (beta == 0 && alpha == 1.0 && order == CblasRowMajor && TransA == CblasNoTrans && TransB == CblasNoTrans && SGEMM_DIRECT_PERFORMANT(m,n,k)) { SGEMM_DIRECT(m, n, k, a, lda, b, ldb, c, ldc); return; } #endif -#if defined(DYNAMIC_ARCH) && defined(ARCH_ARM64) - if (support_sme1()){ +#if defined(ARCH_ARM64) && (defined(USE_SGEMM_KERNEL_DIRECT)||defined(DYNAMIC_ARCH)) +#if defined(DYNAMIC_ARCH) + if (support_sme1()) +#endif if (beta == 0 && alpha == 1.0 && order == CblasRowMajor && TransA == CblasNoTrans && TransB == CblasNoTrans) { SGEMM_DIRECT(m, n, k, a, lda, b, ldb, c, ldc); return; } - } #endif #endif From f2022c23aca676dcfc43a539bde02dc14411966b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 19 May 2025 16:08:12 +0200 Subject: [PATCH 14/37] Remove sve capability from NeoverseN1 and specify CortexX2/A?10 as arm8.4a --- cmake/cc.cmake | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cmake/cc.cmake b/cmake/cc.cmake index f292f1c57..66b316f7f 100644 --- a/cmake/cc.cmake +++ b/cmake/cc.cmake @@ -229,9 +229,9 @@ if (${CORE} STREQUAL NEOVERSEN1) if (${CMAKE_C_COMPILER_ID} STREQUAL "NVC" AND NOT NO_SVE) set (CCOMMON_OPT "${CCOMMON_OPT} -tp=neoverse-n1") elseif (${GCC_VERSION} VERSION_GREATER 9.4 OR ${GCC_VERSION} VERSION_EQUAL 9.4) - set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve -mtune=neoverse-n1") + set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a -mtune=neoverse-n1") else () - set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve") + set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a") endif() endif () endif () @@ -260,13 +260,13 @@ endif () if (${CORE} STREQUAL CORTEXA510) if (NOT DYNAMIC_ARCH) - set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve") + set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.4-a+sve") endif () endif () if (${CORE} STREQUAL CORTEXA710) if (NOT DYNAMIC_ARCH) - set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve") + set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.4-a+sve") endif () endif () @@ -278,7 +278,7 @@ endif () if (${CORE} STREQUAL CORTEXX2) if (NOT DYNAMIC_ARCH) - set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve") + set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.4-a+sve") endif () endif () From 8779eac3b8afb1b862b85ef08ceec3305d054e09 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 19 May 2025 08:55:14 -0700 Subject: [PATCH 15/37] Do not add a 64 suffix to the library name if the user-provided suffix already contains it --- CMakeLists.txt | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f94c4c474..7094eb5b7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -123,7 +123,12 @@ message(WARNING "CMake support is experimental. It does not yet support all buil include("${PROJECT_SOURCE_DIR}/cmake/utils.cmake") include("${PROJECT_SOURCE_DIR}/cmake/system.cmake") -set(OpenBLAS_LIBNAME ${LIBNAMEPREFIX}openblas${LIBNAMESUFFIX}${SUFFIX64_UNDERSCORE}) +string(FIND "${LIBNAMESUFFIX}" "${SUFFIX64_UNDERSCORE}" HAVE64) +if (${HAVE64} GREATER -1) + set(OpenBLAS_LIBNAME ${LIBNAMEPREFIX}openblas${LIBNAMESUFFIX}) +else () + set(OpenBLAS_LIBNAME ${LIBNAMEPREFIX}openblas${LIBNAMESUFFIX}${SUFFIX64_UNDERSCORE}) +endif () set(BLASDIRS interface driver/level2 driver/level3 driver/others) @@ -716,4 +721,5 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PN}ConfigVersion.cmake DESTINATION ${CMAKECONFIG_INSTALL_DIR}) install(EXPORT "${PN}${SUFFIX64}Targets" NAMESPACE "${PN}${SUFFIX64}::" - DESTINATION ${CMAKECONFIG_INSTALL_DIR}) \ No newline at end of file + DESTINATION ${CMAKECONFIG_INSTALL_DIR}) + From 4ca76d9de4ce8808498aa314e7dad961eef16d5b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 19 May 2025 12:07:24 -0700 Subject: [PATCH 16/37] Expressly provide a shared libs option --- CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7094eb5b7..f13f707f9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -62,6 +62,7 @@ option(CPP_THREAD_SAFETY_TEST "Run a massively parallel DGEMM test to confirm th option(CPP_THREAD_SAFETY_GEMV "Run a massively parallel DGEMV test to confirm thread safety of the library (requires OpenMP)" OFF) option(BUILD_STATIC_LIBS "Build static library" OFF) +option(BUILD_SHARED_LIBS "Build shared library" OFF) if(NOT BUILD_STATIC_LIBS AND NOT BUILD_SHARED_LIBS) set(BUILD_STATIC_LIBS ON CACHE BOOL "Build static library" FORCE) endif() From 2351a98005c68aca88e9403ff19f83fe90c6bd49 Mon Sep 17 00:00:00 2001 From: Masato Nakagawa Date: Wed, 21 May 2025 21:21:52 +0900 Subject: [PATCH 17/37] Update 2D thread-partitioned GEMM for M << N case. --- driver/level3/level3_thread.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index 77aaeee6b..05d349d97 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -851,9 +851,19 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, IFLOAT *sa, IF /* Objective function come from sum of partitions in m and n. */ /* (n / nthreads_n) + (m / nthreads_m) */ /* = (n * nthreads_m + m * nthreads_n) / (nthreads_n * nthreads_m) */ - while (nthreads_m % 2 == 0 && n * nthreads_m + m * nthreads_n > n * (nthreads_m / 2) + m * (nthreads_n * 2)) { - nthreads_m /= 2; - nthreads_n *= 2; + BLASLONG cost = 0, div = 0; + for (BLASLONG i = 1; i <= sqrt(nthreads_m); i++) { + if (nthreads_m % i) continue; + BLASLONG j = nthreads_m / i; + BLASLONG cost_i = n * j + m * nthreads_n * i; + BLASLONG cost_j = n * i + m * nthreads_n * j; + if (cost == 0 || + cost_i < cost) {cost = cost_i; div = i;} + if (cost_j < cost) {cost = cost_j; div = j;} + } + if (div > 1) { + nthreads_m /= div; + nthreads_n *= div; } } From bd573a9d387abb3bd81a88660f2b064302ab3a93 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 21 May 2025 22:01:02 +0200 Subject: [PATCH 18/37] Expand mingw32 gfortran workaround to all versions after 14.1 --- ctest/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ctest/CMakeLists.txt b/ctest/CMakeLists.txt index 03b157843..83a715005 100644 --- a/ctest/CMakeLists.txt +++ b/ctest/CMakeLists.txt @@ -6,7 +6,7 @@ enable_language(Fortran) endif() set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DADD${BU} -DCBLAS") -if (BINARY32 AND CMAKE_C_PLATFORM_ID MATCHES "MinGW" AND CMAKE_Fortran_COMPILER_VERSION VERSION_EQUAL 14.2) +if (BINARY32 AND CMAKE_C_PLATFORM_ID MATCHES "MinGW" AND CMAKE_Fortran_COMPILER_VERSION VERSION_GREATER 14.1) list(REMOVE_ITEM ${CMAKE_Fortran_FLAGS} -O3 -O2 -O1 -Os) set (CMAKE_Fortran_FLAGS_RELEASE "" CACHE STRING "" FORCE) endif() From 42b7d1f8972f6444395ce71125da2eed1a0ae196 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 21 May 2025 22:03:38 +0200 Subject: [PATCH 19/37] Fix addressing of alpha in CBLAS --- interface/zsyr.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/interface/zsyr.c b/interface/zsyr.c index 8bc9ac177..51cca84ee 100644 --- a/interface/zsyr.c +++ b/interface/zsyr.c @@ -116,12 +116,12 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, #else -void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, int n, FLOAT alpha, FLOAT *x, int incx, FLOAT *a, int lda) { +void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, int n, void* valpha, FLOAT *x, int incx, FLOAT *a, int lda) { FLOAT *buffer; int uplo; blasint info; - FLOAT * ALPHA = α + FLOAT * ALPHA = (FLOAT*)valpha; FLOAT alpha_r = ALPHA[0]; FLOAT alpha_i = ALPHA[1]; #ifdef SMP From 20f2ba014143f195ff5cafa9b3bb98d5c89a3a03 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 21 May 2025 23:44:17 +0200 Subject: [PATCH 20/37] Move declaration of i for pre-C99 compilers --- driver/level3/level3_thread.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index 05d349d97..db3bffc10 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -852,7 +852,8 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, IFLOAT *sa, IF /* (n / nthreads_n) + (m / nthreads_m) */ /* = (n * nthreads_m + m * nthreads_n) / (nthreads_n * nthreads_m) */ BLASLONG cost = 0, div = 0; - for (BLASLONG i = 1; i <= sqrt(nthreads_m); i++) { + BLASLONG i; + for (i = 1; i <= sqrt(nthreads_m); i++) { if (nthreads_m % i) continue; BLASLONG j = nthreads_m / i; BLASLONG cost_i = n * j + m * nthreads_n * i; From 669c847ceb87faf5242d88cdf13d687ae6573038 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 23 May 2025 05:52:48 -0700 Subject: [PATCH 21/37] support extra flag for NaN handling --- kernel/x86_64/cscal.c | 74 ++++++++++++++++++++++++++++++------------- kernel/x86_64/zscal.c | 68 ++++++++++++++++++++++++++++----------- 2 files changed, 101 insertions(+), 41 deletions(-) diff --git a/kernel/x86_64/cscal.c b/kernel/x86_64/cscal.c index 212a21594..be32bf35a 100644 --- a/kernel/x86_64/cscal.c +++ b/kernel/x86_64/cscal.c @@ -229,10 +229,9 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, if ( da_i == 0.0 ) { - + if (!dummy2) { while(j < n1) { - x[i]=0.0; x[i+1]=0.0; x[i+inc_x]=0.0; @@ -244,21 +243,48 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, while(j < n) { - x[i]=0.0; x[i+1]=0.0; i += inc_x ; j++; - } + } else { + float temp; + while(j < n1) + { + if (isnan(x[i])|| isnan(x[i+1])) + temp=NAN; + else + temp=0.0; + x[i]=temp; + x[i+1]=temp; + if (isnan(x[i+inc_x])|| isnan(x[i+inc_x+1])) + temp=NAN; + else + temp=0.0; + x[i+inc_x]= temp; + x[i+inc_x+1]= temp; + i += 2*inc_x; + j+=2; + } + while(j < n) + { + if (isnan(x[i])|| isnan(x[i+1])) + temp=NAN; + else + temp=0.0; + x[i]=temp; + x[i+1]=temp; + i += inc_x; + j++; + } + } } else { - while(j < n1) { - if (isnan(x[i]) || isinf(x[i])) temp0 = NAN; else @@ -278,7 +304,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, x[i+inc_x] = temp1; i += 2*inc_x ; j+=2; - } while(j < n) @@ -305,14 +330,12 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, else { - - if ( da_i == 0.0 ) + if ( da_i == 0.0 && dummy2 ) { BLASLONG n1 = n & -2; while(j < n1) { - temp0 = da_r * x[i]; x[i+1] = da_r * x[i+1]; x[i] = temp0; @@ -367,22 +390,19 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, return(0); } - BLASLONG n1 = n & -16; if ( n1 > 0 ) { alpha[0] = da_r; alpha[1] = da_i; - if ( da_r == 0.0 ) - if ( da_i == 0 ) + if ( da_i == 0 && !dummy2) cscal_kernel_16_zero(n1 , alpha , x); else - cscal_kernel_16_zero_r(n1 , alpha , x); + cscal_kernel_16/*_zero_r*/(n1 , alpha , x); else cscal_kernel_16(n1 , alpha , x); - i = n1 << 1; j = n1; } @@ -393,6 +413,8 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, { FLOAT res=0.0; if (isnan(da_r)) res= da_r; + if (dummy2) + if (isnan(x[i])||isnan(x[i+1])) res= NAN; while(j < n) { x[i]=res; @@ -415,7 +437,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, } else { - while(j < n) { temp0 = -da_i * x[i+1]; @@ -424,11 +445,10 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, if (!isinf(x[i+1])) x[i+1] = da_i * x[i]; else x[i+1] = NAN; - if ( x[i] == x[i]) //preserve NaN + if ( !isnan(x[i])) //preserve NaN x[i] = temp0; i += 2 ; j++; - } } @@ -439,12 +459,22 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, if ( da_i == 0.0 ) { - while(j < n) { - + temp0 = da_r * x[i]; - x[i+1] = da_r * x[i+1]; + if (dummy2) { + if (isnan(x[i])||isinf(x[i])) temp0=NAN; + if (isnan(x[i+1])||isinf(x[i+1])) + x[i+1]=NAN; + else + x[i+1] = da_r * x[i+1]; + } else { + if (isnan(x[i])) + x[i+1] = NAN; + else + x[i+1] = da_r * x[i+1]; + } x[i] = temp0; i += 2 ; j++; @@ -476,7 +506,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, temp0 = da_r * x[i] - da_i * x[i+1]; x[i+1] = da_r * x[i+1] + da_i * x[i]; - x[i] = temp0; + if(!isnan(x[i]))x[i] = temp0; i += 2 ; j++; diff --git a/kernel/x86_64/zscal.c b/kernel/x86_64/zscal.c index 7859ef6e3..b3d146fd0 100644 --- a/kernel/x86_64/zscal.c +++ b/kernel/x86_64/zscal.c @@ -222,13 +222,14 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, if ( da_r == 0.0 ) { + BLASLONG n1 = n & -2; if ( da_i == 0.0 ) { + if (!dummy2) { while(j < n1) { - x[i]=0.0; x[i+1]=0.0; x[i+inc_x]=0.0; @@ -245,9 +246,40 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, x[i+1]=0.0; i += inc_x ; j++; + } + } else { + float temp; + while(j < n1) + { + if (isnan(x[i])|| isnan(x[i+1])) + temp=NAN; + else + temp=0.0; + x[i]=temp; + x[i+1]=temp; + if (isnan(x[i+inc_x])|| isnan(x[i+inc_x+1])) + temp=NAN; + else + temp=0.0; + x[i+inc_x]= temp; + x[i+inc_x+1]= temp; + i += 2*inc_x; + j+=2; } + while(j < n) + { + if (isnan(x[i])|| isnan(x[i+1])) + temp=NAN; + else + temp=0.0; + x[i]=temp; + x[i+1]=temp; + i += inc_x; + j++; + } + } } else { @@ -260,7 +292,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, temp0 = -da_i * x[i+1]; if (!isinf(x[i+1])) x[i+1] = da_i * x[i]; - else x[i+1] = NAN; + else x[i+1] = NAN; x[i] = temp0; if (isnan(x[i+inc_x]) || isinf(x[i+inc_x])) temp1 = NAN; @@ -291,16 +323,13 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, } - - } } else { - - if ( da_i == 0.0 ) + if ( da_i == 0.0 && dummy2) { BLASLONG n1 = n & -2; @@ -370,26 +399,27 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, alpha[1] = da_i; if ( da_r == 0.0 ) - if ( da_i == 0 ) + if ( da_i == 0 && !dummy2 ) zscal_kernel_8_zero(n1 , alpha , x); else -// zscal_kernel_8_zero_r(n1 , alpha , x); zscal_kernel_8(n1 , alpha , x); else - if ( da_i == 0 && da_r == da_r) + /* if ( da_i == 0 && da_r == da_r ) zscal_kernel_8_zero_i(n1 , alpha , x); - else + else*/ zscal_kernel_8(n1 , alpha , x); - } + i = n1 << 1; j = n1; - - if ( da_r == 0.0 || da_r != da_r ) + } + if ( da_r == 0.0 || isnan(da_r) ) { if ( da_i == 0.0 ) { - FLOAT res=0.0; - if (da_r != da_r) res= da_r; + FLOAT res=0.0; + if (isnan(da_r)) res= da_r; + if (dummy2) + if (isnan(x[i])||isnan(x[i+1])) res= NAN; while(j < n) { x[i]=res; @@ -412,7 +442,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, } else { - while(j < n) { temp0 = -da_i * x[i+1]; @@ -421,7 +450,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, if (!isinf(x[i+1])) x[i+1] = da_i * x[i]; else x[i+1] = NAN; - if ( x[i] == x[i]) //preserve NaN + if ( !isnan(x[i])) //preserve NaN x[i] = temp0; i += 2 ; j++; @@ -437,8 +466,9 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, { while(j < n) { - temp0 = da_r * x[i]; + if (isnan(x[i]))x[i+1]=NAN; + else x[i+1] = da_r * x[i+1]; x[i] = temp0; i += 2 ; @@ -453,7 +483,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, { temp0 = da_r * x[i] - da_i * x[i+1]; x[i+1] = da_r * x[i+1] + da_i * x[i]; - x[i] = temp0; + if(!isnan(x[i]))x[i] = temp0; i += 2 ; j++; From 28f8fdaf0f87c9bca2a79dd41536bd7ff2027e0c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 23 May 2025 14:59:59 +0200 Subject: [PATCH 22/37] support flag for NaN/Inf handling and fix scaling of NaN/Inf values --- kernel/arm64/zscal.S | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/kernel/arm64/zscal.S b/kernel/arm64/zscal.S index 4bd43320d..93e51b70c 100644 --- a/kernel/arm64/zscal.S +++ b/kernel/arm64/zscal.S @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define INC_X x4 /* X stride */ #define I x5 /* loop variable */ #define X_COPY x6 /* Copy of X */ - +#define FLAG x7 /* NaN handling level */ /******************************************************************************* * Macro definitions *******************************************************************************/ @@ -217,11 +217,15 @@ zscal_begin: cmp N, xzr ble .Lzscal_kernel_L999 + ldr FLAG, [sp] + cmp FLAG, #1 + beq .Lzscal_kernel_R_non_zero + fcmp DA_R, #0.0 bne .Lzscal_kernel_R_non_zero - fcmp DA_I, #0.0 - beq .Lzscal_kernel_RI_zero +// fcmp DA_I, #0.0 +// beq .Lzscal_kernel_RI_zero // b .Lzscal_kernel_R_zero From cf06250d36b21f6d0962fc2c84fdd426b93085bd Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 24 May 2025 06:06:24 -0700 Subject: [PATCH 23/37] add handling of dummy2 flag --- kernel/power/zscal.S | 6 ++++++ kernel/power/zscal.c | 2 +- kernel/power/zscal_ppc440.S | 5 +++++ 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/kernel/power/zscal.S b/kernel/power/zscal.S index ae68ee672..5b2861651 100644 --- a/kernel/power/zscal.S +++ b/kernel/power/zscal.S @@ -51,6 +51,7 @@ #define X r8 #define INCX r9 #endif +#define FLAG r11 #endif #if defined(_AIX) || defined(__APPLE__) @@ -61,6 +62,7 @@ #define X r8 #define INCX r9 #endif +#define FLAG r11 #endif #define FZERO f0 @@ -94,6 +96,10 @@ fcmpu cr0, FZERO, ALPHA_I bne- cr0, LL(A1I1) + LDLONG FLAG, 104(SP) + cmpwi cr0, FLAG, 1 + beq- cr0, LL(A1I1) + cmpwi cr0, INCX, 2 * SIZE bne- cr0, LL(A0IN) diff --git a/kernel/power/zscal.c b/kernel/power/zscal.c index 6b7392d0c..671dc9612 100644 --- a/kernel/power/zscal.c +++ b/kernel/power/zscal.c @@ -136,7 +136,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F if ( inc_x <= 0 ) return(0); - if (da_r == ZERO && da_i == ZERO) { + if (da_r == ZERO && da_i == ZERO && dummy2 == 0) { //clear the vector and return if (inc_x == 1) { memset(x, 0, n*COMPSIZE*SIZE); diff --git a/kernel/power/zscal_ppc440.S b/kernel/power/zscal_ppc440.S index 55dd1b87b..c75bb4ae2 100644 --- a/kernel/power/zscal_ppc440.S +++ b/kernel/power/zscal_ppc440.S @@ -64,6 +64,7 @@ #endif #define INC1 r11 +#define FLAG r12 #define FZERO f0 #define ALPHA_R f1 @@ -97,6 +98,10 @@ fcmpu cr0, FZERO, ALPHA_I bne- cr0, LL(A1I1) + lwz FLAG, FRAMESLOT(0)(SP) + cmpwi cr0, FLAG, 1 + beq- cr0, LL(A1I1) + LL(A0IN): srawi. r0, N, 3 mtspr CTR, r0 From fb8dc8ff5c0382d048017e9e6174197b044b17bc Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 25 May 2025 14:47:06 -0700 Subject: [PATCH 24/37] Add dummy2 flag handling --- kernel/zarch/cscal.c | 70 +++++++++++++++++++++++++++++++++++++------- kernel/zarch/zscal.c | 63 ++++++++++++++++++++++++++++++++------- 2 files changed, 111 insertions(+), 22 deletions(-) diff --git a/kernel/zarch/cscal.c b/kernel/zarch/cscal.c index e623f306b..1c9f2cda7 100644 --- a/kernel/zarch/cscal.c +++ b/kernel/zarch/cscal.c @@ -210,7 +210,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, BLASLONG n1 = n & -2; if (da_i == 0.0) { - + if (dummy2 == 0) { while (j < n1) { x[i] = 0.0; @@ -230,11 +230,43 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, j++; } + } else { + while (j < n1) { + if (isnan(x[i]) || isinf(x[i]) || isnan(x[i+1])) { + x[i] = NAN; + x[i+1] = NAN; + }else{ + x[i] = 0.0; + x[i + 1] = 0.0; + } + if (isnan(x[i+inc_x]) || isinf(x[i+inc_x]) || isnan(x[i+1+inc_x])) { + x[i + inc_x] = NAN; + x[i + 1 + inc_x] = NAN; + } else { + x[i + inc_x] = 0.0; + x[i + 1 + inc_x] = 0.0; + } + i += 2 * inc_x; + j += 2; + + } + while (j < n) { + if (isnan(x[i]) || isinf(x[i]) || isnan(x[i+1])) { + x[i] = NAN; + x[i+1] = NAN; + }else{ + x[i] = 0.0; + x[i + 1] = 0.0; + } + i += inc_x; + j++; + } + } } else { while (j < n1) { - if (isnan(x[i]) || isinf(x[i])) + if (isnan(x[i]) || isinf(x[i])) temp0 = NAN; else temp0 = -da_i * x[i + 1]; @@ -276,7 +308,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, } else { - if (da_i == 0.0) { + if (da_i == 0.0 && dummy2) { BLASLONG n1 = n & -2; while (j < n1) { @@ -335,12 +367,16 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, alpha[1] = da_i; if (da_r == 0.0) - if (da_i == 0) + if (da_i == 0 && dummy2 == 0) cscal_kernel_16_zero(n1, x); - else + else { +/* if (dummy2 == 0) cscal_kernel_16_zero_r(n1, alpha, x); - else if (da_i == 0) - cscal_kernel_16_zero_i(n1, alpha, x); + else*/ + cscal_kernel_16(n1, da_r, da_i, x); + } +/* else if (da_i == 0 && !isnan(da_r)) + cscal_kernel_16/*_zero_i(n1, alpha, x);*/ else cscal_kernel_16(n1, da_r, da_i, x); @@ -354,7 +390,8 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, float res = 0.0; if (isnan(da_r)) res = da_r; while (j < n) { - + if (dummy2) + if (isnan(x[i])|| isnan(x[i+1])) res=NAN; x[i] = res; x[i + 1] = res; i += 2; @@ -382,7 +419,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, x[i + 1] = da_i * x[i]; else x[i + 1] = NAN; - if (x[i] == x[i]) + if (!isnan(x[i])) x[i] = temp0; i += 2; j++; @@ -398,7 +435,18 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, while (j < n) { temp0 = da_r * x[i]; - x[i + 1] = da_r * x[i + 1]; + if (dummy2) { + if (isnan(x[i])||isinf(x[i]))temp0 = NAN; + if (isnan(x[i+1])||isinf(x[i+1])) + x[i+1] = NAN; + else + x[i+1] = da_r * x[i + 1]; + } else { + if (isnan(x[i])) + x[i + 1] = NAN; + else + x[i + 1] = da_r * x[i + 1]; + } x[i] = temp0; i += 2; j++; @@ -411,7 +459,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, temp0 = da_r * x[i] - da_i * x[i + 1]; x[i + 1] = da_r * x[i + 1] + da_i * x[i]; - x[i] = temp0; + if (!isnan(x[i])) x[i] = temp0; i += 2; j++; diff --git a/kernel/zarch/zscal.c b/kernel/zarch/zscal.c index 36466a6e0..5111bc455 100644 --- a/kernel/zarch/zscal.c +++ b/kernel/zarch/zscal.c @@ -208,7 +208,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, BLASLONG n1 = n & -2; if (da_i == 0.0) { - + if (dummy2 == 0) { while (j < n1) { x[i] = 0.0; @@ -228,7 +228,38 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, j++; } - + } else { + while (j < n1) { + if (isnan(x[i]) || isinf(x[i]) || isnan(x[i+1])) { + x[i] = NAN; + x[i+1] = NAN; + } else { + x[i] = 0.0; + x[i+1] = 0.0; + } + if (isnan(x[i+inc_x]) || isinf(x[i+inc_x]) || isnan(x[i+inc_x+1])) { + x[i + inc_x] = NAN; + x[i + inc_x + 1] = NAN; + } else { + x[i + inc_x] = 0.; + x[i + inc_x + 1] = 0.; + } + i += 2 * inc_x; + j += 2; + } + + while (j < n) { + if (isnan(x[i]) || isinf(x[i]) || isnan(x[i+1])) { + x[i] = NAN; + x[i+1] = NAN; + } else { + x[i] = 0.; + x[i+1] = 0.; + } + i += inc_x; + j++; + } + } } else { while (j < n1) { @@ -276,7 +307,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, } else { - if (da_i == 0.0) { + if (da_i == 0.0 && dummy2) { BLASLONG n1 = n & -2; while (j < n1) { @@ -335,12 +366,10 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, alpha[1] = da_i; if (da_r == 0.0) - if (da_i == 0) + if (da_i == 0 && dummy2 == 0) zscal_kernel_8_zero(n1, x); else zscal_kernel_8(n1, da_r, da_i, x); - else if (da_i == 0 && da_r == da_r) - zscal_kernel_8_zero_i(n1, alpha, x); else zscal_kernel_8(n1, da_r, da_i, x); @@ -354,7 +383,8 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, double res= 0.0; if (isnan(da_r)) res = da_r; while (j < n) { - + if (dummy2) + if (isnan(x[i]) || isnan(x[i+1])) res = NAN; x[i] = res; x[i + 1] = res; i += 2; @@ -381,7 +411,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, x[i + 1] = da_i * x[i]; else x[i + 1] = NAN; - if (x[i]==x[i]) + if (!isnan(x[i])) x[i] = temp0; i += 2; j++; @@ -397,8 +427,19 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, while (j < n) { temp0 = da_r * x[i]; - x[i + 1] = da_r * x[i + 1]; - x[i] = temp0; + if (dummy2) { + if (isnan(x[i]) || isinf(x[i])) temp0 = NAN; + if (isnan(x[i + 1]) || isinf(x[i + 1])) + x[i + 1] = NAN; + else + x[i + 1] = da_r * x[i + 1]; + } else { + if (isnan(x[i])) + x[i + 1] = NAN; + else + x[i + 1] = da_r * x[i + 1]; + } + x[i] = temp0; i += 2; j++; @@ -410,7 +451,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, temp0 = da_r * x[i] - da_i * x[i + 1]; x[i + 1] = da_r * x[i + 1] + da_i * x[i]; - x[i] = temp0; + if (!isnan(x[i])) x[i] = temp0; i += 2; j++; From 45fd2d9b0790c5ca3698502d65d59d38d911ef4f Mon Sep 17 00:00:00 2001 From: guoyuanplct Date: Thu, 29 May 2025 17:50:44 +0800 Subject: [PATCH 25/37] Optimized the axpby function. --- kernel/riscv64/KERNEL.RISCV64_ZVL256B | 8 ++ kernel/riscv64/axpby_vector_v2.c | 149 ++++++++++++++++++++++++++ 2 files changed, 157 insertions(+) create mode 100644 kernel/riscv64/axpby_vector_v2.c diff --git a/kernel/riscv64/KERNEL.RISCV64_ZVL256B b/kernel/riscv64/KERNEL.RISCV64_ZVL256B index ba7a52bbf..0fd6adb8b 100644 --- a/kernel/riscv64/KERNEL.RISCV64_ZVL256B +++ b/kernel/riscv64/KERNEL.RISCV64_ZVL256B @@ -169,11 +169,15 @@ SSYMV_U_KERNEL = symv_U_vector.c SSYMV_L_KERNEL = symv_L_vector.c DSYMV_U_KERNEL = symv_U_vector.c DSYMV_L_KERNEL = symv_L_vector.c + + CSYMV_U_KERNEL = ../generic/zsymv_k.c CSYMV_L_KERNEL = ../generic/zsymv_k.c ZSYMV_U_KERNEL = ../generic/zsymv_k.c ZSYMV_L_KERNEL = ../generic/zsymv_k.c + + CHEMV_L_KERNEL = zhemv_LM_vector.c CHEMV_M_KERNEL = zhemv_LM_vector.c CHEMV_U_KERNEL = zhemv_UV_vector.c @@ -207,3 +211,7 @@ COMATCOPY_CN = zomatcopy_cn_vector.c DOMATCOPY_CN = omatcopy_cn_vector.c SOMATCOPY_CN = omatcopy_cn_vector.c + +SAXPBYKERNEL = axpby_vector_v2.c +DAXPBYKERNEL = axpby_vector_v2.c + diff --git a/kernel/riscv64/axpby_vector_v2.c b/kernel/riscv64/axpby_vector_v2.c new file mode 100644 index 000000000..369346e1b --- /dev/null +++ b/kernel/riscv64/axpby_vector_v2.c @@ -0,0 +1,149 @@ +/*************************************************************************** +Copyright (c) 2022, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if !defined(DOUBLE) +#define VSETVL(n) RISCV_RVV(vsetvl_e32m8)(n) +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m8) +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m8) +#define VSEV_FLOAT RISCV_RVV(vse32_v_f32m8) +#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m8) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m8) +#define VFMULVF_FLOAT RISCV_RVV(vfmul_vf_f32m8) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m8) +#else +#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) +#define FLOAT_V_T vfloat64m4_t +#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4) +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4) +#define VSEV_FLOAT RISCV_RVV(vse64_v_f64m4) +#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4) +#define VFMULVF_FLOAT RISCV_RVV(vfmul_vf_f64m4) +#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m4) +#endif + +int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *y, BLASLONG inc_y) +{ + FLOAT_V_T vx, vy; + unsigned int gvl; + if (n <= 0) + return (0); + if (inc_x == 1 && inc_y == 1) + { + while (n > 0) + { + gvl = VSETVL(n); + + vx = VLEV_FLOAT(x, gvl); + vy = VLEV_FLOAT(y, gvl); + + vy = VFMULVF_FLOAT(vy, beta, gvl); + vy = VFMACCVF_FLOAT(vy, alpha, vx, gvl); + + VSEV_FLOAT(y, vy, gvl); + + x += gvl; + y += gvl; + n -= gvl; + } + } + else if (1 == inc_x) + { + BLASLONG stride_y = inc_y * sizeof(FLOAT); + while (n > 0) + { + gvl = VSETVL(n); + vy = VLSEV_FLOAT(y, stride_y, gvl); + vx = VLEV_FLOAT(x, gvl); + + vy = VFMULVF_FLOAT(vy, beta, gvl); + vy = VFMACCVF_FLOAT(vy, alpha, vx, gvl); + + VSSEV_FLOAT(y, stride_y, vy, gvl); + + x += gvl; + y += gvl * inc_y; + n -= gvl; + } + } + else if (1 == inc_y) + { + BLASLONG stride_x = inc_x * sizeof(FLOAT); + + while (n > 0) + { + gvl = VSETVL(n); + + vx = VLSEV_FLOAT(x, stride_x, gvl); + vy = VLEV_FLOAT(y, gvl); + + vy = VFMULVF_FLOAT(vy, beta, gvl); + vy = VFMACCVF_FLOAT(vy, alpha, vx, gvl); + + VSEV_FLOAT(y, vy, gvl); + + x += gvl * inc_x; + y += gvl; + n -= gvl; + } + } + else if (inc_y == 0) + { + FLOAT vf = y[0]; + for (; n > 0; n--) + { + vf = (vf * beta) + (x[0] * alpha); + x += inc_x; + } + y[0] = vf; + } + else + { + BLASLONG stride_x = inc_x * sizeof(FLOAT); + BLASLONG stride_y = inc_y * sizeof(FLOAT); + while (n > 0) + { + gvl = VSETVL(n); + vy = VLSEV_FLOAT(y, stride_y, gvl); + vx = VLSEV_FLOAT(x, stride_x, gvl); + + vy = VFMULVF_FLOAT(vy, beta, gvl); + vy = VFMACCVF_FLOAT(vy, alpha, vx, gvl); + + VSSEV_FLOAT(y, stride_y, vy, gvl); + + x += gvl * inc_x; + y += gvl * inc_y; + n -= gvl; + } + } + + return (0); +} From d2003dc8869366c1054e41f181a4d41a152035db Mon Sep 17 00:00:00 2001 From: guoyuanplct Date: Thu, 29 May 2025 18:38:22 +0800 Subject: [PATCH 26/37] del lines --- kernel/riscv64/KERNEL.RISCV64_ZVL256B | 4 ---- 1 file changed, 4 deletions(-) diff --git a/kernel/riscv64/KERNEL.RISCV64_ZVL256B b/kernel/riscv64/KERNEL.RISCV64_ZVL256B index 0fd6adb8b..2b4f0a545 100644 --- a/kernel/riscv64/KERNEL.RISCV64_ZVL256B +++ b/kernel/riscv64/KERNEL.RISCV64_ZVL256B @@ -170,14 +170,11 @@ SSYMV_L_KERNEL = symv_L_vector.c DSYMV_U_KERNEL = symv_U_vector.c DSYMV_L_KERNEL = symv_L_vector.c - CSYMV_U_KERNEL = ../generic/zsymv_k.c CSYMV_L_KERNEL = ../generic/zsymv_k.c ZSYMV_U_KERNEL = ../generic/zsymv_k.c ZSYMV_L_KERNEL = ../generic/zsymv_k.c - - CHEMV_L_KERNEL = zhemv_LM_vector.c CHEMV_M_KERNEL = zhemv_LM_vector.c CHEMV_U_KERNEL = zhemv_UV_vector.c @@ -214,4 +211,3 @@ SOMATCOPY_CN = omatcopy_cn_vector.c SAXPBYKERNEL = axpby_vector_v2.c DAXPBYKERNEL = axpby_vector_v2.c - From 2ae019161a85333a35018b517d4b34474a7694e9 Mon Sep 17 00:00:00 2001 From: guoyuanplct Date: Thu, 5 Jun 2025 21:53:03 +0800 Subject: [PATCH 27/37] fixed the performance problem in RISCV64_ZVL256 when OPENBLAS_K is small --- kernel/riscv64/zaxpy_vector.c | 47 ++++++++++++++++++++++++++++++ kernel/riscv64/zdot_vector.c | 54 ++++++++++++++++++++++++++++++++++- 2 files changed, 100 insertions(+), 1 deletion(-) diff --git a/kernel/riscv64/zaxpy_vector.c b/kernel/riscv64/zaxpy_vector.c index 1e766c5f4..dd5906931 100644 --- a/kernel/riscv64/zaxpy_vector.c +++ b/kernel/riscv64/zaxpy_vector.c @@ -43,8 +43,55 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m4) #endif +#if !defined(DOUBLE) +inline int small_caxpy_kernel(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +#else +inline int small_zaxpy_kernel(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +#endif +{ + BLASLONG i=0; + BLASLONG ix,iy; + BLASLONG inc_x2; + BLASLONG inc_y2; + + if ( n <= 0 ) return(0); + if ( da_r == 0.0 && da_i == 0.0 ) return(0); + + ix = 0; + iy = 0; + + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; + + while(i < n) + { +#if !defined(CONJ) + y[iy] += ( da_r * x[ix] - da_i * x[ix+1] ) ; + y[iy+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ; +#else + y[iy] += ( da_r * x[ix] + da_i * x[ix+1] ) ; + y[iy+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ; +#endif + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + return(0); + +} + int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { +#if !defined(DOUBLE) + if(n < 16) { + return small_caxpy_kernel(n, dummy0, dummy1, da_r, da_i, x, inc_x, y, inc_y, dummy, dummy2); + } +#else + if(n < 8) { + return small_zaxpy_kernel(n, dummy0, dummy1, da_r, da_i, x, inc_x, y, inc_y, dummy, dummy2); + } +#endif BLASLONG i = 0, j = 0; BLASLONG ix = 0,iy = 0; if(n <= 0) return(0); diff --git a/kernel/riscv64/zdot_vector.c b/kernel/riscv64/zdot_vector.c index 13b8fe378..398de28e5 100644 --- a/kernel/riscv64/zdot_vector.c +++ b/kernel/riscv64/zdot_vector.c @@ -68,8 +68,60 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VFNMSACVV_FLOAT RISCV_RVV(vfnmsac_vv_f64m4) #endif +#if !defined(DOUBLE) + inline OPENBLAS_COMPLEX_FLOAT small_cdot_kernel(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +#else +inline OPENBLAS_COMPLEX_FLOAT small_zdot_kernel(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +#endif +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT dot[2]; + OPENBLAS_COMPLEX_FLOAT result; + BLASLONG inc_x2; + BLASLONG inc_y2; + + dot[0]=0.0; + dot[1]=0.0; + + CREAL(result) = 0.0 ; + CIMAG(result) = 0.0 ; + + if ( n < 1 ) return(result); + + inc_x2 = 2 * inc_x ; + inc_y2 = 2 * inc_y ; + + while(i < n) + { +#if !defined(CONJ) + dot[0] += ( x[ix] * y[iy] - x[ix+1] * y[iy+1] ) ; + dot[1] += ( x[ix+1] * y[iy] + x[ix] * y[iy+1] ) ; +#else + dot[0] += ( x[ix] * y[iy] + x[ix+1] * y[iy+1] ) ; + dot[1] -= ( x[ix+1] * y[iy] - x[ix] * y[iy+1] ) ; +#endif + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + CREAL(result) = dot[0]; + CIMAG(result) = dot[1]; + return(result); + +} OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { +#if !defined(DOUBLE) +if(n < 16) { + return small_cdot_kernel(n, x, inc_x, y, inc_y); +} +#else +if(n < 8) { + return small_zdot_kernel(n, x, inc_x, y, inc_y); +} +#endif BLASLONG i=0, j=0; BLASLONG ix=0,iy=0; FLOAT dot[2]; @@ -148,4 +200,4 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA CREAL(result) = dot[0]; CIMAG(result) = dot[1]; return(result); -} +} \ No newline at end of file From 5442aff218e47fdf882dd2828b3552618b4bc761 Mon Sep 17 00:00:00 2001 From: Arne Juul Date: Sun, 8 Jun 2025 19:50:15 +0000 Subject: [PATCH 28/37] Accumulate results in output register explicitly --- kernel/arm64/dot_kernel_asimd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/arm64/dot_kernel_asimd.c b/kernel/arm64/dot_kernel_asimd.c index a404c9636..f52112830 100644 --- a/kernel/arm64/dot_kernel_asimd.c +++ b/kernel/arm64/dot_kernel_asimd.c @@ -134,7 +134,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. " fadd v4.4s, v4.4s, v6.4s \n" \ " fadd v0.4s, v0.4s, v4.4s \n" \ " faddp v0.4s, v0.4s, v0.4s \n" \ - " faddp v0.4s, v0.4s, v0.4s \n" + " faddp "OUT", v0.2s \n" #else /* !defined(DSDOT) */ #define KERNEL_F1 \ From f18b7a46bf4597a2b8eb07365cf41a888dfc924a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 11 Jun 2025 01:47:43 -0700 Subject: [PATCH 29/37] add dummy2 flag handling for inf/nan agnostic zeroing --- kernel/riscv64/zscal_rvv.c | 14 +++++++++++++- kernel/riscv64/zscal_vector.c | 13 ++++++++++--- 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/kernel/riscv64/zscal_rvv.c b/kernel/riscv64/zscal_rvv.c index ae79d9f9d..9f990e0c0 100644 --- a/kernel/riscv64/zscal_rvv.c +++ b/kernel/riscv64/zscal_rvv.c @@ -70,6 +70,11 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F FLOAT_VX2_T vx2; if(inc_x == 1) { + if (dummy2 == 0 && da_r==0. && da_i == 0.) { + BLASLONG i; + for (i=0; i < n*2; i++) x[i]=0.; + return(0); + } else { for (size_t vl; n > 0; n -= vl, x += vl*2) { vl = VSETVL(n); @@ -80,6 +85,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F vt = VFMULVF_FLOAT(vr, da_r, vl); vt = VFNMSACVF_FLOAT(vt, da_i, vi, vl); + vi = VFMULVF_FLOAT(vi, da_r, vl); vi = VFMACCVF_FLOAT(vi, da_i, vr, vl); @@ -87,9 +93,14 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F vx2 = VSET_VX2(vx2, 1, vi); VSSEG_FLOAT(x, vx2, vl); } + } } else { - + if (dummy2 == 0 && da_r==0. && da_i == 0.) { + BLASLONG i,ix=0,inc_x2=2*inc_x; + for (i=0; i < n; i++) {x[ix]=0.;x[ix+1]=0.;ix+=inc_x2;}; + return(0); + } else { for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) { vl = VSETVL(n); @@ -105,6 +116,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F vx2 = VSET_VX2(vx2, 0, vt); vx2 = VSET_VX2(vx2, 1, vi); VSSSEG_FLOAT(x, stride_x, vx2, vl); + } } } diff --git a/kernel/riscv64/zscal_vector.c b/kernel/riscv64/zscal_vector.c index 536bbdf73..a72361b04 100644 --- a/kernel/riscv64/zscal_vector.c +++ b/kernel/riscv64/zscal_vector.c @@ -57,9 +57,15 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F if((n <= 0) || (inc_x <= 0)) return(0); - unsigned int gvl = 0; - FLOAT_V_T vt, v0, v1; - { + if (dummy2 == 0 && da_r == 0. && da_i == 0.) { + int i,inc_x2,ix; + inc_x2 = 2*inc_x; + ix=0; + for (i=0;i Date: Wed, 11 Jun 2025 22:10:46 +0200 Subject: [PATCH 30/37] Use generic SCAL kernels to address inf/nan handling for now --- kernel/sparc/KERNEL | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/sparc/KERNEL b/kernel/sparc/KERNEL index d6580609b..b2a8184a8 100644 --- a/kernel/sparc/KERNEL +++ b/kernel/sparc/KERNEL @@ -86,3 +86,8 @@ endif ifndef QROTMKERNEL QROTMKERNEL = ../generic/rotm.c endif + +SSCALKERNEL = ../arm/scal.c +DSCALKERNEL = ../arm/scal.c +CSCALKERNEL = ../arm/zscal.c +ZSCALKERNEL = ../arm/zscal.c From e12132abd4b43d4e2560bd492204b6cba26f8563 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 11 Jun 2025 22:12:10 +0200 Subject: [PATCH 31/37] Use generic C/ZSCAL kernels to address inf/nan handling for now --- kernel/x86/KERNEL | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/x86/KERNEL b/kernel/x86/KERNEL index 1095c1528..3ae268e6c 100644 --- a/kernel/x86/KERNEL +++ b/kernel/x86/KERNEL @@ -200,3 +200,6 @@ endif ifndef QROTMKERNEL QROTMKERNEL = ../generic/rotm.c endif + +CSCALKERNEL = ../arm/zscal.c +ZSCALKERNEL = ../arm/zscal.c From 58eeb9041cfe93c56fb09337040b3994bddd8fc0 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 12 Jun 2025 03:03:01 -0700 Subject: [PATCH 32/37] fix handling of dummy2 --- kernel/arm64/zscal.S | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/kernel/arm64/zscal.S b/kernel/arm64/zscal.S index 93e51b70c..97d8a8b7a 100644 --- a/kernel/arm64/zscal.S +++ b/kernel/arm64/zscal.S @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define INC_X x4 /* X stride */ #define I x5 /* loop variable */ #define X_COPY x6 /* Copy of X */ -#define FLAG x7 /* NaN handling level */ +#define FLAG x7 /******************************************************************************* * Macro definitions *******************************************************************************/ @@ -216,23 +216,22 @@ zscal_begin: cmp N, xzr ble .Lzscal_kernel_L999 - - ldr FLAG, [sp] - cmp FLAG, #1 - beq .Lzscal_kernel_R_non_zero +ldr FLAG, [sp] +cmp FLAG, #1 +beq .Lzscal_kernel_RI_non_zero fcmp DA_R, #0.0 bne .Lzscal_kernel_R_non_zero -// fcmp DA_I, #0.0 -// beq .Lzscal_kernel_RI_zero + fcmp DA_I, #0.0 + beq .Lzscal_kernel_RI_zero // b .Lzscal_kernel_R_zero .Lzscal_kernel_R_non_zero: fcmp DA_I, #0.0 - beq .Lzscal_kernel_I_zero +//QUAK beq .Lzscal_kernel_I_zero /******************************************************************************* * A_R != 0 && A_I != 0 From 549a9f1dbb152945b14f9376d055ad7c12042917 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 12 Jun 2025 18:54:33 +0200 Subject: [PATCH 33/37] Disable the default SSE kernels for CSCAL/ZSCAL for now --- kernel/x86_64/KERNEL | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/x86_64/KERNEL b/kernel/x86_64/KERNEL index c270ff077..a8ba70a31 100644 --- a/kernel/x86_64/KERNEL +++ b/kernel/x86_64/KERNEL @@ -323,11 +323,11 @@ DSCALKERNEL = scal_sse2.S endif ifndef CSCALKERNEL -CSCALKERNEL = zscal_sse.S +CSCALKERNEL = ../arm/zscal.c endif ifndef ZSCALKERNEL -ZSCALKERNEL = zscal_sse2.S +ZSCALKERNEL = ../arm/zscal.c endif ifndef ASCALKERNEL From 73af02b89fa807402ff37d459c593e121542ee6f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 12 Jun 2025 13:33:56 -0700 Subject: [PATCH 34/37] use dummy2 as Inf/NAN handling flag --- kernel/riscv64/zscal.c | 89 +++++++++++++++++++----------------------- 1 file changed, 40 insertions(+), 49 deletions(-) diff --git a/kernel/riscv64/zscal.c b/kernel/riscv64/zscal.c index 8499145f4..b210f9af3 100644 --- a/kernel/riscv64/zscal.c +++ b/kernel/riscv64/zscal.c @@ -27,65 +27,56 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /************************************************************************************** * 2013/09/14 Saar -* BLASTEST float : OK -* BLASTEST double : OK -* CTEST : OK -* TEST : OK +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : OK +* TEST : OK * **************************************************************************************/ #include "common.h" +// The c/zscal_k function is called not only by cblas_c/zscal but also by other upper-level interfaces. +// In certain cases, the expected return values for cblas_s/zscal differ from those of other upper-level interfaces. +// To handle this, we use the dummy2 parameter to differentiate between them. int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { - BLASLONG i=0; - BLASLONG inc_x2; - BLASLONG ip = 0; - FLOAT temp; + BLASLONG i = 0; + BLASLONG inc_x2; + BLASLONG ip = 0; + FLOAT temp; - if ( (n <= 0) || (inc_x <= 0)) - return(0); + if ((n <= 0) || (inc_x <= 0)) + return(0); + inc_x2 = 2 * inc_x; + if (dummy2 == 0) { + for (i = 0; i < n; i++) + { + if (da_r == 0.0 && da_i == 0.0) + { + x[ip] = 0.0; + x[ip+1] = 0.0; + } + else + { + temp = da_r * x[ip] - da_i * x[ip+1]; + x[ip+1] = da_r * x[ip+1] + da_i * x[ip] ; + x[ip] = temp; + } - inc_x2 = 2 * inc_x; - for ( i=0; i Date: Fri, 13 Jun 2025 00:54:27 -0700 Subject: [PATCH 35/37] resync with the generic arm version for inf/nan handling --- kernel/mips/zscal.c | 97 ++++++++++++++++++++++----------------------- 1 file changed, 47 insertions(+), 50 deletions(-) diff --git a/kernel/mips/zscal.c b/kernel/mips/zscal.c index ae1c87fce..b210f9af3 100644 --- a/kernel/mips/zscal.c +++ b/kernel/mips/zscal.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2016, The OpenBLAS Project +Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -25,61 +25,58 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + #include "common.h" +// The c/zscal_k function is called not only by cblas_c/zscal but also by other upper-level interfaces. +// In certain cases, the expected return values for cblas_s/zscal differ from those of other upper-level interfaces. +// To handle this, we use the dummy2 parameter to differentiate between them. int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { - BLASLONG i=0; - BLASLONG inc_x2; - BLASLONG ip = 0; - FLOAT temp; + BLASLONG i = 0; + BLASLONG inc_x2; + BLASLONG ip = 0; + FLOAT temp; - inc_x2 = 2 * inc_x; - for ( i=0; i Date: Fri, 13 Jun 2025 13:32:02 +0200 Subject: [PATCH 36/37] temporarily change default C/ZSCAL to the non-asm implementation --- kernel/mips64/KERNEL | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/mips64/KERNEL b/kernel/mips64/KERNEL index 2ebd8a5bd..0ebb459b3 100644 --- a/kernel/mips64/KERNEL +++ b/kernel/mips64/KERNEL @@ -6,6 +6,9 @@ CROTKERNEL = ../mips/zrot.c ZROTKERNEL = ../mips/zrot.c CSWAPKERNEL = ../mips/zswap.c ZSWAPKERNEL = ../mips/zswap.c + +CSCALKERNEL = zscal.c +ZSCALKERNEL = zscal.c ifndef SNRM2KERNEL From e338d34ce1d3c3cfed50e0060fee392ca7ef3166 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 13 Jun 2025 13:37:15 +0200 Subject: [PATCH 37/37] fix path --- kernel/mips64/KERNEL | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/mips64/KERNEL b/kernel/mips64/KERNEL index 0ebb459b3..d720aaff5 100644 --- a/kernel/mips64/KERNEL +++ b/kernel/mips64/KERNEL @@ -7,8 +7,8 @@ ZROTKERNEL = ../mips/zrot.c CSWAPKERNEL = ../mips/zswap.c ZSWAPKERNEL = ../mips/zswap.c -CSCALKERNEL = zscal.c -ZSCALKERNEL = zscal.c +CSCALKERNEL = ../mips/zscal.c +ZSCALKERNEL = ../mips/zscal.c ifndef SNRM2KERNEL