From 803e8d48389b520c490bb859e7bae39d1c0e4524 Mon Sep 17 00:00:00 2001 From: yuanjia Date: Tue, 12 Aug 2025 18:03:16 +0800 Subject: [PATCH] Move the value assignment of vector x in gemv_n_sve.c to the outermost loop to reduce the repeated data retrieval. 1.Verify correctness using BLAS-Tester 2.Using the built-in benchmark to verify performance, the performance of float and doule type improved by about 60% and about 40% respectively.The test command is: export OMP_NUM_THREADS=1;numactl -C 10 -l ./sgemv.goto 3000 4000 100 export OMP_NUM_THREADS=1;numactl -C 10 -l ./dgemv.goto 3000 4000 100 --- kernel/arm64/gemv_n_sve.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/kernel/arm64/gemv_n_sve.c b/kernel/arm64/gemv_n_sve.c index 59a5c8557..c2f455739 100644 --- a/kernel/arm64/gemv_n_sve.c +++ b/kernel/arm64/gemv_n_sve.c @@ -69,13 +69,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO FLOAT *a2_ptr = a + lda * width * 2; for (j = 0; j < width; j++) { - for (i = 0; (i + sve_size - 1) < m; i += sve_size) { - ix = j * inc_x; - - SV_TYPE x0_vec = SV_DUP(alpha * x[ix + (inc_x * width * 0)]); - SV_TYPE x1_vec = SV_DUP(alpha * x[ix + (inc_x * width * 1)]); - SV_TYPE x2_vec = SV_DUP(alpha * x[ix + (inc_x * width * 2)]); + ix = j * inc_x; + SV_TYPE x0_vec = SV_DUP(alpha * x[ix + (inc_x * width * 0)]); + SV_TYPE x1_vec = SV_DUP(alpha * x[ix + (inc_x * width * 1)]); + SV_TYPE x2_vec = SV_DUP(alpha * x[ix + (inc_x * width * 2)]); + for (i = 0; (i + sve_size - 1) < m; i += sve_size) { SV_TYPE a00_vec = svld1(pg_true, a0_ptr + i); SV_TYPE a01_vec = svld1(pg_true, a1_ptr + i); SV_TYPE a02_vec = svld1(pg_true, a2_ptr + i); @@ -89,10 +88,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO } if (i < m) { - SV_TYPE x0_vec = SV_DUP(alpha * x[ix + (inc_x * width * 0)]); - SV_TYPE x1_vec = SV_DUP(alpha * x[ix + (inc_x * width * 1)]); - SV_TYPE x2_vec = SV_DUP(alpha * x[ix + (inc_x * width * 2)]); - SV_TYPE a00_vec = svld1(pg, a0_ptr + i); SV_TYPE a01_vec = svld1(pg, a1_ptr + i); SV_TYPE a02_vec = svld1(pg, a2_ptr + i); @@ -115,9 +110,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO a_ptr = a2_ptr; for (j = width * 3; j < n; j++) { ix = j * inc_x; + SV_TYPE x_vec = SV_DUP(alpha * x[(ix)]); for (i = 0; (i + sve_size - 1) < m; i += sve_size) { SV_TYPE y_vec = svld1(pg_true, y + i); - SV_TYPE x_vec = SV_DUP(alpha * x[(ix)]); SV_TYPE a_vec = svld1(pg_true, a_ptr + i); y_vec = svmla_x(pg_true, y_vec, a_vec, x_vec); svst1(pg_true, y + i, y_vec); @@ -125,7 +120,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO if (i < m) { SV_TYPE y_vec = svld1(pg, y + i); - SV_TYPE x_vec = SV_DUP(alpha * x[(ix)]); SV_TYPE a_vec = svld1(pg, a_ptr + i); y_vec = svmla_m(pg, y_vec, a_vec, x_vec); svst1(pg, y + i, y_vec);