From 211dfd0754a3800d178b14b24e312330954721df Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 10 Apr 2025 22:21:57 +0200 Subject: [PATCH 01/38] disable the CooperLake microkernel as it produces wrong results --- kernel/x86_64/sbgemv_n.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/x86_64/sbgemv_n.c b/kernel/x86_64/sbgemv_n.c index 08ccace61..b2d4eb74f 100644 --- a/kernel/x86_64/sbgemv_n.c +++ b/kernel/x86_64/sbgemv_n.c @@ -28,9 +28,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) -#include "sbgemv_n_microk_cooperlake.c" -#endif +//#if defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) +//#include "sbgemv_n_microk_cooperlake.c" +//#endif #define ALIGN64_ALLOC(alloc_size, TYPE, ptr_align, ptr) \ ptr = (TYPE *) malloc(sizeof(TYPE)*alloc_size + 63); \ From f1e628b88974ba984adf0ea08c817aef60548720 Mon Sep 17 00:00:00 2001 From: "Iha, Taisei" Date: Fri, 11 Apr 2025 20:00:33 +0900 Subject: [PATCH 02/38] Further performance improvements to [SD]GEMV. --- kernel/arm64/KERNEL.A64FX | 4 +- kernel/arm64/KERNEL.NEOVERSEV1 | 2 + kernel/arm64/gemv_n_sve_v1x3.c | 138 ++++++++++++++++++++++ kernel/arm64/gemv_n_sve_v4x3.c | 207 +++++++++++++++++++++++++++++++++ 4 files changed, 349 insertions(+), 2 deletions(-) create mode 100644 kernel/arm64/gemv_n_sve_v1x3.c create mode 100644 kernel/arm64/gemv_n_sve_v4x3.c diff --git a/kernel/arm64/KERNEL.A64FX b/kernel/arm64/KERNEL.A64FX index 75f0f39a7..3d68271da 100644 --- a/kernel/arm64/KERNEL.A64FX +++ b/kernel/arm64/KERNEL.A64FX @@ -1,6 +1,6 @@ include $(KERNELDIR)/KERNEL.ARMV8SVE -SGEMVNKERNEL = gemv_n_sve.c -DGEMVNKERNEL = gemv_n_sve.c +SGEMVNKERNEL = gemv_n_sve_v4x3.c +DGEMVNKERNEL = gemv_n_sve_v4x3.c SGEMVTKERNEL = gemv_t_sve_v4x3.c DGEMVTKERNEL = gemv_t_sve_v4x3.c diff --git a/kernel/arm64/KERNEL.NEOVERSEV1 b/kernel/arm64/KERNEL.NEOVERSEV1 index bacedf8cf..3e622bcbf 100644 --- a/kernel/arm64/KERNEL.NEOVERSEV1 +++ b/kernel/arm64/KERNEL.NEOVERSEV1 @@ -1,5 +1,7 @@ include $(KERNELDIR)/KERNEL.ARMV8SVE +SGEMVNKERNEL = gemv_n_sve_v1x3.c +DGEMVNKERNEL = gemv_n_sve_v1x3.c SGEMVTKERNEL = gemv_t_sve_v1x3.c DGEMVTKERNEL = gemv_t_sve_v1x3.c ifeq ($(BUILD_BFLOAT16), 1) diff --git a/kernel/arm64/gemv_n_sve_v1x3.c b/kernel/arm64/gemv_n_sve_v1x3.c new file mode 100644 index 000000000..44c9a89b9 --- /dev/null +++ b/kernel/arm64/gemv_n_sve_v1x3.c @@ -0,0 +1,138 @@ +/*************************************************************************** +Copyright (c) 2025, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include + +#include "common.h" + +#ifdef DOUBLE +#define SV_COUNT svcntd +#define SV_TYPE svfloat64_t +#define SV_TRUE svptrue_b64 +#define SV_WHILE svwhilelt_b64_s64 +#define SV_DUP svdup_f64 +#else +#define SV_COUNT svcntw +#define SV_TYPE svfloat32_t +#define SV_TRUE svptrue_b32 +#define SV_WHILE svwhilelt_b32_s64 +#define SV_DUP svdup_f32 +#endif + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, + BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, + FLOAT *buffer) +{ + BLASLONG i; + BLASLONG ix,iy; + BLASLONG j; + FLOAT *a_ptr; + FLOAT temp; + + ix = 0; + a_ptr = a; + + if (inc_y == 1) { + BLASLONG width = (n + 3 - 1) / 3; + + FLOAT *a0_ptr = a_ptr + lda * width * 0; + FLOAT *a1_ptr = a_ptr + lda * width * 1; + FLOAT *a2_ptr = a_ptr + lda * width * 2; + + FLOAT *x0_ptr = x + inc_x * width * 0; + FLOAT *x1_ptr = x + inc_x * width * 1; + FLOAT *x2_ptr = x + inc_x * width * 2; + + for (j = 0; j < width; j++) { + svbool_t pg00 = ((j + width * 0) < n) ? SV_TRUE() : svpfalse(); + svbool_t pg01 = ((j + width * 1) < n) ? SV_TRUE() : svpfalse(); + svbool_t pg02 = ((j + width * 2) < n) ? SV_TRUE() : svpfalse(); + + SV_TYPE temp0_vec = SV_DUP(alpha * x0_ptr[ix]); + SV_TYPE temp1_vec = SV_DUP(alpha * x1_ptr[ix]); + SV_TYPE temp2_vec = SV_DUP(alpha * x2_ptr[ix]); + i = 0; + BLASLONG sve_size = SV_COUNT(); + while ((i + sve_size * 1 - 1) < m) { + SV_TYPE y0_vec = svld1_vnum(SV_TRUE(), y + i, 0); + + SV_TYPE a00_vec = svld1_vnum(pg00, a0_ptr + i, 0); + SV_TYPE a01_vec = svld1_vnum(pg01, a1_ptr + i, 0); + SV_TYPE a02_vec = svld1_vnum(pg02, a2_ptr + i, 0); + + y0_vec = svmla_m(pg00, y0_vec, temp0_vec, a00_vec); + y0_vec = svmla_m(pg01, y0_vec, temp1_vec, a01_vec); + y0_vec = svmla_m(pg02, y0_vec, temp2_vec, a02_vec); + + svst1_vnum(SV_TRUE(), y + i, 0, y0_vec); + i += sve_size * 1; + } + + if (i < m) { + svbool_t pg0 = SV_WHILE(i + sve_size * 0, m); + + pg00 = svand_z(SV_TRUE(), pg0, pg00); + pg01 = svand_z(SV_TRUE(), pg0, pg01); + pg02 = svand_z(SV_TRUE(), pg0, pg02); + + SV_TYPE y0_vec = svld1_vnum(pg0, y + i, 0); + + SV_TYPE a00_vec = svld1_vnum(pg00, a0_ptr + i, 0); + SV_TYPE a01_vec = svld1_vnum(pg01, a1_ptr + i, 0); + SV_TYPE a02_vec = svld1_vnum(pg02, a2_ptr + i, 0); + + y0_vec = svmla_m(pg00, y0_vec, temp0_vec, a00_vec); + y0_vec = svmla_m(pg01, y0_vec, temp1_vec, a01_vec); + y0_vec = svmla_m(pg02, y0_vec, temp2_vec, a02_vec); + + svst1_vnum(pg0, y + i, 0, y0_vec); + } + a0_ptr += lda; + a1_ptr += lda; + a2_ptr += lda; + ix += inc_x; + } + return(0); + } + + for (j = 0; j < n; j++) { + temp = alpha * x[ix]; + iy = 0; + for (i = 0; i < m; i++) { + y[iy] += temp * a_ptr[i]; + iy += inc_y; + } + a_ptr += lda; + ix += inc_x; + } + return (0); +} diff --git a/kernel/arm64/gemv_n_sve_v4x3.c b/kernel/arm64/gemv_n_sve_v4x3.c new file mode 100644 index 000000000..92e4f75b6 --- /dev/null +++ b/kernel/arm64/gemv_n_sve_v4x3.c @@ -0,0 +1,207 @@ +/*************************************************************************** +Copyright (c) 2025, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include + +#include "common.h" + +#ifdef DOUBLE +#define SV_COUNT svcntd +#define SV_TYPE svfloat64_t +#define SV_TRUE svptrue_b64 +#define SV_WHILE svwhilelt_b64_s64 +#define SV_DUP svdup_f64 +#else +#define SV_COUNT svcntw +#define SV_TYPE svfloat32_t +#define SV_TRUE svptrue_b32 +#define SV_WHILE svwhilelt_b32_s64 +#define SV_DUP svdup_f32 +#endif + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, + BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, + FLOAT *buffer) +{ + BLASLONG i; + BLASLONG ix,iy; + BLASLONG j; + FLOAT *a_ptr; + FLOAT temp; + + ix = 0; + a_ptr = a; + + if (inc_y == 1) { + BLASLONG width = (n + 3 - 1) / 3; + + FLOAT *a0_ptr = a_ptr + lda * width * 0; + FLOAT *a1_ptr = a_ptr + lda * width * 1; + FLOAT *a2_ptr = a_ptr + lda * width * 2; + + FLOAT *x0_ptr = x + inc_x * width * 0; + FLOAT *x1_ptr = x + inc_x * width * 1; + FLOAT *x2_ptr = x + inc_x * width * 2; + + for (j = 0; j < width; j++) { + svbool_t pg00 = ((j + width * 0) < n) ? SV_TRUE() : svpfalse(); + svbool_t pg10 = ((j + width * 0) < n) ? SV_TRUE() : svpfalse(); + svbool_t pg20 = ((j + width * 0) < n) ? SV_TRUE() : svpfalse(); + svbool_t pg30 = ((j + width * 0) < n) ? SV_TRUE() : svpfalse(); + svbool_t pg01 = ((j + width * 1) < n) ? SV_TRUE() : svpfalse(); + svbool_t pg11 = ((j + width * 1) < n) ? SV_TRUE() : svpfalse(); + svbool_t pg21 = ((j + width * 1) < n) ? SV_TRUE() : svpfalse(); + svbool_t pg31 = ((j + width * 1) < n) ? SV_TRUE() : svpfalse(); + svbool_t pg02 = ((j + width * 2) < n) ? SV_TRUE() : svpfalse(); + svbool_t pg12 = ((j + width * 2) < n) ? SV_TRUE() : svpfalse(); + svbool_t pg22 = ((j + width * 2) < n) ? SV_TRUE() : svpfalse(); + svbool_t pg32 = ((j + width * 2) < n) ? SV_TRUE() : svpfalse(); + + SV_TYPE temp0_vec = SV_DUP(alpha * x0_ptr[ix]); + SV_TYPE temp1_vec = SV_DUP(alpha * x1_ptr[ix]); + SV_TYPE temp2_vec = SV_DUP(alpha * x2_ptr[ix]); + i = 0; + BLASLONG sve_size = SV_COUNT(); + while ((i + sve_size * 4 - 1) < m) { + SV_TYPE y0_vec = svld1_vnum(SV_TRUE(), y + i, 0); + SV_TYPE y1_vec = svld1_vnum(SV_TRUE(), y + i, 1); + SV_TYPE y2_vec = svld1_vnum(SV_TRUE(), y + i, 2); + SV_TYPE y3_vec = svld1_vnum(SV_TRUE(), y + i, 3); + + SV_TYPE a00_vec = svld1_vnum(pg00, a0_ptr + i, 0); + SV_TYPE a10_vec = svld1_vnum(pg10, a0_ptr + i, 1); + SV_TYPE a20_vec = svld1_vnum(pg20, a0_ptr + i, 2); + SV_TYPE a30_vec = svld1_vnum(pg30, a0_ptr + i, 3); + SV_TYPE a01_vec = svld1_vnum(pg01, a1_ptr + i, 0); + SV_TYPE a11_vec = svld1_vnum(pg11, a1_ptr + i, 1); + SV_TYPE a21_vec = svld1_vnum(pg21, a1_ptr + i, 2); + SV_TYPE a31_vec = svld1_vnum(pg31, a1_ptr + i, 3); + SV_TYPE a02_vec = svld1_vnum(pg02, a2_ptr + i, 0); + SV_TYPE a12_vec = svld1_vnum(pg12, a2_ptr + i, 1); + SV_TYPE a22_vec = svld1_vnum(pg22, a2_ptr + i, 2); + SV_TYPE a32_vec = svld1_vnum(pg32, a2_ptr + i, 3); + + y0_vec = svmla_m(pg00, y0_vec, temp0_vec, a00_vec); + y1_vec = svmla_m(pg10, y1_vec, temp0_vec, a10_vec); + y2_vec = svmla_m(pg20, y2_vec, temp0_vec, a20_vec); + y3_vec = svmla_m(pg30, y3_vec, temp0_vec, a30_vec); + y0_vec = svmla_m(pg01, y0_vec, temp1_vec, a01_vec); + y1_vec = svmla_m(pg11, y1_vec, temp1_vec, a11_vec); + y2_vec = svmla_m(pg21, y2_vec, temp1_vec, a21_vec); + y3_vec = svmla_m(pg31, y3_vec, temp1_vec, a31_vec); + y0_vec = svmla_m(pg02, y0_vec, temp2_vec, a02_vec); + y1_vec = svmla_m(pg12, y1_vec, temp2_vec, a12_vec); + y2_vec = svmla_m(pg22, y2_vec, temp2_vec, a22_vec); + y3_vec = svmla_m(pg32, y3_vec, temp2_vec, a32_vec); + + svst1_vnum(SV_TRUE(), y + i, 0, y0_vec); + svst1_vnum(SV_TRUE(), y + i, 1, y1_vec); + svst1_vnum(SV_TRUE(), y + i, 2, y2_vec); + svst1_vnum(SV_TRUE(), y + i, 3, y3_vec); + i += sve_size * 4; + } + + if (i < m) { + svbool_t pg0 = SV_WHILE(i + sve_size * 0, m); + svbool_t pg1 = SV_WHILE(i + sve_size * 1, m); + svbool_t pg2 = SV_WHILE(i + sve_size * 2, m); + svbool_t pg3 = SV_WHILE(i + sve_size * 3, m); + + pg00 = svand_z(SV_TRUE(), pg0, pg00); + pg10 = svand_z(SV_TRUE(), pg1, pg10); + pg20 = svand_z(SV_TRUE(), pg2, pg20); + pg30 = svand_z(SV_TRUE(), pg3, pg30); + pg01 = svand_z(SV_TRUE(), pg0, pg01); + pg11 = svand_z(SV_TRUE(), pg1, pg11); + pg21 = svand_z(SV_TRUE(), pg2, pg21); + pg31 = svand_z(SV_TRUE(), pg3, pg31); + pg02 = svand_z(SV_TRUE(), pg0, pg02); + pg12 = svand_z(SV_TRUE(), pg1, pg12); + pg22 = svand_z(SV_TRUE(), pg2, pg22); + pg32 = svand_z(SV_TRUE(), pg3, pg32); + + SV_TYPE y0_vec = svld1_vnum(pg0, y + i, 0); + SV_TYPE y1_vec = svld1_vnum(pg1, y + i, 1); + SV_TYPE y2_vec = svld1_vnum(pg2, y + i, 2); + SV_TYPE y3_vec = svld1_vnum(pg3, y + i, 3); + + SV_TYPE a00_vec = svld1_vnum(pg00, a0_ptr + i, 0); + SV_TYPE a10_vec = svld1_vnum(pg10, a0_ptr + i, 1); + SV_TYPE a20_vec = svld1_vnum(pg20, a0_ptr + i, 2); + SV_TYPE a30_vec = svld1_vnum(pg30, a0_ptr + i, 3); + SV_TYPE a01_vec = svld1_vnum(pg01, a1_ptr + i, 0); + SV_TYPE a11_vec = svld1_vnum(pg11, a1_ptr + i, 1); + SV_TYPE a21_vec = svld1_vnum(pg21, a1_ptr + i, 2); + SV_TYPE a31_vec = svld1_vnum(pg31, a1_ptr + i, 3); + SV_TYPE a02_vec = svld1_vnum(pg02, a2_ptr + i, 0); + SV_TYPE a12_vec = svld1_vnum(pg12, a2_ptr + i, 1); + SV_TYPE a22_vec = svld1_vnum(pg22, a2_ptr + i, 2); + SV_TYPE a32_vec = svld1_vnum(pg32, a2_ptr + i, 3); + + y0_vec = svmla_m(pg00, y0_vec, temp0_vec, a00_vec); + y1_vec = svmla_m(pg10, y1_vec, temp0_vec, a10_vec); + y2_vec = svmla_m(pg20, y2_vec, temp0_vec, a20_vec); + y3_vec = svmla_m(pg30, y3_vec, temp0_vec, a30_vec); + y0_vec = svmla_m(pg01, y0_vec, temp1_vec, a01_vec); + y1_vec = svmla_m(pg11, y1_vec, temp1_vec, a11_vec); + y2_vec = svmla_m(pg21, y2_vec, temp1_vec, a21_vec); + y3_vec = svmla_m(pg31, y3_vec, temp1_vec, a31_vec); + y0_vec = svmla_m(pg02, y0_vec, temp2_vec, a02_vec); + y1_vec = svmla_m(pg12, y1_vec, temp2_vec, a12_vec); + y2_vec = svmla_m(pg22, y2_vec, temp2_vec, a22_vec); + y3_vec = svmla_m(pg32, y3_vec, temp2_vec, a32_vec); + + svst1_vnum(pg0, y + i, 0, y0_vec); + svst1_vnum(pg1, y + i, 1, y1_vec); + svst1_vnum(pg2, y + i, 2, y2_vec); + svst1_vnum(pg3, y + i, 3, y3_vec); + } + a0_ptr += lda; + a1_ptr += lda; + a2_ptr += lda; + ix += inc_x; + } + return(0); + } + + for (j = 0; j < n; j++) { + temp = alpha * x[ix]; + iy = 0; + for (i = 0; i < m; i++) { + y[iy] += temp * a_ptr[i]; + iy += inc_y; + } + a_ptr += lda; + ix += inc_x; + } + return (0); +} From d711906e3e438995146c5c6dce23a2795bbf4740 Mon Sep 17 00:00:00 2001 From: "Usui, Tetsuzo" Date: Fri, 11 Apr 2025 20:39:52 +0900 Subject: [PATCH 03/38] Add symv kernels for arm64 --- kernel/arm64/KERNEL.ARMV8SVE | 5 ++ kernel/arm64/KERNEL.NEOVERSEN1 | 4 + kernel/arm64/symv_L_asimd_4x4.c | 113 +++++++++++++++++++++++++ kernel/arm64/symv_L_sve_v1x4.c | 103 +++++++++++++++++++++++ kernel/arm64/symv_U_asimd_4x4.c | 106 +++++++++++++++++++++++ kernel/arm64/symv_U_sve_v1x4.c | 104 +++++++++++++++++++++++ kernel/arm64/symv_microk_asimd_4x4.c | 120 +++++++++++++++++++++++++++ kernel/arm64/symv_microk_sve_v1x4.c | 89 ++++++++++++++++++++ 8 files changed, 644 insertions(+) create mode 100644 kernel/arm64/symv_L_asimd_4x4.c create mode 100644 kernel/arm64/symv_L_sve_v1x4.c create mode 100644 kernel/arm64/symv_U_asimd_4x4.c create mode 100644 kernel/arm64/symv_U_sve_v1x4.c create mode 100644 kernel/arm64/symv_microk_asimd_4x4.c create mode 100644 kernel/arm64/symv_microk_sve_v1x4.c diff --git a/kernel/arm64/KERNEL.ARMV8SVE b/kernel/arm64/KERNEL.ARMV8SVE index 4ff53c6d0..0e51f2c2f 100644 --- a/kernel/arm64/KERNEL.ARMV8SVE +++ b/kernel/arm64/KERNEL.ARMV8SVE @@ -84,6 +84,11 @@ DGEMVTKERNEL = gemv_t_sve_v1x3.c CGEMVTKERNEL = zgemv_t.S ZGEMVTKERNEL = zgemv_t.S +SSYMV_L_KERNEL = symv_L_sve_v1x4.c +SSYMV_U_KERNEL = symv_U_sve_v1x4.c +DSYMV_L_KERNEL = symv_L_sve_v1x4.c +DSYMV_U_KERNEL = symv_U_sve_v1x4.c + SASUMKERNEL = sasum_thunderx2t99.c DASUMKERNEL = dasum_thunderx2t99.c CASUMKERNEL = casum_thunderx2t99.c diff --git a/kernel/arm64/KERNEL.NEOVERSEN1 b/kernel/arm64/KERNEL.NEOVERSEN1 index e623814d6..665ebe459 100644 --- a/kernel/arm64/KERNEL.NEOVERSEN1 +++ b/kernel/arm64/KERNEL.NEOVERSEN1 @@ -70,6 +70,10 @@ DGEMVTKERNEL = gemv_t.S CGEMVTKERNEL = zgemv_t.S ZGEMVTKERNEL = zgemv_t.S +SSYMV_L_KERNEL = symv_L_asimd_4x4.c +SSYMV_U_KERNEL = symv_U_asimd_4x4.c +DSYMV_L_KERNEL = symv_L_asimd_4x4.c +DSYMV_U_KERNEL = symv_U_asimd_4x4.c SASUMKERNEL = sasum_thunderx2t99.c DASUMKERNEL = dasum_thunderx2t99.c diff --git a/kernel/arm64/symv_L_asimd_4x4.c b/kernel/arm64/symv_L_asimd_4x4.c new file mode 100644 index 000000000..b3d15ba67 --- /dev/null +++ b/kernel/arm64/symv_L_asimd_4x4.c @@ -0,0 +1,113 @@ +/*************************************************************************** +Copyright (c) 2025, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "symv_microk_asimd_4x4.c" + +int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, + FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i, j; + FLOAT temp1, temp2; + FLOAT tmp1[4]; + FLOAT tmp2[4]; + FLOAT *a0, *a1, *a2, *a3; + FLOAT x0, x1, x2, x3; + FLOAT *X = x; + FLOAT *Y = y; + + if (inc_y != 1) { + Y = buffer; + COPY_K(m, y, inc_y, Y, 1); + } + if (inc_x != 1) { + if (inc_y != 1) { + X = Y + m; + } else { + X = buffer; + } + COPY_K(m, x, inc_x, X, 1); + } + + BLASLONG offset1 = (offset / 4) * 4; + for (j = 0; j < offset1; j+=4) { + a0 = &a[j*lda]; + a1 = a0 + lda; + a2 = a1 + lda; + a3 = a2 + lda; + x0 = X[j]; + x1 = X[j+1]; + x2 = X[j+2]; + x3 = X[j+3]; + tmp2[0] = a0[j ]*x0 + a0[j+1]*x1 + a0[j+2]*x2 + a0[j+3]*x3; + tmp2[1] = a0[j+1]*x0 + a1[j+1]*x1 + a1[j+2]*x2 + a1[j+3]*x3; + tmp2[2] = a0[j+2]*x0 + a1[j+2]*x1 + a2[j+2]*x2 + a2[j+3]*x3; + tmp2[3] = a0[j+3]*x0 + a1[j+3]*x1 + a2[j+3]*x2 + a3[j+3]*x3; + tmp1[0] = alpha * x0; + tmp1[1] = alpha * x1; + tmp1[2] = alpha * x2; + tmp1[3] = alpha * x3; + + BLASLONG m2 = (m/4)*4; + if (m2 > j+4) + symv_kernel_4x4(j+4, m2, a0, a1, a2, a3, X, Y, tmp1, tmp2); + + for (i = m2; i < m; i++) { + Y[i] += tmp1[0] * a0[i]; + tmp2[0] += a0[i] * X[i]; + Y[i] += tmp1[1] * a1[i]; + tmp2[1] += a1[i] * X[i]; + Y[i] += tmp1[2] * a2[i]; + tmp2[2] += a2[i] * X[i]; + Y[i] += tmp1[3] * a3[i]; + tmp2[3] += a3[i] * X[i]; + } + Y[j] += alpha * tmp2[0]; + Y[j+1] += alpha * tmp2[1]; + Y[j+2] += alpha * tmp2[2]; + Y[j+3] += alpha * tmp2[3]; + } + + for (j = offset1; j < offset; j++) { + temp1 = alpha * X[j]; + temp2 = 0.0; + Y[j] += temp1 * a[j*lda+j]; + for (i = j+1; i < m; i++) { + Y[i] += temp1 * a[j*lda+i]; + temp2 += a[j*lda+i] * X[i]; + } + Y[j] += alpha * temp2; + } + + if (inc_y != 1) { + COPY_K(m, Y, 1, y, inc_y); + } + return(0); +} diff --git a/kernel/arm64/symv_L_sve_v1x4.c b/kernel/arm64/symv_L_sve_v1x4.c new file mode 100644 index 000000000..4b9252339 --- /dev/null +++ b/kernel/arm64/symv_L_sve_v1x4.c @@ -0,0 +1,103 @@ +/*************************************************************************** +Copyright (c) 2025, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "symv_microk_sve_v1x4.c" + +int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, + FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i, j; + FLOAT temp1, temp2; + FLOAT tmp1[4]; + FLOAT tmp2[4]; + FLOAT *a0, *a1, *a2, *a3; + FLOAT x0, x1, x2, x3; + FLOAT *X = x; + FLOAT *Y = y; + + if (inc_y != 1) { + Y = buffer; + COPY_K(m, y, inc_y, Y, 1); + } + if (inc_x != 1) { + if (inc_y != 1) { + X = Y + m; + } else { + X = buffer; + } + COPY_K(m, x, inc_x, X, 1); + } + + BLASLONG offset1 = (offset / 4) * 4; + + for (j = 0; j < offset1; j+=4) { + a0 = &a[j*lda]; + a1 = a0 + lda; + a2 = a1 + lda; + a3 = a2 + lda; + x0 = X[j]; + x1 = X[j+1]; + x2 = X[j+2]; + x3 = X[j+3]; + tmp2[0] = a0[j ]*x0 + a0[j+1]*x1 + a0[j+2]*x2 + a0[j+3]*x3; + tmp2[1] = a0[j+1]*x0 + a1[j+1]*x1 + a1[j+2]*x2 + a1[j+3]*x3; + tmp2[2] = a0[j+2]*x0 + a1[j+2]*x1 + a2[j+2]*x2 + a2[j+3]*x3; + tmp2[3] = a0[j+3]*x0 + a1[j+3]*x1 + a2[j+3]*x2 + a3[j+3]*x3; + tmp1[0] = alpha * x0; + tmp1[1] = alpha * x1; + tmp1[2] = alpha * x2; + tmp1[3] = alpha * x3; + + symv_kernel_v1x4(j+4, m, a0, a1, a2, a3, X, Y, tmp1, tmp2); + + Y[j] += alpha * tmp2[0]; + Y[j+1] += alpha * tmp2[1]; + Y[j+2] += alpha * tmp2[2]; + Y[j+3] += alpha * tmp2[3]; + } + + for (j = offset1; j < offset; j++) { + temp1 = alpha * X[j]; + temp2 = 0.0; + a0 = &a[j*lda]; + Y[j] += temp1 * a0[j]; + for (i = j+1; i < m; i++) { + Y[i] += temp1 * a0[i]; + temp2 += a0[i] * X[i]; + } + Y[j] += alpha * temp2; + } + + if (inc_y != 1) { + COPY_K(m, Y, 1, y, inc_y); + } + return(0); +} diff --git a/kernel/arm64/symv_U_asimd_4x4.c b/kernel/arm64/symv_U_asimd_4x4.c new file mode 100644 index 000000000..83e954260 --- /dev/null +++ b/kernel/arm64/symv_U_asimd_4x4.c @@ -0,0 +1,106 @@ +/*************************************************************************** +Copyright (c) 2025, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "symv_microk_asimd_4x4.c" + +int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, + FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i, j, j1, j2, m2; + FLOAT temp1, temp2; + FLOAT tmp1[4]; + FLOAT tmp2[4]; + FLOAT *a0, *a1, *a2, *a3; + FLOAT *X = x; + FLOAT *Y = y; + + BLASLONG m1 = m - offset; + if (inc_y != 1) { + Y = buffer; + COPY_K(m, y, inc_y, Y, 1); + } + if (inc_x != 1) { + if (inc_y != 1) { + X = Y + m; + } else { + X = buffer; + } + COPY_K(m, x, inc_x, X, 1); + } + + m2 = m - (offset % 4); + for (j = m1; j < m2; j += 4) { + tmp1[0] = alpha * X[j]; + tmp1[1] = alpha * X[j+1]; + tmp1[2] = alpha * X[j+2]; + tmp1[3] = alpha * X[j+3]; + tmp2[0] = 0.0; + tmp2[1] = 0.0; + tmp2[2] = 0.0; + tmp2[3] = 0.0; + a0 = &a[j*lda]; + a1 = a0 + lda; + a2 = a1 + lda; + a3 = a2 + lda; + j1 = (j / 4) * 4; + if ( j1 ) + symv_kernel_4x4(0, j1, a0, a1, a2, a3, X, Y, tmp1, tmp2); + + j2 = 0; + for (j1 = j ; j1 < j+4 ; j1++) { + temp1 = tmp1[j2]; + temp2 = tmp2[j2]; + a0 = &a[j1*lda]; + for (i=j ; i + +static void symv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, + FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) +{ +#ifdef DOUBLE + float64x2_t vtmpx0 = vld1q_dup_f64(&temp1[0]); + float64x2_t vtmpx1 = vld1q_dup_f64(&temp1[1]); + float64x2_t vtmpx2 = vld1q_dup_f64(&temp1[2]); + float64x2_t vtmpx3 = vld1q_dup_f64(&temp1[3]); + float64x2_t vtmpy0 = {0.0, 0.0}; + float64x2_t vtmpy1 = {0.0, 0.0}; + float64x2_t vtmpy2 = {0.0, 0.0}; + float64x2_t vtmpy3 = {0.0, 0.0}; + float64x2_t vxl, vxh, vyl, vyh; + float64x2_t vap0l, vap0h, vap1l, vap1h, vap2l, vap2h, vap3l, vap3h; + BLASLONG i; + for (i = from; i < to; i+=4) { + vyl = vld1q_f64(&y[i]); + vyh = vld1q_f64(&y[i+2]); + vxl = vld1q_f64(&x[i]); + vxh = vld1q_f64(&x[i+2]); + vap0l = vld1q_f64(&a0[i]); + vap0h = vld1q_f64(&a0[i+2]); + vap1l = vld1q_f64(&a1[i]); + vap1h = vld1q_f64(&a1[i+2]); + vap2l = vld1q_f64(&a2[i]); + vap2h = vld1q_f64(&a2[i+2]); + vap3l = vld1q_f64(&a3[i]); + vap3h = vld1q_f64(&a3[i+2]); + vyl = vfmaq_f64(vyl, vtmpx0, vap0l); + vyh = vfmaq_f64(vyh, vtmpx0, vap0h); + vyl = vfmaq_f64(vyl, vtmpx1, vap1l); + vyh = vfmaq_f64(vyh, vtmpx1, vap1h); + vyl = vfmaq_f64(vyl, vtmpx2, vap2l); + vyh = vfmaq_f64(vyh, vtmpx2, vap2h); + vyl = vfmaq_f64(vyl, vtmpx3, vap3l); + vyh = vfmaq_f64(vyh, vtmpx3, vap3h); + vtmpy0 = vfmaq_f64(vtmpy0, vxl, vap0l); + vtmpy0 = vfmaq_f64(vtmpy0, vxh, vap0h); + vtmpy1 = vfmaq_f64(vtmpy1, vxl, vap1l); + vtmpy2 = vfmaq_f64(vtmpy2, vxl, vap2l); + vtmpy1 = vfmaq_f64(vtmpy1, vxh, vap1h); + vtmpy2 = vfmaq_f64(vtmpy2, vxh, vap2h); + vtmpy3 = vfmaq_f64(vtmpy3, vxl, vap3l); + vtmpy3 = vfmaq_f64(vtmpy3, vxh, vap3h); + vst1q_f64(&y[i], vyl); + vst1q_f64(&y[i+2], vyh); + } + temp2[0] += vaddvq_f64(vtmpy0); + temp2[1] += vaddvq_f64(vtmpy1); + temp2[2] += vaddvq_f64(vtmpy2); + temp2[3] += vaddvq_f64(vtmpy3); +#else + float32x4_t vtmpx0 = vld1q_dup_f32(&temp1[0]); + float32x4_t vtmpx1 = vld1q_dup_f32(&temp1[1]); + float32x4_t vtmpx2 = vld1q_dup_f32(&temp1[2]); + float32x4_t vtmpx3 = vld1q_dup_f32(&temp1[3]); + float32x4_t vtmpy0 = {0.0, 0.0, 0.0, 0.0}; + float32x4_t vtmpy1 = {0.0, 0.0, 0.0, 0.0}; + float32x4_t vtmpy2 = {0.0, 0.0, 0.0, 0.0}; + float32x4_t vtmpy3 = {0.0, 0.0, 0.0, 0.0}; + float32x4_t vx, vy; + float32x4_t vap0, vap1, vap2, vap3; + BLASLONG i; + for (i = from; i < to; i+=4) { + vy = vld1q_f32(&y[i]); + vx = vld1q_f32(&x[i]); + vap0 = vld1q_f32(&a0[i]); + vap1 = vld1q_f32(&a1[i]); + vap2 = vld1q_f32(&a2[i]); + vap3 = vld1q_f32(&a3[i]); + vy = vfmaq_f32(vy, vtmpx0, vap0); + vy = vfmaq_f32(vy, vtmpx1, vap1); + vy = vfmaq_f32(vy, vtmpx2, vap2); + vy = vfmaq_f32(vy, vtmpx3, vap3); + vtmpy0 = vfmaq_f32(vtmpy0, vx, vap0); + vtmpy1 = vfmaq_f32(vtmpy1, vx, vap1); + vtmpy2 = vfmaq_f32(vtmpy2, vx, vap2); + vtmpy3 = vfmaq_f32(vtmpy3, vx, vap3); + vst1q_f32(&y[i], vy); + } + temp2[0] += vaddvq_f32(vtmpy0); + temp2[1] += vaddvq_f32(vtmpy1); + temp2[2] += vaddvq_f32(vtmpy2); + temp2[3] += vaddvq_f32(vtmpy3); +#endif +} diff --git a/kernel/arm64/symv_microk_sve_v1x4.c b/kernel/arm64/symv_microk_sve_v1x4.c new file mode 100644 index 000000000..f87613f39 --- /dev/null +++ b/kernel/arm64/symv_microk_sve_v1x4.c @@ -0,0 +1,89 @@ +/*************************************************************************** +Copyright (c) 2025, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#ifdef DOUBLE +#define SV_COUNT svcntd +#define SV_TYPE svfloat64_t +#define SV_TRUE svptrue_b64 +#define SV_WHILE svwhilelt_b64_s64 +#define SV_DUP svdup_f64 +#else +#define SV_COUNT svcntw +#define SV_TYPE svfloat32_t +#define SV_TRUE svptrue_b32 +#define SV_WHILE svwhilelt_b32_s64 +#define SV_DUP svdup_f32 +#endif + +static void symv_kernel_v1x4(BLASLONG from, BLASLONG to, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, + FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) +{ + SV_TYPE vtmpx0 = SV_DUP(temp1[0]); + SV_TYPE vtmpx1 = SV_DUP(temp1[1]); + SV_TYPE vtmpx2 = SV_DUP(temp1[2]); + SV_TYPE vtmpx3 = SV_DUP(temp1[3]); + SV_TYPE vtmpy0 = SV_DUP(0.0); + SV_TYPE vtmpy1 = SV_DUP(0.0); + SV_TYPE vtmpy2 = SV_DUP(0.0); + SV_TYPE vtmpy3 = SV_DUP(0.0); + SV_TYPE vx, vy; + SV_TYPE vap0, vap1, vap2, vap3; + BLASLONG i; + uint64_t sve_size = SV_COUNT(); + svbool_t pg; + + for (i = from; i < to; i += sve_size) { + pg = SV_WHILE(i, to); + vy = svld1(pg, &y[i]); + vx = svld1(pg, &x[i]); + vap0 = svld1(pg, &a0[i]); + vap1 = svld1(pg, &a1[i]); + vap2 = svld1(pg, &a2[i]); + vap3 = svld1(pg, &a3[i]); + vy = svmla_m(pg, vy, vtmpx0, vap0); + vy = svmla_m(pg, vy, vtmpx1, vap1); + vy = svmla_m(pg, vy, vtmpx2, vap2); + vy = svmla_m(pg, vy, vtmpx3, vap3); + vtmpy0 = svmla_m(pg, vtmpy0, vx, vap0); + vtmpy1 = svmla_m(pg, vtmpy1, vx, vap1); + vtmpy2 = svmla_m(pg, vtmpy2, vx, vap2); + vtmpy3 = svmla_m(pg, vtmpy3, vx, vap3); + svst1(pg, &y[i], vy); + } + pg = SV_TRUE(); + temp2[0] += svaddv(pg, vtmpy0); + temp2[1] += svaddv(pg, vtmpy1); + temp2[2] += svaddv(pg, vtmpy2); + temp2[3] += svaddv(pg, vtmpy3); +} From 7b66330deada3236e2bcd0ccf3e919ab33a88e28 Mon Sep 17 00:00:00 2001 From: zanpeeters Date: Tue, 15 Apr 2025 17:12:03 -0700 Subject: [PATCH 04/38] hw.perflevel[01].cpusperl changed to hw.perflevel[01].cpusperl2 --- cpuid_arm64.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpuid_arm64.c b/cpuid_arm64.c index 20dbead23..03563a23b 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -378,9 +378,9 @@ int detect(void) cpulowperf=value64; sysctlbyname("hw.nperflevels",&value64,&length64,NULL,0); if (value64 > 1) { - sysctlbyname("hw.perflevel0.cpusperl",&value64,&length64,NULL,0); + sysctlbyname("hw.perflevel0.cpusperl2",&value64,&length64,NULL,0); cpuhiperf=value64; - sysctlbyname("hw.perflevel1.cpusperl",&value64,&length64,NULL,0); + sysctlbyname("hw.perflevel1.cpusperl2",&value64,&length64,NULL,0); cpulowperf=value64; } sysctlbyname("hw.cpufamily",&value64,&length64,NULL,0); From d1c2528aed50bac13ef6b41aba012d2fea30eb4b Mon Sep 17 00:00:00 2001 From: zanpeeters Date: Tue, 15 Apr 2025 17:14:19 -0700 Subject: [PATCH 05/38] Add L1_DATA_LINESIZE for ifdef __APPLE__ --- cpuid_arm64.c | 1 + 1 file changed, 1 insertion(+) diff --git a/cpuid_arm64.c b/cpuid_arm64.c index 03563a23b..95c1b9519 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -702,6 +702,7 @@ void get_cpuconfig(void) printf("#define L1_CODE_SIZE %lld \n",value64); sysctlbyname("hw.cachelinesize",&value64,&length64,NULL,0); printf("#define L1_CODE_LINESIZE %lld \n",value64); + printf("#define L1_DATA_LINESIZE %lld \n",value64); sysctlbyname("hw.l1dcachesize",&value64,&length64,NULL,0); printf("#define L1_DATA_SIZE %lld \n",value64); sysctlbyname("hw.l2cachesize",&value64,&length64,NULL,0); From acef78c778631ee23c8e23214b41226872e454f5 Mon Sep 17 00:00:00 2001 From: zanpeeters Date: Tue, 15 Apr 2025 17:17:17 -0700 Subject: [PATCH 06/38] Reset buffer length before every call to sysctlbyname. --- cpuid_arm64.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/cpuid_arm64.c b/cpuid_arm64.c index 95c1b9519..c60725828 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -374,15 +374,20 @@ int detect(void) } #else #ifdef __APPLE__ + length64 = sizeof(value64); sysctlbyname("hw.ncpu",&value64,&length64,NULL,0); cpulowperf=value64; + length64 = sizeof(value64); sysctlbyname("hw.nperflevels",&value64,&length64,NULL,0); if (value64 > 1) { + length64 = sizeof(value64); sysctlbyname("hw.perflevel0.cpusperl2",&value64,&length64,NULL,0); cpuhiperf=value64; + length64 = sizeof(value64); sysctlbyname("hw.perflevel1.cpusperl2",&value64,&length64,NULL,0); cpulowperf=value64; } + length64 = sizeof(value64); sysctlbyname("hw.cpufamily",&value64,&length64,NULL,0); if (value64 ==131287967|| value64 == 458787763 ) return CPU_VORTEX; //A12/M1 if (value64 == 3660830781) return CPU_VORTEX; //A15/M2 @@ -467,6 +472,7 @@ int n=0; printf("#define NUM_CORES_HP %d\n",cpuhiperf); #endif #ifdef __APPLE__ + length64 = sizeof(value64); sysctlbyname("hw.physicalcpu_max",&value,&length,NULL,0); printf("#define NUM_CORES %d\n",value); if (cpulowperf >0) @@ -698,13 +704,17 @@ void get_cpuconfig(void) case CPU_VORTEX: printf("#define VORTEX \n"); #ifdef __APPLE__ + length64 = sizeof(value64); sysctlbyname("hw.l1icachesize",&value64,&length64,NULL,0); printf("#define L1_CODE_SIZE %lld \n",value64); + length64 = sizeof(value64); sysctlbyname("hw.cachelinesize",&value64,&length64,NULL,0); printf("#define L1_CODE_LINESIZE %lld \n",value64); printf("#define L1_DATA_LINESIZE %lld \n",value64); + length64 = sizeof(value64); sysctlbyname("hw.l1dcachesize",&value64,&length64,NULL,0); printf("#define L1_DATA_SIZE %lld \n",value64); + length64 = sizeof(value64); sysctlbyname("hw.l2cachesize",&value64,&length64,NULL,0); printf("#define L2_SIZE %lld \n",value64); #endif From d9369bda1e5641ea1d5509ed122bc3f4181ff18f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 16 Apr 2025 01:09:57 -0700 Subject: [PATCH 07/38] Update and amend parameters for Neoverse cpus --- cmake/prebuild.cmake | 33 +++++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index c8adf4ab2..4c100a770 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -1006,15 +1006,15 @@ endif () "#define HAVE_SVE\n" "#define ARMV8\n") set(SGEMM_UNROLL_M 16) - set(SGEMM_UNROLL_N 4) - set(DGEMM_UNROLL_M 8) - set(DGEMM_UNROLL_N 4) - set(CGEMM_UNROLL_M 8) + set(SGEMM_UNROLL_N 8) + set(DGEMM_UNROLL_M 4) + set(DGEMM_UNROLL_N 8) + set(CGEMM_UNROLL_M 2) set(CGEMM_UNROLL_N 4) - set(ZGEMM_UNROLL_M 4) + set(ZGEMM_UNROLL_M 2) set(ZGEMM_UNROLL_N 4) set(SYMV_P 16) - elseif ("${TCORE}" STREQUAL "NEOVERSEN2" or "${TCORE}" STREQUAL "ARMV9SME") + elseif ("${TCORE}" STREQUAL "NEOVERSEN2" OR "${TCORE}" STREQUAL "ARMV9SME") file(APPEND ${TARGET_CONF_TEMP} "#define L1_CODE_SIZE\t65536\n" "#define L1_CODE_LINESIZE\t64\n" @@ -1249,6 +1249,25 @@ endif () set(ZGEMM_UNROLL_M 2) set(ZGEMM_UNROLL_N 4) set(SYMV_P 16) + elseif ("${TCORE}" STREQUAL "ARMV8SVE" OR "${TCORE}" STREQUAL "CORTEXA510" OR "${TCORE}" STREQUAL "CORTEXX2" OR "${TCORE}" STREQUAL "ARMV9") + file(APPEND ${TARGET_CONF_TEMP} + "#define L1_DATA_SIZE\t32768\n" + "#define L1_DATA_LINESIZE\t64\n" + "#define L2_SIZE\t262144\n" + "#define L2_LINESIZE\t64\n" + "#define DTB_DEFAULT_ENTRIES\t64\n" + "#define DTB_SIZE\t4096\n" + "#define L2_ASSOCIATIVE\t32\n" + "#define ARMV8\n") + set(SGEMM_UNROLL_M 4) + set(SGEMM_UNROLL_N 8) + set(DGEMM_UNROLL_M 4) + set(DGEMM_UNROLL_N 8) + set(CGEMM_UNROLL_M 2) + set(CGEMM_UNROLL_N 4) + set(ZGEMM_UNROLL_M 2) + set(ZGEMM_UNROLL_N 4) + set(SYMV_P 16) elseif ("${TCORE}" STREQUAL "P5600") file(APPEND ${TARGET_CONF_TEMP} "#define L2_SIZE 1048576\n" @@ -1409,9 +1428,11 @@ endif () # GetArch_2nd foreach(float_char S;D;Q;C;Z;X) if (NOT DEFINED ${float_char}GEMM_UNROLL_M) + message(STATUS "setting unrollm=2") set(${float_char}GEMM_UNROLL_M 2) endif() if (NOT DEFINED ${float_char}GEMM_UNROLL_N) + message(STATUS "setting unrolln=2") set(${float_char}GEMM_UNROLL_N 2) endif() endforeach() From d53572880398016fb5bee5b7aa96131926d295ec Mon Sep 17 00:00:00 2001 From: Annop Wongwathanarat Date: Wed, 9 Apr 2025 12:54:57 +0000 Subject: [PATCH 08/38] Improve performance for SGEMVN on NEONVERSEN1 --- kernel/arm64/KERNEL.NEOVERSEN1 | 2 +- kernel/arm64/sgemv_n_neon.c | 219 +++++++++++++++++++++++++++++++++ 2 files changed, 220 insertions(+), 1 deletion(-) create mode 100644 kernel/arm64/sgemv_n_neon.c diff --git a/kernel/arm64/KERNEL.NEOVERSEN1 b/kernel/arm64/KERNEL.NEOVERSEN1 index e623814d6..de4d33c74 100644 --- a/kernel/arm64/KERNEL.NEOVERSEN1 +++ b/kernel/arm64/KERNEL.NEOVERSEN1 @@ -60,7 +60,7 @@ DSCALKERNEL = scal.S CSCALKERNEL = zscal.S ZSCALKERNEL = zscal.S -SGEMVNKERNEL = gemv_n.S +SGEMVNKERNEL = sgemv_n_neon.c DGEMVNKERNEL = gemv_n.S CGEMVNKERNEL = zgemv_n.S ZGEMVNKERNEL = zgemv_n.S diff --git a/kernel/arm64/sgemv_n_neon.c b/kernel/arm64/sgemv_n_neon.c new file mode 100644 index 000000000..5fa86b350 --- /dev/null +++ b/kernel/arm64/sgemv_n_neon.c @@ -0,0 +1,219 @@ +/*************************************************************************** +Copyright (c) 2025, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i; + BLASLONG ix,iy; + BLASLONG j; + FLOAT *a_ptr; + FLOAT temp; + + ix = 0; + a_ptr = a; + + if (inc_x == 1 && inc_y == 1) { + FLOAT *a0_ptr = a + lda * 0; + FLOAT *a1_ptr = a + lda * 1; + FLOAT *a2_ptr = a + lda * 2; + FLOAT *a3_ptr = a + lda * 3; + FLOAT *a4_ptr = a + lda * 4; + FLOAT *a5_ptr = a + lda * 5; + FLOAT *a6_ptr = a + lda * 6; + FLOAT *a7_ptr = a + lda * 7; + + j = 0; + while (j + 3 < n) { + float32x4_t x0_vec = vld1q_f32(x + j); + x0_vec = vmulq_n_f32(x0_vec, alpha); + i = 0; + while (i + 7 < m) { + float32x4_t a00_vec = vld1q_f32(a0_ptr + i); + float32x4_t a01_vec = vld1q_f32(a0_ptr + i + 4); + float32x4_t a10_vec = vld1q_f32(a1_ptr + i); + float32x4_t a11_vec = vld1q_f32(a1_ptr + i + 4); + float32x4_t a20_vec = vld1q_f32(a2_ptr + i); + float32x4_t a21_vec = vld1q_f32(a2_ptr + i + 4); + float32x4_t a30_vec = vld1q_f32(a3_ptr + i); + float32x4_t a31_vec = vld1q_f32(a3_ptr + i + 4); + + float32x4_t y0_vec = vld1q_f32(y + i); + float32x4_t y1_vec = vld1q_f32(y + i + 4); + y0_vec = vmlaq_laneq_f32(y0_vec, a00_vec, x0_vec, 0); + y0_vec = vmlaq_laneq_f32(y0_vec, a10_vec, x0_vec, 1); + y0_vec = vmlaq_laneq_f32(y0_vec, a20_vec, x0_vec, 2); + y0_vec = vmlaq_laneq_f32(y0_vec, a30_vec, x0_vec, 3); + y1_vec = vmlaq_laneq_f32(y1_vec, a01_vec, x0_vec, 0); + y1_vec = vmlaq_laneq_f32(y1_vec, a11_vec, x0_vec, 1); + y1_vec = vmlaq_laneq_f32(y1_vec, a21_vec, x0_vec, 2); + y1_vec = vmlaq_laneq_f32(y1_vec, a31_vec, x0_vec, 3); + + vst1q_f32(y + i, y0_vec); + vst1q_f32(y + i + 4, y1_vec); + + i += 8; + } + while (i + 3 < m) { + float32x4_t a0_vec = vld1q_f32(a0_ptr + i); + float32x4_t a1_vec = vld1q_f32(a1_ptr + i); + float32x4_t a2_vec = vld1q_f32(a2_ptr + i); + float32x4_t a3_vec = vld1q_f32(a3_ptr + i); + + float32x4_t y_vec = vld1q_f32(y + i); + y_vec = vmlaq_laneq_f32(y_vec, a0_vec, x0_vec, 0); + y_vec = vmlaq_laneq_f32(y_vec, a1_vec, x0_vec, 1); + y_vec = vmlaq_laneq_f32(y_vec, a2_vec, x0_vec, 2); + y_vec = vmlaq_laneq_f32(y_vec, a3_vec, x0_vec, 3); + + vst1q_f32(y + i, y_vec); + + i += 4; + } + while (i + 1 < m) { + float32x2_t a0_vec = vld1_f32(a0_ptr + i); + float32x2_t a1_vec = vld1_f32(a1_ptr + i); + float32x2_t a2_vec = vld1_f32(a2_ptr + i); + float32x2_t a3_vec = vld1_f32(a3_ptr + i); + + float32x2_t y_vec = vld1_f32(y + i); + y_vec = vmla_laneq_f32(y_vec, a0_vec, x0_vec, 0); + y_vec = vmla_laneq_f32(y_vec, a1_vec, x0_vec, 1); + y_vec = vmla_laneq_f32(y_vec, a2_vec, x0_vec, 2); + y_vec = vmla_laneq_f32(y_vec, a3_vec, x0_vec, 3); + + vst1_f32(y + i, y_vec); + + i += 2; + } + while (i < m) { + y[i] += a0_ptr[i] * x0_vec[0]; + y[i] += a1_ptr[i] * x0_vec[1]; + y[i] += a2_ptr[i] * x0_vec[2]; + y[i] += a3_ptr[i] * x0_vec[3]; + + i++; + } + + a0_ptr += lda * 4; + a1_ptr += lda * 4; + a2_ptr += lda * 4; + a3_ptr += lda * 4; + + j += 4; + } + while (j + 1 < n) { + float32x2_t x0_vec = vld1_f32(x + j); + x0_vec = vmul_n_f32(x0_vec, alpha); + i = 0; + while (i + 7 < m) { + float32x4_t a00_vec = vld1q_f32(a0_ptr + i); + float32x4_t a01_vec = vld1q_f32(a0_ptr + i + 4); + float32x4_t a10_vec = vld1q_f32(a1_ptr + i); + float32x4_t a11_vec = vld1q_f32(a1_ptr + i + 4); + + float32x4_t y0_vec = vld1q_f32(y + i); + float32x4_t y1_vec = vld1q_f32(y + i + 4); + y0_vec = vmlaq_lane_f32(y0_vec, a00_vec, x0_vec, 0); + y0_vec = vmlaq_lane_f32(y0_vec, a10_vec, x0_vec, 1); + y1_vec = vmlaq_lane_f32(y1_vec, a01_vec, x0_vec, 0); + y1_vec = vmlaq_lane_f32(y1_vec, a11_vec, x0_vec, 1); + + vst1q_f32(y + i, y0_vec); + vst1q_f32(y + i + 4, y1_vec); + + i += 8; + } + while (i + 3 < m) { + float32x4_t a0_vec = vld1q_f32(a0_ptr + i); + float32x4_t a1_vec = vld1q_f32(a1_ptr + i); + + float32x4_t y_vec = vld1q_f32(y + i); + y_vec = vmlaq_lane_f32(y_vec, a0_vec, x0_vec, 0); + y_vec = vmlaq_lane_f32(y_vec, a1_vec, x0_vec, 1); + + vst1q_f32(y + i, y_vec); + + i += 4; + } + while (i + 1 < m) { + float32x2_t a0_vec = vld1_f32(a0_ptr + i); + float32x2_t a1_vec = vld1_f32(a1_ptr + i); + + float32x2_t y_vec = vld1_f32(y + i); + y_vec = vmla_lane_f32(y_vec, a0_vec, x0_vec, 0); + y_vec = vmla_lane_f32(y_vec, a1_vec, x0_vec, 1); + + vst1_f32(y + i, y_vec); + + i += 2; + } + while (i < m) { + y[i] += a0_ptr[i] * x0_vec[0]; + y[i] += a1_ptr[i] * x0_vec[1]; + + i++; + } + + a0_ptr += lda * 2; + a1_ptr += lda * 2; + + j += 2; + } + while (j < n) { + i = 0; + temp = alpha * x[j]; + while (i < m) { + y[i] += a0_ptr[i] * temp; + i++; + } + + a0_ptr += lda; + j++; + } + return (0); + } + + for (j = 0; j < n; j++) { + temp = alpha * x[ix]; + iy = 0; + for (i = 0; i < m; i++) { + y[iy] += temp * a_ptr[i]; + iy += inc_y; + } + a_ptr += lda; + ix += inc_x; + } + return (0); +} From 1f687b2f6001009b271f9ba790760e35e4334530 Mon Sep 17 00:00:00 2001 From: Han Gao Date: Sun, 20 Apr 2025 14:20:49 +0800 Subject: [PATCH 09/38] Bump xuantie qemu for c910v Signed-off-by: Han Gao --- .github/workflows/c910v.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/c910v.yml b/.github/workflows/c910v.yml index 1dd3a2c71..c5b497316 100644 --- a/.github/workflows/c910v.yml +++ b/.github/workflows/c910v.yml @@ -31,7 +31,7 @@ jobs: steps: - name: Checkout repository - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: install build deps run: | @@ -40,18 +40,18 @@ jobs: gcc-${{ matrix.apt_triple }} gfortran-${{ matrix.apt_triple }} libgomp1-riscv64-cross libglib2.0-dev - name: checkout qemu - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: - repository: T-head-Semi/qemu + repository: XUANTIE-RV/qemu path: qemu - ref: 1e692ebb43d396c52352406323fc782c1ac99a42 + ref: e0ace167effcd36d1f82c7ccb4522b3126011479 # xuantie-qemu-9.0 - name: build qemu run: | # Force use c910v qemu-user - wget https://github.com/revyos/qemu/commit/5164bca5a4bcde4534dc1a9aa3a7f619719874cf.patch + wget https://github.com/revyos/qemu/commit/222729c7455784dd855216d7a2bec4bd8f2a6800.patch cd qemu - patch -p1 < ../5164bca5a4bcde4534dc1a9aa3a7f619719874cf.patch + patch -p1 < ../222729c7455784dd855216d7a2bec4bd8f2a6800.patch export CXXFLAGS="-Wno-error"; export CFLAGS="-Wno-error" ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=riscv64-linux-user --disable-system make -j$(nproc) From 0cc248559459eccdf80fdd18510aff6920f9917a Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Sun, 20 Apr 2025 07:50:04 +0000 Subject: [PATCH 10/38] Explicit unaligned vector load/stores in PPC64LE GEMV kernels --- kernel/power/sgemv_n.c | 757 ++++++++++++++++++----------------------- kernel/power/sgemv_t.c | 311 ++++++++--------- 2 files changed, 477 insertions(+), 591 deletions(-) diff --git a/kernel/power/sgemv_n.c b/kernel/power/sgemv_n.c index f5c1ba729..79c651df7 100644 --- a/kernel/power/sgemv_n.c +++ b/kernel/power/sgemv_n.c @@ -17,454 +17,369 @@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE +GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF +THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #if !defined(__VEC__) || !defined(__ALTIVEC__) #include "../arm/gemv_n.c" #else -#include "common.h" +#include +#include "common.h" #define NBMAX 4096 -static void sgemv_kernel_4x8(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, BLASLONG lda4, FLOAT *alpha) -{ - +static void sgemv_kernel_4x8(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, + BLASLONG lda4, FLOAT *alpha) { BLASLONG i; - FLOAT *a0,*a1,*a2,*a3,*b0,*b1,*b2,*b3; - FLOAT x0,x1,x2,x3,x4,x5,x6,x7; - a0 = ap[0]; - a1 = ap[1]; - a2 = ap[2]; - a3 = ap[3]; - b0 = a0 + lda4 ; - b1 = a1 + lda4 ; - b2 = a2 + lda4 ; - b3 = a3 + lda4 ; - x0 = xo[0] * *alpha; - x1 = xo[1] * *alpha; - x2 = xo[2] * *alpha; - x3 = xo[3] * *alpha; - x4 = xo[4] * *alpha; - x5 = xo[5] * *alpha; - x6 = xo[6] * *alpha; - x7 = xo[7] * *alpha; - __vector float* va0 = (__vector float*)a0; - __vector float* va1 = (__vector float*)a1; - __vector float* va2 = (__vector float*)a2; - __vector float* va3 = (__vector float*)a3; - __vector float* vb0 = (__vector float*)b0; - __vector float* vb1 = (__vector float*)b1; - __vector float* vb2 = (__vector float*)b2; - __vector float* vb3 = (__vector float*)b3; - - __vector float v_x0 = {x0,x0,x0,x0}; - __vector float v_x1 = {x1,x1,x1,x1}; - __vector float v_x2 = {x2,x2,x2,x2}; - __vector float v_x3 = {x3,x3,x3,x3}; - __vector float v_x4 = {x4,x4,x4,x4}; - __vector float v_x5 = {x5,x5,x5,x5}; - __vector float v_x6 = {x6,x6,x6,x6}; - __vector float v_x7 = {x7,x7,x7,x7}; - __vector float* v_y =(__vector float*)y; - - for ( i=0; i< n/4; i++) - { - register __vector float vy=v_y[i]; - vy += v_x0 * va0[i] + v_x1 * va1[i] + v_x2 * va2[i] + v_x3 * va3[i] ; - vy += v_x4 * vb0[i] + v_x5 * vb1[i] + v_x6 * vb2[i] + v_x7 * vb3[i] ; - v_y[i] =vy; + FLOAT *a0, *a1, *a2, *a3, *b0, *b1, *b2, *b3; + FLOAT x0, x1, x2, x3, x4, x5, x6, x7; + a0 = ap[0]; + a1 = ap[1]; + a2 = ap[2]; + a3 = ap[3]; + b0 = a0 + lda4; + b1 = a1 + lda4; + b2 = a2 + lda4; + b3 = a3 + lda4; + x0 = xo[0] * (*alpha); + x1 = xo[1] * (*alpha); + x2 = xo[2] * (*alpha); + x3 = xo[3] * (*alpha); + x4 = xo[4] * (*alpha); + x5 = xo[5] * (*alpha); + x6 = xo[6] * (*alpha); + x7 = xo[7] * (*alpha); + + __vector float v_x0 = {x0, x0, x0, x0}; + __vector float v_x1 = {x1, x1, x1, x1}; + __vector float v_x2 = {x2, x2, x2, x2}; + __vector float v_x3 = {x3, x3, x3, x3}; + __vector float v_x4 = {x4, x4, x4, x4}; + __vector float v_x5 = {x5, x5, x5, x5}; + __vector float v_x6 = {x6, x6, x6, x6}; + __vector float v_x7 = {x7, x7, x7, x7}; + + for (i = 0; i < n; i += 4) { + __vector float vy = vec_vsx_ld(0, &y[i]); + __vector float va0 = vec_vsx_ld(0, &a0[i]); + __vector float va1 = vec_vsx_ld(0, &a1[i]); + __vector float va2 = vec_vsx_ld(0, &a2[i]); + __vector float va3 = vec_vsx_ld(0, &a3[i]); + __vector float vb0 = vec_vsx_ld(0, &b0[i]); + __vector float vb1 = vec_vsx_ld(0, &b1[i]); + __vector float vb2 = vec_vsx_ld(0, &b2[i]); + __vector float vb3 = vec_vsx_ld(0, &b3[i]); + vy += v_x0 * va0 + v_x1 * va1 + v_x2 * va2 + v_x3 * va3; + vy += v_x4 * vb0 + v_x5 * vb1 + v_x6 * vb2 + v_x7 * vb3; + vec_vsx_st(vy, 0, &y[i]); } - } - -static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) -{ + +static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, + FLOAT *alpha) { BLASLONG i; - FLOAT x0,x1,x2,x3; - x0 = xo[0] * *alpha; - x1 = xo[1] * *alpha; - x2 = xo[2] * *alpha; - x3 = xo[3] * *alpha; - __vector float v_x0 = {x0,x0,x0,x0}; - __vector float v_x1 = {x1,x1,x1,x1}; - __vector float v_x2 = {x2,x2,x2,x2}; - __vector float v_x3 = {x3,x3,x3,x3}; - __vector float* v_y =(__vector float*)y; - __vector float* va0 = (__vector float*)ap[0]; - __vector float* va1 = (__vector float*)ap[1]; - __vector float* va2 = (__vector float*)ap[2]; - __vector float* va3 = (__vector float*)ap[3]; - - for ( i=0; i< n/4; i++ ) - { - register __vector float vy=v_y[i]; - vy += v_x0 * va0[i] + v_x1 * va1[i] + v_x2 * va2[i] + v_x3 * va3[i] ; - v_y[i] =vy; + FLOAT x0, x1, x2, x3; + FLOAT *a0, *a1, *a2, *a3; + a0 = ap[0]; + a1 = ap[1]; + a2 = ap[2]; + a3 = ap[3]; + x0 = xo[0] * (*alpha); + x1 = xo[1] * (*alpha); + x2 = xo[2] * (*alpha); + x3 = xo[3] * (*alpha); + __vector float v_x0 = {x0, x0, x0, x0}; + __vector float v_x1 = {x1, x1, x1, x1}; + __vector float v_x2 = {x2, x2, x2, x2}; + __vector float v_x3 = {x3, x3, x3, x3}; + + for (i = 0; i < n; i += 4) { + __vector float vy = vec_vsx_ld(0, &y[i]); + __vector float va0 = vec_vsx_ld(0, &a0[i]); + __vector float va1 = vec_vsx_ld(0, &a1[i]); + __vector float va2 = vec_vsx_ld(0, &a2[i]); + __vector float va3 = vec_vsx_ld(0, &a3[i]); + vy += v_x0 * va0 + v_x1 * va1 + v_x2 * va2 + v_x3 * va3; + vec_vsx_st(vy, 0, &y[i]); } +} -} - -static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) -{ - +static void sgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, + FLOAT *alpha) { BLASLONG i; - FLOAT x0,x1; - x0 = x[0] * *alpha; - x1 = x[1] * *alpha; - __vector float v_x0 = {x0,x0,x0,x0}; - __vector float v_x1 = {x1,x1,x1,x1}; - __vector float* v_y =(__vector float*)y; - __vector float* va0 = (__vector float*)ap[0]; - __vector float* va1 = (__vector float*)ap[1]; - - for ( i=0; i< n/4; i++ ) - { - v_y[i] += v_x0 * va0[i] + v_x1 * va1[i] ; + FLOAT x0, x1; + FLOAT *a0, *a1; + a0 = ap[0]; + a1 = ap[1]; + x0 = x[0] * (*alpha); + x1 = x[1] * (*alpha); + __vector float v_x0 = {x0, x0, x0, x0}; + __vector float v_x1 = {x1, x1, x1, x1}; + + for (i = 0; i < n; i += 4) { + __vector float vy = vec_vsx_ld(0, &y[i]); + __vector float va0 = vec_vsx_ld(0, &a0[i]); + __vector float va1 = vec_vsx_ld(0, &a1[i]); + vy += v_x0 * va0 + v_x1 * va1; + vec_vsx_st(vy, 0, &y[i]); } +} -} - - -static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) -{ - +static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, + FLOAT *alpha) { BLASLONG i; - FLOAT x0 ; - x0 = x[0] * *alpha; - __vector float v_x0 = {x0,x0,x0,x0}; - __vector float* v_y =(__vector float*)y; - __vector float* va0 = (__vector float*)ap; - - for ( i=0; i< n/4; i++ ) - { - v_y[i] += v_x0 * va0[i] ; + FLOAT x0 = x[0] * (*alpha); + __vector float v_x0 = {x0, x0, x0, x0}; + + for (i = 0; i < n; i += 4) { + __vector float vy = vec_vsx_ld(0, &y[i]); + __vector float va0 = vec_vsx_ld(0, &ap[i]); + vy += v_x0 * va0; + vec_vsx_st(vy, 0, &y[i]); } - } - -static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) -{ + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) { BLASLONG i; - - for ( i=0; i> 3 ; - n2 = n & 7 ; - } - else - { - n1 = n >> 2 ; - n2 = n & 3 ; - - } - - m3 = m & 3 ; - m1 = m & -4 ; - m2 = (m & (NBMAX-1)) - m3 ; - - - y_ptr = y; - - BLASLONG NB = NBMAX; - - while ( NB == NBMAX ) - { - - m1 -= NB; - if ( m1 < 0) - { - if ( m2 == 0 ) break; - NB = m2; - } - - a_ptr = a; - x_ptr = x; - - ap[0] = a_ptr; - ap[1] = a_ptr + lda; - ap[2] = ap[1] + lda; - ap[3] = ap[2] + lda; - - if ( inc_y != 1 ) - memset(ybuffer,0,NB*4); - else - ybuffer = y_ptr; - - if ( inc_x == 1 ) - { - - - for( i = 0; i < n1 ; i++) - { - sgemv_kernel_4x8(NB,ap,x_ptr,ybuffer,lda4,&alpha); - ap[0] += lda8; - ap[1] += lda8; - ap[2] += lda8; - ap[3] += lda8; - a_ptr += lda8; - x_ptr += 8; - } - - - if ( n2 & 4 ) - { - sgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,&alpha); - ap[0] += lda4; - ap[1] += lda4; - ap[2] += lda4; - ap[3] += lda4; - a_ptr += lda4; - x_ptr += 4; - } - - if ( n2 & 2 ) - { - sgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,&alpha); - a_ptr += lda*2; - x_ptr += 2; - } - - - if ( n2 & 1 ) - { - sgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha); +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, + BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, + FLOAT *buffer) { + BLASLONG i, n1, m1, m2, m3, n2, lda4, lda8; + FLOAT *a_ptr, *x_ptr, *y_ptr, *ap[4]; + + lda4 = lda << 2; + lda8 = lda << 3; + FLOAT xbuffer[8] __attribute__((aligned(16))); + FLOAT *ybuffer = buffer; + + if (m < 1) return (0); + if (n < 1) return (0); + + if (inc_x == 1) { + n1 = n >> 3; + n2 = n & 7; + } else { + n1 = n >> 2; + n2 = n & 3; + } + + m3 = m & 3; + m1 = m & -4; + m2 = (m & (NBMAX - 1)) - m3; + y_ptr = y; + BLASLONG NB = NBMAX; + + while (NB == NBMAX) { + m1 -= NB; + if (m1 < 0) { + if (m2 == 0) break; + NB = m2; + } + + a_ptr = a; + x_ptr = x; + + ap[0] = a_ptr; + ap[1] = a_ptr + lda; + ap[2] = ap[1] + lda; + ap[3] = ap[2] + lda; + + if (inc_y != 1) + memset(ybuffer, 0, NB * 4); + else + ybuffer = y_ptr; + + if (inc_x == 1) { + for (i = 0; i < n1; i++) { + sgemv_kernel_4x8(NB, ap, x_ptr, ybuffer, lda4, &alpha); + ap[0] += lda8; + ap[1] += lda8; + ap[2] += lda8; + ap[3] += lda8; + a_ptr += lda8; + x_ptr += 8; + } + if (n2 & 4) { + sgemv_kernel_4x4(NB, ap, x_ptr, ybuffer, &alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + x_ptr += 4; + } + + if (n2 & 2) { + sgemv_kernel_4x2(NB, ap, x_ptr, ybuffer, &alpha); + a_ptr += lda * 2; + x_ptr += 2; + } + + if (n2 & 1) { + sgemv_kernel_4x1(NB, a_ptr, x_ptr, ybuffer, &alpha); + a_ptr += lda; + x_ptr += 1; + } + + } else { + for (i = 0; i < n1; i++) { + xbuffer[0] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[1] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[2] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[3] = x_ptr[0]; + x_ptr += inc_x; + sgemv_kernel_4x4(NB, ap, xbuffer, ybuffer, &alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + } + + for (i = 0; i < n2; i++) { + xbuffer[0] = x_ptr[0]; + x_ptr += inc_x; + sgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, &alpha); + a_ptr += lda; + } + } + + a += NB; + if (inc_y != 1) { + add_y(NB, ybuffer, y_ptr, inc_y); + y_ptr += NB * inc_y; + } else + y_ptr += NB; + } + + if (m3 == 0) return (0); + + if (m3 == 3) { + a_ptr = a; + x_ptr = x; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + FLOAT temp2 = 0.0; + if (lda == 3 && inc_x == 1) { + for (i = 0; i < (n & -4); i += 4) { + temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1]; + temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1]; + + temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3]; + temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3]; + temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3]; + + a_ptr += 12; + x_ptr += 4; + } + + for (; i < n; i++) { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + temp2 += a_ptr[2] * x_ptr[0]; + a_ptr += 3; + x_ptr++; + } + + } else { + for (i = 0; i < n; i++) { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + temp2 += a_ptr[2] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + } + } + y_ptr[0] += alpha * temp0; + y_ptr += inc_y; + y_ptr[0] += alpha * temp1; + y_ptr += inc_y; + y_ptr[0] += alpha * temp2; + return (0); + } + + if (m3 == 2) { + a_ptr = a; + x_ptr = x; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + if (lda == 2 && inc_x == 1) { + for (i = 0; i < (n & -4); i += 4) { + temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; + temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; + temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3]; + a_ptr += 8; + x_ptr += 4; + } + + for (; i < n; i++) { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + a_ptr += 2; + x_ptr++; + } + + } else { + for (i = 0; i < n; i++) { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + } + } + y_ptr[0] += alpha * temp0; + y_ptr += inc_y; + y_ptr[0] += alpha * temp1; + return (0); + } + + if (m3 == 1) { + a_ptr = a; + x_ptr = x; + FLOAT temp = 0.0; + if (lda == 1 && inc_x == 1) { + for (i = 0; i < (n & -4); i += 4) { + temp += a_ptr[i] * x_ptr[i] + a_ptr[i + 1] * x_ptr[i + 1] + + a_ptr[i + 2] * x_ptr[i + 2] + + a_ptr[i + 3] * x_ptr[i + 3]; + } + + for (; i < n; i++) { + temp += a_ptr[i] * x_ptr[i]; + } + + } else { + for (i = 0; i < n; i++) { + temp += a_ptr[0] * x_ptr[0]; a_ptr += lda; - x_ptr += 1; - } - - - } - else - { - - for( i = 0; i < n1 ; i++) - { - xbuffer[0] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[1] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[2] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[3] = x_ptr[0]; - x_ptr += inc_x; - sgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,&alpha); - ap[0] += lda4; - ap[1] += lda4; - ap[2] += lda4; - ap[3] += lda4; - a_ptr += lda4; - } - - for( i = 0; i < n2 ; i++) - { - xbuffer[0] = x_ptr[0]; - x_ptr += inc_x; - sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,&alpha); - a_ptr += lda; - - } - - } - - a += NB; - if ( inc_y != 1 ) - { - add_y(NB,ybuffer,y_ptr,inc_y); - y_ptr += NB * inc_y; - } - else - y_ptr += NB ; - - } - - if ( m3 == 0 ) return(0); - - if ( m3 == 3 ) - { - a_ptr = a; - x_ptr = x; - FLOAT temp0 = 0.0; - FLOAT temp1 = 0.0; - FLOAT temp2 = 0.0; - if ( lda == 3 && inc_x ==1 ) - { - - for( i = 0; i < ( n & -4 ); i+=4 ) - { - - temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1]; - temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1]; - temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1]; - - temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3]; - temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3]; - temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3]; - - a_ptr += 12; - x_ptr += 4; - } - - for( ; i < n; i++ ) - { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - temp2 += a_ptr[2] * x_ptr[0]; - a_ptr += 3; - x_ptr ++; - } - - } - else - { - - for( i = 0; i < n; i++ ) - { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - temp2 += a_ptr[2] * x_ptr[0]; - a_ptr += lda; - x_ptr += inc_x; - - - } - - } - y_ptr[0] += alpha * temp0; - y_ptr += inc_y; - y_ptr[0] += alpha * temp1; - y_ptr += inc_y; - y_ptr[0] += alpha * temp2; - return(0); - } - - - if ( m3 == 2 ) - { - a_ptr = a; - x_ptr = x; - FLOAT temp0 = 0.0; - FLOAT temp1 = 0.0; - if ( lda == 2 && inc_x ==1 ) - { - - for( i = 0; i < (n & -4) ; i+=4 ) - { - temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; - temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; - temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; - temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3]; - a_ptr += 8; - x_ptr += 4; - - } - - - for( ; i < n; i++ ) - { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - a_ptr += 2; - x_ptr ++; - } - - } - else - { - - for( i = 0; i < n; i++ ) - { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - a_ptr += lda; - x_ptr += inc_x; - - - } - - } - y_ptr[0] += alpha * temp0; - y_ptr += inc_y; - y_ptr[0] += alpha * temp1; - return(0); - } - - if ( m3 == 1 ) - { - a_ptr = a; - x_ptr = x; - FLOAT temp = 0.0; - if ( lda == 1 && inc_x ==1 ) - { - - for( i = 0; i < (n & -4); i+=4 ) - { - temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3]; - - } - - for( ; i < n; i++ ) - { - temp += a_ptr[i] * x_ptr[i]; - } - - } - else - { - - for( i = 0; i < n; i++ ) - { - temp += a_ptr[0] * x_ptr[0]; - a_ptr += lda; - x_ptr += inc_x; - } - - } - y_ptr[0] += alpha * temp; - return(0); - } - - - return(0); + x_ptr += inc_x; + } + } + y_ptr[0] += alpha * temp; + return (0); + } + + return (0); } #endif - diff --git a/kernel/power/sgemv_t.c b/kernel/power/sgemv_t.c index ed0a24230..8a01d2de4 100644 --- a/kernel/power/sgemv_t.c +++ b/kernel/power/sgemv_t.c @@ -17,12 +17,12 @@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE +GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF +THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #if !defined(__VEC__) || !defined(__ALTIVEC__) #include "../arm/gemv_t.c" @@ -33,20 +33,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define NBMAX 2048 -#include - -static void sgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { - BLASLONG i; +#include + +static void sgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, + FLOAT *y, FLOAT alpha) { + BLASLONG i; FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7; - __vector float *va0, *va1, *va2, *va3, *va4, *va5, *va6, *va7, *v_x; - register __vector float temp0 = {0,0,0,0}; - register __vector float temp1 = {0,0,0,0}; - register __vector float temp2 = {0,0,0,0}; - register __vector float temp3 = {0,0,0,0}; - register __vector float temp4 = {0,0,0,0}; - register __vector float temp5 = {0,0,0,0}; - register __vector float temp6 = {0,0,0,0}; - register __vector float temp7 = {0,0,0,0}; + register __vector float temp0 = {0, 0, 0, 0}; + register __vector float temp1 = {0, 0, 0, 0}; + register __vector float temp2 = {0, 0, 0, 0}; + register __vector float temp3 = {0, 0, 0, 0}; + register __vector float temp4 = {0, 0, 0, 0}; + register __vector float temp5 = {0, 0, 0, 0}; + register __vector float temp6 = {0, 0, 0, 0}; + register __vector float temp7 = {0, 0, 0, 0}; a0 = ap; a1 = ap + lda; @@ -56,43 +56,42 @@ static void sgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA a5 = a4 + lda; a6 = a5 + lda; a7 = a6 + lda; - va0 = (__vector float*) a0; - va1 = (__vector float*) a1; - va2 = (__vector float*) a2; - va3 = (__vector float*) a3; - va4 = (__vector float*) a4; - va5 = (__vector float*) a5; - va6 = (__vector float*) a6; - va7 = (__vector float*) a7; - v_x = (__vector float*) x; - - - for (i = 0; i < n/4; i ++) { - temp0 += v_x[i] * va0[i]; - temp1 += v_x[i] * va1[i]; - temp2 += v_x[i] * va2[i]; - temp3 += v_x[i] * va3[i]; - temp4 += v_x[i] * va4[i]; - temp5 += v_x[i] * va5[i]; - temp6 += v_x[i] * va6[i]; - temp7 += v_x[i] * va7[i]; - } - - #if defined(POWER8) - y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); - y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); - y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]); - y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]); - - y[4] += alpha * (temp4[0] + temp4[1]+temp4[2] + temp4[3]); - y[5] += alpha * (temp5[0] + temp5[1]+temp5[2] + temp5[3]); - y[6] += alpha * (temp6[0] + temp6[1]+temp6[2] + temp6[3]); - y[7] += alpha * (temp7[0] + temp7[1]+temp7[2] + temp7[3]); - #else - register __vector float t0, t1, t2, t3; - register __vector float a = { alpha, alpha, alpha, alpha }; - __vector float *v_y = (__vector float*) y; + for (i = 0; i < n; i += 4) { + __vector float vx = vec_vsx_ld(0, &x[i]); + __vector float vva0 = vec_vsx_ld(0, &a0[i]); + __vector float vva1 = vec_vsx_ld(0, &a1[i]); + __vector float vva2 = vec_vsx_ld(0, &a2[i]); + __vector float vva3 = vec_vsx_ld(0, &a3[i]); + __vector float vva4 = vec_vsx_ld(0, &a4[i]); + __vector float vva5 = vec_vsx_ld(0, &a5[i]); + __vector float vva6 = vec_vsx_ld(0, &a6[i]); + __vector float vva7 = vec_vsx_ld(0, &a7[i]); + temp0 += vx * vva0; + temp1 += vx * vva1; + temp2 += vx * vva2; + temp3 += vx * vva3; + temp4 += vx * vva4; + temp5 += vx * vva5; + temp6 += vx * vva6; + temp7 += vx * vva7; + } + +#if defined(POWER8) + y[0] += alpha * (temp0[0] + temp0[1] + temp0[2] + temp0[3]); + y[1] += alpha * (temp1[0] + temp1[1] + temp1[2] + temp1[3]); + y[2] += alpha * (temp2[0] + temp2[1] + temp2[2] + temp2[3]); + y[3] += alpha * (temp3[0] + temp3[1] + temp3[2] + temp3[3]); + + y[4] += alpha * (temp4[0] + temp4[1] + temp4[2] + temp4[3]); + y[5] += alpha * (temp5[0] + temp5[1] + temp5[2] + temp5[3]); + y[6] += alpha * (temp6[0] + temp6[1] + temp6[2] + temp6[3]); + y[7] += alpha * (temp7[0] + temp7[1] + temp7[2] + temp7[3]); +#else + register __vector float t0, t1, t2, t3; + register __vector float a = {alpha, alpha, alpha, alpha}; + __vector float vy0 = vec_vsx_ld(0, y); + __vector float vy1 = vec_vsx_ld(0, &(y[4])); t0 = vec_mergeh(temp0, temp2); t1 = vec_mergel(temp0, temp2); t2 = vec_mergeh(temp1, temp3); @@ -113,44 +112,46 @@ static void sgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA temp7 = vec_mergel(t1, t3); temp4 += temp5 + temp6 + temp7; - v_y[0] += a * temp0; - v_y[1] += a * temp4; + vy0 += a * temp0; + vy1 += a * temp4; + vec_vsx_st(vy0, 0, y); + vec_vsx_st(vy1, 0, &(y[4])); #endif } - -static void sgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { +static void sgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, + FLOAT *y, FLOAT alpha) { BLASLONG i = 0; FLOAT *a0, *a1, *a2, *a3; a0 = ap; a1 = ap + lda; a2 = a1 + lda; a3 = a2 + lda; - __vector float* va0 = (__vector float*) a0; - __vector float* va1 = (__vector float*) a1; - __vector float* va2 = (__vector float*) a2; - __vector float* va3 = (__vector float*) a3; - __vector float* v_x = (__vector float*) x; - register __vector float temp0 = {0,0,0,0}; - register __vector float temp1 = {0,0,0,0}; - register __vector float temp2 = {0,0,0,0}; - register __vector float temp3 = {0,0,0,0}; - - for (i = 0; i < n / 4; i ++) { - temp0 += v_x[i] * va0[i]; - temp1 += v_x[i] * va1[i]; - temp2 += v_x[i] * va2[i]; - temp3 += v_x[i] * va3[i]; + register __vector float temp0 = {0, 0, 0, 0}; + register __vector float temp1 = {0, 0, 0, 0}; + register __vector float temp2 = {0, 0, 0, 0}; + register __vector float temp3 = {0, 0, 0, 0}; + + for (i = 0; i < n; i += 4) { + __vector float vx = vec_vsx_ld(0, &x[i]); + __vector float vva0 = vec_vsx_ld(0, &a0[i]); + __vector float vva1 = vec_vsx_ld(0, &a1[i]); + __vector float vva2 = vec_vsx_ld(0, &a2[i]); + __vector float vva3 = vec_vsx_ld(0, &a3[i]); + temp0 += vx * vva0; + temp1 += vx * vva1; + temp2 += vx * vva2; + temp3 += vx * vva3; } - #if defined(POWER8) - y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); - y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); - y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]); - y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]); - #else +#if defined(POWER8) + y[0] += alpha * (temp0[0] + temp0[1] + temp0[2] + temp0[3]); + y[1] += alpha * (temp1[0] + temp1[1] + temp1[2] + temp1[3]); + y[2] += alpha * (temp2[0] + temp2[1] + temp2[2] + temp2[3]); + y[3] += alpha * (temp3[0] + temp3[1] + temp3[2] + temp3[3]); +#else register __vector float t0, t1, t2, t3; - register __vector float a = { alpha, alpha, alpha, alpha }; - __vector float *v_y = (__vector float*) y; + register __vector float a = {alpha, alpha, alpha, alpha}; + __vector float vy0 = vec_vsx_ld(0, y); t0 = vec_mergeh(temp0, temp2); t1 = vec_mergel(temp0, temp2); @@ -162,47 +163,42 @@ static void sgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA temp3 = vec_mergel(t1, t3); temp0 += temp1 + temp2 + temp3; - v_y[0] += a * temp0; + vy0 += a * temp0; + vec_vsx_st(vy0, 0, y); #endif } - - -static void sgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha, BLASLONG inc_y) { +static void sgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, + FLOAT *y, FLOAT alpha, BLASLONG inc_y) { BLASLONG i; FLOAT *a0, *a1; a0 = ap; a1 = ap + lda; - __vector float* va0 = (__vector float*) a0; - __vector float* va1 = (__vector float*) a1; - __vector float* v_x = (__vector float*) x; - __vector float temp0 = {0,0,0,0}; - __vector float temp1 = {0,0,0,0}; - for (i = 0; i < n / 4; i ++) { - temp0 += v_x[i] * va0[i]; - temp1 += v_x[i] * va1[i]; + __vector float temp0 = {0, 0, 0, 0}; + __vector float temp1 = {0, 0, 0, 0}; + for (i = 0; i < n; i += 4) { + __vector float vx = vec_vsx_ld(0, &x[i]); + __vector float vva0 = vec_vsx_ld(0, &a0[i]); + __vector float vva1 = vec_vsx_ld(0, &a1[i]); + temp0 += vx * vva0; + temp1 += vx * vva1; } - - - y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); - y[inc_y] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); + y[0] += alpha * (temp0[0] + temp0[1] + temp0[2] + temp0[3]); + y[inc_y] += alpha * (temp1[0] + temp1[1] + temp1[2] + temp1[3]); } -static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { - +static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, + FLOAT alpha) { BLASLONG i; - FLOAT *a0; - a0 = ap; - __vector float* va0 = (__vector float*) a0; - __vector float* v_x = (__vector float*) x; - __vector float temp0 = {0,0,0,0}; - for (i = 0; i < n / 4; i ++) { - temp0 += v_x[i] * va0[i] ; + __vector float temp0 = {0, 0, 0, 0}; + for (i = 0; i < n; i += 4) { + __vector float vx = vec_vsx_ld(0, &x[i]); + __vector float vva0 = vec_vsx_ld(0, &ap[i]); + temp0 += vx * vva0; } - y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); - + y[0] += alpha * (temp0[0] + temp0[1] + temp0[2] + temp0[3]); } static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { @@ -213,20 +209,14 @@ static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { } } -int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { - BLASLONG i; - BLASLONG j; - FLOAT *a_ptr; - FLOAT *x_ptr; - FLOAT *y_ptr; - - BLASLONG n1; - BLASLONG m1; - BLASLONG m2; - BLASLONG m3; - BLASLONG n2; +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, + BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, + FLOAT *buffer) { + + BLASLONG i, j, n1, m1, m2, m3, n2; + FLOAT *a_ptr, *x_ptr, *y_ptr; FLOAT ybuffer[8] __attribute__((aligned(16))); - FLOAT *xbuffer; + FLOAT *xbuffer; if (m < 1) return (0); if (n < 1) return (0); @@ -242,7 +232,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO BLASLONG NB = NBMAX; while (NB == NBMAX) { - m1 -= NB; if (m1 < 0) { if (m2 == 0) break; @@ -260,20 +249,15 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO BLASLONG lda8 = lda << 3; - if (inc_y == 1) { - for (i = 0; i < n1; i++) { - sgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, y_ptr, alpha); - + y_ptr += 8; a_ptr += lda8; - } } else { - for (i = 0; i < n1; i++) { ybuffer[0] = 0; ybuffer[1] = 0; @@ -285,8 +269,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO ybuffer[7] = 0; sgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, ybuffer, alpha); - - *y_ptr += ybuffer[0]; y_ptr += inc_y; *y_ptr += ybuffer[1]; @@ -307,10 +289,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO a_ptr += lda8; } - } - if (n2 & 4) { ybuffer[0] = 0; ybuffer[1] = 0; @@ -318,7 +298,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO ybuffer[3] = 0; sgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha); - a_ptr += lda<<2; + a_ptr += lda << 2; *y_ptr += ybuffer[0]; y_ptr += inc_y; @@ -334,20 +314,16 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO sgemv_kernel_4x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha, inc_y); a_ptr += lda << 1; y_ptr += 2 * inc_y; - } if (n2 & 1) { sgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha); a_ptr += lda; y_ptr += inc_y; - } a += NB; x += NB * inc_x; - - } if (m3 == 0) return (0); @@ -365,13 +341,14 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO y_ptr = y; if (lda == 3 && inc_y == 1) { - for (j = 0; j < (n & -4); j += 4) { - y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; - y_ptr[j + 1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2; - y_ptr[j + 2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2; - y_ptr[j + 3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2; + y_ptr[j + 1] += + aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2; + y_ptr[j + 2] += + aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2; + y_ptr[j + 3] += + aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2; aj += 12; } @@ -381,38 +358,40 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO } } else { - if (inc_y == 1) { - BLASLONG register lda2 = lda << 1; BLASLONG register lda4 = lda << 2; BLASLONG register lda3 = lda2 + lda; for (j = 0; j < (n & -4); j += 4) { - - y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; - y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1 + *(aj + lda + 2) * xtemp2; - y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1 + *(aj + lda2 + 2) * xtemp2; - y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1 + *(aj + lda3 + 2) * xtemp2; + y_ptr[j] += + *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; + y_ptr[j + 1] += *(aj + lda) * xtemp0 + + *(aj + lda + 1) * xtemp1 + + *(aj + lda + 2) * xtemp2; + y_ptr[j + 2] += *(aj + lda2) * xtemp0 + + *(aj + lda2 + 1) * xtemp1 + + *(aj + lda2 + 2) * xtemp2; + y_ptr[j + 3] += *(aj + lda3) * xtemp0 + + *(aj + lda3 + 1) * xtemp1 + + *(aj + lda3 + 2) * xtemp2; aj += lda4; } for (; j < n; j++) { - - y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; + y_ptr[j] += + *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; aj += lda; } } else { - for (j = 0; j < n; j++) { - *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; + *y_ptr += + *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; y_ptr += inc_y; aj += lda; } - } - } return (0); } @@ -426,14 +405,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO y_ptr = y; if (lda == 2 && inc_y == 1) { - for (j = 0; j < (n & -4); j += 4) { y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1; y_ptr[j + 1] += aj[2] * xtemp0 + aj[3] * xtemp1; y_ptr[j + 2] += aj[4] * xtemp0 + aj[5] * xtemp1; y_ptr[j + 3] += aj[6] * xtemp0 + aj[7] * xtemp1; aj += 8; - } for (; j < n; j++) { @@ -443,22 +420,22 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO } else { if (inc_y == 1) { - BLASLONG register lda2 = lda << 1; BLASLONG register lda4 = lda << 2; BLASLONG register lda3 = lda2 + lda; for (j = 0; j < (n & -4); j += 4) { - y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; - y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1; - y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1; - y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1; + y_ptr[j + 1] += + *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1; + y_ptr[j + 2] += + *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1; + y_ptr[j + 3] += + *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1; aj += lda4; } for (; j < n; j++) { - y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; aj += lda; } @@ -470,10 +447,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO aj += lda; } } - } return (0); - } FLOAT xtemp = *x_ptr * alpha; @@ -490,10 +465,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO y_ptr[j] += aj[j] * xtemp; } - } else { if (inc_y == 1) { - BLASLONG register lda2 = lda << 1; BLASLONG register lda4 = lda << 2; BLASLONG register lda3 = lda2 + lda; @@ -516,12 +489,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO y_ptr += inc_y; aj += lda; } - } } return (0); - } #endif From d659f3c3f6058ece8a704423b72ece3821cfba4d Mon Sep 17 00:00:00 2001 From: Ruiyang Wu Date: Wed, 16 Apr 2025 12:15:44 -0400 Subject: [PATCH 11/38] Fix "Argument list too long" compilation error for Intel macOS --- CMakeLists.txt | 60 ++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 43 insertions(+), 17 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e56a013b2..5f46b905d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -299,23 +299,49 @@ if (USE_OPENMP) endif() endif() -# Seems that this hack doesn't required since macOS 11 Big Sur -if (APPLE AND BUILD_SHARED_LIBS AND CMAKE_HOST_SYSTEM_VERSION VERSION_LESS 20) - set (CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1) - if (NOT NOFORTRAN) - set (CMAKE_Fortran_USE_RESPONSE_FILE_FOR_OBJECTS 1) - set (CMAKE_Fortran_CREATE_SHARED_LIBRARY - "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ${CMAKE_AR} -ru libopenblas.a && exit 0' " - "sh -c '${CMAKE_AR} -rs libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' " - "sh -c 'echo \"\" | ${CMAKE_Fortran_COMPILER} -o dummy.o -c -x f95-cpp-input - '" - "sh -c '${CMAKE_Fortran_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load dummy.o -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'" - "sh -c 'ls -l ${CMAKE_BINARY_DIR}/lib'") - else () - set (CMAKE_C_CREATE_SHARED_LIBRARY - "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ${CMAKE_AR} -ru libopenblas.a && exit 0' " - "sh -c '${CMAKE_AR} -rs libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' " - "sh -c '${CMAKE_C_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'") - endif () +# Fix "Argument list too long" for macOS with Intel CPUs and DYNAMIC_ARCH turned on +if(APPLE AND DYNAMIC_ARCH AND (NOT CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "arm64")) + # Use response files + set(CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1) + # Always build static library first + if(INTERFACE64) + set(STATIC_FILE "libopenblas_64.a") + else() + set(STATIC_FILE "libopenblas.a") + endif() + if(BUILD_STATIC_LIBS) + set(STATIC_PATH "${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/${STATIC_FILE}") + else() + add_library(${OpenBLAS_LIBNAME}_static STATIC ${TARGET_OBJS} ${OpenBLAS_DEF_FILE}) + set(STATIC_PATH STATIC_FILE) + endif() + set(CREATE_STATIC_LIBRARY_COMMAND + "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ${CMAKE_AR} -ru ${STATIC_PATH} && exit 0' " + "sh -c '${CMAKE_AR} -rs ${STATIC_PATH} ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' ") + if(BUILD_SHARED_LIBS) + add_dependencies(${OpenBLAS_LIBNAME}_shared ${OpenBLAS_LIBNAME}_static) + set(SHARED_PATH "${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib") + endif() + if(USE_OPENMP) + get_target_property(OMP_LIB OpenMP::OpenMP_C INTERFACE_LINK_LIBRARIES) + else() + set(OMP_LIB "") + endif() + if(NOT NOFORTRAN) + set(CMAKE_Fortran_USE_RESPONSE_FILE_FOR_OBJECTS 1) + set(CMAKE_Fortran_CREATE_STATIC_LIBRARY ${CREATE_STATIC_LIBRARY_COMMAND}) + if(BUILD_SHARED_LIBS) + set(CMAKE_Fortran_CREATE_SHARED_LIBRARY + "sh -c 'echo \"\" | ${CMAKE_Fortran_COMPILER} -o dummy.o -c -x f95-cpp-input - '" + "sh -c '${CMAKE_Fortran_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,${STATIC_PATH} dummy.o -o ${SHARED_PATH} ${OMP_LIB}'") + endif() + else() + set(CMAKE_C_CREATE_STATIC_LIBRARY ${CREATE_STATIC_LIBRARY_COMMAND}) + if(BUILD_SHARED_LIBS) + set(CMAKE_C_CREATE_SHARED_LIBRARY + "sh -c '${CMAKE_C_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,${STATIC_PATH} -o ${SHARED_PATH} ${OMP_LIB}'") + endif() + endif() endif() # Handle MSVC exports From 9aa7a0b2a7b2770adec6ff26b34660d3bcd8c49c Mon Sep 17 00:00:00 2001 From: Ruiyang Wu Date: Sun, 20 Apr 2025 22:55:19 -0400 Subject: [PATCH 12/38] Follow-up to d659f3c --- CMakeLists.txt | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5f46b905d..f94c4c474 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -304,19 +304,14 @@ if(APPLE AND DYNAMIC_ARCH AND (NOT CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "arm64") # Use response files set(CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1) # Always build static library first - if(INTERFACE64) - set(STATIC_FILE "libopenblas_64.a") - else() - set(STATIC_FILE "libopenblas.a") - endif() if(BUILD_STATIC_LIBS) - set(STATIC_PATH "${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/${STATIC_FILE}") + set(STATIC_PATH "${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/lib${OpenBLAS_LIBNAME}.a") else() add_library(${OpenBLAS_LIBNAME}_static STATIC ${TARGET_OBJS} ${OpenBLAS_DEF_FILE}) - set(STATIC_PATH STATIC_FILE) + set(STATIC_PATH "lib${OpenBLAS_LIBNAME}.a") endif() set(CREATE_STATIC_LIBRARY_COMMAND - "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ${CMAKE_AR} -ru ${STATIC_PATH} && exit 0' " + "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/${OpenBLAS_LIBNAME}_static.dir/objects*.rsp | xargs -n 1024 ${CMAKE_AR} -ru ${STATIC_PATH} && exit 0' " "sh -c '${CMAKE_AR} -rs ${STATIC_PATH} ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' ") if(BUILD_SHARED_LIBS) add_dependencies(${OpenBLAS_LIBNAME}_shared ${OpenBLAS_LIBNAME}_static) From 96d80801bc2a5df7d85f73dbd2ecc8673dad0292 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 21 Apr 2025 22:53:26 +0200 Subject: [PATCH 13/38] Reinstate the CooperLake microkernel --- kernel/x86_64/sbgemv_n.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/x86_64/sbgemv_n.c b/kernel/x86_64/sbgemv_n.c index b2d4eb74f..08ccace61 100644 --- a/kernel/x86_64/sbgemv_n.c +++ b/kernel/x86_64/sbgemv_n.c @@ -28,9 +28,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -//#if defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) -//#include "sbgemv_n_microk_cooperlake.c" -//#endif +#if defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) +#include "sbgemv_n_microk_cooperlake.c" +#endif #define ALIGN64_ALLOC(alloc_size, TYPE, ptr_align, ptr) \ ptr = (TYPE *) malloc(sizeof(TYPE)*alloc_size + 63); \ From 99d9f1ff3878e791591671f92f3a2470a1e565e6 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 21 Apr 2025 22:55:45 +0200 Subject: [PATCH 14/38] Fix conditional --- kernel/x86_64/sbgemv_n_microk_cooperlake_template.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/sbgemv_n_microk_cooperlake_template.c b/kernel/x86_64/sbgemv_n_microk_cooperlake_template.c index 4711e9720..ab22e0848 100644 --- a/kernel/x86_64/sbgemv_n_microk_cooperlake_template.c +++ b/kernel/x86_64/sbgemv_n_microk_cooperlake_template.c @@ -231,7 +231,7 @@ static int sbgemv_kernel_32xN_lda_direct(BLASLONG m, BLASLONG n, float alpha, bf accum512_8 = _mm512_permutex2var_ps(accum512_0, idx_base_0, accum512_1); accum512_9 = _mm512_permutex2var_ps(accum512_0, idx_base_1, accum512_1); - if ((m-tag_m_32x) > 16) { + if ((m-tag_m_32x) >= 16) { STORE16_COMPLETE_RESULT(accum512_8, y+tag_m_32x+0) STORE16_MASK_COMPLETE_RESULT(accum512_9, y+tag_m_32x+16, store_tail_mask) } else { From 4ec62d7f730b13d0ee48ce6f03d879f82696d9e6 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 21 Apr 2025 23:14:10 +0200 Subject: [PATCH 15/38] remove non-vectorized code path for power8, restoring PR4880 --- kernel/power/sgemv_t.c | 23 ++++------------------- 1 file changed, 4 insertions(+), 19 deletions(-) diff --git a/kernel/power/sgemv_t.c b/kernel/power/sgemv_t.c index 8a01d2de4..c66b003d2 100644 --- a/kernel/power/sgemv_t.c +++ b/kernel/power/sgemv_t.c @@ -77,17 +77,7 @@ static void sgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, temp7 += vx * vva7; } -#if defined(POWER8) - y[0] += alpha * (temp0[0] + temp0[1] + temp0[2] + temp0[3]); - y[1] += alpha * (temp1[0] + temp1[1] + temp1[2] + temp1[3]); - y[2] += alpha * (temp2[0] + temp2[1] + temp2[2] + temp2[3]); - y[3] += alpha * (temp3[0] + temp3[1] + temp3[2] + temp3[3]); - - y[4] += alpha * (temp4[0] + temp4[1] + temp4[2] + temp4[3]); - y[5] += alpha * (temp5[0] + temp5[1] + temp5[2] + temp5[3]); - y[6] += alpha * (temp6[0] + temp6[1] + temp6[2] + temp6[3]); - y[7] += alpha * (temp7[0] + temp7[1] + temp7[2] + temp7[3]); -#else + register __vector float t0, t1, t2, t3; register __vector float a = {alpha, alpha, alpha, alpha}; __vector float vy0 = vec_vsx_ld(0, y); @@ -116,7 +106,7 @@ static void sgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, vy1 += a * temp4; vec_vsx_st(vy0, 0, y); vec_vsx_st(vy1, 0, &(y[4])); -#endif + } static void sgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, @@ -143,12 +133,7 @@ static void sgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, temp2 += vx * vva2; temp3 += vx * vva3; } -#if defined(POWER8) - y[0] += alpha * (temp0[0] + temp0[1] + temp0[2] + temp0[3]); - y[1] += alpha * (temp1[0] + temp1[1] + temp1[2] + temp1[3]); - y[2] += alpha * (temp2[0] + temp2[1] + temp2[2] + temp2[3]); - y[3] += alpha * (temp3[0] + temp3[1] + temp3[2] + temp3[3]); -#else + register __vector float t0, t1, t2, t3; register __vector float a = {alpha, alpha, alpha, alpha}; __vector float vy0 = vec_vsx_ld(0, y); @@ -165,7 +150,7 @@ static void sgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, vy0 += a * temp0; vec_vsx_st(vy0, 0, y); -#endif + } static void sgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, From e11744a41163c5ce6e744214f54fb583e294f598 Mon Sep 17 00:00:00 2001 From: Annop Wongwathanarat Date: Tue, 22 Apr 2025 09:40:13 +0000 Subject: [PATCH 16/38] Use SVE kernel for S/DGEMVN for SVE machines --- kernel/arm64/KERNEL.ARMV8SVE | 4 ++-- kernel/arm64/KERNEL.NEOVERSEN2 | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/arm64/KERNEL.ARMV8SVE b/kernel/arm64/KERNEL.ARMV8SVE index 0e51f2c2f..a8371a03c 100644 --- a/kernel/arm64/KERNEL.ARMV8SVE +++ b/kernel/arm64/KERNEL.ARMV8SVE @@ -74,8 +74,8 @@ DSCALKERNEL = scal.S CSCALKERNEL = zscal.S ZSCALKERNEL = zscal.S -SGEMVNKERNEL = gemv_n_sve.c -DGEMVNKERNEL = gemv_n.S +SGEMVNKERNEL = gemv_n_sve_v1x3.c +DGEMVNKERNEL = gemv_n_sve_v1x3.c CGEMVNKERNEL = zgemv_n.S ZGEMVNKERNEL = zgemv_n.S diff --git a/kernel/arm64/KERNEL.NEOVERSEN2 b/kernel/arm64/KERNEL.NEOVERSEN2 index b9dc23562..c8d511f20 100644 --- a/kernel/arm64/KERNEL.NEOVERSEN2 +++ b/kernel/arm64/KERNEL.NEOVERSEN2 @@ -60,8 +60,8 @@ DSCALKERNEL = scal.S CSCALKERNEL = zscal.S ZSCALKERNEL = zscal.S -SGEMVNKERNEL = gemv_n.S -DGEMVNKERNEL = gemv_n.S +SGEMVNKERNEL = gemv_n_sve_v1x3.c +DGEMVNKERNEL = gemv_n_sve_v1x3.c CGEMVNKERNEL = zgemv_n.S ZGEMVNKERNEL = zgemv_n.S From 08b5c18d707adc363e5047793fdee5218db63b20 Mon Sep 17 00:00:00 2001 From: "Iha, Taisei" Date: Tue, 22 Apr 2025 19:56:44 +0900 Subject: [PATCH 17/38] fixed a potential out-of-bounds on gemv. --- kernel/arm64/gemv_n_sve_v1x3.c | 6 +++--- kernel/arm64/gemv_n_sve_v4x3.c | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/arm64/gemv_n_sve_v1x3.c b/kernel/arm64/gemv_n_sve_v1x3.c index 44c9a89b9..d6aa3d389 100644 --- a/kernel/arm64/gemv_n_sve_v1x3.c +++ b/kernel/arm64/gemv_n_sve_v1x3.c @@ -77,9 +77,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, svbool_t pg01 = ((j + width * 1) < n) ? SV_TRUE() : svpfalse(); svbool_t pg02 = ((j + width * 2) < n) ? SV_TRUE() : svpfalse(); - SV_TYPE temp0_vec = SV_DUP(alpha * x0_ptr[ix]); - SV_TYPE temp1_vec = SV_DUP(alpha * x1_ptr[ix]); - SV_TYPE temp2_vec = SV_DUP(alpha * x2_ptr[ix]); + SV_TYPE temp0_vec = ((j + width * 0) < n) ? SV_DUP(alpha * x0_ptr[ix]) : SV_DUP(0.0); + SV_TYPE temp1_vec = ((j + width * 1) < n) ? SV_DUP(alpha * x1_ptr[ix]) : SV_DUP(0.0); + SV_TYPE temp2_vec = ((j + width * 2) < n) ? SV_DUP(alpha * x2_ptr[ix]) : SV_DUP(0.0); i = 0; BLASLONG sve_size = SV_COUNT(); while ((i + sve_size * 1 - 1) < m) { diff --git a/kernel/arm64/gemv_n_sve_v4x3.c b/kernel/arm64/gemv_n_sve_v4x3.c index 92e4f75b6..0a7018303 100644 --- a/kernel/arm64/gemv_n_sve_v4x3.c +++ b/kernel/arm64/gemv_n_sve_v4x3.c @@ -86,9 +86,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, svbool_t pg22 = ((j + width * 2) < n) ? SV_TRUE() : svpfalse(); svbool_t pg32 = ((j + width * 2) < n) ? SV_TRUE() : svpfalse(); - SV_TYPE temp0_vec = SV_DUP(alpha * x0_ptr[ix]); - SV_TYPE temp1_vec = SV_DUP(alpha * x1_ptr[ix]); - SV_TYPE temp2_vec = SV_DUP(alpha * x2_ptr[ix]); + SV_TYPE temp0_vec = ((j + width * 0) < n) ? SV_DUP(alpha * x0_ptr[ix]) : SV_DUP(0.0); + SV_TYPE temp1_vec = ((j + width * 1) < n) ? SV_DUP(alpha * x1_ptr[ix]) : SV_DUP(0.0); + SV_TYPE temp2_vec = ((j + width * 2) < n) ? SV_DUP(alpha * x2_ptr[ix]) : SV_DUP(0.0); i = 0; BLASLONG sve_size = SV_COUNT(); while ((i + sve_size * 4 - 1) < m) { From 9c02cdb073264d3c5360f077d4f2baa157e08fc9 Mon Sep 17 00:00:00 2001 From: abhishek-fujitsu Date: Mon, 24 Mar 2025 01:00:50 +0530 Subject: [PATCH 18/38] optimise dot using thread throttling for NEOVERSE V1 --- kernel/arm64/dot.c | 53 +++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 50 insertions(+), 3 deletions(-) diff --git a/kernel/arm64/dot.c b/kernel/arm64/dot.c index 4607ebc59..ece31bccd 100644 --- a/kernel/arm64/dot.c +++ b/kernel/arm64/dot.c @@ -48,6 +48,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, void *c, BLASLONG ldc, int (*function)(), int nthreads); + +#ifdef DYNAMIC_ARCH +extern char* gotoblas_corename(void); +#endif + +#if defined(DYNAMIC_ARCH) || defined(NEOVERSEV1) +static inline int get_dot_optimal_nthreads_neoversev1(BLASLONG N, int ncpu) { + #ifdef DOUBLE + return (N <= 10000L) ? 1 + : (N <= 64500L) ? 1 + : (N <= 100000L) ? MIN(ncpu, 2) + : (N <= 150000L) ? MIN(ncpu, 4) + : (N <= 260000L) ? MIN(ncpu, 8) + : (N <= 360000L) ? MIN(ncpu, 16) + : (N <= 520000L) ? MIN(ncpu, 24) + : (N <= 1010000L) ? MIN(ncpu, 56) + : ncpu; + #else + return (N <= 10000L) ? 1 + : (N <= 110000L) ? 1 + : (N <= 200000L) ? MIN(ncpu, 2) + : (N <= 280000L) ? MIN(ncpu, 4) + : (N <= 520000L) ? MIN(ncpu, 8) + : (N <= 830000L) ? MIN(ncpu, 16) + : (N <= 1010000L) ? MIN(ncpu, 24) + : ncpu; + #endif +} +#endif + +static inline int get_dot_optimal_nthreads(BLASLONG n) { + int ncpu = num_cpu_avail(1); + +#if defined(NEOVERSEV1) && !defined(COMPLEX) && !defined(BFLOAT16) + return get_dot_optimal_nthreads_neoversev1(n, ncpu); +#elif defined(DYNAMIC_ARCH) && !defined(COMPLEX) && !defined(BFLOAT16) + if (strcmp(gotoblas_corename(), "neoversev1") == 0) { + return get_dot_optimal_nthreads_neoversev1(n, ncpu); + } +#endif + + // Default case + if (n <= 10000L) + return 1; + else + return num_cpu_avail(1); +} #endif static RETURN_TYPE dot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) @@ -85,10 +132,10 @@ RETURN_TYPE CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y RETURN_TYPE dot = 0.0; #if defined(SMP) - if (inc_x == 0 || inc_y == 0 || n <= 10000) + if (inc_x == 0 || inc_y == 0) nthreads = 1; else - nthreads = num_cpu_avail(1); + nthreads = get_dot_optimal_nthreads(n); if (nthreads == 1) { dot = dot_compute(n, x, inc_x, y, inc_y); @@ -105,7 +152,7 @@ RETURN_TYPE CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, x, inc_x, y, inc_y, result, 0, - ( void *)dot_thread_function, nthreads); + (void *)dot_thread_function, nthreads); ptr = (RETURN_TYPE *)result; for (i = 0; i < nthreads; i++) { From 0c239c9d483840dc6c939601b1a86fad973e9e4a Mon Sep 17 00:00:00 2001 From: abhishek-fujitsu Date: Tue, 22 Apr 2025 21:56:05 +0530 Subject: [PATCH 19/38] update contribution list --- CONTRIBUTORS.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 6b0814dcc..d8f57ef60 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -250,4 +250,7 @@ In chronological order: * Ye Tao * [2025-02-03] Optimize SBGEMM kernel on NEOVERSEV1 - * [2025-02-27] Add sbgemv_n_neon kernel \ No newline at end of file + * [2025-02-27] Add sbgemv_n_neon kernel + +* Abhishek Kumar + * [2025-04-22] Optimise dot kernel for NEOVERSE V1 \ No newline at end of file From 7616c42095ead2755633eb41d98f5d17d58a9800 Mon Sep 17 00:00:00 2001 From: guoyuanplct Date: Fri, 25 Apr 2025 00:05:15 +0800 Subject: [PATCH 20/38] Optimized RVV_ZVL256B Implementation of zgemv_n The implementation of zgemv_n using RVV_ZVL256B has been optimized. Compared to the previous implementation, it has achieved a 1.5x performance improvement. --- kernel/riscv64/zgemv_n_vector.c | 197 ++++++++++++++++++++++++++++---- 1 file changed, 174 insertions(+), 23 deletions(-) diff --git a/kernel/riscv64/zgemv_n_vector.c b/kernel/riscv64/zgemv_n_vector.c index 104d3865d..77f55cdd9 100644 --- a/kernel/riscv64/zgemv_n_vector.c +++ b/kernel/riscv64/zgemv_n_vector.c @@ -27,23 +27,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) -#define FLOAT_V_T vfloat32m4_t -#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4) -#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4) -#define VSEV_FLOAT RISCV_RVV(vse32_v_f32m4) -#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4) -#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4) -#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f32m4) +#define VSETVL(n) RISCV_RVV(vsetvl_e32m2)(n) +#define FLOAT_V_T vfloat32m2_t +#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m2) +#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m2) +#define VSEV_FLOAT RISCV_RVV(vse32_v_f32m2) +#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m2) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m2) +#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f32m2) +#define VFMUL_VF_FLOAT RISCV_RVV(vfmul_vf_f32m2) +#define VSEV_FLOAT RISCV_RVV(vse32_v_f32m2) #else -#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) -#define FLOAT_V_T vfloat64m4_t -#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4) -#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4) -#define VSEV_FLOAT RISCV_RVV(vse64_v_f64m4) -#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4) -#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4) -#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m4) +#define VSETVL(n) RISCV_RVV(vsetvl_e64m2)(n) +#define FLOAT_V_T vfloat64m2_t +#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m2) +#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m2) +#define VSEV_FLOAT RISCV_RVV(vse64_v_f64m2) +#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m2) +#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m2) +#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m2) +#define VFMUL_VF_FLOAT RISCV_RVV(vfmul_vf_f64m2) +#define VSEV_FLOAT RISCV_RVV(vse64_v_f64m2) #endif int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) @@ -51,8 +55,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, BLASLONG i = 0, j = 0, k = 0; BLASLONG ix = 0, iy = 0; FLOAT *a_ptr = a; - FLOAT temp_r = 0.0, temp_i = 0.0; - FLOAT_V_T va0, va1, vy0, vy1; + FLOAT temp_r = 0.0, temp_i = 0.0 ,temp_r1 ,temp_i1, temp_r2 ,temp_i2 ,temp_r3, temp_i3,temp_rr[4] ,temp_ii[4]; + FLOAT_V_T va0, va1, vy0, vy1,vy0_new, vy1_new, va2 , va3 , va4 , va5, va6 , va7, temp_iv , temp_rv,x_v0 , x_v1,temp_v1 , temp_v2 , temp_v3 , temp_v4; unsigned int gvl = 0; BLASLONG stride_a = sizeof(FLOAT) * 2; BLASLONG stride_y = inc_y * sizeof(FLOAT) * 2; @@ -60,12 +64,20 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, BLASLONG inc_yv = inc_y * gvl * 2; BLASLONG inc_x2 = inc_x * 2; BLASLONG lda2 = lda * 2; + vy0_new = VLSEV_FLOAT(&y[iy], stride_y, gvl); + vy1_new = VLSEV_FLOAT(&y[iy+1], stride_y, gvl); for(k=0,j=0; k Date: Fri, 25 Apr 2025 00:27:27 +0800 Subject: [PATCH 21/38] Format the code --- kernel/riscv64/zgemv_n_vector.c | 342 ++++++++++++++++---------------- 1 file changed, 169 insertions(+), 173 deletions(-) diff --git a/kernel/riscv64/zgemv_n_vector.c b/kernel/riscv64/zgemv_n_vector.c index 77f55cdd9..cbed06c97 100644 --- a/kernel/riscv64/zgemv_n_vector.c +++ b/kernel/riscv64/zgemv_n_vector.c @@ -52,11 +52,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { - BLASLONG i = 0, j = 0, k = 0; + BLASLONG i = 0, j = 0, k = 0; BLASLONG ix = 0, iy = 0; FLOAT *a_ptr = a; - FLOAT temp_r = 0.0, temp_i = 0.0 ,temp_r1 ,temp_i1, temp_r2 ,temp_i2 ,temp_r3, temp_i3,temp_rr[4] ,temp_ii[4]; - FLOAT_V_T va0, va1, vy0, vy1,vy0_new, vy1_new, va2 , va3 , va4 , va5, va6 , va7, temp_iv , temp_rv,x_v0 , x_v1,temp_v1 , temp_v2 , temp_v3 , temp_v4; + FLOAT temp_r = 0.0, temp_i = 0.0, temp_r1, temp_i1, temp_r2, temp_i2, temp_r3, temp_i3, temp_rr[4], temp_ii[4]; + FLOAT_V_T va0, va1, vy0, vy1, vy0_new, vy1_new, va2, va3, va4, va5, va6, va7, temp_iv, temp_rv, x_v0, x_v1, temp_v1, temp_v2, temp_v3, temp_v4; unsigned int gvl = 0; BLASLONG stride_a = sizeof(FLOAT) * 2; BLASLONG stride_y = inc_y * sizeof(FLOAT) * 2; @@ -64,56 +64,58 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, BLASLONG inc_yv = inc_y * gvl * 2; BLASLONG inc_x2 = inc_x * 2; BLASLONG lda2 = lda * 2; - vy0_new = VLSEV_FLOAT(&y[iy], stride_y, gvl); - vy1_new = VLSEV_FLOAT(&y[iy+1], stride_y, gvl); - for(k=0,j=0; k Date: Wed, 30 Apr 2025 16:40:44 +0800 Subject: [PATCH 22/38] Loongarch64: fixed amax_lasx --- kernel/loongarch64/amax_lasx.S | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/kernel/loongarch64/amax_lasx.S b/kernel/loongarch64/amax_lasx.S index e964d4ddb..8d2acf283 100644 --- a/kernel/loongarch64/amax_lasx.S +++ b/kernel/loongarch64/amax_lasx.S @@ -56,17 +56,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. LDINT INCX, 0(INCX) #endif + xvxor.v VM0, VM0, VM0 bge $r0, N, .L999 bge $r0, INCX, .L999 li.d TEMP, 1 slli.d TEMP, TEMP, BASE_SHIFT slli.d INCX, INCX, BASE_SHIFT -#ifdef DOUBLE - xvldrepl.d VM0, X, 0 -#else - xvldrepl.w VM0, X, 0 -#endif - XVFSUB VM0, VM0, VM0 bne INCX, TEMP, .L20 srai.d I, N, 4 From be525521ad204c6080c10a8eca10799b67abd4e4 Mon Sep 17 00:00:00 2001 From: pengxu Date: Wed, 30 Apr 2025 16:40:55 +0800 Subject: [PATCH 23/38] Loongarch64: fixed asum_lasx --- kernel/loongarch64/asum_lasx.S | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/kernel/loongarch64/asum_lasx.S b/kernel/loongarch64/asum_lasx.S index 9a2c031f3..e5ab4df3d 100644 --- a/kernel/loongarch64/asum_lasx.S +++ b/kernel/loongarch64/asum_lasx.S @@ -103,21 +103,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvfadd.d res1, VX2, res1 xvfadd.d res1, VX3, res1 #else - xvfadd.s res2, res1, res2 xvpickve.w VX1, res1, 1 xvpickve.w VX2, res1, 2 xvpickve.w VX3, res1, 3 xvfadd.s res1, VX1, res1 xvfadd.s res1, VX2, res1 xvfadd.s res1, VX3, res1 - xvpickve.w VX0, res2, 4 - xvpickve.w VX1, res2, 5 - xvpickve.w VX2, res2, 6 - xvpickve.w VX3, res2, 7 + xvpickve.w VX0, res1, 4 + xvpickve.w VX1, res1, 5 + xvpickve.w VX2, res1, 6 + xvpickve.w VX3, res1, 7 xvfadd.s res1, VX0, res1 xvfadd.s res1, VX1, res1 xvfadd.s res1, VX2, res1 - xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX3, res1 #endif .align 3 @@ -217,21 +216,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvfadd.d res1, VX2, res1 xvfadd.d res1, VX3, res1 #else - xvfadd.s res2, res1, res2 xvpickve.w VX1, res1, 1 xvpickve.w VX2, res1, 2 xvpickve.w VX3, res1, 3 xvfadd.s res1, VX1, res1 xvfadd.s res1, VX2, res1 xvfadd.s res1, VX3, res1 - xvpickve.w VX0, res2, 4 - xvpickve.w VX1, res2, 5 - xvpickve.w VX2, res2, 6 - xvpickve.w VX3, res2, 7 + xvpickve.w VX0, res1, 4 + xvpickve.w VX1, res1, 5 + xvpickve.w VX2, res1, 6 + xvpickve.w VX3, res1, 7 xvfadd.s res1, VX0, res1 xvfadd.s res1, VX1, res1 xvfadd.s res1, VX2, res1 - xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX3, res1 #endif .align 3 From 74c97ef814c85134b3f6556f07cfb37b20ac93d5 Mon Sep 17 00:00:00 2001 From: pengxu Date: Wed, 30 Apr 2025 16:41:05 +0800 Subject: [PATCH 24/38] Loongarch64: fixed cdot_lasx --- kernel/loongarch64/cdot_lasx.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/loongarch64/cdot_lasx.S b/kernel/loongarch64/cdot_lasx.S index 0583e56ea..32b8bf982 100644 --- a/kernel/loongarch64/cdot_lasx.S +++ b/kernel/loongarch64/cdot_lasx.S @@ -288,7 +288,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvinsgr2vr.w x2, t2, 6 xvinsgr2vr.w x1, t3, 7 xvinsgr2vr.w x2, t4, 7 - addi.d Y, Y, 8 * SIZE + addi.d Y, Y, 16 * SIZE xvpickev.w x3, VX3, VX2 xvpickod.w x4, VX3, VX2 xvfmadd.s res1, x1, x3, res1 From d49319c2d2667bbc604dd78e0ec5831399c6b8f7 Mon Sep 17 00:00:00 2001 From: pengxu Date: Wed, 30 Apr 2025 16:41:18 +0800 Subject: [PATCH 25/38] Loongarch64: fixed cnrm2_lasx --- kernel/loongarch64/cnrm2_lasx.S | 78 ++++++++++++++++++++++----------- 1 file changed, 53 insertions(+), 25 deletions(-) diff --git a/kernel/loongarch64/cnrm2_lasx.S b/kernel/loongarch64/cnrm2_lasx.S index 3a60069ac..034f9de79 100644 --- a/kernel/loongarch64/cnrm2_lasx.S +++ b/kernel/loongarch64/cnrm2_lasx.S @@ -47,6 +47,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VX4 $xr21 #define res1 $xr19 #define res2 $xr20 +#define RCP $f2 +#define VALPHA $xr3 PROLOGUE @@ -55,10 +57,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. LDINT INCX, 0(INCX) #endif - xvxor.v res1, res1, res1 - xvxor.v res2, res2, res2 bge $r0, N, .L999 beq $r0, INCX, .L999 + + addi.d $sp, $sp, -32 + st.d $ra, $sp, 0 + st.d N, $sp, 8 + st.d X, $sp, 16 + st.d INCX, $sp, 24 +#ifdef DYNAMIC_ARCH + bl camax_k_LA264 +#else + bl camax_k +#endif + ld.d $ra, $sp, 0 + ld.d N, $sp, 8 + ld.d X, $sp, 16 + ld.d INCX, $sp, 24 + addi.d $sp, $sp, 32 + + frecip.s RCP, $f0 + vreplvei.w $vr3, $vr2, 0 + xvpermi.d VALPHA, $xr3,0x00 + xvxor.v res1, res1, res1 + xvxor.v res2, res2, res2 + fcmp.ceq.s $fcc0, $f0, $f19 + bcnez $fcc0, .L999 + li.d TEMP, SIZE slli.d INCX, INCX, ZBASE_SHIFT srai.d I, N, 2 @@ -67,13 +92,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .align 3 .L10: - xvld VX0, X, 0 * SIZE - xvfcvtl.d.s VX1, VX0 - xvfcvth.d.s VX2, VX0 - xvfmadd.d res1, VX1, VX1, res1 - xvfmadd.d res2, VX2, VX2, res2 addi.d I, I, -1 - addi.d X, X, 8 * SIZE + + xvld VX0, X, 0 * SIZE + xvld VX1, X, 8 * SIZE + xvfmul.s VX0, VX0, VALPHA + xvfmul.s VX1, VX1, VALPHA + xvfmadd.s res1, VX0, VX0, res1 + xvfmadd.s res2, VX1, VX1, res2 + + addi.d X, X, 16 * SIZE blt $r0, I, .L10 .align 3 b .L996 @@ -103,22 +131,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvinsgr2vr.w VX0, t3, 6 xvinsgr2vr.w VX0, t4, 7 add.d X, X, INCX - xvfcvtl.d.s VX1, VX0 - xvfcvth.d.s VX2, VX0 - xvfmadd.d res1, VX1, VX1, res1 - xvfmadd.d res2, VX2, VX2, res2 + xvfmul.s VX0, VX0, VALPHA + xvfmadd.s res2, VX0, VX0, res2 addi.d I, I, -1 blt $r0, I, .L21 b .L996 .L996: - xvfadd.d res1, res1, res2 - xvpickve.d VX1, res1, 1 - xvpickve.d VX2, res1, 2 - xvpickve.d VX3, res1, 3 - xvfadd.d res1, VX1, res1 - xvfadd.d res1, VX2, res1 - xvfadd.d res1, VX3, res1 + xvfadd.s res1, res1, res2 + xvpermi.d VX1, res1, 0x4e + xvfadd.s res1, res1, VX1 + vreplvei.w $vr17, $vr19, 1 + vreplvei.w $vr18, $vr19, 2 + vreplvei.w $vr21, $vr19, 3 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX3, res1 + xvfadd.s res1, VX4, res1 .align 3 .L997: @@ -130,18 +158,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fld.s a1, X, 0 * SIZE fld.s a2, X, 1 * SIZE addi.d I, I, -1 - fcvt.d.s a1, a1 - fcvt.d.s a2, a2 - fmadd.d res, a1, a1, res - fmadd.d res, a2, a2, res + fmul.s a1, a1, RCP + fmul.s a2, a2, RCP + fmadd.s res, a1, a1, res + fmadd.s res, a2, a2, res add.d X, X, INCX blt $r0, I, .L998 .align 3 .L999: - fsqrt.d res, res + fsqrt.s res, res + fmul.s $f0, res, $f0 move $r4, $r17 - fcvt.s.d $f0, res jirl $r0, $r1, 0x0 EPILOGUE From a98dd6d91156862d16fa0c43d718e3dfaf70e279 Mon Sep 17 00:00:00 2001 From: pengxu Date: Wed, 30 Apr 2025 16:41:28 +0800 Subject: [PATCH 26/38] Loongarch64: fixed copy_lasx --- kernel/loongarch64/copy_lasx.S | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/loongarch64/copy_lasx.S b/kernel/loongarch64/copy_lasx.S index 31f91cec1..6a723fb1e 100644 --- a/kernel/loongarch64/copy_lasx.S +++ b/kernel/loongarch64/copy_lasx.S @@ -260,9 +260,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add.d Y, Y, INCY ST a2, Y, 0 add.d Y, Y, INCY - ST a3, X, 0 + ST a3, Y, 0 add.d Y, Y, INCY - ST a4, X, 0 + ST a4, Y, 0 add.d Y, Y, INCY LD a1, X, 0 add.d X, X, INCX @@ -276,9 +276,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add.d Y, Y, INCY ST a2, Y, 0 add.d Y, Y, INCY - ST a3, X, 0 + ST a3, Y, 0 add.d Y, Y, INCY - ST a4, X, 0 + ST a4, Y, 0 add.d Y, Y, INCY addi.d I, I, -1 blt $r0, I, .L222 From dc5fa29851ea3c7c1e27195b21e3d593276d5475 Mon Sep 17 00:00:00 2001 From: pengxu Date: Wed, 30 Apr 2025 16:41:39 +0800 Subject: [PATCH 27/38] Loongarch64: fixed cscal_lasx --- kernel/loongarch64/cscal_lasx.S | 247 ++++++++------------------------ 1 file changed, 61 insertions(+), 186 deletions(-) diff --git a/kernel/loongarch64/cscal_lasx.S b/kernel/loongarch64/cscal_lasx.S index f53526663..e32071def 100644 --- a/kernel/loongarch64/cscal_lasx.S +++ b/kernel/loongarch64/cscal_lasx.S @@ -33,6 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHAI $f1 #define X $r7 #define INCX $r8 +#define DUMMY2 $r9 #define I $r12 #define TEMP $r13 @@ -65,6 +66,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. bge $r0, N, .L999 bge $r0, INCX, .L999 + ld.d DUMMY2, $sp, 0 li.d TEMP, 1 movgr2fr.d a1, $r0 FFINT a1, a1 @@ -86,24 +88,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif bne INCX, TEMP, .L22 +/////// INCX == 1 //////// .L11: - bge $r0, I, .L997 CMPEQ $fcc0, ALPHAR, a1 CMPEQ $fcc1, ALPHAI, a1 - bceqz $fcc0, .L13 - b .L14 - .align 3 + bge $r0, I, .L19 +/////// INCX == 1 && N >= 4 //////// + bnez DUMMY2, .L17 // if DUMMPY2 == 1, called from c/zscal. -.L13: - bceqz $fcc1, .L114 //alpha_r != 0.0 && alpha_i != 0.0 - b .L113 //alpha_r != 0.0 && alpha_i == 0.0 + bceqz $fcc0, .L17 -.L14: - bceqz $fcc1, .L114 //alpha_r == 0.0 && alpha_i != 0.0 - b .L111 //alpha_r == 0.0 && alpha_i == 0.0 - .align 3 + bceqz $fcc1, .L17 -.L111: //alpha_r == 0.0 && alpha_i == 0.0 +.L15: //alpha_r == 0.0 && alpha_i == 0.0 xvst VXZ, X, 0 * SIZE #ifdef DOUBLE xvst VXZ, X, 4 * SIZE @@ -113,41 +110,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi.d X, X, 16 * SIZE #endif addi.d I, I, -1 - blt $r0, I, .L111 - b .L997 - .align 3 - -.L113: //alpha_r != 0.0 && alpha_i == 0.0 - xvld VX0, X, 0 * SIZE -#ifdef DOUBLE - xvld VX1, X, 4 * SIZE - xvpickev.d x1, VX1, VX0 - xvpickod.d x2, VX1, VX0 - xvfmul.d x3, VXAR, x1 - xvfmul.d x4, VXAR, x2 - xvilvl.d VX2, x4 ,x3 - xvilvh.d VX3, x4, x3 - xvst VX2, X, 0 * SIZE - xvst VX3, X, 4 * SIZE - addi.d X, X, 8 * SIZE -#else - xvld VX1, X, 8 * SIZE - xvpickev.w x1, VX1, VX0 - xvpickod.w x2, VX1, VX0 - xvfmul.s x3, VXAR, x1 - xvfmul.s x4, VXAR, x2 - xvilvl.w VX2, x4 ,x3 - xvilvh.w VX3, x4, x3 - xvst VX2, X, 0 * SIZE - xvst VX3, X, 8 * SIZE - addi.d X, X, 16 * SIZE -#endif - addi.d I, I, -1 - blt $r0, I, .L113 - b .L997 + blt $r0, I, .L15 + b .L19 .align 3 -.L114: //alpha_r != 0.0 && alpha_i != 0.0 +.L17: xvld VX0, X, 0 * SIZE #ifdef DOUBLE xvld VX1, X, 4 * SIZE @@ -177,29 +144,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi.d X, X, 16 * SIZE #endif addi.d I, I, -1 - blt $r0, I, .L114 - b .L997 + blt $r0, I, .L17 + b .L19 .align 3 +/////// INCX == 1 && N < 8 /////// +.L19: +#ifdef DOUBLE + andi I, N, 3 +#else + andi I, N, 7 +#endif + beqz I, .L999 + bnez DUMMY2, .L998 // if DUMMPY2 == 1, called from c/zscal. + + bceqz $fcc0, .L998 + + bceqz $fcc1, .L998 + + b .L995 // alpha_r == 0.0 && alpha_i == 0.0 + .align 3 + +/////// INCX != 1 //////// .L22: - bge $r0, I, .L997 - move XX, X CMPEQ $fcc0, ALPHAR, a1 CMPEQ $fcc1, ALPHAI, a1 - bceqz $fcc0, .L23 - b .L24 - .align 3 - -.L23: - bceqz $fcc1, .L224 //alpha_r != 0.0 && alpha_i != 0.0 - b .L223 //alpha_r != 0.0 && alpha_i == 0.0 + move XX, X + bge $r0, I, .L29 + bnez DUMMY2, .L25 // if DUMMPY2 == 1, called from c/zscal. + bceqz $fcc0, .L25 -.L24: - bceqz $fcc1, .L224 //alpha_r == 0.0 && alpha_i != 0.0 - b .L221 //alpha_r == 0.0 && alpha_i == 0.0 - .align 3 + bceqz $fcc1, .L25 -.L221: //alpha_r == 0.0 && alpha_i == 0.0 +.L27: //alpha_r == 0.0 && alpha_i == 0.0 #ifdef DOUBLE xvstelm.d VXZ, X, 0, 0 xvstelm.d VXZ, X, 1 * SIZE, 0 @@ -239,122 +216,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif add.d X, X, INCX addi.d I, I, -1 - blt $r0, I, .L221 - b .L997 - .align 3 - -.L223: //alpha_r != 0.0 && alpha_i == 0.0 -#ifdef DOUBLE - ld.d t1, X, 0 * SIZE - ld.d t2, X, 1 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - ld.d t4, X, 1 * SIZE - add.d X, X, INCX - xvinsgr2vr.d x1, t1, 0 - xvinsgr2vr.d x2, t2, 0 - xvinsgr2vr.d x1, t3, 1 - xvinsgr2vr.d x2, t4, 1 - ld.d t1, X, 0 * SIZE - ld.d t2, X, 1 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - ld.d t4, X, 1 * SIZE - xvinsgr2vr.d x1, t1, 2 - xvinsgr2vr.d x2, t2, 2 - xvinsgr2vr.d x1, t3, 3 - xvinsgr2vr.d x2, t4, 3 - add.d X, X, INCX - - xvfmul.d x3, VXAR, x1 - xvfmul.d x4, VXAR, x2 - addi.d I, I, -1 - xvstelm.d x3, XX, 0 * SIZE, 0 - xvstelm.d x4, XX, 1 * SIZE, 0 - add.d XX, XX, INCX - xvstelm.d x3, XX, 0 * SIZE, 1 - xvstelm.d x4, XX, 1 * SIZE, 1 - add.d XX, XX, INCX - xvstelm.d x3, XX, 0 * SIZE, 2 - xvstelm.d x4, XX, 1 * SIZE, 2 - add.d XX, XX, INCX - xvstelm.d x3, XX, 0 * SIZE, 3 - xvstelm.d x4, XX, 1 * SIZE, 3 -#else - ld.w t1, X, 0 * SIZE - ld.w t2, X, 1 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - ld.w t4, X, 1 * SIZE - add.d X, X, INCX - xvinsgr2vr.w x1, t1, 0 - xvinsgr2vr.w x2, t2, 0 - xvinsgr2vr.w x1, t3, 1 - xvinsgr2vr.w x2, t4, 1 - ld.w t1, X, 0 * SIZE - ld.w t2, X, 1 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - ld.w t4, X, 1 * SIZE - xvinsgr2vr.w x1, t1, 2 - xvinsgr2vr.w x2, t2, 2 - xvinsgr2vr.w x1, t3, 3 - xvinsgr2vr.w x2, t4, 3 - add.d X, X, INCX - ld.w t1, X, 0 * SIZE - ld.w t2, X, 1 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - ld.w t4, X, 1 * SIZE - add.d X, X, INCX - xvinsgr2vr.w x1, t1, 4 - xvinsgr2vr.w x2, t2, 4 - xvinsgr2vr.w x1, t3, 5 - xvinsgr2vr.w x2, t4, 5 - ld.w t1, X, 0 * SIZE - ld.w t2, X, 1 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - ld.w t4, X, 1 * SIZE - xvinsgr2vr.w x1, t1, 6 - xvinsgr2vr.w x2, t2, 6 - xvinsgr2vr.w x1, t3, 7 - xvinsgr2vr.w x2, t4, 7 - add.d X, X, INCX - - xvfmul.s x3, VXAR, x1 - xvfmul.s x4, VXAR, x2 - addi.d I, I, -1 - xvstelm.w x3, XX, 0 * SIZE, 0 - xvstelm.w x4, XX, 1 * SIZE, 0 - add.d XX, XX, INCX - xvstelm.w x3, XX, 0 * SIZE, 1 - xvstelm.w x4, XX, 1 * SIZE, 1 - add.d XX, XX, INCX - xvstelm.w x3, XX, 0 * SIZE, 2 - xvstelm.w x4, XX, 1 * SIZE, 2 - add.d XX, XX, INCX - xvstelm.w x3, XX, 0 * SIZE, 3 - xvstelm.w x4, XX, 1 * SIZE, 3 - add.d XX, XX, INCX - xvstelm.w x3, XX, 0 * SIZE, 4 - xvstelm.w x4, XX, 1 * SIZE, 4 - add.d XX, XX, INCX - xvstelm.w x3, XX, 0 * SIZE, 5 - xvstelm.w x4, XX, 1 * SIZE, 5 - add.d XX, XX, INCX - xvstelm.w x3, XX, 0 * SIZE, 6 - xvstelm.w x4, XX, 1 * SIZE, 6 - add.d XX, XX, INCX - xvstelm.w x3, XX, 0 * SIZE, 7 - xvstelm.w x4, XX, 1 * SIZE, 7 -#endif - add.d XX, XX, INCX - blt $r0, I, .L223 - b .L997 + blt $r0, I, .L27 + b .L29 .align 3 -.L224: //alpha_r != 0.0 && alpha_i != 0.0 +.L25: #ifdef DOUBLE ld.d t1, X, 0 * SIZE ld.d t2, X, 1 * SIZE @@ -376,7 +242,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvinsgr2vr.d x1, t3, 3 xvinsgr2vr.d x2, t4, 3 add.d X, X, INCX - xvfmul.d VX0, VXAI, x2 xvfmsub.d x3, VXAR, x1, VX0 xvfmul.d VX1, VXAI, x1 @@ -434,7 +299,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvinsgr2vr.w x1, t3, 7 xvinsgr2vr.w x2, t4, 7 add.d X, X, INCX - xvfmul.s VX0, VXAI, x2 xvfmsub.s x3, VXAR, x1, VX0 xvfmul.s VX1, VXAI, x1 @@ -465,19 +329,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvstelm.w x4, XX, 1 * SIZE, 7 #endif add.d XX, XX, INCX - blt $r0, I, .L224 - b .L997 + blt $r0, I, .L25 + b .L29 .align 3 -.L997: +/////// INCX != 1 && N < 8 /////// +.L29: #ifdef DOUBLE - andi I, N, 3 + andi I, N, 3 #else - andi I, N, 7 + andi I, N, 7 #endif - bge $r0, I, .L999 - .align 3 + beqz I, .L999 + bnez DUMMY2, .L998 // if DUMMPY2 == 1, called from c/zscal. + + bceqz $fcc0, .L998 + bceqz $fcc1, .L998 + +.L995: // alpha_r == 0.0 && alpha_i == 0.0 + ST a1, X, 0 * SIZE + ST a1, X, 1 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L995 + b .L999 .L998: LD a1, X, 0 * SIZE LD a2, X, 1 * SIZE @@ -490,11 +366,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ST s2, X, 1 * SIZE add.d X, X, INCX blt $r0, I, .L998 - .align 3 + b .L999 .L999: move $r4, $r12 jirl $r0, $r1, 0x0 .align 3 - EPILOGUE From ba9569e382639ab96ddb896df13007f0e834d5ea Mon Sep 17 00:00:00 2001 From: pengxu Date: Wed, 30 Apr 2025 16:41:48 +0800 Subject: [PATCH 28/38] Loongarch64: fixed dot_lasx --- kernel/loongarch64/dot_lasx.S | 86 ++++++++++++----------------------- 1 file changed, 29 insertions(+), 57 deletions(-) diff --git a/kernel/loongarch64/dot_lasx.S b/kernel/loongarch64/dot_lasx.S index 11c896cb9..e72f848f8 100644 --- a/kernel/loongarch64/dot_lasx.S +++ b/kernel/loongarch64/dot_lasx.S @@ -53,8 +53,8 @@ PROLOGUE #endif /* init $f8 and $f9 to zero */ - SUB s1, s1, s1 - SUB s2, s2, s2 + xvxor.v $xr8, $xr8, $xr8 + xvxor.v $xr9, $xr9, $xr9 slli.d INCX, INCX, BASE_SHIFT li.d TEMP, SIZE slli.d INCY, INCY, BASE_SHIFT @@ -64,20 +64,6 @@ PROLOGUE /* !((inc_x == 1) && (inc_y == 1)) */ - /* init $xr8 and $xr9 to zero */ -#ifdef DOUBLE - xvldrepl.d $xr0, X, 0 -#else - xvldrepl.w $xr0, X, 0 -#endif -#ifdef DSDOT - xvfcvtl.d.s $xr0, $xr0 - xvfsub.d $xr8, $xr0, $xr0 - xvfsub.d $xr9, $xr0, $xr0 -#else - XVFSUB $xr8, $xr0, $xr0 - XVFSUB $xr9, $xr0, $xr0 -#endif #ifdef DOUBLE srai.d I, N, 4 @@ -99,31 +85,31 @@ PROLOGUE addi.w I, I, -1 addi.d X, X, 128 addi.d Y, Y, 128 -#ifdef DSDOT +#ifndef DOUBLE xvfcvtl.d.s $xr10, $xr0 xvfcvtl.d.s $xr11, $xr4 xvfcvth.d.s $xr12, $xr0 xvfcvth.d.s $xr13, $xr4 - xvfmadd.d $xr8, $xr10, $xr12, $xr8 - xvfmadd.d $xr9, $xr11, $xr13, $xr9 + xvfmadd.d $xr8, $xr10, $xr11, $xr8 + xvfmadd.d $xr9, $xr12, $xr13, $xr9 xvfcvtl.d.s $xr10, $xr1 xvfcvtl.d.s $xr11, $xr5 xvfcvth.d.s $xr12, $xr1 xvfcvth.d.s $xr13, $xr5 - xvfmadd.d $xr8, $xr10, $xr12, $xr8 - xvfmadd.d $xr9, $xr11, $xr13, $xr9 + xvfmadd.d $xr8, $xr10, $xr11, $xr8 + xvfmadd.d $xr9, $xr12, $xr13, $xr9 xvfcvtl.d.s $xr10, $xr2 xvfcvtl.d.s $xr11, $xr6 xvfcvth.d.s $xr12, $xr2 xvfcvth.d.s $xr13, $xr6 - xvfmadd.d $xr8, $xr10, $xr12, $xr8 - xvfmadd.d $xr9, $xr11, $xr13, $xr9 + xvfmadd.d $xr8, $xr10, $xr11, $xr8 + xvfmadd.d $xr9, $xr12, $xr13, $xr9 xvfcvtl.d.s $xr10, $xr3 xvfcvtl.d.s $xr11, $xr7 xvfcvth.d.s $xr12, $xr3 xvfcvth.d.s $xr13, $xr7 - xvfmadd.d $xr8, $xr10, $xr12, $xr8 - xvfmadd.d $xr9, $xr11, $xr13, $xr9 + xvfmadd.d $xr8, $xr10, $xr11, $xr8 + xvfmadd.d $xr9, $xr12, $xr13, $xr9 #else XVFMADD $xr8, $xr0, $xr4, $xr8 XVFMADD $xr9, $xr1, $xr5, $xr9 @@ -149,13 +135,13 @@ PROLOGUE addi.w I, I, -1 addi.d X, X, 32 addi.d Y, Y, 32 -#ifdef DSDOT +#ifndef DOUBLE xvfcvtl.d.s $xr10, $xr0 xvfcvtl.d.s $xr11, $xr4 xvfcvth.d.s $xr12, $xr0 xvfcvth.d.s $xr13, $xr4 - xvfmadd.d $xr8, $xr10, $xr12, $xr8 - xvfmadd.d $xr9, $xr11, $xr13, $xr9 + xvfmadd.d $xr8, $xr10, $xr11, $xr8 + xvfmadd.d $xr9, $xr12, $xr13, $xr9 #else XVFMADD $xr8, $xr0, $xr4, $xr8 #endif @@ -163,27 +149,12 @@ PROLOGUE .align 3 .L14: /* store dot in s1 $f8 */ -#ifdef DSDOT xvfadd.d $xr8, $xr8, $xr9 - fsub.s s2, s2, s2 /* set s2 to 0.0 */ + fsub.d s2, s2, s2 /* set s2 to 0.0 */ xvpermi.q $xr0, $xr8, 0x1 vfadd.d $vr8, $vr8, $vr0 vpackod.d $vr0, $vr8, $vr8 vfadd.d $vr8, $vr8, $vr0 -#else - XVFADD $xr8, $xr8, $xr9 - SUB s2, s2, s2 /* set s2 to 0.0 */ - xvpermi.q $xr0, $xr8, 0x1 - VFADD $vr8, $vr8, $vr0 - vpackod.d $vr0, $vr8, $vr8 -#ifdef DOUBLE - VFADD $vr8, $vr8, $vr0 -#else - VFADD $vr8, $vr8, $vr0 - vpackod.w $vr0, $vr8, $vr8 - VFADD $vr8, $vr8, $vr0 -#endif /* defined DOUBLE */ -#endif /* defined DSDOT */ .align 3 .L15: #ifdef DOUBLE @@ -197,7 +168,7 @@ PROLOGUE /* FLOAT: 1~7 ; DOUBLE: 1~3 */ LD a1, X, 0 LD b1, Y, 0 -#ifdef DSDOT +#ifndef DOUBLE fcvt.d.s a1, a1 fcvt.d.s b1, b1 fmadd.d s1, b1, a1, s1 @@ -240,7 +211,7 @@ PROLOGUE add.d X, X, INCX LD b1, Y, 0 * SIZE add.d Y, Y, INCY -#ifdef DSDOT +#ifndef DOUBLE fcvt.d.s a1, a1 fcvt.d.s b1, b1 fmadd.d s1, b1, a1, s1 @@ -252,7 +223,7 @@ PROLOGUE add.d X, X, INCX LD b1, Y, 0 * SIZE add.d Y, Y, INCY -#ifdef DSDOT +#ifndef DOUBLE fcvt.d.s a1, a1 fcvt.d.s b1, b1 fmadd.d s2, b1, a1, s2 @@ -264,7 +235,7 @@ PROLOGUE add.d X, X, INCX LD b1, Y, 0 * SIZE add.d Y, Y, INCY -#ifdef DSDOT +#ifndef DOUBLE fcvt.d.s a1, a1 fcvt.d.s b1, b1 fmadd.d s1, b1, a1, s1 @@ -276,7 +247,7 @@ PROLOGUE add.d X, X, INCX LD b1, Y, 0 * SIZE add.d Y, Y, INCY -#ifdef DSDOT +#ifndef DOUBLE fcvt.d.s a1, a1 fcvt.d.s b1, b1 fmadd.d s2, b1, a1, s2 @@ -288,7 +259,7 @@ PROLOGUE add.d X, X, INCX LD b1, Y, 0 * SIZE add.d Y, Y, INCY -#ifdef DSDOT +#ifndef DOUBLE fcvt.d.s a1, a1 fcvt.d.s b1, b1 fmadd.d s1, b1, a1, s1 @@ -300,7 +271,7 @@ PROLOGUE add.d X, X, INCX LD b1, Y, 0 * SIZE add.d Y, Y, INCY -#ifdef DSDOT +#ifndef DOUBLE fcvt.d.s a1, a1 fcvt.d.s b1, b1 fmadd.d s2, b1, a1, s2 @@ -312,7 +283,7 @@ PROLOGUE add.d X, X, INCX LD b1, Y, 0 * SIZE add.d Y, Y, INCY -#ifdef DSDOT +#ifndef DOUBLE fcvt.d.s a1, a1 fcvt.d.s b1, b1 fmadd.d s1, b1, a1, s1 @@ -325,7 +296,7 @@ PROLOGUE LD b1, Y, 0 * SIZE add.d Y, Y, INCY addi.d I, I, -1 -#ifdef DSDOT +#ifndef DOUBLE fcvt.d.s a1, a1 fcvt.d.s b1, b1 fmadd.d s2, b1, a1, s2 @@ -346,7 +317,7 @@ PROLOGUE LD b1, Y, 0 * SIZE add.d Y, Y, INCY addi.d I, I, -1 -#ifdef DSDOT +#ifndef DOUBLE fcvt.d.s a1, a1 fcvt.d.s b1, b1 fmadd.d s1, b1, a1, s1 @@ -357,12 +328,13 @@ PROLOGUE .align 3 .L999: -#ifdef DSDOT fadd.d $f0, s1, s2 + move $r4, $r17 +#if defined(DOUBLE) +#elif defined(DSDOT) #else - ADD $f0, s1, s2 + fcvt.s.d $f0, $f0 #endif - move $r4, $r17 jirl $r0, $r1, 0x0 EPILOGUE From b528b1b8ea5e4237cd6c4da4eb2df631f63b5782 Mon Sep 17 00:00:00 2001 From: pengxu Date: Wed, 30 Apr 2025 16:41:58 +0800 Subject: [PATCH 29/38] Loongarch64: fixed iamax_lasx --- kernel/loongarch64/iamax_lasx.S | 566 ++++++++++++++++---------------- 1 file changed, 282 insertions(+), 284 deletions(-) diff --git a/kernel/loongarch64/iamax_lasx.S b/kernel/loongarch64/iamax_lasx.S index 090da3004..a573fd4b7 100644 --- a/kernel/loongarch64/iamax_lasx.S +++ b/kernel/loongarch64/iamax_lasx.S @@ -56,25 +56,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VI3 $xr8 #define VI4 $xr19 #define VT0 $xr23 +#define VZE $xr3 +#define VT1 $xr4 +#define VT2 $xr5 +#define VC0 $xr6 PROLOGUE li.d i0, 0 bge $r0, N, .L999 bge $r0, INCX, .L999 li.d TEMP, 1 + xvldi VZE, 0 slli.d TEMP, TEMP, BASE_SHIFT slli.d INCX, INCX, BASE_SHIFT bne INCX, TEMP, .L20 xvld VM0, X, 0 #ifdef DOUBLE + xvfsub.d VT1, VZE, VM0 addi.d i0, i0, 1 srai.d I, N, 3 - bge $r0, I, .L21 - slli.d i0, i0, 2 //4 + xvfmaxa.d VM0, VM0, VT1 + bge $r0, I, .L11 + slli.d i0, i0, 1 //2 xvreplgr2vr.d VINC4, i0 - slli.d i0, i0, 1 //8 + slli.d i0, i0, 1 //4 xvreplgr2vr.d VINC8, i0 - addi.d i0, i0, -15 + addi.d i0, i0, -7 xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization addi.d i0, i0, 1 xvinsgr2vr.d VI1, i0, 1 @@ -82,19 +89,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvinsgr2vr.d VI1, i0, 2 addi.d i0, i0, 1 xvinsgr2vr.d VI1, i0, 3 - addi.d i0, i0, 5 - xvinsgr2vr.d VI0, i0, 0 //1 addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 1 //2 + xvinsgr2vr.d VI0, i0, 0 //initialize the index value for vectorization addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 2 //3 + xvinsgr2vr.d VI0, i0, 1 addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 3 //4 + xvinsgr2vr.d VI0, i0, 2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 3 #else + xvfsub.s VT1, VZE, VM0 addi.w i0, i0, 1 srai.d I, N, 3 + xvfmaxa.s VM0, VM0, VT1 bge $r0, I, .L21 - slli.w i0, i0, 3 //8 + slli.w i0, i0, 2 //4 + xvreplgr2vr.w VINC4, i0 + slli.w i0, i0, 1 //8 xvreplgr2vr.w VINC8, i0 addi.w i0, i0, -15 xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization @@ -135,73 +146,124 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifdef DOUBLE xvld VX0, X, 0 * SIZE xvadd.d VI1, VI1, VINC8 - xvld VX1, X, 4 * SIZE + xvld VX1, X, 2 * SIZE + xvadd.d VI2, VI1, VINC4 + xvfsub.d VT1, VZE, VX0 + xvfsub.d VT2, VZE, VX1 + xvfmaxa.d VX0, VX0, VT1 + xvfmaxa.d VX1, VX1, VT2 + xvfcmp.clt.d VT0, VX0, VX1 //abx(x0) < abs(x1) + xvbitsel.v x1, VX0, VX1, VT0 //abs(maxf) + xvbitsel.v x2, VI1, VI2, VT0 //i + + xvld VX0, X, 4 * SIZE + xvadd.d VI1, VI2, VINC4 + xvld VX1, X, 6 * SIZE xvadd.d VI2, VI1, VINC4 - xvfmaxa.d VM1, VX0, VX1 - xvfcmp.ceq.d VT0, VX0, VM1 + xvfsub.d VT1, VZE, VX0 + xvfsub.d VT2, VZE, VX1 + xvfmaxa.d VX0, VX0, VT1 + xvfmaxa.d VX1, VX1, VT2 + xvfcmp.clt.d VT0, VX0, VX1 + xvbitsel.v x3, VX0, VX1, VT0 //abs(maxf) + xvbitsel.v x4, VI1, VI2, VT0 //i + xvfcmp.clt.d VC0, x1, x3 + xvbitsel.v x1, x1, x3, VC0 //abs(maxf) + xvbitsel.v x2, x2, x4, VC0 //i + xvfcmp.clt.d VT0, VM0, x1 addi.d I, I, -1 - xvbitsel.v VI2, VI2, VI1, VT0 - xvfmaxa.d VM1, VM0, VM1 - xvfcmp.ceq.d VT0, VM0, VM1 addi.d X, X, 8 * SIZE - xvbitsel.v VM0, VM1, VM0, VT0 - xvbitsel.v VI0, VI2, VI0, VT0 + xvbitsel.v VM0, VM0, x1, VT0 + xvbitsel.v VI0, VI0, x2, VT0 #else xvld VX0, X, 0 * SIZE - addi.d I, I, -1 xvadd.w VI1, VI1, VINC8 - xvfmaxa.s VM1, VX0, VM0 - xvfcmp.ceq.s VT0, VM0, VM1 + xvld VX1, X, 4 * SIZE + xvadd.w VI2, VI1, VINC4 + xvfsub.s VT1, VZE, VX0 + xvfsub.s VT2, VZE, VX1 + xvfmaxa.s VX0, VX0, VT1 + xvfmaxa.s VX1, VX1, VT2 + xvfcmp.clt.s VT0, VX0, VX1 + xvbitsel.v x1, VX0, VX1, VT0 //abs(maxf) + xvbitsel.v x2, VI1, VI2, VT0 //i + addi.d I, I, -1 + xvfcmp.clt.s VT0, VM0, x1 addi.d X, X, 8 * SIZE - xvbitsel.v VM0, VM1, VM0, VT0 - xvbitsel.v VI0, VI1, VI0, VT0 + xvbitsel.v VM0, VM0, x1, VT0 + xvbitsel.v VI0, VI0, x2, VT0 + #endif blt $r0, I, .L10 .align 3 .L15: #ifdef DOUBLE - xvpickve.d VI1, VI0, 0 - xvpickve.d VI2, VI0, 1 - xvpickve.d VI3, VI0, 2 - xvpickve.d VI4, VI0, 3 - xvpickve.d x1, VM0, 0 - xvpickve.d x2, VM0, 1 - xvpickve.d x3, VM0, 2 - xvpickve.d x4, VM0, 3 + vreplvei.d $vr21, $vr20, 0 + vreplvei.d $vr22, $vr20, 1 + vreplvei.d $vr9, $vr15, 0 + vreplvei.d $vr10, $vr15, 1 + fcmp.ceq.d $fcc0, $f9, $f10 + bceqz $fcc0, .L16 + xvfcmp.clt.d VT0, VI1, VI2 + xvbitsel.v VI0, VI2, VI1, VT0 + b .L17 #else - xvxor.v VX0, VX0, VX0 - xvor.v VX0, VI0, VX0 - xvxor.v VX1, VX1, VX1 - xvor.v VX1, VM0, VX1 - xvpickve.w VI1, VI0, 0 - xvpickve.w VI2, VI0, 1 - xvpickve.w VI3, VI0, 2 - xvpickve.w VI4, VI0, 3 - xvpickve.w x1, VM0, 0 - xvpickve.w x2, VM0, 1 - xvpickve.w x3, VM0, 2 - xvpickve.w x4, VM0, 3 + vreplvei.w $vr21, $vr20, 0 + vreplvei.w $vr22, $vr20, 1 + vreplvei.w $vr8, $vr20, 2 + vreplvei.w $vr19, $vr20, 3 + vreplvei.w $vr9, $vr15, 0 + vreplvei.w $vr10, $vr15, 1 + vreplvei.w $vr11, $vr15, 2 + vreplvei.w $vr12, $vr15, 3 + b .L26 #endif - XVFMAXA VM1, x1, x2 - XVCMPEQ VT0, x1, VM1 - xvbitsel.v VINC4, VI2, VI1, VT0 - XVFMAXA VM0, x3, x4 - XVCMPEQ VT0, x3, VM0 - xvbitsel.v VINC8, VI4, VI3, VT0 - XVFMAXA VM0, VM0, VM1 - XVCMPEQ VT0, VM0, VM1 - xvbitsel.v VI0, VINC8, VINC4, VT0 - CMPEQ $fcc0, $f15, $f9 - bceqz $fcc0, .L26 - XVCMPLT VT0, VI1, VI0 + .align 3 + +#ifdef DOUBLE +.L16: + xvfcmp.clt.d VT0, x1, x2 + xvbitsel.v VI0, VI1, VI2, VT0 + xvbitsel.v VM0, x1, x2, VT0 + .align 3 + +.L17: + movfr2gr.d i0, $f20 + .align 3 + +.L11: //INCX==1 and N<8 + andi I, N, 7 + bge $r0, I, .L14 + srai.d i1, N, 3 + slli.d i1, i1, 3 + addi.d i1, i1, 1 //current index + movgr2fr.d $f21, i1 + movgr2fr.d $f20, i0 + .align 3 + +.L13: + fld.d $f9, X, 0 + fsub.d $f10, $f3, $f9 + xvfmaxa.d x1, x1, x2 + xvfcmp.clt.d VT0, VM0, x1 + xvbitsel.v VM0, VM0, x1, VT0 xvbitsel.v VI0, VI0, VI1, VT0 - b .L26 + addi.d I, I, -1 + addi.d i1, i1, 1 + addi.d X, X, SIZE + movgr2fr.d $f21, i1 + blt $r0, I, .L13 + movfr2gr.d i0, $f20 + .align 3 + +.L14: + move $r4, $r17 + jirl $r0, $r1, 0x0 .align 3 .L20: // INCX!=1 move TEMP, X -#ifdef DOUBLE addi.d i0, i0, 1 ld.d t1, TEMP, 0 * SIZE add.d TEMP, TEMP, INCX @@ -210,34 +272,103 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. bge $r0, I, .L21 ld.d t2, TEMP, 0 * SIZE add.d TEMP, TEMP, INCX - ld.d t3, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - ld.d t4, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX xvinsgr2vr.d VM0, t2, 1 - xvinsgr2vr.d VM0, t3, 2 - xvinsgr2vr.d VM0, t4, 3 - slli.d i0, i0, 2 //4 + slli.d i0, i0, 1 //2 + xvfsub.d VT1, VZE, VM0 xvreplgr2vr.d VINC4, i0 - slli.d i0, i0, 1 //8 + slli.d i0, i0, 1 //4 xvreplgr2vr.d VINC8, i0 - addi.d i0, i0, -15 + addi.d i0, i0, -7 + xvfmaxa.d VM0, VM0, VT1 xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization addi.d i0, i0, 1 xvinsgr2vr.d VI1, i0, 1 - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 2 - addi.d i0, i0, 1 - xvinsgr2vr.d VI1, i0, 3 - addi.d i0, i0, 5 + addi.d i0, i0, 3 xvinsgr2vr.d VI0, i0, 0 //1 addi.d i0, i0, 1 xvinsgr2vr.d VI0, i0, 1 //2 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 2 //3 - addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 3 //4 + .align 3 + +.L24: + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t2, 1 + xvadd.d VI1, VI1, VINC8 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t2, 1 + xvadd.d VI2, VI1, VINC4 + + xvfsub.d VT1, VZE, VX0 + xvfsub.d VT2, VZE, VX1 + xvfmaxa.d VX0, VX0, VT1 + xvfmaxa.d VX1, VX1, VT2 + xvfcmp.clt.d VT0, VX0, VX1 + xvbitsel.v x1, VX0, VX1, VT0 + xvbitsel.v x2, VI1, VI2, VT0 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t2, 1 + xvadd.d VI1, VI2, VINC4 + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t1, 0 + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX1, t2, 1 + xvadd.d VI2, VI1, VINC4 + xvfsub.d VT1, VZE, VX0 + xvfsub.d VT2, VZE, VX1 + xvfmaxa.d VX0, VX0, VT1 + xvfmaxa.d VX1, VX1, VT2 + xvfcmp.clt.d VT0, VX0, VX1 + xvbitsel.v x3, VX0, VX1, VT0 + xvbitsel.v x4, VI1, VI2, VT0 + xvfcmp.clt.d VC0, x1, x3 + xvbitsel.v x1, x1, x3, VC0 + xvbitsel.v x2, x2, x4, VC0 + xvfcmp.clt.d VT0, VM0, x1 + xvbitsel.v VM0, VM0, x1, VT0 + xvbitsel.v VI0, VI0, x2, VT0 + + addi.d I, I, -1 + blt $r0, I, .L24 + .align 3 + +.L25: + vreplvei.d $vr21, $vr20, 0 + vreplvei.d $vr22, $vr20, 1 + vreplvei.d $vr9, $vr15, 0 + vreplvei.d $vr10, $vr15, 1 + fcmp.ceq.d $fcc0, $f10, $f9 + bceqz $fcc0, .L26 + xvfcmp.clt.d VT0, VI1, VI2 + xvbitsel.v VI0, VI2, VI1, VT0 + b .L27 + .align 3 + +.L26: + xvfcmp.clt.d VT0, x1, x2 + xvbitsel.v VI0, VI1, VI2, VT0 + xvbitsel.v VM0, x1, x2, VT0 + .align 3 + +.L27: + movfr2gr.d i0, $f20 + .align 3 + #else +.L20: // INCX!=1 + move TEMP, X addi.w i0, i0, 1 ld.w t1, TEMP, 0 * SIZE add.d TEMP, TEMP, INCX @@ -253,19 +384,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvinsgr2vr.w VM0, t2, 1 xvinsgr2vr.w VM0, t3, 2 xvinsgr2vr.w VM0, t4, 3 - ld.w t1, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - ld.w t2, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - ld.w t3, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - ld.w t4, TEMP, 0 * SIZE - add.d TEMP, TEMP, INCX - xvinsgr2vr.w VM0, t1, 4 - xvinsgr2vr.w VM0, t2, 5 - xvinsgr2vr.w VM0, t3, 6 - xvinsgr2vr.w VM0, t4, 7 - slli.w i0, i0, 3 //8 + slli.w i0, i0, 2 //4 + xvreplgr2vr.w VINC4, i0 + slli.w i0, i0, 1 //8 xvreplgr2vr.w VINC8, i0 addi.w i0, i0, -15 xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization @@ -275,15 +396,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvinsgr2vr.w VI1, i0, 2 addi.w i0, i0, 1 xvinsgr2vr.w VI1, i0, 3 - addi.w i0, i0, 1 - xvinsgr2vr.w VI1, i0, 4 - addi.w i0, i0, 1 - xvinsgr2vr.w VI1, i0, 5 - addi.w i0, i0, 1 - xvinsgr2vr.w VI1, i0, 6 - addi.w i0, i0, 1 - xvinsgr2vr.w VI1, i0, 7 - addi.w i0, i0, 1 + addi.w i0, i0, 5 xvinsgr2vr.w VI0, i0, 0 //1 addi.w i0, i0, 1 xvinsgr2vr.w VI0, i0, 1 //2 @@ -291,54 +404,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvinsgr2vr.w VI0, i0, 2 //3 addi.w i0, i0, 1 xvinsgr2vr.w VI0, i0, 3 //4 - addi.w i0, i0, 1 - xvinsgr2vr.w VI0, i0, 4 //5 - addi.w i0, i0, 1 - xvinsgr2vr.w VI0, i0, 5 //6 - addi.w i0, i0, 1 - xvinsgr2vr.w VI0, i0, 6 //7 - addi.w i0, i0, 1 - xvinsgr2vr.w VI0, i0, 7 //8 -#endif .align 3 .L24: -#ifdef DOUBLE - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.d VX0, t1, 0 - xvinsgr2vr.d VX0, t2, 1 - xvinsgr2vr.d VX0, t3, 2 - xvinsgr2vr.d VX0, t4, 3 - xvadd.d VI1, VI1, VINC8 - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.d VX1, t1, 0 - xvinsgr2vr.d VX1, t2, 1 - xvinsgr2vr.d VX1, t3, 2 - xvinsgr2vr.d VX1, t4, 3 - xvadd.d VI2, VI1, VINC4 - xvfmaxa.d VM1, VX0, VX1 - xvfcmp.ceq.d VT0, VX0, VM1 - addi.d I, I, -1 - xvbitsel.v VI2, VI2, VI1, VT0 - xvfmaxa.d VM1, VM0, VM1 - xvfcmp.ceq.d VT0, VM0, VM1 - xvbitsel.v VM0, VM1, VM0, VT0 - xvbitsel.v VI0, VI2, VI0, VT0 -#else ld.w t1, X, 0 * SIZE add.d X, X, INCX ld.w t2, X, 0 * SIZE @@ -351,6 +419,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvinsgr2vr.w VX0, t2, 1 xvinsgr2vr.w VX0, t3, 2 xvinsgr2vr.w VX0, t4, 3 + xvadd.w VI1, VI1, VINC8 ld.w t1, X, 0 * SIZE add.d X, X, INCX ld.w t2, X, 0 * SIZE @@ -359,158 +428,80 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add.d X, X, INCX ld.w t4, X, 0 * SIZE add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 4 - xvinsgr2vr.w VX0, t2, 5 - xvinsgr2vr.w VX0, t3, 6 - xvinsgr2vr.w VX0, t4, 7 - xvadd.w VI1, VI1, VINC8 - xvfmaxa.s VM1, VX0, VM0 - xvfcmp.ceq.s VT0, VM1, VM0 + xvinsgr2vr.w VX1, t1, 0 + xvinsgr2vr.w VX1, t2, 1 + xvinsgr2vr.w VX1, t3, 2 + xvinsgr2vr.w VX1, t4, 3 + xvadd.w VI2, VI1, VINC4 + xvfsub.s VT1, VZE, VX0 + xvfsub.s VT2, VZE, VX1 + xvfmaxa.s VX0, VX0, VT1 + xvfmaxa.s VX1, VX1, VT2 + xvfcmp.clt.s VT0, VX0, VX1 + xvbitsel.v x1, VX0, VX1, VT0 + xvbitsel.v x2, VI1, VI2, VT0 //i + addi.d I, I, -1 - xvbitsel.v VM0, VM1, VM0, VT0 - xvbitsel.v VI0, VI1, VI0, VT0 -#endif + xvfcmp.clt.s VT0, VM0, x1 + xvbitsel.v VM0, VM0, x1, VT0 + xvbitsel.v VI0, VI0, x2, VT0 blt $r0, I, .L24 .align 3 .L25: -#ifdef DOUBLE - xvpickve.d VI1, VI0, 0 - xvpickve.d VI2, VI0, 1 - xvpickve.d VI3, VI0, 2 - xvpickve.d VI4, VI0, 3 - xvpickve.d x1, VM0, 0 - xvpickve.d x2, VM0, 1 - xvpickve.d x3, VM0, 2 - xvpickve.d x4, VM0, 3 - xvfmaxa.d VM1, x1, x2 - xvfcmp.ceq.d VT0, x1, VM1 - xvbitsel.v VINC4, VI2, VI1, VT0 - xvfmaxa.d VM0, x4, x3 - xvfcmp.ceq.d VT0, x3, VM0 - xvbitsel.v VINC8, VI4, VI3, VT0 - xvfmaxa.d VM0, VM0, VM1 - xvfcmp.ceq.d VT0, VM0, VM1 - xvbitsel.v VI0, VINC8, VINC4, VT0 -#else - xvxor.v VX0, VX0, VX0 - xvor.v VX0, VI0, VX0 - xvxor.v VX1, VX1, VX1 - xvor.v VX1, VM0, VX1 - xvpickve.w VI1, VI0, 0 - xvpickve.w VI2, VI0, 1 - xvpickve.w VI3, VI0, 2 - xvpickve.w VI4, VI0, 3 - xvpickve.w x1, VM0, 0 - xvpickve.w x2, VM0, 1 - xvpickve.w x3, VM0, 2 - xvpickve.w x4, VM0, 3 - xvfmaxa.s VM1, x1, x2 - xvfcmp.ceq.s VT0, x1, VM1 - xvbitsel.v VINC4, VI2, VI1, VT0 - xvfmaxa.s VM0, x3, x4 - xvfcmp.ceq.s VT0, x3, VM0 - xvbitsel.v VINC8, VI3, VI4, VT0 - xvfmaxa.s VM0, VM0, VM1 - xvfcmp.ceq.s VT0, VM0, VM1 - xvbitsel.v VM0, VM0, VM1, VT0 - xvbitsel.v VI0, VINC8, VINC4, VT0 -#endif - CMPEQ $fcc0, $f15, $f9 - bceqz $fcc0, .L26 - XVCMPLT VT0, VI1, VI0 - xvbitsel.v VI0, VI0, VI1, VT0 + vreplvei.w $vr21, $vr20, 0 + vreplvei.w $vr22, $vr20, 1 + vreplvei.w $vr8, $vr20, 2 + vreplvei.w $vr19, $vr20, 3 + vreplvei.w $vr9, $vr15, 0 + vreplvei.w $vr10, $vr15, 1 + vreplvei.w $vr11, $vr15, 2 + vreplvei.w $vr12, $vr15, 3 .align 3 .L26: - fcmp.ceq.d $fcc0, $f15, $f10 - bceqz $fcc0, .L27 - XVCMPLT VT0, VI2, VI0 - xvbitsel.v VI0, VI0, VI2, VT0 + fcmp.ceq.s $fcc0, $f9, $f10 + bceqz $fcc0, .L31 + xvfcmp.clt.s VT0, VI1, VI2 + xvbitsel.v VI1, VI2, VI1, VT0 + b .L32 .align 3 - -.L27: - fcmp.ceq.d $fcc0, $f15, $f11 - bceqz $fcc0, .L28 - XVCMPLT VT0, VI3, VI0 - xvbitsel.v VI0, VI0, VI3, VT0 +.L31: + xvfcmp.clt.s VT0, x1, x2 + xvbitsel.v VI1, VI1, VI2, VT0 + xvbitsel.v x1, x1, x2, VT0 .align 3 - -.L28: - fcmp.ceq.d $fcc0, $f15, $f12 - bceqz $fcc0, .L29 - XVCMPLT VT0, VI4, VI0 - xvbitsel.v VI0, VI0, VI4, VT0 +.L32: + fcmp.ceq.s $fcc0, $f11, $f12 + bceqz $fcc0, .L33 + xvfcmp.clt.s VT1, VI3, VI4 + xvbitsel.v VI3, VI4, VI3, VT1 + b .L34 .align 3 - -.L29: -#ifdef DOUBLE - movfr2gr.d i0, $f20 -#else - fmov.s $f16, $f20 -#endif +.L33: + xvfcmp.clt.s VT1, x3, x4 + xvbitsel.v x3, x3, x4, VT1 + xvbitsel.v VI3, VI3, VI4, VT1 .align 3 - -#ifdef DOUBLE - -#else -.L252: - xvxor.v VI0, VI0, VI0 - xvor.v VI0, VI0, VX0 - fmov.s $f13, $f15 - xvxor.v VM0, VM0, VM0 - xvor.v VM0, VM0, VX1 - xvpickve.w VI1, VI0, 4 - xvpickve.w VI2, VI0, 5 - xvpickve.w VI3, VI0, 6 - xvpickve.w VI4, VI0, 7 - xvpickve.w x1, VM0, 4 - xvpickve.w x2, VM0, 5 - xvpickve.w x3, VM0, 6 - xvpickve.w x4, VM0, 7 - xvfmaxa.s VM1, x1, x2 - xvfcmp.ceq.s VT0, x1, VM1 - xvbitsel.v VINC4, VI2, VI1, VT0 - xvfmaxa.s VM0, x3, x4 - xvfcmp.ceq.s VT0, x3, VM0 - xvbitsel.v VINC8, VI4, VI3, VT0 - xvfmaxa.s VM0, VM0, VM1 - xvfcmp.ceq.s VT0, VM0, VM1 - xvbitsel.v VI0, VINC8, VINC4, VT0 - fcmp.ceq.d $fcc0, $f15, $f9 - bceqz $fcc0, .L262 - xvfcmp.clt.s VT0, VI1, VI0 - xvbitsel.v VI0, VI0, VI1, VT0 +.L34: + fcmp.ceq.s $fcc0, $f9, $f11 + bceqz $fcc0, .L35 + xvfcmp.clt.s VT0, VI1, VI3 + xvbitsel.v VI0, VI3, VI1, VT0 + xvxor.v VM0, x1, VZE + b .L29 .align 3 - -.L262: - fcmp.ceq.d $fcc0, $f15, $f10 - bceqz $fcc0, .L272 - xvfcmp.clt.s VT0, VI2, VI0 - xvbitsel.v VI0, VI0, VI2, VT0 +.L35: + xvfcmp.clt.s VT0, x1, x3 + xvbitsel.v VM0, x1, x3, VT0 + xvbitsel.v VI0, VI1, VI3, VT0 .align 3 -.L272: - fcmp.ceq.d $fcc0, $f15, $f11 - bceqz $fcc0, .L282 - xvfcmp.clt.s VT0, VI3, VI0 - xvbitsel.v VI0, VI0, VI3, VT0 - .align 3 - -.L282: - fcmp.ceq.d $fcc0, $f15, $f12 - bceqz $fcc0, .L292 - xvfcmp.clt.s VT0, VI4, VI0 - xvbitsel.v VI0, VI0, VI4, VT0 +.L29: + movfr2gr.s i0, $f20 .align 3 -.L292: - xvfmaxa.s VM0, VX0, VM0 - xvfcmp.ceq.s VT0, VM0, VX0 - xvbitsel.v VI0, VI0, VI1, VT0 - movfr2gr.s i0, $f20 #endif - -.L21: //N<8 +.L21: // N<8 andi I, N, 7 bge $r0, I, .L999 srai.d i1, N, 3 @@ -521,17 +512,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .align 3 .L22: - LD $f9, X, 0 + LD $f9, X, 0 +#ifdef DOUBLE + fsub.d $f10, $f3, $f9 + xvfmaxa.d x1, x1, x2 + xvfcmp.clt.d VT0, VM0, x1 +#else + fsub.s $f10, $f3, $f9 + xvfmaxa.s x1, x1, x2 + xvfcmp.clt.s VT0, VM0, x1 +#endif + xvbitsel.v VM0, VM0, x1, VT0 + xvbitsel.v VI0, VI0, VI1, VT0 addi.d I, I, -1 - XVFMAXA VM1, x1, VM0 - XVCMPEQ VT0, VM0, VM1 - add.d X, X, INCX - xvbitsel.v VM0, VM1, VM0, VT0 - xvbitsel.v VI0, VI1, VI0, VT0 addi.d i1, i1, 1 + add.d X, X, INCX movgr2fr.d $f21, i1 blt $r0, I, .L22 - MTG i0, $f20 + MTG i0, $f20 .align 3 .L999: From 6dc4ca23918a12628c44cad4aacd934d2b8417df Mon Sep 17 00:00:00 2001 From: pengxu Date: Wed, 30 Apr 2025 16:42:12 +0800 Subject: [PATCH 30/38] Loongarch64: fixed icamax_lasx --- kernel/loongarch64/icamax_lasx.S | 412 +++++++++++++------------------ 1 file changed, 165 insertions(+), 247 deletions(-) diff --git a/kernel/loongarch64/icamax_lasx.S b/kernel/loongarch64/icamax_lasx.S index 7800cb917..fb47aa458 100644 --- a/kernel/loongarch64/icamax_lasx.S +++ b/kernel/loongarch64/icamax_lasx.S @@ -76,66 +76,66 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi.d i0, i0, 1 srai.d I, N, 2 bge $r0, I, .L21 - slli.d i0, i0, 2 //4 + slli.d i0, i0, 1 //2 xvreplgr2vr.d VINC4, i0 - addi.d i0, i0, -7 + addi.d i0, i0, -3 xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization - addi.d i0, i0, 2 + addi.d i0, i0, 1 xvinsgr2vr.d VI1, i0, 1 - addi.d i0, i0, -1 + addi.d i0, i0, 1 xvinsgr2vr.d VI1, i0, 2 - addi.d i0, i0, 2 - xvinsgr2vr.d VI1, i0, 3 addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 0 //1 - addi.d i0, i0, 2 - xvinsgr2vr.d VI0, i0, 1 //3 + xvinsgr2vr.d VI1, i0, 3 addi.d i0, i0, -1 - xvinsgr2vr.d VI0, i0, 2 //2 - addi.d i0, i0, 2 - xvinsgr2vr.d VI0, i0, 3 //4 + xvinsgr2vr.d VI0, i0, 0 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 3 #else li.w I, -1 xvreplgr2vr.w VI4, I xvffint.s.w VI4, VI4 // -1 bne INCX, TEMP, .L20 addi.w i0, i0, 1 - srai.d I, N, 3 + srai.d I, N, 2 bge $r0, I, .L21 - slli.w i0, i0, 3 //8 - xvreplgr2vr.w VINC8, i0 - addi.w i0, i0, -15 + slli.w i0, i0, 2 //4 + xvreplgr2vr.w VINC4, i0 + addi.w i0, i0, -7 xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization addi.w i0, i0, 1 xvinsgr2vr.w VI1, i0, 1 - addi.w i0, i0, 3 + addi.w i0, i0, 1 xvinsgr2vr.w VI1, i0, 2 addi.w i0, i0, 1 xvinsgr2vr.w VI1, i0, 3 - addi.w i0, i0, -3 + addi.w i0, i0, 1 xvinsgr2vr.w VI1, i0, 4 addi.w i0, i0, 1 xvinsgr2vr.w VI1, i0, 5 - addi.w i0, i0, 3 + addi.w i0, i0, 1 xvinsgr2vr.w VI1, i0, 6 addi.w i0, i0, 1 xvinsgr2vr.w VI1, i0, 7 + addi.w i0, i0, -3 + xvinsgr2vr.w VI0, i0, 0 addi.w i0, i0, 1 - xvinsgr2vr.w VI0, i0, 0 //1 + xvinsgr2vr.w VI0, i0, 1 addi.w i0, i0, 1 - xvinsgr2vr.w VI0, i0, 1 //2 - addi.w i0, i0, 3 - xvinsgr2vr.w VI0, i0, 2 //5 + xvinsgr2vr.w VI0, i0, 2 addi.w i0, i0, 1 - xvinsgr2vr.w VI0, i0, 3 //6 - addi.w i0, i0, -3 - xvinsgr2vr.w VI0, i0, 4 //3 + xvinsgr2vr.w VI0, i0, 3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 4 addi.w i0, i0, 1 - xvinsgr2vr.w VI0, i0, 5 //4 - addi.w i0, i0, 3 - xvinsgr2vr.w VI0, i0, 6 //7 + xvinsgr2vr.w VI0, i0, 5 addi.w i0, i0, 1 - xvinsgr2vr.w VI0, i0, 7 //8 + xvinsgr2vr.w VI0, i0, 6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 7 #endif .align 3 @@ -143,7 +143,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvld VX0, X, 0 * SIZE #ifdef DOUBLE xvadd.d VI1, VI1, VINC4 - xvld VX1, X, 4 * SIZE + xvld VX1, X, 2 * SIZE addi.d I, I, -1 xvpickev.d x1, VX1, VX0 xvpickod.d x2, VX1, VX0 @@ -153,22 +153,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvfcmp.clt.d VINC8, x2, VI3 xvbitsel.v x1, x1, x3, VT0 xvbitsel.v x2, x2, x4, VINC8 + xvfadd.d x1, x1, x2 + xvfmax.d x3, VM0, x1 + xvfcmp.ceq.d VT0, x3, VM0 + xvbitsel.v VM0, x3, VM0, VT0 + xvbitsel.v VI0, VI1, VI0, VT0 + xvld VX0, X, 4 * SIZE + xvadd.d VI1, VI1, VINC4 + xvld VX1, X, 6 * SIZE + xvpickev.d x1, VX1, VX0 + xvpickod.d x2, VX1, VX0 + xvfmul.d x3, VI4, x1 + xvfmul.d x4, VI4, x2 #else - xvadd.w VI1, VI1, VINC8 - xvld VX1, X, 8 * SIZE + xvadd.w VI1, VI1, VINC4 + xvld VX1, X, 4 * SIZE addi.d I, I, -1 xvpickev.w x1, VX1, VX0 xvpickod.w x2, VX1, VX0 xvfmul.s x3, VI4, x1 xvfmul.s x4, VI4, x2 - xvfcmp.clt.s VT0, x1, VI3 - xvfcmp.clt.s VINC4, x2, VI3 - xvbitsel.v x1, x1, x3, VT0 - xvbitsel.v x2, x2, x4, VINC4 #endif - XVFADD x1, x1, x2 - XVFMAX x3, VM0, x1 - XVCMPEQ VT0, x3, VM0 + XVCMPLT VT0, x1, VI3 + XVCMPLT VINC8, x2, VI3 + xvbitsel.v x1, x1, x3, VT0 + xvbitsel.v x2, x2, x4, VINC8 + XVFADD x1, x1, x2 + XVFMAX x3, VM0, x1 + XVCMPEQ VT0, x3, VM0 addi.d X, X, 8 * SIZE xvbitsel.v VM0, x3, VM0, VT0 xvbitsel.v VI0, VI1, VI0, VT0 @@ -177,51 +189,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L15: #ifdef DOUBLE - xvpickve.d VI1, VI0, 0 - xvpickve.d VI2, VI0, 1 - xvpickve.d VI3, VI0, 2 - xvpickve.d VI4, VI0, 3 - xvpickve.d x1, VM0, 0 - xvpickve.d x2, VM0, 1 - xvpickve.d x3, VM0, 2 - xvpickve.d x4, VM0, 3 - xvfmax.d VM1, x1, x2 - xvfcmp.ceq.d VT0, VM1, x1 + vreplvei.d $vr21, $vr20, 0 + vreplvei.d $vr22, $vr20, 1 + vreplvei.d $vr9, $vr15, 0 + vreplvei.d $vr10, $vr15, 1 + fcmp.ceq.d $fcc0, $f10, $f9 + bceqz $fcc0, .L26 + xvfcmp.clt.d VT0, VI1, VI2 + xvbitsel.v VI0, VI2, VI1, VT0 + b .L27 +#else + vreplvei.w $vr21, $vr20, 0 + vreplvei.w $vr22, $vr20, 1 + vreplvei.w $vr8, $vr20, 2 + vreplvei.w $vr19, $vr20, 3 + vreplvei.w $vr9, $vr15, 0 + vreplvei.w $vr10, $vr15, 1 + vreplvei.w $vr11, $vr15, 2 + vreplvei.w $vr12, $vr15, 3 + xvfmaxa.s VM1, x1, x2 + xvfcmp.ceq.s VT0, VM1, x1 xvbitsel.v VINC4, VI2, VI1, VT0 - xvfmax.d VM0, x3, x4 - xvfcmp.ceq.d VT0, x3, VM0 + xvfmaxa.s VM0, x3, x4 + xvfcmp.ceq.s VT0, x3, VM0 xvbitsel.v VINC8, VI4, VI3, VT0 - xvfmax.d VM0, VM0, VM1 - xvfcmp.ceq.d VT0, VM0, VM1 + xvfmaxa.s VM0, VM0, VM1 + xvfcmp.ceq.s VT0, VM0, VM1 xvbitsel.v VI0, VINC8, VINC4, VT0 -#else - xvxor.v VX0, VX0, VX0 - xvor.v VX0, VI0, VX0 - xvxor.v VX1, VX1, VX1 - xvor.v VX1, VM0, VX1 - xvpickve.w VI1, VI0, 0 - xvpickve.w VI2, VI0, 1 - xvpickve.w VI3, VI0, 2 - xvpickve.w VI4, VI0, 3 - xvpickve.w x1, VM0, 0 - xvpickve.w x2, VM0, 1 - xvpickve.w x3, VM0, 2 - xvpickve.w x4, VM0, 3 - xvfcmp.clt.s VT0, x1, x2 - xvbitsel.v VM1, x1, x2, VT0 - xvbitsel.v VINC4, VI1, VI2, VT0 - xvfcmp.clt.s VT0, x3, x4 - xvbitsel.v VM0, x3, x4, VT0 - xvbitsel.v VINC8, VI3, VI4, VT0 - xvfcmp.clt.s VT0, VM0, VM1 - xvbitsel.v VM0, VM0, VM1, VT0 - xvbitsel.v VI0, VINC8, VINC4, VT0 -#endif fcmp.ceq.d $fcc0, $f15, $f9 bceqz $fcc0, .L26 - XVCMPLT VT0, VI1, VI0 + xvfcmp.clt.s VT0, VI1, VI0 xvbitsel.v VI0, VI0, VI1, VT0 b .L26 +#endif .align 3 .L20: // INCX!=1 @@ -229,62 +229,62 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi.d i0, i0, 1 srai.d I, N, 2 bge $r0, I, .L21 - slli.d i0, i0, 2 //4 + slli.d i0, i0, 1 //2 xvreplgr2vr.d VINC4, i0 - addi.d i0, i0, -7 + addi.d i0, i0, -3 xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization - addi.d i0, i0, 2 + addi.d i0, i0, 1 xvinsgr2vr.d VI1, i0, 1 - addi.d i0, i0, -1 + addi.d i0, i0, 1 xvinsgr2vr.d VI1, i0, 2 - addi.d i0, i0, 2 - xvinsgr2vr.d VI1, i0, 3 addi.d i0, i0, 1 - xvinsgr2vr.d VI0, i0, 0 //1 - addi.d i0, i0, 2 - xvinsgr2vr.d VI0, i0, 1 //3 + xvinsgr2vr.d VI1, i0, 3 addi.d i0, i0, -1 - xvinsgr2vr.d VI0, i0, 2 //2 - addi.d i0, i0, 2 - xvinsgr2vr.d VI0, i0, 3 //4 + xvinsgr2vr.d VI0, i0, 0 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 1 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 2 + addi.d i0, i0, 1 + xvinsgr2vr.d VI0, i0, 3 #else addi.w i0, i0, 1 - srai.d I, N, 3 + srai.d I, N, 2 bge $r0, I, .L21 - slli.w i0, i0, 3 //8 - xvreplgr2vr.w VINC8, i0 - addi.w i0, i0, -15 + slli.w i0, i0, 2 //4 + xvreplgr2vr.w VINC4, i0 + addi.w i0, i0, -7 xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization addi.w i0, i0, 1 xvinsgr2vr.w VI1, i0, 1 - addi.w i0, i0, 3 + addi.w i0, i0, 1 xvinsgr2vr.w VI1, i0, 2 addi.w i0, i0, 1 xvinsgr2vr.w VI1, i0, 3 - addi.w i0, i0, -3 + addi.w i0, i0, 1 xvinsgr2vr.w VI1, i0, 4 addi.w i0, i0, 1 xvinsgr2vr.w VI1, i0, 5 - addi.w i0, i0, 3 + addi.w i0, i0, 1 xvinsgr2vr.w VI1, i0, 6 addi.w i0, i0, 1 xvinsgr2vr.w VI1, i0, 7 + addi.w i0, i0, -3 + xvinsgr2vr.w VI0, i0, 0 addi.w i0, i0, 1 - xvinsgr2vr.w VI0, i0, 0 //1 + xvinsgr2vr.w VI0, i0, 1 addi.w i0, i0, 1 - xvinsgr2vr.w VI0, i0, 1 //2 - addi.w i0, i0, 3 - xvinsgr2vr.w VI0, i0, 2 //5 + xvinsgr2vr.w VI0, i0, 2 addi.w i0, i0, 1 - xvinsgr2vr.w VI0, i0, 3 //6 - addi.w i0, i0, -3 - xvinsgr2vr.w VI0, i0, 4 //3 + xvinsgr2vr.w VI0, i0, 3 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 4 addi.w i0, i0, 1 - xvinsgr2vr.w VI0, i0, 5 //4 - addi.w i0, i0, 3 - xvinsgr2vr.w VI0, i0, 6 //7 + xvinsgr2vr.w VI0, i0, 5 addi.w i0, i0, 1 - xvinsgr2vr.w VI0, i0, 7 //8 + xvinsgr2vr.w VI0, i0, 6 + addi.w i0, i0, 1 + xvinsgr2vr.w VI0, i0, 7 #endif .align 3 @@ -301,16 +301,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvinsgr2vr.d x1, t3, 1 xvinsgr2vr.d x2, t4, 1 xvadd.d VI1, VI1, VINC4 + xvfmul.d x3, VI4, x1 + xvfmul.d x4, VI4, x2 + xvfcmp.clt.d VT0, x1, VI3 + xvfcmp.clt.d VINC8, x2, VI3 + xvbitsel.v x1, x1, x3, VT0 + xvbitsel.v x2, x2, x4, VINC8 + xvfadd.d x1, x1, x2 + xvfmax.d x3, VM0, x1 ld.d t1, X, 0 * SIZE + xvfcmp.ceq.d VT0, x3, VM0 ld.d t2, X, 1 * SIZE + xvbitsel.v VM0, x3, VM0, VT0 + xvbitsel.v VI0, VI1, VI0, VT0 add.d X, X, INCX ld.d t3, X, 0 * SIZE ld.d t4, X, 1 * SIZE add.d X, X, INCX - xvinsgr2vr.d x1, t1, 2 - xvinsgr2vr.d x2, t2, 2 - xvinsgr2vr.d x1, t3, 3 - xvinsgr2vr.d x2, t4, 3 + xvinsgr2vr.d x1, t1, 0 + xvinsgr2vr.d x2, t2, 0 + xvinsgr2vr.d x1, t3, 1 + xvinsgr2vr.d x2, t4, 1 + xvadd.d VI1, VI1, VINC4 addi.d I, I, -1 xvfmul.d x3, VI4, x1 xvfmul.d x4, VI4, x2 @@ -332,6 +344,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvinsgr2vr.w x2, t2, 0 xvinsgr2vr.w x1, t3, 1 xvinsgr2vr.w x2, t4, 1 + xvadd.w VI1, VI1, VINC4 ld.w t1, X, 0 * SIZE ld.w t2, X, 1 * SIZE add.d X, X, INCX @@ -342,31 +355,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvinsgr2vr.w x2, t2, 2 xvinsgr2vr.w x1, t3, 3 xvinsgr2vr.w x2, t4, 3 - xvadd.w VI1, VI1, VINC8 - ld.w t1, X, 0 * SIZE - ld.w t2, X, 1 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - ld.w t4, X, 1 * SIZE - add.d X, X, INCX - xvinsgr2vr.w x1, t1, 4 - xvinsgr2vr.w x2, t2, 4 - xvinsgr2vr.w x1, t3, 5 - xvinsgr2vr.w x2, t4, 5 - xvadd.w VI1, VI1, VINC8 - ld.w t1, X, 0 * SIZE - ld.w t2, X, 1 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - ld.w t4, X, 1 * SIZE - add.d X, X, INCX - xvinsgr2vr.w x1, t1, 6 - xvinsgr2vr.w x2, t2, 6 - xvinsgr2vr.w x1, t3, 7 - xvinsgr2vr.w x2, t4, 7 addi.d I, I, -1 - xvpickev.w x1, VX1, VX0 - xvpickod.w x2, VX1, VX0 xvfmul.s x3, VI4, x1 xvfmul.s x4, VI4, x2 xvfcmp.clt.s VT0, x1, VI3 @@ -384,152 +373,82 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L25: #ifdef DOUBLE - xvpickve.d VI1, VI0, 0 - xvpickve.d VI2, VI0, 1 - xvpickve.d VI3, VI0, 2 - xvpickve.d VI4, VI0, 3 - xvpickve.d x1, VM0, 0 - xvpickve.d x2, VM0, 1 - xvpickve.d x3, VM0, 2 - xvpickve.d x4, VM0, 3 - xvfmaxa.d VM1, x1, x2 - xvfcmp.ceq.d VT0, VM1, x1 + vreplvei.d $vr21, $vr20, 0 + vreplvei.d $vr22, $vr20, 1 + vreplvei.d $vr9, $vr15, 0 + vreplvei.d $vr10, $vr15, 1 + fcmp.ceq.d $fcc0, $f10, $f9 + bceqz $fcc0, .L26 + xvfcmp.clt.d VT0, VI1, VI2 + xvbitsel.v VI0, VI2, VI1, VT0 + b .L27 +#else + vreplvei.w $vr21, $vr20, 0 + vreplvei.w $vr22, $vr20, 1 + vreplvei.w $vr8, $vr20, 2 + vreplvei.w $vr19, $vr20, 3 + vreplvei.w $vr9, $vr15, 0 + vreplvei.w $vr10, $vr15, 1 + vreplvei.w $vr11, $vr15, 2 + vreplvei.w $vr12, $vr15, 3 + xvfmaxa.s VM1, x1, x2 + xvfcmp.ceq.s VT0, VM1, x1 xvbitsel.v VINC4, VI2, VI1, VT0 - xvfmaxa.d VM0, x3, x4 - xvfcmp.ceq.d VT0, x3, VM0 + xvfmaxa.s VM0, x3, x4 + xvfcmp.ceq.s VT0, x3, VM0 xvbitsel.v VINC8, VI4, VI3, VT0 - xvfmaxa.d VM0, VM0, VM1 - xvfcmp.ceq.d VT0, VM0, VM1 + xvfmaxa.s VM0, VM0, VM1 + xvfcmp.ceq.s VT0, VM0, VM1 xvbitsel.v VI0, VINC8, VINC4, VT0 -#else - xvxor.v VX0, VX0, VX0 - xvor.v VX0, VI0, VX0 - xvxor.v VX1, VX1, VX1 - xvor.v VX1, VM0, VX1 - xvpickve.w VI1, VI0, 0 - xvpickve.w VI2, VI0, 1 - xvpickve.w VI3, VI0, 2 - xvpickve.w VI4, VI0, 3 - xvpickve.w x1, VM0, 0 - xvpickve.w x2, VM0, 1 - xvpickve.w x3, VM0, 2 - xvpickve.w x4, VM0, 3 - xvfcmp.clt.s VT0, x1, x2 - xvbitsel.v VM1, x1, x2, VT0 - xvbitsel.v VINC4, VI1, VI2, VT0 - xvfcmp.clt.s VT0, x3, x4 - xvbitsel.v VM0, x3, x4, VT0 - xvbitsel.v VINC8, VI3, VI4, VT0 - xvfcmp.clt.s VT0, VM0, VM1 - xvbitsel.v VM0, VM0, VM1, VT0 - xvbitsel.v VI0, VINC8, VINC4, VT0 -#endif fcmp.ceq.d $fcc0, $f15, $f9 bceqz $fcc0, .L26 - XVCMPLT VT0, VI1, VI0 + xvfcmp.clt.s VT0, VI1, VI0 xvbitsel.v VI0, VI0, VI1, VT0 +#endif .align 3 +#ifdef DOUBLE .L26: - fcmp.ceq.d $fcc0, $f15, $f10 - bceqz $fcc0, .L27 - XVCMPLT VT0, VI2, VI0 - xvbitsel.v VI0, VI0, VI2, VT0 + xvfmaxa.d VM0, x1, x2 + xvfcmp.ceq.d VT0, x1, VM0 + xvbitsel.v VI0, VI2, VI1, VT0 .align 3 .L27: - fcmp.ceq.d $fcc0, $f15, $f11 - bceqz $fcc0, .L28 - XVCMPLT VT0, VI3, VI0 - xvbitsel.v VI0, VI0, VI3, VT0 - .align 3 - -.L28: - fcmp.ceq.d $fcc0, $f15, $f12 - bceqz $fcc0, .L29 - XVCMPLT VT0, VI4, VI0 - xvbitsel.v VI0, VI0, VI4, VT0 - .align 3 - -.L29: -#ifdef DOUBLE movfr2gr.d i0, $f20 -#else - fmov.s $f16, $f20 -#endif .align 3 - -#ifdef DOUBLE #else -.L252: - xvxor.v VI0, VI0, VI0 - xvor.v VI0, VI0, VX0 - fmov.s $f13, $f15 - xvxor.v VM0, VM0, VM0 - xvor.v VM0, VM0, VX1 - xvpickve.w VI1, VI0, 4 - xvpickve.w VI2, VI0, 5 - xvpickve.w VI3, VI0, 6 - xvpickve.w VI4, VI0, 7 - xvpickve.w x1, VM0, 4 - xvpickve.w x2, VM0, 5 - xvpickve.w x3, VM0, 6 - xvpickve.w x4, VM0, 7 - xvfcmp.clt.s VT0, x1, x2 - xvbitsel.v x1, x1, x2, VT0 - xvbitsel.v VINC4, VI1, VI2, VT0 - xvfcmp.clt.s VT0, x3, x4 - xvbitsel.v VM0, x3, x4, VT0 - xvbitsel.v VINC8, VI3, VI4, VT0 - xvfcmp.clt.s VT0, VM0, x1 - xvbitsel.v VM0, VM0, x1, VT0 - xvbitsel.v VI0, VINC8, VINC4, VT0 - fcmp.ceq.d $fcc0, $f15, $f9 - bceqz $fcc0, .L262 - xvfcmp.clt.s VT0, VI1, VI0 - xvbitsel.v VI0, VI0, VI1, VT0 - .align 3 - -.L262: +.L26: fcmp.ceq.d $fcc0, $f15, $f10 - bceqz $fcc0, .L272 + bceqz $fcc0, .L27 xvfcmp.clt.s VT0, VI2, VI0 xvbitsel.v VI0, VI0, VI2, VT0 .align 3 -.L272: +.L27: fcmp.ceq.d $fcc0, $f15, $f11 - bceqz $fcc0, .L282 + bceqz $fcc0, .L28 xvfcmp.clt.s VT0, VI3, VI0 xvbitsel.v VI0, VI0, VI3, VT0 .align 3 -.L282: +.L28: fcmp.ceq.d $fcc0, $f15, $f12 - bceqz $fcc0, .L292 + bceqz $fcc0, .L29 xvfcmp.clt.s VT0, VI4, VI0 xvbitsel.v VI0, VI0, VI4, VT0 .align 3 -.L292: - fcmp.clt.s $fcc0, $f15, $f13 - fsel $f15, $f15, $f13, $fcc0 - fsel $f20, $f20, $f16, $fcc0 +.L29: movfr2gr.s i0, $f20 + .align 3 #endif -.L21: //N<8 -#ifdef DOUBLE +.L21: //N<4 andi I, N, 3 bge $r0, I, .L999 srai.d i1, N, 2 slli.d i1, i1, 2 -#else - andi I, N, 7 - bge $r0, I, .L999 - srai.d i1, N, 3 - slli.d i1, i1, 3 -#endif addi.d i1, i1, 1 //current index movgr2fr.d $f21, i1 movgr2fr.d $f20, i0 @@ -550,10 +469,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi.d i1, i1, 1 movgr2fr.d $f21, i1 blt $r0, I, .L22 - MTG i0, $f20 + MTG i0, $f20 .align 3 - .L999: move $r4, $r17 jirl $r0, $r1, 0x0 From 57bb46bedfca77fdbce2a480cb3949ca5ea9ab91 Mon Sep 17 00:00:00 2001 From: pengxu Date: Wed, 30 Apr 2025 16:42:22 +0800 Subject: [PATCH 31/38] Loongarch64: fixed rot_lasx --- kernel/loongarch64/rot_lasx.S | 1300 ++++----------------------------- 1 file changed, 123 insertions(+), 1177 deletions(-) diff --git a/kernel/loongarch64/rot_lasx.S b/kernel/loongarch64/rot_lasx.S index 71378e0b2..386e9136f 100644 --- a/kernel/loongarch64/rot_lasx.S +++ b/kernel/loongarch64/rot_lasx.S @@ -64,6 +64,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. slli.d TEMP, TEMP, BASE_SHIFT slli.d INCX, INCX, BASE_SHIFT slli.d INCY, INCY, BASE_SHIFT + move XX, X + move YY, Y #ifdef DOUBLE movfr2gr.d t1, C xvreplgr2vr.d VXC, t1 @@ -80,27 +82,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvreplgr2vr.w VXZ, t3 #endif srai.d I, N, 3 + bge $r0, I, .L997 bne INCX, TEMP, .L20 - bne INCY, TEMP, .L12 // INCX==1 and INCY!=1 - b .L11 // INCX==1 and INCY==1 + bne INCY, TEMP, .L121 // INCX==1 and INCY!=1 + b .L111 // INCX==1 and INCY==1 .L20: - bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1 - b .L21 // INCX!=1 and INCY==1 - -.L11: - bge $r0, I, .L997 - CMPEQ $fcc0, C, a1 - bcnez $fcc0, .L110 - CMPEQ $fcc0, S, a1 - bcnez $fcc0, .L112 // C!=0 S==0 - b .L111 // C!=0 S!=0 - .align 3 - -.L110: - CMPEQ $fcc0, S, a1 - bcnez $fcc0, .L114 // C==0 S==0 - b .L113 // C==0 S!=0 - .align 3 + bne INCY, TEMP, .L221 // INCX!=1 and INCY!=1 + b .L211 // INCX!=1 and INCY==1 .L111: // C!=0 S!=0 xvld VX0, X, 0 * SIZE @@ -130,90 +118,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. b .L997 .align 3 -.L112: // C!=0 S==0 - xvld VX0, X, 0 * SIZE - xvld VX2, Y, 0 * SIZE -#ifdef DOUBLE - xvld VX1, X, 4 * SIZE - xvld VX3, Y, 4 * SIZE -#endif - XVMUL VT0, VX0, VXC - XVMUL VT1, VX2, VXC - xvst VT0, X, 0 * SIZE - xvst VT1, Y, 0 * SIZE -#ifdef DOUBLE - XVMUL VT0, VX1, VXC - XVMUL VT1, VX3, VXC - xvst VT0, X, 4 * SIZE - xvst VT1, Y, 4 * SIZE -#endif - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L112 - b .L997 - .align 3 - -.L113: // C==0 S!=0 - xvld VX0, X, 0 * SIZE - xvld VX2, Y, 0 * SIZE -#ifdef DOUBLE - xvld VX1, X, 4 * SIZE - xvld VX3, Y, 4 * SIZE -#endif - XVMUL VT0, VX2, VXS - XVMUL VT1, VX0, VXS - XVFSUB VT1, VXZ, VT1 - xvst VT0, X, 0 * SIZE - xvst VT1, Y, 0 * SIZE -#ifdef DOUBLE - XVMUL VT0, VX3, VXS - XVMUL VT1, VX1, VXS - xvfsub.d VT1, VXZ, VT1 - xvst VT0, X, 4 * SIZE - xvst VT1, Y, 4 * SIZE -#endif - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L113 - b .L997 - .align 3 - -.L114: // C==0 S==0 - xvst VXZ, X, 0 * SIZE - xvst VXZ, Y, 0 * SIZE -#ifdef DOUBLE - xvst VXZ, X, 4 * SIZE - xvst VXZ, Y, 4 * SIZE -#endif - addi.d X, X, 8 * SIZE - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L114 - b .L997 - .align 3 - -.L12: // INCX==1 and INCY!=1 - bge $r0, I, .L997 - move YY, Y - move XX, X - CMPEQ $fcc0, C, a1 - bcnez $fcc0, .L120 - CMPEQ $fcc0, S, a1 - bcnez $fcc0, .L122 // C!=0 S==0 - b .L121 // C!=0 S!=0 - .align 3 - -.L120: - CMPEQ $fcc0, S, a1 - bcnez $fcc0, .L124 // C==0 S==0 - b .L123 // C==0 S!=0 - .align 3 - .L121: // C!=0 S!=0 - xvld VX0, X, 0 * SIZE #ifdef DOUBLE + xvld VX0, X, 0 * SIZE ld.d t1, Y, 0 * SIZE add.d Y, Y, INCY ld.d t2, Y, 0 * SIZE @@ -221,12 +128,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld.d t3, Y, 0 * SIZE add.d Y, Y, INCY ld.d t4, Y, 0 * SIZE + add.d Y, Y, INCY xvinsgr2vr.d VX2, t1, 0 xvinsgr2vr.d VX2, t2, 1 xvinsgr2vr.d VX2, t3, 2 xvinsgr2vr.d VX2, t4, 3 - add.d Y, Y, INCY #else + xvld VX0, X, 0 * SIZE ld.w t1, Y, 0 * SIZE add.d Y, Y, INCY ld.w t2, Y, 0 * SIZE @@ -234,133 +142,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld.w t3, Y, 0 * SIZE add.d Y, Y, INCY ld.w t4, Y, 0 * SIZE + add.d Y, Y, INCY xvinsgr2vr.w VX2, t1, 0 xvinsgr2vr.w VX2, t2, 1 xvinsgr2vr.w VX2, t3, 2 xvinsgr2vr.w VX2, t4, 3 + ld.w t1, Y, 0 * SIZE add.d Y, Y, INCY - ld.w t1, Y, 0 * SIZE + ld.w t2, Y, 0 * SIZE add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE + ld.w t3, Y, 0 * SIZE add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 0 * SIZE add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE xvinsgr2vr.w VX2, t1, 4 xvinsgr2vr.w VX2, t2, 5 xvinsgr2vr.w VX2, t3, 6 xvinsgr2vr.w VX2, t4, 7 - add.d Y, Y, INCY #endif XVMUL VT0, VX0, VXC XVFMADD VT0, VX2, VXS, VT0 XVMUL VT1, VX0, VXS XVMSUB VT1, VX2, VXC, VT1 - -#ifdef DOUBLE - xvld VX1, X, 4 * SIZE - xvst VT0, X, 0 * SIZE - xvstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.d VT1, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.d VT1, YY, 0, 3 - add.d YY, YY, INCY - ld.d t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - xvinsgr2vr.d VX3, t1, 0 - xvinsgr2vr.d VX3, t2, 1 - xvinsgr2vr.d VX3, t3, 2 - xvinsgr2vr.d VX3, t4, 3 - add.d Y, Y, INCY - XVMUL VT0, VX1, VXC - XVFMADD VT0, VX3, VXS, VT0 - XVMUL VT1, VX1, VXS - XVMSUB VT1, VX3, VXC, VT1 - xvst VT0, X, 4 * SIZE - xvstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.d VT1, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.d VT1, YY, 0, 3 -#else xvst VT0, X, 0 * SIZE - xvstelm.w VT1, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 3 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 4 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 5 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 6 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 7 - -#endif - add.d YY, YY, INCY - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L121 - b .L997 - .align 3 - -.L122: // C!=0 S==0 #ifdef DOUBLE - xvld VX0, X, 0 * SIZE - ld.d t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - xvinsgr2vr.d VX2, t1, 0 - xvinsgr2vr.d VX2, t2, 1 - xvinsgr2vr.d VX2, t3, 2 - xvinsgr2vr.d VX2, t4, 3 - add.d Y, Y, INCY - xvfmul.d VT0, VX0, VXC - xvfmul.d VT1, VX2, VXC - xvld VX1, X, 4 * SIZE - xvst VT0, X, 0 * SIZE - xvstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.d VT1, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.d VT1, YY, 0, 3 - add.d YY, YY, INCY - ld.d t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - xvinsgr2vr.d VX3, t1, 0 - xvinsgr2vr.d VX3, t2, 1 - xvinsgr2vr.d VX3, t3, 2 - xvinsgr2vr.d VX3, t4, 3 - add.d Y, Y, INCY - xvfmul.d VT0, VX1, VXC - xvfmul.d VT1, VX3, VXC - addi.d I, I, -1 - xvst VT0, X, 4 * SIZE xvstelm.d VT1, YY, 0, 0 add.d YY, YY, INCY xvstelm.d VT1, YY, 0, 1 @@ -368,102 +173,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvstelm.d VT1, YY, 0, 2 add.d YY, YY, INCY xvstelm.d VT1, YY, 0, 3 -#else - xvld VX0, X, 0 * SIZE - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - add.d Y, Y, INCY - xvinsgr2vr.w VX2, t1, 0 - xvinsgr2vr.w VX2, t2, 1 - xvinsgr2vr.w VX2, t3, 2 - xvinsgr2vr.w VX2, t4, 3 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - xvinsgr2vr.w VX2, t1, 4 - xvinsgr2vr.w VX2, t2, 5 - xvinsgr2vr.w VX2, t3, 6 - xvinsgr2vr.w VX2, t4, 7 - add.d Y, Y, INCY - xvfmul.s VT0, VX0, VXC - xvfmul.s VT1, VX2, VXC - xvst VT0, X, 0 * SIZE - xvstelm.w VT1, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 3 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 4 add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 5 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 6 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 7 -#endif - add.d YY, YY, INCY - addi.d X, X, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L122 - b .L997 - .align 3 -.L123: // C==0 S!=0 -#ifdef DOUBLE - xvld VX0, X, 0 * SIZE - ld.d t1, Y, 0 * SIZE + xvld VX0, X, 4 * SIZE + ld.d t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t2, Y, 0 * SIZE add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE + ld.d t3, Y, 0 * SIZE add.d Y, Y, INCY - ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 0 * SIZE add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE xvinsgr2vr.d VX2, t1, 0 xvinsgr2vr.d VX2, t2, 1 xvinsgr2vr.d VX2, t3, 2 xvinsgr2vr.d VX2, t4, 3 - add.d Y, Y, INCY - xvfmul.d VT0, VX2, VXS - xvfmul.d VT1, VX0, VXS - xvfsub.d VT1, VXZ, VT1 - xvst VT0, X, 0 * SIZE - xvstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.d VT1, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.d VT1, YY, 0, 3 - add.d YY, YY, INCY - xvld VX1, X, 4 * SIZE - ld.d t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - xvinsgr2vr.d VX3, t1, 0 - xvinsgr2vr.d VX3, t2, 1 - xvinsgr2vr.d VX3, t3, 2 - xvinsgr2vr.d VX3, t4, 3 - add.d Y, Y, INCY - xvfmul.d VT0, VX3, VXS - xvfmul.d VT1, VX1, VXS - xvfsub.d VT1, VXZ, VT1 - addi.d I, I, -1 + + XVMUL VT0, VX0, VXC + XVFMADD VT0, VX2, VXS, VT0 + XVMUL VT1, VX0, VXS + XVMSUB VT1, VX2, VXC, VT1 + xvst VT0, X, 4 * SIZE xvstelm.d VT1, YY, 0, 0 add.d YY, YY, INCY @@ -472,36 +202,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvstelm.d VT1, YY, 0, 2 add.d YY, YY, INCY xvstelm.d VT1, YY, 0, 3 + add.d YY, YY, INCY #else - xvld VX0, X, 0 * SIZE - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - add.d Y, Y, INCY - xvinsgr2vr.w VX2, t1, 0 - xvinsgr2vr.w VX2, t2, 1 - xvinsgr2vr.w VX2, t3, 2 - xvinsgr2vr.w VX2, t4, 3 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - xvinsgr2vr.w VX2, t1, 4 - xvinsgr2vr.w VX2, t2, 5 - xvinsgr2vr.w VX2, t3, 6 - xvinsgr2vr.w VX2, t4, 7 - add.d Y, Y, INCY - xvfmul.s VT0, VX2, VXS - xvfmul.s VT1, VX0, VXS - xvfsub.s VT1, VXZ, VT1 - xvst VT0, X, 0 * SIZE xvstelm.w VT1, YY, 0, 0 add.d YY, YY, INCY xvstelm.w VT1, YY, 0, 1 @@ -517,135 +219,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvstelm.w VT1, YY, 0, 6 add.d YY, YY, INCY xvstelm.w VT1, YY, 0, 7 -#endif add.d YY, YY, INCY +#endif addi.d X, X, 8 * SIZE addi.d I, I, -1 - blt $r0, I, .L123 + blt $r0, I, .L121 b .L997 .align 3 -.L124: // C==0 S==0 - xvst VXZ, X, 0 * SIZE +.L211: // C!=0 S!=0 #ifdef DOUBLE - xvst VXZ, X, 0 * SIZE - xvst VXZ, X, 4 * SIZE - xvstelm.d VXZ, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.d VXZ, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.d VXZ, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.d VXZ, YY, 0, 3 - add.d YY, YY, INCY - xvstelm.d VXZ, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.d VXZ, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.d VXZ, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.d VXZ, YY, 0, 3 + xvld VX2, Y, 0 * SIZE + ld.d t1, X, 0 * SIZE + add.d X, X, INCX + ld.d t2, X, 0 * SIZE + add.d X, X, INCX + ld.d t3, X, 0 * SIZE + add.d X, X, INCX + ld.d t4, X, 0 * SIZE + add.d X, X, INCX + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 #else - xvst VXZ, X, 0 * SIZE - xvstelm.w VXZ, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 3 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 4 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 5 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 6 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 7 -#endif - add.d YY, YY, INCY - addi.d I, I, -1 - addi.d X, X, 8 * SIZE - blt $r0, I, .L124 - move Y, YY - b .L997 - .align 3 - -.L21:// INCX!=1 and INCY==1 - bge $r0, I, .L997 - move XX, X - CMPEQ $fcc0, C, a1 - bcnez $fcc0, .L210 - CMPEQ $fcc0, S, a1 - bcnez $fcc0, .L212 // C!=0 S==0 - b .L211 // C!=0 S!=0 - .align 3 - -.L210: - CMPEQ $fcc0, S, a1 - bcnez $fcc0, .L214 // C==0 S==0 - b .L213 // C==0 S!=0 - .align 3 - -.L211: // C!=0 S!=0 -#ifdef DOUBLE xvld VX2, Y, 0 * SIZE - ld.d t1, X, 0 * SIZE + ld.w t1, X, 0 * SIZE add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - xvinsgr2vr.d VX0, t1, 0 - xvinsgr2vr.d VX0, t2, 1 - xvinsgr2vr.d VX0, t3, 2 - xvinsgr2vr.d VX0, t4, 3 - add.d X, X, INCX - xvfmul.d VT0, VXC, VX0 - xvfmadd.d VT0, VX2, VXS, VT0 - xvfmul.d VT1, VXS, VX0 - xvfmsub.d VT1, VX2, VXC, VT1 - xvstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - xvstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - xvstelm.d VT0, XX, 0, 2 - add.d XX, XX, INCX - xvstelm.d VT0, XX, 0, 3 - add.d XX, XX, INCX - xvst VT1, Y, 0 * SIZE - xvld VX3, Y, 4 * SIZE - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - xvinsgr2vr.d VX1, t1, 0 - xvinsgr2vr.d VX1, t2, 1 - xvinsgr2vr.d VX1, t3, 2 - xvinsgr2vr.d VX1, t4, 3 - add.d X, X, INCX - xvfmul.d VT0, VX1, VXC - xvfmadd.d VT0, VX3, VXS, VT0 - xvfmul.d VT1, VX1, VXS - xvfmsub.d VT1, VX3, VXC, VT1 - xvstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - xvstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - xvstelm.d VT0, XX, 0, 2 - add.d XX, XX, INCX - xvstelm.d VT0, XX, 0, 3 - add.d XX, XX, INCX - xvst VT1, Y, 4 * SIZE -#else - xvld VX2, Y, 0 * SIZE - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE + ld.w t2, X, 0 * SIZE add.d X, X, INCX ld.w t3, X, 0 * SIZE add.d X, X, INCX @@ -655,166 +256,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvinsgr2vr.w VX0, t2, 1 xvinsgr2vr.w VX0, t3, 2 xvinsgr2vr.w VX0, t4, 3 - ld.w t1, X, 0 * SIZE + ld.w t1, X, 0 * SIZE add.d X, X, INCX - ld.w t2, X, 0 * SIZE + ld.w t2, X, 0 * SIZE add.d X, X, INCX ld.w t3, X, 0 * SIZE add.d X, X, INCX ld.w t4, X, 0 * SIZE - xvinsgr2vr.w VX0, t1, 4 - xvinsgr2vr.w VX0, t2, 5 - xvinsgr2vr.w VX0, t3, 6 - xvinsgr2vr.w VX0, t4, 7 - add.d X, X, INCX - xvfmul.s VT0, VXC, VX0 - xvfmadd.s VT0, VX2, VXS, VT0 - xvfmul.s VT1, VX0, VXS - xvfmsub.s VT1, VX2, VXC, VT1 - xvstelm.w VT0, XX, 0, 0 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 1 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 2 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 3 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 4 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 5 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 6 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 7 - add.d XX, XX, INCX - xvst VT1, Y, 0 * SIZE -#endif - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L211 - b .L997 - .align 3 - -.L212: // C!=0 S==0 -#ifdef DOUBLE - xvld VX2, Y, 0 * SIZE - ld.d t1, X, 0 * SIZE add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - xvinsgr2vr.d VX0, t1, 0 - xvinsgr2vr.d VX0, t2, 1 - xvinsgr2vr.d VX0, t3, 2 - xvinsgr2vr.d VX0, t4, 3 - add.d X, X, INCX - xvfmul.d VT0, VXC, VX0 - xvstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - xvstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - xvstelm.d VT0, XX, 0, 2 - add.d XX, XX, INCX - xvstelm.d VT0, XX, 0, 3 - add.d XX, XX, INCX - xvfmul.d VT1, VX2, VXC - xvst VT1, Y, 0 * SIZE - xvld VX3, Y, 4 * SIZE - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - xvinsgr2vr.d VX1, t1, 0 - xvinsgr2vr.d VX1, t2, 1 - xvinsgr2vr.d VX1, t3, 2 - xvinsgr2vr.d VX1, t4, 3 - add.d X, X, INCX - xvfmul.d VT0, VX1, VXC - xvstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - xvstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - xvstelm.d VT0, XX, 0, 2 - add.d XX, XX, INCX - xvstelm.d VT0, XX, 0, 3 - add.d XX, XX, INCX - xvfmul.d VT1, VX3, VXS - xvst VT1, Y, 4 * SIZE -#else - xvld VX2, Y, 0 * SIZE - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 0 - xvinsgr2vr.w VX0, t2, 1 - xvinsgr2vr.w VX0, t3, 2 - xvinsgr2vr.w VX0, t4, 3 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE xvinsgr2vr.w VX0, t1, 4 xvinsgr2vr.w VX0, t2, 5 xvinsgr2vr.w VX0, t3, 6 xvinsgr2vr.w VX0, t4, 7 - add.d X, X, INCX - xvfmul.s VT0, VXC, VX0 - xvfmul.s VT1, VX2, VXC - xvstelm.w VT0, XX, 0, 0 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 1 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 2 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 3 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 4 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 5 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 6 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 7 - add.d XX, XX, INCX - xvst VT1, Y, 0 * SIZE #endif - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L212 - b .L997 - .align 3 - -.L213: // C==0 S!=0 + XVMUL VT0, VXC, VX0 + XVFMADD VT0, VX2, VXS, VT0 + XVMUL VT1, VXS, VX0 + XVMSUB VT1, VX2, VXC, VT1 #ifdef DOUBLE - xvld VX2, Y, 0 * SIZE - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - xvinsgr2vr.d VX0, t1, 0 - xvinsgr2vr.d VX0, t2, 1 - xvinsgr2vr.d VX0, t3, 2 - xvinsgr2vr.d VX0, t4, 3 - add.d X, X, INCX - xvfmul.d VT0, VXS, VX2 - xvfmul.d VT1, VXS, VX0 - xvfsub.d VT1, VXZ, VT1 xvstelm.d VT0, XX, 0, 0 add.d XX, XX, INCX xvstelm.d VT0, XX, 0, 1 @@ -824,148 +283,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvstelm.d VT0, XX, 0, 3 add.d XX, XX, INCX xvst VT1, Y, 0 * SIZE - xvld VX3, Y, 4 * SIZE - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - xvinsgr2vr.d VX1, t1, 0 - xvinsgr2vr.d VX1, t2, 1 - xvinsgr2vr.d VX1, t3, 2 - xvinsgr2vr.d VX1, t4, 3 - add.d X, X, INCX - xvfmul.d VT0, VX3, VXS - xvfmul.d VT1, VX1, VXS - xvfsub.d VT1, VXZ, VT1 - xvstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - xvstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - xvstelm.d VT0, XX, 0, 2 - add.d XX, XX, INCX - xvstelm.d VT0, XX, 0, 3 - add.d XX, XX, INCX - xvst VT1, Y, 4 * SIZE -#else - xvld VX2, Y, 0 * SIZE - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 0 - xvinsgr2vr.w VX0, t2, 1 - xvinsgr2vr.w VX0, t3, 2 - xvinsgr2vr.w VX0, t4, 3 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - xvinsgr2vr.w VX0, t1, 4 - xvinsgr2vr.w VX0, t2, 5 - xvinsgr2vr.w VX0, t3, 6 - xvinsgr2vr.w VX0, t4, 7 - add.d X, X, INCX - xvfmul.s VT0, VXS, VX2 - xvfmul.s VT1, VXS, VX0 - xvfsub.s VT1, VXZ, VT1 - xvstelm.w VT0, XX, 0, 0 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 1 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 2 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 3 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 4 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 5 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 6 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 7 - add.d XX, XX, INCX - xvst VT1, Y, 0 * SIZE -#endif - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L213 - b .L997 - .align 3 - -.L214: // C==0 S==0 -#ifdef DOUBLE - xvstelm.d VXZ, XX, 0, 0 - add.d XX, XX, INCX - xvstelm.d VXZ, XX, 0, 1 - add.d XX, XX, INCX - xvstelm.d VXZ, XX, 0, 2 - add.d XX, XX, INCX - xvstelm.d VXZ, XX, 0, 3 - add.d XX, XX, INCX - xvst VT1, Y, 0 * SIZE - xvstelm.d VXZ, XX, 0, 0 - add.d XX, XX, INCX - xvstelm.d VXZ, XX, 0, 1 - add.d XX, XX, INCX - xvstelm.d VXZ, XX, 0, 2 - add.d XX, XX, INCX - xvstelm.d VXZ, XX, 0, 3 - add.d XX, XX, INCX - xvst VT1, Y, 4 * SIZE -#else - xvstelm.w VXZ, XX, 0, 0 - add.d XX, XX, INCX - xvstelm.w VXZ, XX, 0, 1 - add.d XX, XX, INCX - xvstelm.w VXZ, XX, 0, 2 - add.d XX, XX, INCX - xvstelm.w VXZ, XX, 0, 3 - add.d XX, XX, INCX - xvst VT1, Y, 0 * SIZE - xvstelm.w VXZ, XX, 0, 4 - add.d XX, XX, INCX - xvstelm.w VXZ, XX, 0, 5 - add.d XX, XX, INCX - xvstelm.w VXZ, XX, 0, 6 - add.d XX, XX, INCX - xvstelm.w VXZ, XX, 0, 7 - add.d XX, XX, INCX -#endif - addi.d Y, Y, 8 * SIZE - addi.d I, I, -1 - blt $r0, I, .L211 - b .L997 - .align 3 - -.L22: - bge $r0, I, .L997 - move YY, Y - move XX, X - CMPEQ $fcc0, C, a1 - bcnez $fcc0, .L220 - CMPEQ $fcc0, S, a1 - bcnez $fcc0, .L222 // C!=0 S==0 - b .L221 // C!=0 S!=0 - .align 3 -.L220: - CMPEQ $fcc0, S, a1 - bcnez $fcc0, .L224 // C==0 S==0 - b .L223 // C==0 S!=0 - .align 3 - -.L221: // C!=0 S!=0 -#ifdef DOUBLE + xvld VX2, Y, 4 * SIZE ld.d t1, X, 0 * SIZE add.d X, X, INCX ld.d t2, X, 0 * SIZE @@ -978,135 +297,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvinsgr2vr.d VX0, t2, 1 xvinsgr2vr.d VX0, t3, 2 xvinsgr2vr.d VX0, t4, 3 - ld.d t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - xvinsgr2vr.d VX2, t1, 0 - xvinsgr2vr.d VX2, t2, 1 - xvinsgr2vr.d VX2, t3, 2 - xvinsgr2vr.d VX2, t4, 3 - add.d Y, Y, INCY - xvfmul.d VT0, VX0, VXC - xvfmadd.d VT0, VX2, VXS, VT0 - xvfmul.d VT1, VX0, VXS - xvfmsub.d VT1, VX2, VXC, VT1 - xvstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - xvstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - xvstelm.d VT0, XX, 0, 2 - add.d XX, XX, INCX - xvstelm.d VT0, XX, 0, 3 - add.d XX, XX, INCX - xvstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.d VT1, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.d VT1, YY, 0, 3 - add.d YY, YY, INCY - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.d VX1, t1, 0 - xvinsgr2vr.d VX1, t2, 1 - xvinsgr2vr.d VX1, t3, 2 - xvinsgr2vr.d VX1, t4, 3 - ld.d t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - xvinsgr2vr.d VX3, t1, 0 - xvinsgr2vr.d VX3, t2, 1 - xvinsgr2vr.d VX3, t3, 2 - xvinsgr2vr.d VX3, t4, 3 - add.d Y, Y, INCY - xvfmul.d VT0, VX1, VXC - xvfmadd.d VT0, VX3, VXS, VT0 - xvfmul.d VT1, VX1, VXS - xvfmsub.d VT1, VX3, VXC, VT1 + + XVMUL VT0, VXC, VX0 + XVFMADD VT0, VX2, VXS, VT0 + XVMUL VT1, VXS, VX0 + XVMSUB VT1, VX2, VXC, VT1 xvstelm.d VT0, XX, 0, 0 add.d XX, XX, INCX xvstelm.d VT0, XX, 0, 1 add.d XX, XX, INCX - xvstelm.d VT0, XX, 0, 2 - add.d XX, XX, INCX - xvstelm.d VT0, XX, 0, 3 - add.d XX, XX, INCX - xvstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.d VT1, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.d VT1, YY, 0, 3 - add.d YY, YY, INCY -#else - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 0 - xvinsgr2vr.w VX0, t2, 1 - xvinsgr2vr.w VX0, t3, 2 - xvinsgr2vr.w VX0, t4, 3 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 4 - xvinsgr2vr.w VX0, t2, 5 - xvinsgr2vr.w VX0, t3, 6 - xvinsgr2vr.w VX0, t4, 7 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - add.d Y, Y, INCY - xvinsgr2vr.w VX2, t1, 0 - xvinsgr2vr.w VX2, t2, 1 - xvinsgr2vr.w VX2, t3, 2 - xvinsgr2vr.w VX2, t4, 3 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - xvinsgr2vr.w VX2, t1, 4 - xvinsgr2vr.w VX2, t2, 5 - xvinsgr2vr.w VX2, t3, 6 - xvinsgr2vr.w VX2, t4, 7 - add.d Y, Y, INCY - xvfmul.s VT0, VX0, VXC - xvfmadd.s VT0, VX2, VXS, VT0 - xvfmul.s VT1, VX0, VXS - xvfmsub.s VT1, VX2, VXC, VT1 + xvstelm.d VT0, XX, 0, 2 + add.d XX, XX, INCX + xvstelm.d VT0, XX, 0, 3 + add.d XX, XX, INCX + xvst VT1, Y, 4 * SIZE +#else xvstelm.w VT0, XX, 0, 0 add.d XX, XX, INCX xvstelm.w VT0, XX, 0, 1 @@ -1123,232 +328,96 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add.d XX, XX, INCX xvstelm.w VT0, XX, 0, 7 add.d XX, XX, INCX - xvstelm.w VT1, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 3 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 4 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 5 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 6 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 7 - add.d YY, YY, INCY + xvst VT1, Y, 0 * SIZE #endif + addi.d Y, Y, 8 * SIZE addi.d I, I, -1 - blt $r0, I, .L221 + blt $r0, I, .L211 b .L997 .align 3 -.L222: // C!=0 S==0 +.L221: // C!=0 S!=0 #ifdef DOUBLE - ld.d t1, X, 0 * SIZE + ld.d t1, X, 0 * SIZE add.d X, X, INCX - ld.d t2, X, 0 * SIZE + ld.d t2, X, 0 * SIZE add.d X, X, INCX - ld.d t3, X, 0 * SIZE + ld.d t3, X, 0 * SIZE add.d X, X, INCX - ld.d t4, X, 0 * SIZE + ld.d t4, X, 0 * SIZE add.d X, X, INCX xvinsgr2vr.d VX0, t1, 0 xvinsgr2vr.d VX0, t2, 1 xvinsgr2vr.d VX0, t3, 2 xvinsgr2vr.d VX0, t4, 3 - ld.d t1, Y, 0 * SIZE + ld.d t1, Y, 0 * SIZE add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE + ld.d t2, Y, 0 * SIZE + add.d Y, Y, INCY + ld.d t3, Y, 0 * SIZE add.d Y, Y, INCY - ld.d t3, Y, 0 * SIZE + ld.d t4, Y, 0 * SIZE add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE xvinsgr2vr.d VX2, t1, 0 xvinsgr2vr.d VX2, t2, 1 xvinsgr2vr.d VX2, t3, 2 xvinsgr2vr.d VX2, t4, 3 - add.d Y, Y, INCY - xvfmul.d VT0, VX0, VXC - xvfmul.d VT1, VX2, VXC - xvstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - xvstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - xvstelm.d VT0, XX, 0, 2 - add.d XX, XX, INCX - xvstelm.d VT0, XX, 0, 3 - add.d XX, XX, INCX - xvstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.d VT1, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.d VT1, YY, 0, 3 - add.d YY, YY, INCY - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.d VX1, t1, 0 - xvinsgr2vr.d VX1, t2, 1 - xvinsgr2vr.d VX1, t3, 2 - xvinsgr2vr.d VX1, t4, 3 - ld.d t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - xvinsgr2vr.d VX3, t1, 0 - xvinsgr2vr.d VX3, t2, 1 - xvinsgr2vr.d VX3, t3, 2 - xvinsgr2vr.d VX3, t4, 3 - add.d Y, Y, INCY - xvfmul.d VT0, VX1, VXC - xvfmul.d VT1, VX3, VXC - xvstelm.d VT0, XX, 0, 0 - add.d XX, XX, INCX - xvstelm.d VT0, XX, 0, 1 - add.d XX, XX, INCX - xvstelm.d VT0, XX, 0, 2 - add.d XX, XX, INCX - xvstelm.d VT0, XX, 0, 3 - add.d XX, XX, INCX - xvstelm.d VT1, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.d VT1, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.d VT1, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.d VT1, YY, 0, 3 - add.d YY, YY, INCY #else - ld.w t1, X, 0 * SIZE + ld.w t1, X, 0 * SIZE add.d X, X, INCX - ld.w t2, X, 0 * SIZE + ld.w t2, X, 0 * SIZE add.d X, X, INCX - ld.w t3, X, 0 * SIZE + ld.w t3, X, 0 * SIZE add.d X, X, INCX - ld.w t4, X, 0 * SIZE + ld.w t4, X, 0 * SIZE add.d X, X, INCX xvinsgr2vr.w VX0, t1, 0 xvinsgr2vr.w VX0, t2, 1 xvinsgr2vr.w VX0, t3, 2 xvinsgr2vr.w VX0, t4, 3 - ld.w t1, X, 0 * SIZE + ld.w t1, X, 0 * SIZE add.d X, X, INCX - ld.w t2, X, 0 * SIZE + ld.w t2, X, 0 * SIZE add.d X, X, INCX - ld.w t3, X, 0 * SIZE + ld.w t3, X, 0 * SIZE add.d X, X, INCX - ld.w t4, X, 0 * SIZE + ld.w t4, X, 0 * SIZE add.d X, X, INCX xvinsgr2vr.w VX0, t1, 4 xvinsgr2vr.w VX0, t2, 5 xvinsgr2vr.w VX0, t3, 6 xvinsgr2vr.w VX0, t4, 7 - ld.w t1, Y, 0 * SIZE + + ld.w t1, Y, 0 * SIZE add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE + ld.w t2, Y, 0 * SIZE add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE + ld.w t3, Y, 0 * SIZE add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE + ld.w t4, Y, 0 * SIZE add.d Y, Y, INCY xvinsgr2vr.w VX2, t1, 0 xvinsgr2vr.w VX2, t2, 1 xvinsgr2vr.w VX2, t3, 2 xvinsgr2vr.w VX2, t4, 3 - ld.w t1, Y, 0 * SIZE + ld.w t1, Y, 0 * SIZE + add.d Y, Y, INCY + ld.w t2, Y, 0 * SIZE add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE + ld.w t3, Y, 0 * SIZE add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE + ld.w t4, Y, 0 * SIZE add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE xvinsgr2vr.w VX2, t1, 4 xvinsgr2vr.w VX2, t2, 5 xvinsgr2vr.w VX2, t3, 6 xvinsgr2vr.w VX2, t4, 7 - add.d Y, Y, INCY - xvfmul.s VT0, VX0, VXC - xvfmul.s VT1, VX2, VXC - xvstelm.w VT0, XX, 0, 0 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 1 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 2 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 3 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 4 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 5 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 6 - add.d XX, XX, INCX - xvstelm.w VT0, XX, 0, 7 - add.d XX, XX, INCX - xvstelm.w VT1, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 3 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 4 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 5 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 6 - add.d YY, YY, INCY - xvstelm.w VT1, YY, 0, 7 - add.d YY, YY, INCY #endif - addi.d I, I, -1 - blt $r0, I, .L222 - b .L997 - .align 3 - -.L223: // C==0 S!=0 + XVMUL VT0, VX0, VXC + XVFMADD VT0, VX2, VXS, VT0 + XVMUL VT1, VX0, VXS + XVMSUB VT1, VX2, VXC, VT1 #ifdef DOUBLE - ld.d t1, X, 0 * SIZE - add.d X, X, INCX - ld.d t2, X, 0 * SIZE - add.d X, X, INCX - ld.d t3, X, 0 * SIZE - add.d X, X, INCX - ld.d t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.d VX0, t1, 0 - xvinsgr2vr.d VX0, t2, 1 - xvinsgr2vr.d VX0, t3, 2 - xvinsgr2vr.d VX0, t4, 3 - ld.d t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - xvinsgr2vr.d VX2, t1, 0 - xvinsgr2vr.d VX2, t2, 1 - xvinsgr2vr.d VX2, t3, 2 - xvinsgr2vr.d VX2, t4, 3 - add.d Y, Y, INCY - xvfmul.d VT0, VX2, VXS - xvfmul.d VT1, VX0, VXS - xvfsub.d VT1, VXZ, VT1 xvstelm.d VT0, XX, 0, 0 add.d XX, XX, INCX xvstelm.d VT0, XX, 0, 1 @@ -1365,33 +434,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add.d YY, YY, INCY xvstelm.d VT1, YY, 0, 3 add.d YY, YY, INCY - ld.d t1, X, 0 * SIZE + + ld.d t1, X, 0 * SIZE add.d X, X, INCX - ld.d t2, X, 0 * SIZE + ld.d t2, X, 0 * SIZE add.d X, X, INCX - ld.d t3, X, 0 * SIZE + ld.d t3, X, 0 * SIZE add.d X, X, INCX - ld.d t4, X, 0 * SIZE + ld.d t4, X, 0 * SIZE add.d X, X, INCX - xvinsgr2vr.d VX1, t1, 0 - xvinsgr2vr.d VX1, t2, 1 - xvinsgr2vr.d VX1, t3, 2 - xvinsgr2vr.d VX1, t4, 3 - ld.d t1, Y, 0 * SIZE + xvinsgr2vr.d VX0, t1, 0 + xvinsgr2vr.d VX0, t2, 1 + xvinsgr2vr.d VX0, t3, 2 + xvinsgr2vr.d VX0, t4, 3 + ld.d t1, Y, 0 * SIZE add.d Y, Y, INCY - ld.d t2, Y, 0 * SIZE + ld.d t2, Y, 0 * SIZE add.d Y, Y, INCY - ld.d t3, Y, 0 * SIZE + ld.d t3, Y, 0 * SIZE add.d Y, Y, INCY - ld.d t4, Y, 0 * SIZE - xvinsgr2vr.d VX3, t1, 0 - xvinsgr2vr.d VX3, t2, 1 - xvinsgr2vr.d VX3, t3, 2 - xvinsgr2vr.d VX3, t4, 3 + ld.d t4, Y, 0 * SIZE add.d Y, Y, INCY - xvfmul.d VT0, VX3, VXS - xvfmul.d VT1, VX0, VXS - xvfsub.d VT1, VXZ, VT1 + xvinsgr2vr.d VX2, t1, 0 + xvinsgr2vr.d VX2, t2, 1 + xvinsgr2vr.d VX2, t3, 2 + xvinsgr2vr.d VX2, t4, 3 + + XVMUL VT0, VX0, VXC + XVFMADD VT0, VX2, VXS, VT0 + XVMUL VT1, VX0, VXS + XVMSUB VT1, VX2, VXC, VT1 xvstelm.d VT0, XX, 0, 0 add.d XX, XX, INCX xvstelm.d VT0, XX, 0, 1 @@ -1409,57 +481,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvstelm.d VT1, YY, 0, 3 add.d YY, YY, INCY #else - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 0 - xvinsgr2vr.w VX0, t2, 1 - xvinsgr2vr.w VX0, t3, 2 - xvinsgr2vr.w VX0, t4, 3 - ld.w t1, X, 0 * SIZE - add.d X, X, INCX - ld.w t2, X, 0 * SIZE - add.d X, X, INCX - ld.w t3, X, 0 * SIZE - add.d X, X, INCX - ld.w t4, X, 0 * SIZE - add.d X, X, INCX - xvinsgr2vr.w VX0, t1, 4 - xvinsgr2vr.w VX0, t2, 5 - xvinsgr2vr.w VX0, t3, 6 - xvinsgr2vr.w VX0, t4, 7 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - add.d Y, Y, INCY - xvinsgr2vr.w VX2, t1, 0 - xvinsgr2vr.w VX2, t2, 1 - xvinsgr2vr.w VX2, t3, 2 - xvinsgr2vr.w VX2, t4, 3 - ld.w t1, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t2, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t3, Y, 0 * SIZE - add.d Y, Y, INCY - ld.w t4, Y, 0 * SIZE - xvinsgr2vr.w VX2, t1, 4 - xvinsgr2vr.w VX2, t2, 5 - xvinsgr2vr.w VX2, t3, 6 - xvinsgr2vr.w VX2, t4, 7 - add.d Y, Y, INCY - xvfmul.s VT0, VX2, VXS - xvfmul.s VT1, VX0, VXS - xvfsub.s VT1, VXZ, VT1 xvstelm.w VT0, XX, 0, 0 add.d XX, XX, INCX xvstelm.w VT0, XX, 0, 1 @@ -1476,6 +497,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add.d XX, XX, INCX xvstelm.w VT0, XX, 0, 7 add.d XX, XX, INCX + xvstelm.w VT1, YY, 0, 0 add.d YY, YY, INCY xvstelm.w VT1, YY, 0, 1 @@ -1494,83 +516,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add.d YY, YY, INCY #endif addi.d I, I, -1 - blt $r0, I, .L223 - b .L997 - .align 3 - -.L224: // C==0 S==0 -#ifdef DOUBLE - xvstelm.d VXZ, XX, 0, 0 - add.d XX, XX, INCX - xvstelm.d VXZ, XX, 0, 1 - add.d XX, XX, INCX - xvstelm.d VXZ, XX, 0, 2 - add.d XX, XX, INCX - xvstelm.d VXZ, XX, 0, 3 - add.d XX, XX, INCX - xvstelm.d VXZ, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.d VXZ, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.d VXZ, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.d VXZ, YY, 0, 3 - add.d YY, YY, INCY - xvstelm.d VXZ, XX, 0, 0 - add.d XX, XX, INCX - xvstelm.d VXZ, XX, 0, 1 - add.d XX, XX, INCX - xvstelm.d VXZ, XX, 0, 2 - add.d XX, XX, INCX - xvstelm.d VXZ, XX, 0, 3 - add.d XX, XX, INCX - xvstelm.d VXZ, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.d VXZ, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.d VXZ, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.d VXZ, YY, 0, 3 -#else - xvstelm.w VXZ, XX, 0, 0 - add.d XX, XX, INCX - xvstelm.w VXZ, XX, 0, 1 - add.d XX, XX, INCX - xvstelm.w VXZ, XX, 0, 2 - add.d XX, XX, INCX - xvstelm.w VXZ, XX, 0, 3 - add.d XX, XX, INCX - xvstelm.w VXZ, YY, 0, 0 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 1 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 2 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 3 - add.d YY, YY, INCY - xvstelm.w VXZ, XX, 0, 4 - add.d XX, XX, INCX - xvstelm.w VXZ, XX, 0, 5 - add.d XX, XX, INCX - xvstelm.w VXZ, XX, 0, 6 - add.d XX, XX, INCX - xvstelm.w VXZ, XX, 0, 7 - add.d XX, XX, INCX - xvstelm.w VXZ, YY, 0, 4 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 5 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 6 - add.d YY, YY, INCY - xvstelm.w VXZ, YY, 0, 7 -#endif - add.d YY, YY, INCY - addi.d I, I, -1 - blt $r0, I, .L224 -#ifdef DOUBLE - move X, XX - move Y, YY -#endif + blt $r0, I, .L221 b .L997 .align 3 From b471fa337bdc59e11df6baacfd9d1202bb4079a7 Mon Sep 17 00:00:00 2001 From: pengxu Date: Wed, 30 Apr 2025 16:42:36 +0800 Subject: [PATCH 32/38] Loongarch64: fixed snrm2_lasx --- kernel/loongarch64/snrm2_lasx.S | 126 ++++++++++++++++++++++++-------- 1 file changed, 94 insertions(+), 32 deletions(-) diff --git a/kernel/loongarch64/snrm2_lasx.S b/kernel/loongarch64/snrm2_lasx.S index 3ae11e897..e5c9c557a 100644 --- a/kernel/loongarch64/snrm2_lasx.S +++ b/kernel/loongarch64/snrm2_lasx.S @@ -43,15 +43,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define t2 $r13 #define t3 $r14 #define t4 $r15 - -/* Don't change following FR unless you know the effects. */ #define VX0 $xr15 #define VX1 $xr16 #define VX2 $xr17 #define VX3 $xr18 #define VX4 $xr21 +#define VX5 $xr22 +/* Don't change following FR unless you know the effects. */ #define res1 $xr19 #define res2 $xr20 +#define RCP $f2 +#define VALPHA $xr3 + +// The optimization for snrm2 cannot simply involve +// extending the data type from float to double and +// then summing the squares of the data. LAPACK tests +// have shown that this approach can still lead to data overflow. +// Instead, we need to find the maximum absolute value in the entire +// array and divide each data element by this maximum value before +// performing the calculation. This approach can avoid overflow (and does not require extending the data type). PROLOGUE @@ -59,29 +69,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. LDINT N, 0(N) LDINT INCX, 0(INCX) #endif + bge $r0, N, .L999 + beq $r0, INCX, .L999 + addi.d $sp, $sp, -32 + st.d $ra, $sp, 0 + st.d N, $sp, 8 + st.d X, $sp, 16 + st.d INCX, $sp, 24 +#ifdef DYNAMIC_ARCH + bl samax_k_LA264 +#else + bl samax_k +#endif + ld.d $ra, $sp, 0 + ld.d N, $sp, 8 + ld.d X, $sp, 16 + ld.d INCX, $sp, 24 + addi.d $sp, $sp, 32 + + frecip.s RCP, $f0 + vreplvei.w $vr3, $vr2, 0 + xvpermi.d VALPHA, $xr3,0x00 xvxor.v res1, res1, res1 xvxor.v res2, res2, res2 - bge $r0, N, .L999 - beq $r0, INCX, .L999 + fcmp.ceq.s $fcc0, $f0, $f19 + bcnez $fcc0, .L999 li.d TEMP, SIZE slli.d INCX, INCX, BASE_SHIFT - srai.d I, N, 3 + srai.d I, N, 4 bne INCX, TEMP, .L20 - bge $r0, I, .L997 + bge $r0, I, .L997 .align 3 .L10: - xvld VX0, X, 0 - xvfcvtl.d.s VX1, VX0 - xvfcvth.d.s VX2, VX0 - xvfmadd.d res1, VX1, VX1, res1 - xvfmadd.d res2, VX2, VX2, res2 + xvld VX0, X, 0 + xvld VX5, X, 8 * SIZE addi.d I, I, -1 - addi.d X, X, 8 * SIZE + addi.d X, X, 16 * SIZE + + xvfmul.s VX0, VX0, VALPHA + xvfmul.s VX5, VX5, VALPHA + + xvfmadd.s res1, VX0, VX0, res1 + xvfmadd.s res2, VX5, VX5, res2 blt $r0, I, .L10 - .align 3 b .L996 + .align 3 .L20: bge $r0, I, .L997 @@ -107,47 +141,75 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld.w t3, X, 0 add.d X, X, INCX ld.w t4, X, 0 + add.d X, X, INCX xvinsgr2vr.w VX0, t1, 4 xvinsgr2vr.w VX0, t2, 5 xvinsgr2vr.w VX0, t3, 6 xvinsgr2vr.w VX0, t4, 7 + xvfmul.s VX0, VX0, VALPHA + xvfmadd.s res1, VX0, VX0, res1 + + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 add.d X, X, INCX - xvfcvtl.d.s VX1, VX0 - xvfcvth.d.s VX2, VX0 - xvfmadd.d res1, VX1, VX1, res1 - xvfmadd.d res2, VX2, VX2, res2 + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 0 + xvinsgr2vr.w VX0, t2, 1 + xvinsgr2vr.w VX0, t3, 2 + xvinsgr2vr.w VX0, t4, 3 + ld.w t1, X, 0 + add.d X, X, INCX + ld.w t2, X, 0 + add.d X, X, INCX + ld.w t3, X, 0 + add.d X, X, INCX + ld.w t4, X, 0 + add.d X, X, INCX + xvinsgr2vr.w VX0, t1, 4 + xvinsgr2vr.w VX0, t2, 5 + xvinsgr2vr.w VX0, t3, 6 + xvinsgr2vr.w VX0, t4, 7 + xvfmul.s VX0, VX0, VALPHA + xvfmadd.s res2, VX0, VX0, res2 addi.d I, I, -1 blt $r0, I, .L21 - b .L996 + .align 3 .L996: - xvfadd.d res1, res1, res2 - xvpickve.d VX1, res1, 1 - xvpickve.d VX2, res1, 2 - xvpickve.d VX3, res1, 3 - fadd.d $f19, $f19, $f16 - fadd.d $f19, $f19, $f17 - fadd.d $f19, $f19, $f18 + xvfadd.s res1, res1, res2 + xvpermi.d VX1, res1, 0x4e + xvfadd.s res1, res1, VX1 + vreplvei.w $vr16, $vr19, 1 + vreplvei.w $vr17, $vr19, 2 + vreplvei.w $vr18, $vr19, 3 + xvfadd.s res1, VX1, res1 + xvfadd.s res1, VX2, res1 + xvfadd.s res1, VX3, res1 .align 3 .L997: - andi I, N, 7 + andi I, N, 15 bge $r0, I, .L999 .align 3 .L998: fld.s $f15, X, 0 - add.d X, X, INCX - addi.d I, I, -1 - fcvt.d.s $f15, $f15 - fmadd.d $f19, $f15, $f15, $f19 + addi.d I, I, -1 + fmul.s $f15, $f15, RCP + fmadd.s $f19, $f15, $f15, $f19 + add.d X, X, INCX blt $r0, I, .L998 .align 3 .L999: - fsqrt.d $f19, $f19 + fsqrt.s $f19, $f19 + fmul.s $f0, $f19, $f0 move $r4, $r17 - fcvt.s.d $f0, $f19 jirl $r0, $r1, 0x0 + .align 3 EPILOGUE From f19e72c40293f617ab290fad2e7fc6865cc772cb Mon Sep 17 00:00:00 2001 From: pengxu Date: Wed, 30 Apr 2025 16:42:52 +0800 Subject: [PATCH 33/38] Loongarch64: fixed swap_lasx --- kernel/loongarch64/swap_lasx.S | 65 +++++----------------------------- 1 file changed, 9 insertions(+), 56 deletions(-) diff --git a/kernel/loongarch64/swap_lasx.S b/kernel/loongarch64/swap_lasx.S index 4767fffe3..3e7a14c42 100644 --- a/kernel/loongarch64/swap_lasx.S +++ b/kernel/loongarch64/swap_lasx.S @@ -318,62 +318,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. move XX, X .L222: - LD a1, X, 0 - add.d X, X, INCX - LD a2, X, 0 - add.d X, X, INCX - LD a3, X, 0 - add.d X, X, INCX - LD a4, X, 0 - add.d X, X, INCX - LD b1, Y, 0 - ST a1, Y, 0 - add.d Y, Y, INCY - LD b2, Y, 0 - ST a2, Y, 0 - add.d Y, Y, INCY - LD b3, Y, 0 - ST a3, Y, 0 - add.d Y, Y, INCY - LD b4, Y, 0 - ST a4, Y, 0 - add.d Y, Y, INCY - LD a1, X, 0 - add.d X, X, INCX - ST b1, XX, 0 - add.d XX, XX, INCX - LD b1, Y, 0 - ST a1, Y, 0 - add.d Y, Y, INCY - LD a2, X, 0 - add.d X, X, INCX - ST b2, XX, 0 - add.d XX, XX, INCX - LD b2, Y, 0 - ST a2, Y, 0 - add.d Y, Y, INCY - LD a3, X, 0 - add.d X, X, INCX - ST b3, XX, 0 - add.d XX, XX, INCX - LD b3, Y, 0 - ST a3, Y, 0 - LD a4, X, 0 - add.d X, X, INCX - ST b4, XX, 0 - add.d XX, XX, INCX - LD b4, Y, 0 - ST a4, Y, 0 - add.d Y, Y, INCY - ST b1, XX, 0 - add.d XX, XX, INCX - ST b2, XX, 0 - add.d XX, XX, INCX - ST b3, XX, 0 - add.d XX, XX, INCX - ST b4, XX, 0 - add.d XX, XX, INCX - addi.d I, I, -1 +.rept 8 + LD $f12, X, 0 + LD $f14, Y, 0 + ST $f12, Y, 0 + ST $f14, X, 0 + add.d X, X, INCX + add.d Y, Y, INCY +.endr + addi.d I, I, -1 blt $r0, I, .L222 .align 3 From 4bee135cc1362c168d419e3b021e4a97f5883816 Mon Sep 17 00:00:00 2001 From: Scott Tsai Date: Wed, 30 Apr 2025 20:53:11 +0800 Subject: [PATCH 34/38] cpuid_x86: improve Intel Arrow Lake detection Add Intel Arrow Lake CPUIDs. See the datasheet: https://edc.intel.com/content/www/us/en/design/products/platforms/details/arrow-lake-s/core-ultra-200s-series-processors-datasheet-volume-1-of-2/cpuid/ --- cpuid_x86.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/cpuid_x86.c b/cpuid_x86.c index 4e13f1462..1b09c7217 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1578,6 +1578,7 @@ int get_cpuname(void){ case 12: //family 6 exmodel 12 switch (model) { case 15: + case 6: // Arrow Lake if(support_avx512()) return CPUTYPE_SAPPHIRERAPIDS; if(support_avx2()) @@ -2421,6 +2422,22 @@ int get_coretype(void){ else return CORE_NEHALEM; } + case 12: + switch (model) { + case 6: // Arrow Lake + if(support_amx_bf16()) + return CORE_SAPPHIRERAPIDS; + if(support_avx512_bf16()) + return CORE_COOPERLAKE; + if(support_avx512()) + return CORE_SKYLAKEX; + if(support_avx2()) + return CORE_HASWELL; + if(support_avx()) + return CORE_SANDYBRIDGE; + else + return CORE_NEHALEM; + } } case 15: if (model <= 0x2) return CORE_NORTHWOOD; From 47b43054f18ea7bf36eb5d901f1533f2e0cb4cc3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 1 May 2025 11:52:22 +0200 Subject: [PATCH 35/38] Avoid out of bounds accesses in SCAL when INFO<0 --- lapack-netlib/SRC/dgeev.f | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lapack-netlib/SRC/dgeev.f b/lapack-netlib/SRC/dgeev.f index 4677b9f52..fc73bb226 100644 --- a/lapack-netlib/SRC/dgeev.f +++ b/lapack-netlib/SRC/dgeev.f @@ -506,17 +506,17 @@ * Undo scaling if necessary * 50 CONTINUE - IF( SCALEA ) THEN + IF( SCALEA .AND. INFO.GT.0) THEN CALL DLASCL( 'G', 0, 0, CSCALE, ANRM, N-INFO, 1, WR( INFO+1 ), $ MAX( N-INFO, 1 ), IERR ) CALL DLASCL( 'G', 0, 0, CSCALE, ANRM, N-INFO, 1, WI( INFO+1 ), $ MAX( N-INFO, 1 ), IERR ) - IF( INFO.GT.0 ) THEN + CALL DLASCL( 'G', 0, 0, CSCALE, ANRM, ILO-1, 1, WR, N, $ IERR ) CALL DLASCL( 'G', 0, 0, CSCALE, ANRM, ILO-1, 1, WI, N, $ IERR ) - END IF + END IF * WORK( 1 ) = MAXWRK From d48a2fc4692600826622bda8e36d865cc17e3f50 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 1 May 2025 11:53:50 +0200 Subject: [PATCH 36/38] Avoid out of bounds accesses in SCAL when INFO<0 --- lapack-netlib/SRC/cgeev.f | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lapack-netlib/SRC/cgeev.f b/lapack-netlib/SRC/cgeev.f index bb41599d1..af14aa73a 100644 --- a/lapack-netlib/SRC/cgeev.f +++ b/lapack-netlib/SRC/cgeev.f @@ -485,12 +485,12 @@ * Undo scaling if necessary * 50 CONTINUE - IF( SCALEA ) THEN + IF( SCALEA .AND. INFO.GT.0 ) THEN CALL CLASCL( 'G', 0, 0, CSCALE, ANRM, N-INFO, 1, W( INFO+1 ), $ MAX( N-INFO, 1 ), IERR ) - IF( INFO.GT.0 ) THEN + CALL CLASCL( 'G', 0, 0, CSCALE, ANRM, ILO-1, 1, W, N, IERR ) - END IF + END IF * WORK( 1 ) = SROUNDUP_LWORK(MAXWRK) From 4c0445aed1eafde035add619fac1d92d64b004db Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 1 May 2025 11:56:07 +0200 Subject: [PATCH 37/38] Avoid out of bounds accesses in SCAL when INFO <0 --- lapack-netlib/SRC/sgeev.f | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lapack-netlib/SRC/sgeev.f b/lapack-netlib/SRC/sgeev.f index 93f993265..adf1a1a9d 100644 --- a/lapack-netlib/SRC/sgeev.f +++ b/lapack-netlib/SRC/sgeev.f @@ -504,17 +504,17 @@ * Undo scaling if necessary * 50 CONTINUE - IF( SCALEA ) THEN + IF( SCALEA .AND. INFO.GT.0) THEN CALL SLASCL( 'G', 0, 0, CSCALE, ANRM, N-INFO, 1, WR( INFO+1 ), $ MAX( N-INFO, 1 ), IERR ) CALL SLASCL( 'G', 0, 0, CSCALE, ANRM, N-INFO, 1, WI( INFO+1 ), $ MAX( N-INFO, 1 ), IERR ) - IF( INFO.GT.0 ) THEN + CALL SLASCL( 'G', 0, 0, CSCALE, ANRM, ILO-1, 1, WR, N, $ IERR ) CALL SLASCL( 'G', 0, 0, CSCALE, ANRM, ILO-1, 1, WI, N, $ IERR ) - END IF + END IF * WORK( 1 ) = SROUNDUP_LWORK(MAXWRK) From 5c958dfe1eba7a2f5e6802a39199ab3c65c1a52f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 1 May 2025 11:58:21 +0200 Subject: [PATCH 38/38] Avoid of out of bounds accesses in SCAL when INFO<0 --- lapack-netlib/SRC/zgeev.f | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lapack-netlib/SRC/zgeev.f b/lapack-netlib/SRC/zgeev.f index b968900e2..6cf5c669c 100644 --- a/lapack-netlib/SRC/zgeev.f +++ b/lapack-netlib/SRC/zgeev.f @@ -485,12 +485,12 @@ * Undo scaling if necessary * 50 CONTINUE - IF( SCALEA ) THEN + IF( SCALEA .AND. INFO.GT.0) THEN CALL ZLASCL( 'G', 0, 0, CSCALE, ANRM, N-INFO, 1, W( INFO+1 ), $ MAX( N-INFO, 1 ), IERR ) - IF( INFO.GT.0 ) THEN + CALL ZLASCL( 'G', 0, 0, CSCALE, ANRM, ILO-1, 1, W, N, IERR ) - END IF + END IF * WORK( 1 ) = MAXWRK