diff --git a/kernel/loongarch64/cnrm2_lsx.S b/kernel/loongarch64/cnrm2_lsx.S index 20950ba17..17361d143 100644 --- a/kernel/loongarch64/cnrm2_lsx.S +++ b/kernel/loongarch64/cnrm2_lsx.S @@ -47,6 +47,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VX4 $vr21 #define res1 $vr19 #define res2 $vr20 +#define RCP $f2 +#define VALPHA $vr3 PROLOGUE @@ -55,10 +57,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. LDINT INCX, 0(INCX) #endif - vxor.v res1, res1, res1 - vxor.v res2, res2, res2 bge $r0, N, .L999 beq $r0, INCX, .L999 + addi.d $sp, $sp, -32 + st.d $ra, $sp, 0 + st.d N, $sp, 8 + st.d X, $sp, 16 + st.d INCX, $sp, 24 +#ifdef DYNAMIC_ARCH + bl camax_k_LA264 +#else + bl camax_k +#endif + ld.d $ra, $sp, 0 + ld.d N, $sp, 8 + ld.d X, $sp, 16 + ld.d INCX, $sp, 24 + addi.d $sp, $sp, 32 + + frecip.s RCP, $f0 + vreplvei.w VALPHA, $vr2, 0 + vxor.v res1, res1, res1 + vxor.v res2, res2, res2 + fcmp.ceq.s $fcc0, $f0, $f19 + bcnez $fcc0, .L999 li.d TEMP, 1 slli.d TEMP, TEMP, ZBASE_SHIFT slli.d INCX, INCX, ZBASE_SHIFT @@ -69,16 +91,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L10: vld VX0, X, 0 * SIZE - vfcvtl.d.s VX1, VX0 - vfcvth.d.s VX2, VX0 - vfmadd.d res1, VX1, VX1, res1 - vfmadd.d res2, VX2, VX2, res2 - vld VX0, X, 4 * SIZE - vfcvtl.d.s VX3, VX0 - vfcvth.d.s VX4, VX0 - vfmadd.d res1, VX3, VX3, res1 - vfmadd.d res2, VX4, VX4, res2 addi.d I, I, -1 + vld VX0, X, 0 * SIZE + vld VX1, X, 4 * SIZE + vfmul.s VX0, VX0, VALPHA + vfmul.s VX1, VX1, VALPHA + + vfmadd.s res1, VX0, VX0, res1 + vfmadd.s res2, VX1, VX1, res2 + addi.d X, X, 8 * SIZE blt $r0, I, .L10 b .L996 @@ -99,10 +120,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vinsgr2vr.w VX0, t3, 2 vinsgr2vr.w VX0, t4, 3 add.d X, X, INCX - vfcvtl.d.s VX1, VX0 - vfcvth.d.s VX2, VX0 - vfmadd.d res1, VX1, VX1, res1 - vfmadd.d res2, VX2, VX2, res2 + vfmul.s VX0, VX0, VALPHA + vfmadd.s res1, VX0, VX0, res1 + ld.w t1, X, 0 * SIZE ld.w t2, X, 1 * SIZE add.d X, X, INCX @@ -113,19 +133,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vinsgr2vr.w VX0, t3, 2 vinsgr2vr.w VX0, t4, 3 add.d X, X, INCX - vfcvtl.d.s VX3, VX0 - vfcvth.d.s VX4, VX0 - vfmadd.d res1, VX3, VX3, res1 - vfmadd.d res2, VX4, VX4, res2 + vfmul.s VX0, VX0, VALPHA + vfmadd.s res2, VX0, VX0, res2 + addi.d I, I, -1 blt $r0, I, .L21 b .L996 .align 3 .L996: - vfadd.d res1, res1, res2 - vreplvei.d VX1, res1, 1 - vfadd.d res1, VX1, res1 + vfadd.s res1, res1, res2 + vreplvei.w VX1, res1, 1 + vreplvei.w VX2, res1, 2 + vreplvei.w VX3, res1, 3 + vfadd.s res1, VX1, res1 + vfadd.s res1, VX2, res1 + vfadd.s res1, VX3, res1 .align 3 .L997: @@ -137,18 +160,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fld.s a1, X, 0 * SIZE fld.s a2, X, 1 * SIZE addi.d I, I, -1 - fcvt.d.s a1, a1 - fcvt.d.s a2, a2 - fmadd.d res, a1, a1, res - fmadd.d res, a2, a2, res + fmul.s a1, a1, RCP + fmul.s a2, a2, RCP + fmadd.s res, a1, a1, res + fmadd.s res, a2, a2, res add.d X, X, INCX blt $r0, I, .L998 .align 3 .L999: - fsqrt.d res, res + fsqrt.s res, res + fmul.s $f0, res, $f0 move $r4, $r17 - fcvt.s.d $f0, $f19 jirl $r0, $r1, 0x0 .align 3 diff --git a/kernel/loongarch64/snrm2_lsx.S b/kernel/loongarch64/snrm2_lsx.S index bb492dbf0..9822cc66a 100644 --- a/kernel/loongarch64/snrm2_lsx.S +++ b/kernel/loongarch64/snrm2_lsx.S @@ -52,6 +52,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* Don't change following FR unless you know the effects. */ #define res1 $vr19 #define res2 $vr20 +#define RCP $f2 +#define VALPHA $vr3 + +// The optimization for snrm2 cannot simply involve +// extending the data type from float to double and +// then summing the squares of the data. LAPACK tests +// have shown that this approach can still lead to data overflow. +// Instead, we need to find the maximum absolute value in the entire +// array and divide each data element by this maximum value before +// performing the calculation. This approach can avoid overflow (and does not require extending the data type). PROLOGUE @@ -59,10 +69,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. LDINT N, 0(N) LDINT INCX, 0(INCX) #endif - vxor.v res1, res1, res1 - vxor.v res2, res2, res2 bge $r0, N, .L999 beq $r0, INCX, .L999 + + addi.d $sp, $sp, -32 + st.d $ra, $sp, 0 + st.d N, $sp, 8 + st.d X, $sp, 16 + st.d INCX, $sp, 24 +#ifdef DYNAMIC_ARCH + bl samax_k_LA264 +#else + bl samax_k +#endif + ld.d $ra, $sp, 0 + ld.d N, $sp, 8 + ld.d X, $sp, 16 + ld.d INCX, $sp, 24 + addi.d $sp, $sp, 32 + + frecip.s RCP, $f0 + vreplvei.w VALPHA, $vr2, 0 + vxor.v res1, res1, res1 + vxor.v res2, res2, res2 + fcmp.ceq.s $fcc0, $f0, $f19 + bcnez $fcc0, .L999 li.d TEMP, SIZE slli.d INCX, INCX, BASE_SHIFT srai.d I, N, 3 @@ -75,14 +106,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vld VX5, X, 4 * SIZE addi.d I, I, -1 addi.d X, X, 8 * SIZE - vfcvtl.d.s VX1, VX0 - vfcvth.d.s VX2, VX0 - vfcvtl.d.s VX3, VX5 - vfcvth.d.s VX4, VX5 - vfmadd.d res1, VX1, VX1, res1 - vfmadd.d res2, VX2, VX2, res2 - vfmadd.d res1, VX3, VX3, res1 - vfmadd.d res2, VX4, VX4, res2 + + vfmul.s VX0, VX0, VALPHA + vfmul.s VX5, VX5, VALPHA + + vfmadd.s res1, VX0, VX0, res1 + vfmadd.s res2, VX5, VX5, res2 blt $r0, I, .L10 b .L996 .align 3 @@ -104,10 +133,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vinsgr2vr.w VX0, t2, 1 vinsgr2vr.w VX0, t3, 2 vinsgr2vr.w VX0, t4, 3 - vfcvtl.d.s VX1, VX0 - vfcvth.d.s VX2, VX0 - vfmadd.d res1, VX1, VX1, res1 - vfmadd.d res2, VX2, VX2, res2 + vfmul.s VX0, VX0, VALPHA + vfmadd.s res1, VX0, VX0, res1 + ld.w t1, X, 0 add.d X, X, INCX ld.w t2, X, 0 @@ -120,19 +148,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vinsgr2vr.w VX0, t2, 1 vinsgr2vr.w VX0, t3, 2 vinsgr2vr.w VX0, t4, 3 - vfcvtl.d.s VX3, VX0 - vfcvth.d.s VX4, VX0 - vfmadd.d res1, VX3, VX3, res1 - vfmadd.d res2, VX4, VX4, res2 + vfmul.s VX0, VX0, VALPHA + vfmadd.s res2, VX0, VX0, res2 addi.d I, I, -1 blt $r0, I, .L21 - b .L996 .align 3 .L996: - vfadd.d res1, res1, res2 - vreplvei.d VX1, res1, 1 - vfadd.d res1, VX1, res1 + vfadd.s res1, res1, res2 + vreplvei.w VX1, res1, 1 + vreplvei.w VX2, res1, 2 + vreplvei.w VX3, res1, 3 + vfadd.s res1, VX1, res1 + vfadd.s res1, VX2, res1 + vfadd.s res1, VX3, res1 .align 3 .L997: @@ -143,16 +172,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .L998: fld.s $f15, X, 0 addi.d I, I, -1 - fcvt.d.s $f15, $f15 - fmadd.d $f19, $f15, $f15, $f19 + fmul.s $f15, $f15, RCP + fmadd.s $f19, $f15, $f15, $f19 add.d X, X, INCX blt $r0, I, .L998 .align 3 .L999: - fsqrt.d $f19, $f19 + fsqrt.s $f19, $f19 + fmul.s $f0, $f19, $f0 move $r4, $r17 - fcvt.s.d $f0, $f19 jirl $r0, $r1, 0x0 .align 3