LoongArch64: Update symv lsx versiontags/v0.3.30
| @@ -28,6 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #include "loongarch64_asm.S" | |||
| /* Param */ | |||
| #define M $r4 | |||
| @@ -57,6 +58,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define T2 $r28 | |||
| #define T3 $r29 | |||
| #define T4 $r30 | |||
| #define T5 $r17 | |||
| #define T6 $r16 | |||
| #define T7 $r12 | |||
| /* LSX vectors */ | |||
| #define U0 $vr31 | |||
| @@ -87,10 +91,114 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define a8 $f8 | |||
| #define a9 $f9 | |||
| .macro LOAD_Y_8 | |||
| beqz T5, .L01_Y_0 | |||
| add.d T2, IY, INCY | |||
| fldx.d $f4, Y, T2 | |||
| add.d T2, T2, INCY | |||
| fldx.d $f5, Y, T2 | |||
| add.d T2, T2, INCY | |||
| fldx.d $f6, Y, T2 | |||
| add.d T2, T2, INCY | |||
| fldx.d $f7, Y, T2 | |||
| PROLOGUE | |||
| add.d T2, T2, INCY | |||
| fldx.d $f8, Y, T2 | |||
| add.d T2, T2, INCY | |||
| fldx.d $f9, Y, T2 | |||
| add.d T2, T2, INCY | |||
| fldx.d $f10, Y, T2 | |||
| add.d T2, T2, INCY | |||
| fldx.d $f11, Y, T2 | |||
| vextrins.d U4, U5, 0x10 | |||
| vextrins.d U6, U7, 0x10 | |||
| vextrins.d U8, U9, 0x10 | |||
| vextrins.d U10, U11, 0x10 | |||
| b .L01_Y_1 | |||
| .L01_Y_0: | |||
| add.d T7, IY, INCY | |||
| vldx U4, Y, T7 | |||
| alsl.d T2, INCY, T7, 1 | |||
| vldx U6, Y, T2 | |||
| alsl.d T3, INCY, T2, 1 | |||
| vldx U8, Y, T3 | |||
| alsl.d T4, INCY, T3, 1 | |||
| vldx U10, Y, T4 | |||
| .L01_Y_1: | |||
| .endm | |||
| .macro LOAD_X_8 | |||
| beqz T6, .L01_X_0 | |||
| add.d T2, IX, INCX | |||
| fldx.d $f4, X, T2 | |||
| add.d T2, T2, INCX | |||
| fldx.d $f5, X, T2 | |||
| add.d T2, T2, INCX | |||
| fldx.d $f6, X, T2 | |||
| add.d T2, T2, INCX | |||
| fldx.d $f7, X, T2 | |||
| add.d T2, T2, INCX | |||
| fldx.d $f8, X, T2 | |||
| add.d T2, T2, INCX | |||
| fldx.d $f9, X, T2 | |||
| add.d T2, T2, INCX | |||
| fldx.d $f10, X, T2 | |||
| add.d T2, T2, INCX | |||
| fldx.d $f11, X, T2 | |||
| vextrins.d U4, U5, 0x10 | |||
| vextrins.d U6, U7, 0x10 | |||
| vextrins.d U8, U9, 0x10 | |||
| vextrins.d U10, U11, 0x10 | |||
| b .L01_X_1 | |||
| .L01_X_0: | |||
| add.d T7, IX, INCX | |||
| vldx U4, X, T7 | |||
| alsl.d T2, INCX, T7, 1 | |||
| vldx U6, X, T2 | |||
| alsl.d T3, INCX, T2, 1 | |||
| vldx U8, X, T3 | |||
| alsl.d T4, INCX, T3, 1 | |||
| vldx U10, X, T4 | |||
| .L01_X_1: | |||
| .endm | |||
| .macro STORE_Y_8 | |||
| beqz T5, .L01_Y_2 | |||
| vextrins.d U5, U4, 0x01 | |||
| vextrins.d U7, U6, 0x01 | |||
| vextrins.d U9, U8, 0x01 | |||
| vextrins.d U11, U10, 0x01 | |||
| add.d T2, IY, INCY | |||
| fstx.d $f4, Y, T2 | |||
| add.d T2, T2, INCY | |||
| fstx.d $f5, Y, T2 | |||
| add.d T2, T2, INCY | |||
| fstx.d $f6, Y, T2 | |||
| add.d T2, T2, INCY | |||
| fstx.d $f7, Y, T2 | |||
| add.d T2, T2, INCY | |||
| fstx.d $f8, Y, T2 | |||
| add.d T2, T2, INCY | |||
| fstx.d $f9, Y, T2 | |||
| add.d T2, T2, INCY | |||
| fstx.d $f10, Y, T2 | |||
| add.d T2, T2, INCY | |||
| fstx.d $f11, Y, T2 | |||
| b .L01_Y_3 | |||
| .L01_Y_2: | |||
| vstx U4, Y, T7 | |||
| vstx U6, Y, T2 | |||
| vstx U8, Y, T3 | |||
| vstx U10, Y, T4 | |||
| .L01_Y_3: | |||
| .endm | |||
| LDARG BUFFER, $sp, 0 | |||
| PROLOGUE | |||
| addi.d $sp, $sp, -88 | |||
| @@ -107,6 +215,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| vldrepl.d VALPHA, $sp, 80 | |||
| addi.d T5, INCY, -1 | |||
| addi.d T6, INCX, -1 | |||
| slli.d LDA, LDA, BASE_SHIFT | |||
| slli.d INCX, INCX, BASE_SHIFT | |||
| slli.d INCY, INCY, BASE_SHIFT | |||
| @@ -122,11 +232,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| beq J, N, .L999 | |||
| .L01: | |||
| MTC a2, $r0 //temp2 | |||
| vxor.v U2, U2, U2 | |||
| fldx.d a6, X, JX | |||
| fmul.d a3, ALPHA, a6 //temp1 | |||
| vshuf4i.d U3, U3, 0x00 | |||
| vshuf4i.d U2, U2, 0x00 | |||
| mul.d T0, J, LDA | |||
| slli.d T1, J, BASE_SHIFT | |||
| @@ -163,105 +272,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| vldx U16, AO1, T1 | |||
| addi.d T1, T1, 16 | |||
| add.d T2, IY, INCY | |||
| fldx.d $f4, Y, T2 | |||
| add.d T2, T2, INCY | |||
| fldx.d $f5, Y, T2 | |||
| add.d T2, T2, INCY | |||
| fldx.d $f6, Y, T2 | |||
| add.d T2, T2, INCY | |||
| fldx.d $f7, Y, T2 | |||
| add.d T2, T2, INCY | |||
| fldx.d $f8, Y, T2 | |||
| add.d T2, T2, INCY | |||
| fldx.d $f9, Y, T2 | |||
| add.d T2, T2, INCY | |||
| fldx.d $f10, Y, T2 | |||
| add.d T2, T2, INCY | |||
| fldx.d $f11, Y, T2 | |||
| vextrins.d U4, U5, 0x10 | |||
| vextrins.d U6, U7, 0x10 | |||
| vextrins.d U8, U9, 0x10 | |||
| vextrins.d U10, U11, 0x10 | |||
| LOAD_Y_8 | |||
| vfmadd.d U4, U3, U1, U4 | |||
| vfmadd.d U6, U3, U14, U6 | |||
| vfmadd.d U8, U3, U15, U8 | |||
| vfmadd.d U10, U3, U16, U10 | |||
| vextrins.d U5, U4, 0x01 | |||
| vextrins.d U7, U6, 0x01 | |||
| vextrins.d U9, U8, 0x01 | |||
| vextrins.d U11, U10, 0x01 | |||
| add.d T2, IY, INCY | |||
| fstx.d $f4, Y, T2 | |||
| add.d T2, T2, INCY | |||
| fstx.d $f5, Y, T2 | |||
| add.d T2, T2, INCY | |||
| fstx.d $f6, Y, T2 | |||
| add.d T2, T2, INCY | |||
| fstx.d $f7, Y, T2 | |||
| add.d T2, T2, INCY | |||
| fstx.d $f8, Y, T2 | |||
| add.d T2, T2, INCY | |||
| fstx.d $f9, Y, T2 | |||
| add.d T2, T2, INCY | |||
| fstx.d $f10, Y, T2 | |||
| add.d T2, T2, INCY | |||
| fstx.d $f11, Y, T2 | |||
| slli.d T2, INCY, 3 | |||
| add.d IY, IY, T2 | |||
| add.d T2, IX, INCX | |||
| fldx.d $f4, X, T2 | |||
| add.d T2, T2, INCX | |||
| fldx.d $f5, X, T2 | |||
| add.d T2, T2, INCX | |||
| fldx.d $f6, X, T2 | |||
| add.d T2, T2, INCX | |||
| fldx.d $f7, X, T2 | |||
| add.d T2, T2, INCX | |||
| fldx.d $f8, X, T2 | |||
| add.d T2, T2, INCX | |||
| fldx.d $f9, X, T2 | |||
| add.d T2, T2, INCX | |||
| fldx.d $f10, X, T2 | |||
| add.d T2, T2, INCX | |||
| fldx.d $f11, X, T2 | |||
| STORE_Y_8 | |||
| vextrins.d U4, U5, 0x10 | |||
| vextrins.d U6, U7, 0x10 | |||
| vextrins.d U8, U9, 0x10 | |||
| vextrins.d U10, U11, 0x10 | |||
| alsl.d IY, INCY, IY, 3 | |||
| vand.v $vr12, $vr2, $vr2 | |||
| LOAD_X_8 | |||
| vfmadd.d U2, U1, U4, U2 | |||
| vfsub.d U2, U2, $vr12 | |||
| vfmadd.d U2, U14, U6, U2 | |||
| vfmadd.d U2, U15, U8, U2 | |||
| vfmadd.d U2, U16, U10, U2 | |||
| vextrins.d U4, U2, 0x01 | |||
| fadd.d $f2, $f2, $f4 | |||
| fadd.d $f2, $f2, $f12 | |||
| vextrins.d U2, U2, 0x10 | |||
| slli.d T2, INCX, 3 | |||
| add.d IX, IX, T2 | |||
| alsl.d IX, INCX, IX, 3 | |||
| addi.d II, II, 64 | |||
| addi.d I, I, 1 | |||
| blt I, T0, .L02 | |||
| // Acc U2 | |||
| GACC vf, d, U4, U2 | |||
| vilvl.d U2, U4, U4 | |||
| .L03: /* &4 */ | |||
| sub.d T0, M, J | |||
| addi.d T0, T0, -1 | |||
| @@ -429,4 +467,4 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| addi.d $sp, $sp, 88 | |||
| jirl $r0, $r1, 0x0 | |||
| EPILOGUE | |||
| EPILOGUE | |||
| @@ -28,6 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #include "loongarch64_asm.S" | |||
| /* Param */ | |||
| #define M $r4 | |||
| @@ -57,6 +58,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define T2 $r28 | |||
| #define T3 $r29 | |||
| #define T4 $r30 | |||
| #define T5 $r17 | |||
| #define T6 $r16 | |||
| #define T7 $r12 | |||
| /* LSX vectors */ | |||
| #define U0 $vr31 | |||
| @@ -87,10 +91,109 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define a8 $f8 | |||
| #define a9 $f9 | |||
| .macro LOAD_Y_8 | |||
| beqz T5, .L01_Y_0 | |||
| fldx.d $f4, Y, IY | |||
| add.d T2, IY, INCY | |||
| fldx.d $f5, Y, T2 | |||
| add.d T2, T2, INCY | |||
| fldx.d $f6, Y, T2 | |||
| add.d T2, T2, INCY | |||
| fldx.d $f7, Y, T2 | |||
| PROLOGUE | |||
| add.d T2, T2, INCY | |||
| fldx.d $f8, Y, T2 | |||
| add.d T2, T2, INCY | |||
| fldx.d $f9, Y, T2 | |||
| add.d T2, T2, INCY | |||
| fldx.d $f10, Y, T2 | |||
| add.d T2, T2, INCY | |||
| fldx.d $f11, Y, T2 | |||
| LDARG BUFFER, $sp, 0 | |||
| vextrins.d U4, U5, 0x10 | |||
| vextrins.d U6, U7, 0x10 | |||
| vextrins.d U8, U9, 0x10 | |||
| vextrins.d U10, U11, 0x10 | |||
| b .L01_Y_1 | |||
| .L01_Y_0: | |||
| vldx U4, Y, IY | |||
| alsl.d T2, INCY, IY, 1 | |||
| vldx U6, Y, T2 | |||
| alsl.d T3, INCY, T2, 1 | |||
| vldx U8, Y, T3 | |||
| alsl.d T4, INCY, T3, 1 | |||
| vldx U10, Y, T4 | |||
| .L01_Y_1: | |||
| .endm | |||
| .macro STORE_Y_8 | |||
| beqz T5, .L01_Y_2 | |||
| vextrins.d U5, U4, 0x01 | |||
| vextrins.d U7, U6, 0x01 | |||
| vextrins.d U9, U8, 0x01 | |||
| vextrins.d U11, U10, 0x01 | |||
| fstx.d $f4, Y, IY | |||
| add.d T2, IY, INCY | |||
| fstx.d $f5, Y, T2 | |||
| add.d T2, T2, INCY | |||
| fstx.d $f6, Y, T2 | |||
| add.d T2, T2, INCY | |||
| fstx.d $f7, Y, T2 | |||
| add.d T2, T2, INCY | |||
| fstx.d $f8, Y, T2 | |||
| add.d T2, T2, INCY | |||
| fstx.d $f9, Y, T2 | |||
| add.d T2, T2, INCY | |||
| fstx.d $f10, Y, T2 | |||
| add.d T2, T2, INCY | |||
| fstx.d $f11, Y, T2 | |||
| b .L01_Y_3 | |||
| .L01_Y_2: | |||
| vstx U4, Y, IY | |||
| vstx U6, Y, T2 | |||
| vstx U8, Y, T3 | |||
| vstx U10,Y, T4 | |||
| .L01_Y_3: | |||
| .endm | |||
| .macro LOAD_X_8 | |||
| beqz T6, .L01_X_0 | |||
| fldx.d $f4, X, IX | |||
| add.d T2, IX, INCX | |||
| fldx.d $f5, X, T2 | |||
| add.d T2, T2, INCX | |||
| fldx.d $f6, X, T2 | |||
| add.d T2, T2, INCX | |||
| fldx.d $f7, X, T2 | |||
| add.d T2, T2, INCX | |||
| fldx.d $f8, X, T2 | |||
| add.d T2, T2, INCX | |||
| fldx.d $f9, X, T2 | |||
| add.d T2, T2, INCX | |||
| fldx.d $f10, X, T2 | |||
| add.d T2, T2, INCX | |||
| fldx.d $f11, X, T2 | |||
| vextrins.d U4, U5, 0x10 | |||
| vextrins.d U6, U7, 0x10 | |||
| vextrins.d U8, U9, 0x10 | |||
| vextrins.d U10, U11, 0x10 | |||
| b .L01_X_1 | |||
| .L01_X_0: | |||
| vldx U4, X, IX | |||
| alsl.d T2, INCX, IX, 1 | |||
| vldx U6, X, T2 | |||
| alsl.d T3, INCX, T2, 1 | |||
| vldx U8, X, T3 | |||
| alsl.d T4, INCX, T3, 1 | |||
| vldx U10, X, T4 | |||
| .L01_X_1: | |||
| .endm | |||
| PROLOGUE | |||
| addi.d $sp, $sp, -88 | |||
| @@ -107,6 +210,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| vldrepl.d VALPHA, $sp, 80 | |||
| addi.d T5, INCY, -1 | |||
| addi.d T6, INCX, -1 | |||
| slli.d LDA, LDA, BASE_SHIFT | |||
| slli.d INCX, INCX, BASE_SHIFT | |||
| slli.d INCY, INCY, BASE_SHIFT | |||
| @@ -125,11 +230,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| beq J, M, .L999 | |||
| .L01: | |||
| MTC $f2, $r0 //temp2 | |||
| vxor.v U2, U2, U2 | |||
| fldx.d $f6, X, JX | |||
| fmul.d $f3, ALPHA, $f6 //temp1 | |||
| vshuf4i.d U3, U3, 0x00 | |||
| vshuf4i.d U2, U2, 0x00 | |||
| move IY, $r0 | |||
| move IX, $r0 | |||
| @@ -152,102 +256,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| vldx U16, AO1, T1 | |||
| addi.d T1, T1, 16 | |||
| fldx.d $f4, Y, IY | |||
| add.d T2, IY, INCY | |||
| fldx.d $f5, Y, T2 | |||
| add.d T2, T2, INCY | |||
| fldx.d $f6, Y, T2 | |||
| add.d T2, T2, INCY | |||
| fldx.d $f7, Y, T2 | |||
| add.d T2, T2, INCY | |||
| fldx.d $f8, Y, T2 | |||
| add.d T2, T2, INCY | |||
| fldx.d $f9, Y, T2 | |||
| add.d T2, T2, INCY | |||
| fldx.d $f10, Y, T2 | |||
| add.d T2, T2, INCY | |||
| fldx.d $f11, Y, T2 | |||
| vextrins.d U4, U5, 0x10 | |||
| vextrins.d U6, U7, 0x10 | |||
| vextrins.d U8, U9, 0x10 | |||
| vextrins.d U10, U11, 0x10 | |||
| LOAD_Y_8 | |||
| vfmadd.d U4, U3, U1, U4 | |||
| vfmadd.d U6, U3, U14, U6 | |||
| vfmadd.d U8, U3, U15, U8 | |||
| vfmadd.d U10, U3, U16, U10 | |||
| vextrins.d U5, U4, 0x01 | |||
| vextrins.d U7, U6, 0x01 | |||
| vextrins.d U9, U8, 0x01 | |||
| vextrins.d U11, U10, 0x01 | |||
| STORE_Y_8 | |||
| fstx.d $f4, Y, IY | |||
| add.d T2, IY, INCY | |||
| fstx.d $f5, Y, T2 | |||
| add.d T2, T2, INCY | |||
| fstx.d $f6, Y, T2 | |||
| add.d T2, T2, INCY | |||
| fstx.d $f7, Y, T2 | |||
| alsl.d IY, INCY, IY, 3 | |||
| add.d T2, T2, INCY | |||
| fstx.d $f8, Y, T2 | |||
| add.d T2, T2, INCY | |||
| fstx.d $f9, Y, T2 | |||
| add.d T2, T2, INCY | |||
| fstx.d $f10, Y, T2 | |||
| add.d T2, T2, INCY | |||
| fstx.d $f11, Y, T2 | |||
| slli.d T2, INCY, 3 | |||
| add.d IY, IY, T2 | |||
| fldx.d $f4, X, IX | |||
| add.d T2, IX, INCX | |||
| fldx.d $f5, X, T2 | |||
| add.d T2, T2, INCX | |||
| fldx.d $f6, X, T2 | |||
| add.d T2, T2, INCX | |||
| fldx.d $f7, X, T2 | |||
| add.d T2, T2, INCX | |||
| fldx.d $f8, X, T2 | |||
| add.d T2, T2, INCX | |||
| fldx.d $f9, X, T2 | |||
| add.d T2, T2, INCX | |||
| fldx.d $f10, X, T2 | |||
| add.d T2, T2, INCX | |||
| fldx.d $f11, X, T2 | |||
| vextrins.d U4, U5, 0x10 | |||
| vextrins.d U6, U7, 0x10 | |||
| vextrins.d U8, U9, 0x10 | |||
| vextrins.d U10, U11, 0x10 | |||
| vand.v $vr12, $vr2, $vr2 | |||
| LOAD_X_8 | |||
| vfmadd.d U2, U1, U4, U2 | |||
| vfsub.d U2, U2, $vr12 | |||
| vfmadd.d U2, U14, U6, U2 | |||
| vfmadd.d U2, U15, U8, U2 | |||
| vfmadd.d U2, U16, U10, U2 | |||
| vextrins.d U4, U2, 0x01 | |||
| fadd.d $f2, $f2, $f4 | |||
| fadd.d $f2, $f2, $f12 | |||
| vextrins.d U2, U2, 0x10 | |||
| slli.d T2, INCX, 3 | |||
| add.d IX, IX, T2 | |||
| alsl.d IX, INCX, IX, 3 | |||
| addi.d II, II, 64 | |||
| addi.d I, I, 1 | |||
| blt I, T0, .L02 | |||
| // Acc U2 | |||
| GACC vf, d, U4, U2 | |||
| vilvl.d U2, U4, U4 | |||
| .L03: /* &4 */ | |||
| andi T0, J, 4 | |||
| beq $r0, T0, .L04 | |||
| @@ -417,4 +453,4 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| addi.d $sp, $sp, 88 | |||
| jirl $r0, $r1, 0x0 | |||
| EPILOGUE | |||
| EPILOGUE | |||
| @@ -28,6 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #include "loongarch64_asm.S" | |||
| /* Param */ | |||
| #define M $r4 | |||
| @@ -57,6 +58,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define T2 $r28 | |||
| #define T3 $r29 | |||
| #define T4 $r30 | |||
| #define T5 $r17 | |||
| #define T6 $r16 | |||
| /* LSX vectors */ | |||
| #define U0 $vr31 | |||
| @@ -88,77 +91,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define a9 $f9 | |||
| PROLOGUE | |||
| LDARG BUFFER, $sp, 0 | |||
| addi.d $sp, $sp, -88 | |||
| SDARG $r23, $sp, 0 | |||
| SDARG $r24, $sp, 8 | |||
| SDARG $r25, $sp, 16 | |||
| SDARG $r26, $sp, 32 | |||
| SDARG $r27, $sp, 40 | |||
| SDARG $r28, $sp, 48 | |||
| SDARG $r29, $sp, 56 | |||
| SDARG $r30, $sp, 64 | |||
| SDARG $r31, $sp, 72 | |||
| ST ALPHA, $sp, 80 | |||
| vldrepl.w VALPHA, $sp, 80 | |||
| slli.d LDA, LDA, BASE_SHIFT | |||
| slli.d INCX, INCX, BASE_SHIFT | |||
| slli.d INCY, INCY, BASE_SHIFT | |||
| bge $r0, M, .L999 | |||
| bge $r0, N, .L999 | |||
| move J, $r0 | |||
| move JY, $r0 | |||
| move JX, $r0 | |||
| move AO1, A | |||
| beq J, N, .L999 | |||
| .L01: | |||
| MTC a2, $r0 //temp2 | |||
| fldx.s a6, X, JX | |||
| fmul.s a3, ALPHA, a6 //temp1 | |||
| vpermi.w U3, U3, 0x00 | |||
| vpermi.w U2, U2, 0x00 | |||
| mul.w T0, J, LDA | |||
| slli.d T1, J, BASE_SHIFT | |||
| add.w T0, T0, T1 | |||
| fldx.s a6, AO1, T0 | |||
| fldx.s a4, Y, JY | |||
| fmadd.s a4, a3, a6, a4 | |||
| fstx.s a4, Y, JY | |||
| move IY, JY | |||
| move IX, JX | |||
| addi.d II, J, 1 | |||
| move I, II | |||
| slli.d II, II, BASE_SHIFT | |||
| sub.d T0, M, J | |||
| addi.d T0, T0, -1 | |||
| srai.d T0, T0, 3 | |||
| add.d T0, T0, J | |||
| addi.d T0, T0, 1 | |||
| beq I, T0, .L03 | |||
| bge I, T0, .L03 | |||
| mul.w T1, J, LDA | |||
| add.d T1, T1, II | |||
| .L02: /* /8 */ | |||
| vldx U1, AO1, T1 | |||
| addi.d T1, T1, 16 | |||
| vldx U14, AO1, T1 | |||
| addi.d T1, T1, 16 | |||
| .macro LOAD_Y_8 | |||
| beqz T5, .L01_Y_0 | |||
| add.d T2, IY, INCY | |||
| fldx.s $f4, Y, T2 | |||
| add.d T2, T2, INCY | |||
| @@ -183,10 +117,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| vextrins.w U8, U9, 0x10 | |||
| vextrins.w U8, U10, 0x20 | |||
| vextrins.w U8, U11, 0x30 | |||
| vfmadd.s U4, U3, U1, U4 | |||
| vfmadd.s U8, U3, U14, U8 | |||
| b .L01_Y_1 | |||
| .L01_Y_0: | |||
| add.d T3, IY, INCY | |||
| vldx U4, Y, T3 | |||
| alsl.d T4, INCY, T3, 2 | |||
| vldx U8, Y, T4 | |||
| .L01_Y_1: | |||
| .endm | |||
| .macro STORE_Y_8 | |||
| beqz T5, .L01_Y_2 | |||
| vextrins.w U5, U4, 0x01 | |||
| vextrins.w U6, U4, 0x02 | |||
| vextrins.w U7, U4, 0x03 | |||
| @@ -211,10 +152,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| fstx.s $f10, Y, T2 | |||
| add.d T2, T2, INCY | |||
| fstx.s $f11, Y, T2 | |||
| slli.d T2, INCY, 3 | |||
| add.d IY, IY, T2 | |||
| b .L01_Y_3 | |||
| .L01_Y_2: | |||
| vstx U4, Y, T3 | |||
| vstx U8, Y, T4 | |||
| .L01_Y_3: | |||
| .endm | |||
| .macro LOAD_X_8 | |||
| beqz T6, .L01_X_0 | |||
| add.d T2, IX, INCX | |||
| fldx.s $f4, X, T2 | |||
| add.d T2, T2, INCX | |||
| @@ -239,31 +185,109 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| vextrins.w $vr8, $vr9, 0x10 | |||
| vextrins.w $vr8, $vr10, 0x20 | |||
| vextrins.w $vr8, $vr11, 0x30 | |||
| b .L01_X_1 | |||
| .L01_X_0: | |||
| add.d T3, IX, INCX | |||
| vldx U4, X, T3 | |||
| alsl.d T4, INCX, T3, 2 | |||
| vldx U8, X, T4 | |||
| .L01_X_1: | |||
| .endm | |||
| vand.v $vr12, $vr2, $vr2 | |||
| PROLOGUE | |||
| vfmadd.s U2, U1, U4, U2 | |||
| vfsub.s U2, U2, $vr12 | |||
| vfmadd.s U2, U14, U8, U2 | |||
| addi.d $sp, $sp, -88 | |||
| vextrins.w U4, U2, 0x01 | |||
| vextrins.w U5, U2, 0x02 | |||
| vextrins.w U6, U2, 0x03 | |||
| SDARG $r23, $sp, 0 | |||
| SDARG $r24, $sp, 8 | |||
| SDARG $r25, $sp, 16 | |||
| SDARG $r26, $sp, 32 | |||
| SDARG $r27, $sp, 40 | |||
| SDARG $r28, $sp, 48 | |||
| SDARG $r29, $sp, 56 | |||
| SDARG $r30, $sp, 64 | |||
| SDARG $r31, $sp, 72 | |||
| ST ALPHA, $sp, 80 | |||
| fadd.s $f2, $f2, $f4 | |||
| fadd.s $f2, $f2, $f5 | |||
| fadd.s $f2, $f2, $f6 | |||
| fadd.s $f2, $f2, $f12 | |||
| vldrepl.w VALPHA, $sp, 80 | |||
| vpermi.w U2, U2, 0x00 | |||
| addi.d T5, INCY, -1 | |||
| addi.d T6, INCX, -1 | |||
| slli.d LDA, LDA, BASE_SHIFT | |||
| slli.d INCX, INCX, BASE_SHIFT | |||
| slli.d INCY, INCY, BASE_SHIFT | |||
| bge $r0, M, .L999 | |||
| bge $r0, N, .L999 | |||
| move J, $r0 | |||
| move JY, $r0 | |||
| move JX, $r0 | |||
| move AO1, A | |||
| beq J, N, .L999 | |||
| .L01: | |||
| vxor.v U2, U2, U2 | |||
| fldx.s a6, X, JX | |||
| fmul.s a3, ALPHA, a6 //temp1 | |||
| vpermi.w U3, U3, 0x00 | |||
| mul.w T0, J, LDA | |||
| slli.d T1, J, BASE_SHIFT | |||
| add.w T0, T0, T1 | |||
| fldx.s a6, AO1, T0 | |||
| fldx.s a4, Y, JY | |||
| fmadd.s a4, a3, a6, a4 | |||
| fstx.s a4, Y, JY | |||
| move IY, JY | |||
| move IX, JX | |||
| addi.d II, J, 1 | |||
| move I, II | |||
| slli.d II, II, BASE_SHIFT | |||
| slli.d T2, INCX, 3 | |||
| add.d IX, IX, T2 | |||
| sub.d T0, M, J | |||
| addi.d T0, T0, -1 | |||
| srai.d T0, T0, 3 | |||
| add.d T0, T0, J | |||
| addi.d T0, T0, 1 | |||
| beq I, T0, .L03 | |||
| bge I, T0, .L03 | |||
| mul.w T1, J, LDA | |||
| add.d T1, T1, II | |||
| .L02: /* /8 */ | |||
| vldx U1, AO1, T1 | |||
| addi.d T1, T1, 16 | |||
| vldx U14, AO1, T1 | |||
| addi.d T1, T1, 16 | |||
| LOAD_Y_8 | |||
| vfmadd.s U4, U3, U1, U4 | |||
| vfmadd.s U8, U3, U14, U8 | |||
| STORE_Y_8 | |||
| alsl.d IY, INCY, IY, 3 | |||
| LOAD_X_8 | |||
| vfmadd.s U2, U1, U4, U2 | |||
| vfmadd.s U2, U14, U8, U2 | |||
| alsl.d IX, INCX, IX, 3 | |||
| addi.d II, II, 32 | |||
| addi.d I, I, 1 | |||
| blt I, T0, .L02 | |||
| // Acc U2 | |||
| GACC vf, s, U4, U2 | |||
| vpermi.w U2, U4, 0 | |||
| .L03: /* &4 */ | |||
| sub.d T0, M, J | |||
| addi.d T0, T0, -1 | |||
| @@ -426,4 +450,4 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| addi.d $sp, $sp, 88 | |||
| jirl $r0, $r1, 0x0 | |||
| EPILOGUE | |||
| EPILOGUE | |||
| @@ -28,6 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #include "loongarch64_asm.S" | |||
| /* Param */ | |||
| #define M $r4 | |||
| @@ -57,6 +58,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define T2 $r28 | |||
| #define T3 $r29 | |||
| #define T4 $r30 | |||
| #define T5 $r17 | |||
| #define T6 $r16 | |||
| /* LSX vectors */ | |||
| #define U0 $vr31 | |||
| @@ -87,67 +90,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define a8 $f8 | |||
| #define a9 $f9 | |||
| PROLOGUE | |||
| LDARG BUFFER, $sp, 0 | |||
| addi.d $sp, $sp, -88 | |||
| SDARG $r23, $sp, 0 | |||
| SDARG $r24, $sp, 8 | |||
| SDARG $r25, $sp, 16 | |||
| SDARG $r26, $sp, 32 | |||
| SDARG $r27, $sp, 40 | |||
| SDARG $r28, $sp, 48 | |||
| SDARG $r29, $sp, 56 | |||
| SDARG $r30, $sp, 64 | |||
| SDARG $r31, $sp, 72 | |||
| ST ALPHA, $sp, 80 | |||
| vldrepl.w VALPHA, $sp, 80 | |||
| slli.d LDA, LDA, BASE_SHIFT | |||
| slli.d INCX, INCX, BASE_SHIFT | |||
| slli.d INCY, INCY, BASE_SHIFT | |||
| bge $r0, M, .L999 | |||
| bge $r0, N, .L999 | |||
| sub.d M1, M, N | |||
| mul.d JY, M1, INCY | |||
| mul.d JX, M1, INCX | |||
| move J, M1 | |||
| move AO1, A | |||
| beq J, M, .L999 | |||
| .L01: | |||
| MTC $f2, $r0 //temp2 | |||
| fldx.s $f6, X, JX | |||
| fmul.s $f3, ALPHA, $f6 //temp1 | |||
| vpermi.w U3, U3, 0x00 | |||
| vpermi.w U2, U2, 0x00 | |||
| move IY, $r0 | |||
| move IX, $r0 | |||
| move II, $r0 | |||
| move I, $r0 | |||
| srai.d T0, J, 3 | |||
| beq I, T0, .L03 | |||
| mul.w T1, J, LDA | |||
| add.d T1, T1, II | |||
| .L02: /* /8 */ | |||
| vldx U1, AO1, T1 | |||
| addi.d T1, T1, 16 | |||
| vldx U14, AO1, T1 | |||
| addi.d T1, T1, 16 | |||
| .macro LOAD_Y_8 | |||
| beqz T5, .L01_Y_0 | |||
| fldx.s $f4, Y, IY | |||
| add.d T2, IY, INCY | |||
| fldx.s $f5, Y, T2 | |||
| @@ -171,10 +115,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| vextrins.w U8, U9, 0x10 | |||
| vextrins.w U8, U10, 0x20 | |||
| vextrins.w U8, U11, 0x30 | |||
| vfmadd.s U4, U3, U1, U4 | |||
| vfmadd.s U8, U3, U14, U8 | |||
| b .L01_Y_1 | |||
| .L01_Y_0: | |||
| vldx U4, Y, IY | |||
| alsl.d T2, INCY, IY, 2 | |||
| vldx U8, Y, T2 | |||
| .L01_Y_1: | |||
| .endm | |||
| .macro STORE_Y_8 | |||
| beqz T5, .L01_Y_2 | |||
| vextrins.w U5, U4, 0x01 | |||
| vextrins.w U6, U4, 0x02 | |||
| vextrins.w U7, U4, 0x03 | |||
| @@ -198,10 +148,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| fstx.s $f10, Y, T2 | |||
| add.d T2, T2, INCY | |||
| fstx.s $f11, Y, T2 | |||
| slli.d T2, INCY, 3 | |||
| add.d IY, IY, T2 | |||
| b .L01_Y_3 | |||
| .L01_Y_2: | |||
| vstx U4, Y, IY | |||
| vstx U8, Y, T2 | |||
| .L01_Y_3: | |||
| .endm | |||
| .macro LOAD_X_8 | |||
| beqz T6, .L01_X_0 | |||
| fldx.s $f4, X, IX | |||
| add.d T2, IX, INCX | |||
| fldx.s $f5, X, T2 | |||
| @@ -225,31 +180,97 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| vextrins.w $vr8, $vr9, 0x10 | |||
| vextrins.w $vr8, $vr10, 0x20 | |||
| vextrins.w $vr8, $vr11, 0x30 | |||
| b .L01_X_1 | |||
| .L01_X_0: | |||
| vldx U4, X, IX | |||
| alsl.d T3, INCX, IX, 2 | |||
| vldx U8, X, T3 | |||
| .L01_X_1: | |||
| .endm | |||
| vand.v $vr12, $vr2, $vr2 | |||
| PROLOGUE | |||
| vfmadd.s U2, U1, U4, U2 | |||
| vfsub.s U2, U2, $vr12 | |||
| vfmadd.s U2, U14, U8, U2 | |||
| addi.d $sp, $sp, -88 | |||
| vextrins.w U4, U2, 0x01 | |||
| vextrins.w U5, U2, 0x02 | |||
| vextrins.w U6, U2, 0x03 | |||
| SDARG $r23, $sp, 0 | |||
| SDARG $r24, $sp, 8 | |||
| SDARG $r25, $sp, 16 | |||
| SDARG $r26, $sp, 32 | |||
| SDARG $r27, $sp, 40 | |||
| SDARG $r28, $sp, 48 | |||
| SDARG $r29, $sp, 56 | |||
| SDARG $r30, $sp, 64 | |||
| SDARG $r31, $sp, 72 | |||
| ST ALPHA, $sp, 80 | |||
| fadd.s $f2, $f2, $f4 | |||
| fadd.s $f2, $f2, $f5 | |||
| fadd.s $f2, $f2, $f6 | |||
| fadd.s $f2, $f2, $f12 | |||
| vldrepl.w VALPHA, $sp, 80 | |||
| vpermi.w U2, U2, 0x00 | |||
| addi.d T5, INCY, -1 | |||
| addi.d T6, INCX, -1 | |||
| slli.d LDA, LDA, BASE_SHIFT | |||
| slli.d INCX, INCX, BASE_SHIFT | |||
| slli.d INCY, INCY, BASE_SHIFT | |||
| bge $r0, M, .L999 | |||
| bge $r0, N, .L999 | |||
| sub.d M1, M, N | |||
| mul.d JY, M1, INCY | |||
| mul.d JX, M1, INCX | |||
| move J, M1 | |||
| move AO1, A | |||
| beq J, M, .L999 | |||
| .L01: | |||
| vxor.v U2, U2, U2 | |||
| fldx.s $f6, X, JX | |||
| fmul.s $f3, ALPHA, $f6 //temp1 | |||
| vpermi.w U3, U3, 0x00 | |||
| move IY, $r0 | |||
| move IX, $r0 | |||
| move II, $r0 | |||
| move I, $r0 | |||
| srai.d T0, J, 3 | |||
| beq I, T0, .L03 | |||
| mul.w T1, J, LDA | |||
| add.d T1, T1, II | |||
| slli.d T2, INCX, 3 | |||
| add.d IX, IX, T2 | |||
| .L02: /* /8 */ | |||
| vldx U1, AO1, T1 | |||
| addi.d T1, T1, 16 | |||
| vldx U14, AO1, T1 | |||
| addi.d T1, T1, 16 | |||
| LOAD_Y_8 | |||
| vfmadd.s U4, U3, U1, U4 | |||
| vfmadd.s U8, U3, U14, U8 | |||
| STORE_Y_8 | |||
| alsl.d IY, INCY, IY, 3 | |||
| LOAD_X_8 | |||
| vfmadd.s U2, U1, U4, U2 | |||
| vfmadd.s U2, U14, U8, U2 | |||
| alsl.d IX, INCX, IX, 3 | |||
| addi.d II, II, 32 | |||
| addi.d I, I, 1 | |||
| blt I, T0, .L02 | |||
| // Acc U2 | |||
| GACC vf, s, U4, U2 | |||
| vpermi.w U2, U4, 0x00 | |||
| .L03: /* &4 */ | |||
| andi T0, J, 4 | |||
| beq $r0, T0, .L04 | |||
| @@ -414,4 +435,4 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| addi.d $sp, $sp, 88 | |||
| jirl $r0, $r1, 0x0 | |||
| EPILOGUE | |||
| EPILOGUE | |||