Browse Source

Merge pull request #5073 from XiWeiGu/la64_update_symv_lsx_version

LoongArch64: Update symv lsx version
tags/v0.3.30
Martin Kroeker GitHub 1 year ago
parent
commit
eba7338484
No known key found for this signature in database GPG Key ID: B5690EEEBB952194
4 changed files with 464 additions and 345 deletions
  1. +123
    -85
      kernel/loongarch64/dsymv_L_lsx.S
  2. +118
    -82
      kernel/loongarch64/dsymv_U_lsx.S
  3. +118
    -94
      kernel/loongarch64/ssymv_L_lsx.S
  4. +105
    -84
      kernel/loongarch64/ssymv_U_lsx.S

+ 123
- 85
kernel/loongarch64/dsymv_L_lsx.S View File

@@ -28,6 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ASSEMBLER #define ASSEMBLER


#include "common.h" #include "common.h"
#include "loongarch64_asm.S"


/* Param */ /* Param */
#define M $r4 #define M $r4
@@ -57,6 +58,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define T2 $r28 #define T2 $r28
#define T3 $r29 #define T3 $r29
#define T4 $r30 #define T4 $r30
#define T5 $r17
#define T6 $r16
#define T7 $r12


/* LSX vectors */ /* LSX vectors */
#define U0 $vr31 #define U0 $vr31
@@ -87,10 +91,114 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define a8 $f8 #define a8 $f8
#define a9 $f9 #define a9 $f9


.macro LOAD_Y_8
beqz T5, .L01_Y_0
add.d T2, IY, INCY
fldx.d $f4, Y, T2
add.d T2, T2, INCY
fldx.d $f5, Y, T2
add.d T2, T2, INCY
fldx.d $f6, Y, T2
add.d T2, T2, INCY
fldx.d $f7, Y, T2


PROLOGUE
add.d T2, T2, INCY
fldx.d $f8, Y, T2
add.d T2, T2, INCY
fldx.d $f9, Y, T2
add.d T2, T2, INCY
fldx.d $f10, Y, T2
add.d T2, T2, INCY
fldx.d $f11, Y, T2

vextrins.d U4, U5, 0x10
vextrins.d U6, U7, 0x10
vextrins.d U8, U9, 0x10
vextrins.d U10, U11, 0x10
b .L01_Y_1
.L01_Y_0:
add.d T7, IY, INCY
vldx U4, Y, T7
alsl.d T2, INCY, T7, 1
vldx U6, Y, T2
alsl.d T3, INCY, T2, 1
vldx U8, Y, T3
alsl.d T4, INCY, T3, 1
vldx U10, Y, T4
.L01_Y_1:
.endm

.macro LOAD_X_8
beqz T6, .L01_X_0
add.d T2, IX, INCX
fldx.d $f4, X, T2
add.d T2, T2, INCX
fldx.d $f5, X, T2
add.d T2, T2, INCX
fldx.d $f6, X, T2
add.d T2, T2, INCX
fldx.d $f7, X, T2

add.d T2, T2, INCX
fldx.d $f8, X, T2
add.d T2, T2, INCX
fldx.d $f9, X, T2
add.d T2, T2, INCX
fldx.d $f10, X, T2
add.d T2, T2, INCX
fldx.d $f11, X, T2

vextrins.d U4, U5, 0x10
vextrins.d U6, U7, 0x10
vextrins.d U8, U9, 0x10
vextrins.d U10, U11, 0x10
b .L01_X_1
.L01_X_0:
add.d T7, IX, INCX
vldx U4, X, T7
alsl.d T2, INCX, T7, 1
vldx U6, X, T2
alsl.d T3, INCX, T2, 1
vldx U8, X, T3
alsl.d T4, INCX, T3, 1
vldx U10, X, T4
.L01_X_1:
.endm

.macro STORE_Y_8
beqz T5, .L01_Y_2
vextrins.d U5, U4, 0x01
vextrins.d U7, U6, 0x01
vextrins.d U9, U8, 0x01
vextrins.d U11, U10, 0x01

add.d T2, IY, INCY
fstx.d $f4, Y, T2
add.d T2, T2, INCY
fstx.d $f5, Y, T2
add.d T2, T2, INCY
fstx.d $f6, Y, T2
add.d T2, T2, INCY
fstx.d $f7, Y, T2

add.d T2, T2, INCY
fstx.d $f8, Y, T2
add.d T2, T2, INCY
fstx.d $f9, Y, T2
add.d T2, T2, INCY
fstx.d $f10, Y, T2
add.d T2, T2, INCY
fstx.d $f11, Y, T2
b .L01_Y_3
.L01_Y_2:
vstx U4, Y, T7
vstx U6, Y, T2
vstx U8, Y, T3
vstx U10, Y, T4
.L01_Y_3:
.endm


LDARG BUFFER, $sp, 0
PROLOGUE


addi.d $sp, $sp, -88 addi.d $sp, $sp, -88


@@ -107,6 +215,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


vldrepl.d VALPHA, $sp, 80 vldrepl.d VALPHA, $sp, 80


addi.d T5, INCY, -1
addi.d T6, INCX, -1
slli.d LDA, LDA, BASE_SHIFT slli.d LDA, LDA, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT slli.d INCX, INCX, BASE_SHIFT
slli.d INCY, INCY, BASE_SHIFT slli.d INCY, INCY, BASE_SHIFT
@@ -122,11 +232,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
beq J, N, .L999 beq J, N, .L999


.L01: .L01:
MTC a2, $r0 //temp2
vxor.v U2, U2, U2
fldx.d a6, X, JX fldx.d a6, X, JX
fmul.d a3, ALPHA, a6 //temp1 fmul.d a3, ALPHA, a6 //temp1
vshuf4i.d U3, U3, 0x00 vshuf4i.d U3, U3, 0x00
vshuf4i.d U2, U2, 0x00


mul.d T0, J, LDA mul.d T0, J, LDA
slli.d T1, J, BASE_SHIFT slli.d T1, J, BASE_SHIFT
@@ -163,105 +272,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vldx U16, AO1, T1 vldx U16, AO1, T1
addi.d T1, T1, 16 addi.d T1, T1, 16


add.d T2, IY, INCY
fldx.d $f4, Y, T2
add.d T2, T2, INCY
fldx.d $f5, Y, T2
add.d T2, T2, INCY
fldx.d $f6, Y, T2
add.d T2, T2, INCY
fldx.d $f7, Y, T2

add.d T2, T2, INCY
fldx.d $f8, Y, T2
add.d T2, T2, INCY
fldx.d $f9, Y, T2
add.d T2, T2, INCY
fldx.d $f10, Y, T2
add.d T2, T2, INCY
fldx.d $f11, Y, T2

vextrins.d U4, U5, 0x10
vextrins.d U6, U7, 0x10
vextrins.d U8, U9, 0x10
vextrins.d U10, U11, 0x10
LOAD_Y_8


vfmadd.d U4, U3, U1, U4 vfmadd.d U4, U3, U1, U4
vfmadd.d U6, U3, U14, U6 vfmadd.d U6, U3, U14, U6
vfmadd.d U8, U3, U15, U8 vfmadd.d U8, U3, U15, U8
vfmadd.d U10, U3, U16, U10 vfmadd.d U10, U3, U16, U10


vextrins.d U5, U4, 0x01
vextrins.d U7, U6, 0x01
vextrins.d U9, U8, 0x01
vextrins.d U11, U10, 0x01

add.d T2, IY, INCY
fstx.d $f4, Y, T2
add.d T2, T2, INCY
fstx.d $f5, Y, T2
add.d T2, T2, INCY
fstx.d $f6, Y, T2
add.d T2, T2, INCY
fstx.d $f7, Y, T2

add.d T2, T2, INCY
fstx.d $f8, Y, T2
add.d T2, T2, INCY
fstx.d $f9, Y, T2
add.d T2, T2, INCY
fstx.d $f10, Y, T2
add.d T2, T2, INCY
fstx.d $f11, Y, T2

slli.d T2, INCY, 3
add.d IY, IY, T2

add.d T2, IX, INCX
fldx.d $f4, X, T2
add.d T2, T2, INCX
fldx.d $f5, X, T2
add.d T2, T2, INCX
fldx.d $f6, X, T2
add.d T2, T2, INCX
fldx.d $f7, X, T2

add.d T2, T2, INCX
fldx.d $f8, X, T2
add.d T2, T2, INCX
fldx.d $f9, X, T2
add.d T2, T2, INCX
fldx.d $f10, X, T2
add.d T2, T2, INCX
fldx.d $f11, X, T2
STORE_Y_8


vextrins.d U4, U5, 0x10
vextrins.d U6, U7, 0x10
vextrins.d U8, U9, 0x10
vextrins.d U10, U11, 0x10
alsl.d IY, INCY, IY, 3


vand.v $vr12, $vr2, $vr2
LOAD_X_8


vfmadd.d U2, U1, U4, U2 vfmadd.d U2, U1, U4, U2
vfsub.d U2, U2, $vr12
vfmadd.d U2, U14, U6, U2 vfmadd.d U2, U14, U6, U2
vfmadd.d U2, U15, U8, U2 vfmadd.d U2, U15, U8, U2
vfmadd.d U2, U16, U10, U2 vfmadd.d U2, U16, U10, U2


vextrins.d U4, U2, 0x01

fadd.d $f2, $f2, $f4
fadd.d $f2, $f2, $f12

vextrins.d U2, U2, 0x10

slli.d T2, INCX, 3
add.d IX, IX, T2
alsl.d IX, INCX, IX, 3


addi.d II, II, 64 addi.d II, II, 64
addi.d I, I, 1 addi.d I, I, 1
blt I, T0, .L02 blt I, T0, .L02


// Acc U2
GACC vf, d, U4, U2
vilvl.d U2, U4, U4

.L03: /* &4 */ .L03: /* &4 */
sub.d T0, M, J sub.d T0, M, J
addi.d T0, T0, -1 addi.d T0, T0, -1
@@ -429,4 +467,4 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi.d $sp, $sp, 88 addi.d $sp, $sp, 88
jirl $r0, $r1, 0x0 jirl $r0, $r1, 0x0


EPILOGUE
EPILOGUE

+ 118
- 82
kernel/loongarch64/dsymv_U_lsx.S View File

@@ -28,6 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ASSEMBLER #define ASSEMBLER


#include "common.h" #include "common.h"
#include "loongarch64_asm.S"


/* Param */ /* Param */
#define M $r4 #define M $r4
@@ -57,6 +58,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define T2 $r28 #define T2 $r28
#define T3 $r29 #define T3 $r29
#define T4 $r30 #define T4 $r30
#define T5 $r17
#define T6 $r16
#define T7 $r12


/* LSX vectors */ /* LSX vectors */
#define U0 $vr31 #define U0 $vr31
@@ -87,10 +91,109 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define a8 $f8 #define a8 $f8
#define a9 $f9 #define a9 $f9


.macro LOAD_Y_8
beqz T5, .L01_Y_0
fldx.d $f4, Y, IY
add.d T2, IY, INCY
fldx.d $f5, Y, T2
add.d T2, T2, INCY
fldx.d $f6, Y, T2
add.d T2, T2, INCY
fldx.d $f7, Y, T2


PROLOGUE
add.d T2, T2, INCY
fldx.d $f8, Y, T2
add.d T2, T2, INCY
fldx.d $f9, Y, T2
add.d T2, T2, INCY
fldx.d $f10, Y, T2
add.d T2, T2, INCY
fldx.d $f11, Y, T2


LDARG BUFFER, $sp, 0
vextrins.d U4, U5, 0x10
vextrins.d U6, U7, 0x10
vextrins.d U8, U9, 0x10
vextrins.d U10, U11, 0x10
b .L01_Y_1
.L01_Y_0:
vldx U4, Y, IY
alsl.d T2, INCY, IY, 1
vldx U6, Y, T2
alsl.d T3, INCY, T2, 1
vldx U8, Y, T3
alsl.d T4, INCY, T3, 1
vldx U10, Y, T4
.L01_Y_1:
.endm

.macro STORE_Y_8
beqz T5, .L01_Y_2
vextrins.d U5, U4, 0x01
vextrins.d U7, U6, 0x01
vextrins.d U9, U8, 0x01
vextrins.d U11, U10, 0x01

fstx.d $f4, Y, IY
add.d T2, IY, INCY
fstx.d $f5, Y, T2
add.d T2, T2, INCY
fstx.d $f6, Y, T2
add.d T2, T2, INCY
fstx.d $f7, Y, T2

add.d T2, T2, INCY
fstx.d $f8, Y, T2
add.d T2, T2, INCY
fstx.d $f9, Y, T2
add.d T2, T2, INCY
fstx.d $f10, Y, T2
add.d T2, T2, INCY
fstx.d $f11, Y, T2
b .L01_Y_3
.L01_Y_2:
vstx U4, Y, IY
vstx U6, Y, T2
vstx U8, Y, T3
vstx U10,Y, T4
.L01_Y_3:
.endm

.macro LOAD_X_8
beqz T6, .L01_X_0
fldx.d $f4, X, IX
add.d T2, IX, INCX
fldx.d $f5, X, T2
add.d T2, T2, INCX
fldx.d $f6, X, T2
add.d T2, T2, INCX
fldx.d $f7, X, T2

add.d T2, T2, INCX
fldx.d $f8, X, T2
add.d T2, T2, INCX
fldx.d $f9, X, T2
add.d T2, T2, INCX
fldx.d $f10, X, T2
add.d T2, T2, INCX
fldx.d $f11, X, T2

vextrins.d U4, U5, 0x10
vextrins.d U6, U7, 0x10
vextrins.d U8, U9, 0x10
vextrins.d U10, U11, 0x10
b .L01_X_1
.L01_X_0:
vldx U4, X, IX
alsl.d T2, INCX, IX, 1
vldx U6, X, T2
alsl.d T3, INCX, T2, 1
vldx U8, X, T3
alsl.d T4, INCX, T3, 1
vldx U10, X, T4
.L01_X_1:
.endm

PROLOGUE


addi.d $sp, $sp, -88 addi.d $sp, $sp, -88


@@ -107,6 +210,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


vldrepl.d VALPHA, $sp, 80 vldrepl.d VALPHA, $sp, 80


addi.d T5, INCY, -1
addi.d T6, INCX, -1
slli.d LDA, LDA, BASE_SHIFT slli.d LDA, LDA, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT slli.d INCX, INCX, BASE_SHIFT
slli.d INCY, INCY, BASE_SHIFT slli.d INCY, INCY, BASE_SHIFT
@@ -125,11 +230,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
beq J, M, .L999 beq J, M, .L999


.L01: .L01:
MTC $f2, $r0 //temp2
vxor.v U2, U2, U2
fldx.d $f6, X, JX fldx.d $f6, X, JX
fmul.d $f3, ALPHA, $f6 //temp1 fmul.d $f3, ALPHA, $f6 //temp1
vshuf4i.d U3, U3, 0x00 vshuf4i.d U3, U3, 0x00
vshuf4i.d U2, U2, 0x00


move IY, $r0 move IY, $r0
move IX, $r0 move IX, $r0
@@ -152,102 +256,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vldx U16, AO1, T1 vldx U16, AO1, T1
addi.d T1, T1, 16 addi.d T1, T1, 16


fldx.d $f4, Y, IY
add.d T2, IY, INCY
fldx.d $f5, Y, T2
add.d T2, T2, INCY
fldx.d $f6, Y, T2
add.d T2, T2, INCY
fldx.d $f7, Y, T2

add.d T2, T2, INCY
fldx.d $f8, Y, T2
add.d T2, T2, INCY
fldx.d $f9, Y, T2
add.d T2, T2, INCY
fldx.d $f10, Y, T2
add.d T2, T2, INCY
fldx.d $f11, Y, T2

vextrins.d U4, U5, 0x10
vextrins.d U6, U7, 0x10
vextrins.d U8, U9, 0x10
vextrins.d U10, U11, 0x10
LOAD_Y_8


vfmadd.d U4, U3, U1, U4 vfmadd.d U4, U3, U1, U4
vfmadd.d U6, U3, U14, U6 vfmadd.d U6, U3, U14, U6
vfmadd.d U8, U3, U15, U8 vfmadd.d U8, U3, U15, U8
vfmadd.d U10, U3, U16, U10 vfmadd.d U10, U3, U16, U10


vextrins.d U5, U4, 0x01
vextrins.d U7, U6, 0x01
vextrins.d U9, U8, 0x01
vextrins.d U11, U10, 0x01
STORE_Y_8


fstx.d $f4, Y, IY
add.d T2, IY, INCY
fstx.d $f5, Y, T2
add.d T2, T2, INCY
fstx.d $f6, Y, T2
add.d T2, T2, INCY
fstx.d $f7, Y, T2
alsl.d IY, INCY, IY, 3


add.d T2, T2, INCY
fstx.d $f8, Y, T2
add.d T2, T2, INCY
fstx.d $f9, Y, T2
add.d T2, T2, INCY
fstx.d $f10, Y, T2
add.d T2, T2, INCY
fstx.d $f11, Y, T2

slli.d T2, INCY, 3
add.d IY, IY, T2

fldx.d $f4, X, IX
add.d T2, IX, INCX
fldx.d $f5, X, T2
add.d T2, T2, INCX
fldx.d $f6, X, T2
add.d T2, T2, INCX
fldx.d $f7, X, T2

add.d T2, T2, INCX
fldx.d $f8, X, T2
add.d T2, T2, INCX
fldx.d $f9, X, T2
add.d T2, T2, INCX
fldx.d $f10, X, T2
add.d T2, T2, INCX
fldx.d $f11, X, T2

vextrins.d U4, U5, 0x10
vextrins.d U6, U7, 0x10
vextrins.d U8, U9, 0x10
vextrins.d U10, U11, 0x10

vand.v $vr12, $vr2, $vr2
LOAD_X_8


vfmadd.d U2, U1, U4, U2 vfmadd.d U2, U1, U4, U2
vfsub.d U2, U2, $vr12
vfmadd.d U2, U14, U6, U2 vfmadd.d U2, U14, U6, U2
vfmadd.d U2, U15, U8, U2 vfmadd.d U2, U15, U8, U2
vfmadd.d U2, U16, U10, U2 vfmadd.d U2, U16, U10, U2


vextrins.d U4, U2, 0x01

fadd.d $f2, $f2, $f4
fadd.d $f2, $f2, $f12

vextrins.d U2, U2, 0x10

slli.d T2, INCX, 3
add.d IX, IX, T2
alsl.d IX, INCX, IX, 3


addi.d II, II, 64 addi.d II, II, 64
addi.d I, I, 1 addi.d I, I, 1
blt I, T0, .L02 blt I, T0, .L02


// Acc U2
GACC vf, d, U4, U2
vilvl.d U2, U4, U4

.L03: /* &4 */ .L03: /* &4 */
andi T0, J, 4 andi T0, J, 4
beq $r0, T0, .L04 beq $r0, T0, .L04
@@ -417,4 +453,4 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi.d $sp, $sp, 88 addi.d $sp, $sp, 88
jirl $r0, $r1, 0x0 jirl $r0, $r1, 0x0


EPILOGUE
EPILOGUE

+ 118
- 94
kernel/loongarch64/ssymv_L_lsx.S View File

@@ -28,6 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ASSEMBLER #define ASSEMBLER


#include "common.h" #include "common.h"
#include "loongarch64_asm.S"


/* Param */ /* Param */
#define M $r4 #define M $r4
@@ -57,6 +58,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define T2 $r28 #define T2 $r28
#define T3 $r29 #define T3 $r29
#define T4 $r30 #define T4 $r30
#define T5 $r17
#define T6 $r16


/* LSX vectors */ /* LSX vectors */
#define U0 $vr31 #define U0 $vr31
@@ -88,77 +91,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define a9 $f9 #define a9 $f9




PROLOGUE

LDARG BUFFER, $sp, 0

addi.d $sp, $sp, -88

SDARG $r23, $sp, 0
SDARG $r24, $sp, 8
SDARG $r25, $sp, 16
SDARG $r26, $sp, 32
SDARG $r27, $sp, 40
SDARG $r28, $sp, 48
SDARG $r29, $sp, 56
SDARG $r30, $sp, 64
SDARG $r31, $sp, 72
ST ALPHA, $sp, 80

vldrepl.w VALPHA, $sp, 80

slli.d LDA, LDA, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
slli.d INCY, INCY, BASE_SHIFT

bge $r0, M, .L999
bge $r0, N, .L999

move J, $r0
move JY, $r0
move JX, $r0
move AO1, A

beq J, N, .L999

.L01:
MTC a2, $r0 //temp2
fldx.s a6, X, JX
fmul.s a3, ALPHA, a6 //temp1
vpermi.w U3, U3, 0x00
vpermi.w U2, U2, 0x00

mul.w T0, J, LDA
slli.d T1, J, BASE_SHIFT
add.w T0, T0, T1
fldx.s a6, AO1, T0
fldx.s a4, Y, JY
fmadd.s a4, a3, a6, a4
fstx.s a4, Y, JY

move IY, JY
move IX, JX
addi.d II, J, 1
move I, II
slli.d II, II, BASE_SHIFT

sub.d T0, M, J
addi.d T0, T0, -1
srai.d T0, T0, 3
add.d T0, T0, J
addi.d T0, T0, 1
beq I, T0, .L03
bge I, T0, .L03

mul.w T1, J, LDA
add.d T1, T1, II

.L02: /* /8 */
vldx U1, AO1, T1
addi.d T1, T1, 16
vldx U14, AO1, T1
addi.d T1, T1, 16

.macro LOAD_Y_8
beqz T5, .L01_Y_0
add.d T2, IY, INCY add.d T2, IY, INCY
fldx.s $f4, Y, T2 fldx.s $f4, Y, T2
add.d T2, T2, INCY add.d T2, T2, INCY
@@ -183,10 +117,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vextrins.w U8, U9, 0x10 vextrins.w U8, U9, 0x10
vextrins.w U8, U10, 0x20 vextrins.w U8, U10, 0x20
vextrins.w U8, U11, 0x30 vextrins.w U8, U11, 0x30

vfmadd.s U4, U3, U1, U4
vfmadd.s U8, U3, U14, U8

b .L01_Y_1
.L01_Y_0:
add.d T3, IY, INCY
vldx U4, Y, T3
alsl.d T4, INCY, T3, 2
vldx U8, Y, T4
.L01_Y_1:
.endm

.macro STORE_Y_8
beqz T5, .L01_Y_2
vextrins.w U5, U4, 0x01 vextrins.w U5, U4, 0x01
vextrins.w U6, U4, 0x02 vextrins.w U6, U4, 0x02
vextrins.w U7, U4, 0x03 vextrins.w U7, U4, 0x03
@@ -211,10 +152,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fstx.s $f10, Y, T2 fstx.s $f10, Y, T2
add.d T2, T2, INCY add.d T2, T2, INCY
fstx.s $f11, Y, T2 fstx.s $f11, Y, T2

slli.d T2, INCY, 3
add.d IY, IY, T2

b .L01_Y_3
.L01_Y_2:
vstx U4, Y, T3
vstx U8, Y, T4
.L01_Y_3:
.endm

.macro LOAD_X_8
beqz T6, .L01_X_0
add.d T2, IX, INCX add.d T2, IX, INCX
fldx.s $f4, X, T2 fldx.s $f4, X, T2
add.d T2, T2, INCX add.d T2, T2, INCX
@@ -239,31 +185,109 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vextrins.w $vr8, $vr9, 0x10 vextrins.w $vr8, $vr9, 0x10
vextrins.w $vr8, $vr10, 0x20 vextrins.w $vr8, $vr10, 0x20
vextrins.w $vr8, $vr11, 0x30 vextrins.w $vr8, $vr11, 0x30
b .L01_X_1
.L01_X_0:
add.d T3, IX, INCX
vldx U4, X, T3
alsl.d T4, INCX, T3, 2
vldx U8, X, T4
.L01_X_1:
.endm


vand.v $vr12, $vr2, $vr2
PROLOGUE


vfmadd.s U2, U1, U4, U2
vfsub.s U2, U2, $vr12
vfmadd.s U2, U14, U8, U2
addi.d $sp, $sp, -88


vextrins.w U4, U2, 0x01
vextrins.w U5, U2, 0x02
vextrins.w U6, U2, 0x03
SDARG $r23, $sp, 0
SDARG $r24, $sp, 8
SDARG $r25, $sp, 16
SDARG $r26, $sp, 32
SDARG $r27, $sp, 40
SDARG $r28, $sp, 48
SDARG $r29, $sp, 56
SDARG $r30, $sp, 64
SDARG $r31, $sp, 72
ST ALPHA, $sp, 80


fadd.s $f2, $f2, $f4
fadd.s $f2, $f2, $f5
fadd.s $f2, $f2, $f6
fadd.s $f2, $f2, $f12
vldrepl.w VALPHA, $sp, 80


vpermi.w U2, U2, 0x00
addi.d T5, INCY, -1
addi.d T6, INCX, -1
slli.d LDA, LDA, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
slli.d INCY, INCY, BASE_SHIFT

bge $r0, M, .L999
bge $r0, N, .L999

move J, $r0
move JY, $r0
move JX, $r0
move AO1, A

beq J, N, .L999

.L01:
vxor.v U2, U2, U2
fldx.s a6, X, JX
fmul.s a3, ALPHA, a6 //temp1
vpermi.w U3, U3, 0x00

mul.w T0, J, LDA
slli.d T1, J, BASE_SHIFT
add.w T0, T0, T1
fldx.s a6, AO1, T0
fldx.s a4, Y, JY
fmadd.s a4, a3, a6, a4
fstx.s a4, Y, JY

move IY, JY
move IX, JX
addi.d II, J, 1
move I, II
slli.d II, II, BASE_SHIFT


slli.d T2, INCX, 3
add.d IX, IX, T2
sub.d T0, M, J
addi.d T0, T0, -1
srai.d T0, T0, 3
add.d T0, T0, J
addi.d T0, T0, 1
beq I, T0, .L03
bge I, T0, .L03

mul.w T1, J, LDA
add.d T1, T1, II

.L02: /* /8 */
vldx U1, AO1, T1
addi.d T1, T1, 16
vldx U14, AO1, T1
addi.d T1, T1, 16

LOAD_Y_8

vfmadd.s U4, U3, U1, U4
vfmadd.s U8, U3, U14, U8

STORE_Y_8

alsl.d IY, INCY, IY, 3

LOAD_X_8

vfmadd.s U2, U1, U4, U2
vfmadd.s U2, U14, U8, U2

alsl.d IX, INCX, IX, 3


addi.d II, II, 32 addi.d II, II, 32
addi.d I, I, 1 addi.d I, I, 1
blt I, T0, .L02 blt I, T0, .L02


// Acc U2
GACC vf, s, U4, U2
vpermi.w U2, U4, 0

.L03: /* &4 */ .L03: /* &4 */
sub.d T0, M, J sub.d T0, M, J
addi.d T0, T0, -1 addi.d T0, T0, -1
@@ -426,4 +450,4 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi.d $sp, $sp, 88 addi.d $sp, $sp, 88
jirl $r0, $r1, 0x0 jirl $r0, $r1, 0x0


EPILOGUE
EPILOGUE

+ 105
- 84
kernel/loongarch64/ssymv_U_lsx.S View File

@@ -28,6 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ASSEMBLER #define ASSEMBLER


#include "common.h" #include "common.h"
#include "loongarch64_asm.S"


/* Param */ /* Param */
#define M $r4 #define M $r4
@@ -57,6 +58,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define T2 $r28 #define T2 $r28
#define T3 $r29 #define T3 $r29
#define T4 $r30 #define T4 $r30
#define T5 $r17
#define T6 $r16


/* LSX vectors */ /* LSX vectors */
#define U0 $vr31 #define U0 $vr31
@@ -87,67 +90,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define a8 $f8 #define a8 $f8
#define a9 $f9 #define a9 $f9



PROLOGUE

LDARG BUFFER, $sp, 0

addi.d $sp, $sp, -88

SDARG $r23, $sp, 0
SDARG $r24, $sp, 8
SDARG $r25, $sp, 16
SDARG $r26, $sp, 32
SDARG $r27, $sp, 40
SDARG $r28, $sp, 48
SDARG $r29, $sp, 56
SDARG $r30, $sp, 64
SDARG $r31, $sp, 72
ST ALPHA, $sp, 80

vldrepl.w VALPHA, $sp, 80

slli.d LDA, LDA, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
slli.d INCY, INCY, BASE_SHIFT

bge $r0, M, .L999
bge $r0, N, .L999

sub.d M1, M, N

mul.d JY, M1, INCY
mul.d JX, M1, INCX

move J, M1
move AO1, A

beq J, M, .L999

.L01:
MTC $f2, $r0 //temp2
fldx.s $f6, X, JX
fmul.s $f3, ALPHA, $f6 //temp1
vpermi.w U3, U3, 0x00
vpermi.w U2, U2, 0x00

move IY, $r0
move IX, $r0
move II, $r0
move I, $r0

srai.d T0, J, 3
beq I, T0, .L03

mul.w T1, J, LDA
add.d T1, T1, II

.L02: /* /8 */
vldx U1, AO1, T1
addi.d T1, T1, 16
vldx U14, AO1, T1
addi.d T1, T1, 16

.macro LOAD_Y_8
beqz T5, .L01_Y_0
fldx.s $f4, Y, IY fldx.s $f4, Y, IY
add.d T2, IY, INCY add.d T2, IY, INCY
fldx.s $f5, Y, T2 fldx.s $f5, Y, T2
@@ -171,10 +115,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vextrins.w U8, U9, 0x10 vextrins.w U8, U9, 0x10
vextrins.w U8, U10, 0x20 vextrins.w U8, U10, 0x20
vextrins.w U8, U11, 0x30 vextrins.w U8, U11, 0x30

vfmadd.s U4, U3, U1, U4
vfmadd.s U8, U3, U14, U8

b .L01_Y_1
.L01_Y_0:
vldx U4, Y, IY
alsl.d T2, INCY, IY, 2
vldx U8, Y, T2
.L01_Y_1:
.endm

.macro STORE_Y_8
beqz T5, .L01_Y_2
vextrins.w U5, U4, 0x01 vextrins.w U5, U4, 0x01
vextrins.w U6, U4, 0x02 vextrins.w U6, U4, 0x02
vextrins.w U7, U4, 0x03 vextrins.w U7, U4, 0x03
@@ -198,10 +148,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fstx.s $f10, Y, T2 fstx.s $f10, Y, T2
add.d T2, T2, INCY add.d T2, T2, INCY
fstx.s $f11, Y, T2 fstx.s $f11, Y, T2

slli.d T2, INCY, 3
add.d IY, IY, T2

b .L01_Y_3
.L01_Y_2:
vstx U4, Y, IY
vstx U8, Y, T2
.L01_Y_3:
.endm

.macro LOAD_X_8
beqz T6, .L01_X_0
fldx.s $f4, X, IX fldx.s $f4, X, IX
add.d T2, IX, INCX add.d T2, IX, INCX
fldx.s $f5, X, T2 fldx.s $f5, X, T2
@@ -225,31 +180,97 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vextrins.w $vr8, $vr9, 0x10 vextrins.w $vr8, $vr9, 0x10
vextrins.w $vr8, $vr10, 0x20 vextrins.w $vr8, $vr10, 0x20
vextrins.w $vr8, $vr11, 0x30 vextrins.w $vr8, $vr11, 0x30
b .L01_X_1
.L01_X_0:
vldx U4, X, IX
alsl.d T3, INCX, IX, 2
vldx U8, X, T3
.L01_X_1:
.endm


vand.v $vr12, $vr2, $vr2
PROLOGUE


vfmadd.s U2, U1, U4, U2
vfsub.s U2, U2, $vr12
vfmadd.s U2, U14, U8, U2
addi.d $sp, $sp, -88


vextrins.w U4, U2, 0x01
vextrins.w U5, U2, 0x02
vextrins.w U6, U2, 0x03
SDARG $r23, $sp, 0
SDARG $r24, $sp, 8
SDARG $r25, $sp, 16
SDARG $r26, $sp, 32
SDARG $r27, $sp, 40
SDARG $r28, $sp, 48
SDARG $r29, $sp, 56
SDARG $r30, $sp, 64
SDARG $r31, $sp, 72
ST ALPHA, $sp, 80


fadd.s $f2, $f2, $f4
fadd.s $f2, $f2, $f5
fadd.s $f2, $f2, $f6
fadd.s $f2, $f2, $f12
vldrepl.w VALPHA, $sp, 80


vpermi.w U2, U2, 0x00
addi.d T5, INCY, -1
addi.d T6, INCX, -1
slli.d LDA, LDA, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
slli.d INCY, INCY, BASE_SHIFT

bge $r0, M, .L999
bge $r0, N, .L999

sub.d M1, M, N

mul.d JY, M1, INCY
mul.d JX, M1, INCX

move J, M1
move AO1, A

beq J, M, .L999

.L01:
vxor.v U2, U2, U2
fldx.s $f6, X, JX
fmul.s $f3, ALPHA, $f6 //temp1
vpermi.w U3, U3, 0x00

move IY, $r0
move IX, $r0
move II, $r0
move I, $r0

srai.d T0, J, 3
beq I, T0, .L03

mul.w T1, J, LDA
add.d T1, T1, II


slli.d T2, INCX, 3
add.d IX, IX, T2
.L02: /* /8 */
vldx U1, AO1, T1
addi.d T1, T1, 16
vldx U14, AO1, T1
addi.d T1, T1, 16

LOAD_Y_8

vfmadd.s U4, U3, U1, U4
vfmadd.s U8, U3, U14, U8

STORE_Y_8

alsl.d IY, INCY, IY, 3

LOAD_X_8

vfmadd.s U2, U1, U4, U2
vfmadd.s U2, U14, U8, U2

alsl.d IX, INCX, IX, 3


addi.d II, II, 32 addi.d II, II, 32
addi.d I, I, 1 addi.d I, I, 1
blt I, T0, .L02 blt I, T0, .L02


// Acc U2
GACC vf, s, U4, U2
vpermi.w U2, U4, 0x00

.L03: /* &4 */ .L03: /* &4 */
andi T0, J, 4 andi T0, J, 4
beq $r0, T0, .L04 beq $r0, T0, .L04
@@ -414,4 +435,4 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi.d $sp, $sp, 88 addi.d $sp, $sp, 88
jirl $r0, $r1, 0x0 jirl $r0, $r1, 0x0


EPILOGUE
EPILOGUE

Loading…
Cancel
Save