Browse Source

Merge pull request #5073 from XiWeiGu/la64_update_symv_lsx_version

LoongArch64: Update symv lsx version
tags/v0.3.30
Martin Kroeker GitHub 1 year ago
parent
commit
eba7338484
No known key found for this signature in database GPG Key ID: B5690EEEBB952194
4 changed files with 464 additions and 345 deletions
  1. +123
    -85
      kernel/loongarch64/dsymv_L_lsx.S
  2. +118
    -82
      kernel/loongarch64/dsymv_U_lsx.S
  3. +118
    -94
      kernel/loongarch64/ssymv_L_lsx.S
  4. +105
    -84
      kernel/loongarch64/ssymv_U_lsx.S

+ 123
- 85
kernel/loongarch64/dsymv_L_lsx.S View File

@@ -28,6 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ASSEMBLER

#include "common.h"
#include "loongarch64_asm.S"

/* Param */
#define M $r4
@@ -57,6 +58,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define T2 $r28
#define T3 $r29
#define T4 $r30
#define T5 $r17
#define T6 $r16
#define T7 $r12

/* LSX vectors */
#define U0 $vr31
@@ -87,10 +91,114 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define a8 $f8
#define a9 $f9

.macro LOAD_Y_8
beqz T5, .L01_Y_0
add.d T2, IY, INCY
fldx.d $f4, Y, T2
add.d T2, T2, INCY
fldx.d $f5, Y, T2
add.d T2, T2, INCY
fldx.d $f6, Y, T2
add.d T2, T2, INCY
fldx.d $f7, Y, T2

PROLOGUE
add.d T2, T2, INCY
fldx.d $f8, Y, T2
add.d T2, T2, INCY
fldx.d $f9, Y, T2
add.d T2, T2, INCY
fldx.d $f10, Y, T2
add.d T2, T2, INCY
fldx.d $f11, Y, T2

vextrins.d U4, U5, 0x10
vextrins.d U6, U7, 0x10
vextrins.d U8, U9, 0x10
vextrins.d U10, U11, 0x10
b .L01_Y_1
.L01_Y_0:
add.d T7, IY, INCY
vldx U4, Y, T7
alsl.d T2, INCY, T7, 1
vldx U6, Y, T2
alsl.d T3, INCY, T2, 1
vldx U8, Y, T3
alsl.d T4, INCY, T3, 1
vldx U10, Y, T4
.L01_Y_1:
.endm

.macro LOAD_X_8
beqz T6, .L01_X_0
add.d T2, IX, INCX
fldx.d $f4, X, T2
add.d T2, T2, INCX
fldx.d $f5, X, T2
add.d T2, T2, INCX
fldx.d $f6, X, T2
add.d T2, T2, INCX
fldx.d $f7, X, T2

add.d T2, T2, INCX
fldx.d $f8, X, T2
add.d T2, T2, INCX
fldx.d $f9, X, T2
add.d T2, T2, INCX
fldx.d $f10, X, T2
add.d T2, T2, INCX
fldx.d $f11, X, T2

vextrins.d U4, U5, 0x10
vextrins.d U6, U7, 0x10
vextrins.d U8, U9, 0x10
vextrins.d U10, U11, 0x10
b .L01_X_1
.L01_X_0:
add.d T7, IX, INCX
vldx U4, X, T7
alsl.d T2, INCX, T7, 1
vldx U6, X, T2
alsl.d T3, INCX, T2, 1
vldx U8, X, T3
alsl.d T4, INCX, T3, 1
vldx U10, X, T4
.L01_X_1:
.endm

.macro STORE_Y_8
beqz T5, .L01_Y_2
vextrins.d U5, U4, 0x01
vextrins.d U7, U6, 0x01
vextrins.d U9, U8, 0x01
vextrins.d U11, U10, 0x01

add.d T2, IY, INCY
fstx.d $f4, Y, T2
add.d T2, T2, INCY
fstx.d $f5, Y, T2
add.d T2, T2, INCY
fstx.d $f6, Y, T2
add.d T2, T2, INCY
fstx.d $f7, Y, T2

add.d T2, T2, INCY
fstx.d $f8, Y, T2
add.d T2, T2, INCY
fstx.d $f9, Y, T2
add.d T2, T2, INCY
fstx.d $f10, Y, T2
add.d T2, T2, INCY
fstx.d $f11, Y, T2
b .L01_Y_3
.L01_Y_2:
vstx U4, Y, T7
vstx U6, Y, T2
vstx U8, Y, T3
vstx U10, Y, T4
.L01_Y_3:
.endm

LDARG BUFFER, $sp, 0
PROLOGUE

addi.d $sp, $sp, -88

@@ -107,6 +215,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

vldrepl.d VALPHA, $sp, 80

addi.d T5, INCY, -1
addi.d T6, INCX, -1
slli.d LDA, LDA, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
slli.d INCY, INCY, BASE_SHIFT
@@ -122,11 +232,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
beq J, N, .L999

.L01:
MTC a2, $r0 //temp2
vxor.v U2, U2, U2
fldx.d a6, X, JX
fmul.d a3, ALPHA, a6 //temp1
vshuf4i.d U3, U3, 0x00
vshuf4i.d U2, U2, 0x00

mul.d T0, J, LDA
slli.d T1, J, BASE_SHIFT
@@ -163,105 +272,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vldx U16, AO1, T1
addi.d T1, T1, 16

add.d T2, IY, INCY
fldx.d $f4, Y, T2
add.d T2, T2, INCY
fldx.d $f5, Y, T2
add.d T2, T2, INCY
fldx.d $f6, Y, T2
add.d T2, T2, INCY
fldx.d $f7, Y, T2

add.d T2, T2, INCY
fldx.d $f8, Y, T2
add.d T2, T2, INCY
fldx.d $f9, Y, T2
add.d T2, T2, INCY
fldx.d $f10, Y, T2
add.d T2, T2, INCY
fldx.d $f11, Y, T2

vextrins.d U4, U5, 0x10
vextrins.d U6, U7, 0x10
vextrins.d U8, U9, 0x10
vextrins.d U10, U11, 0x10
LOAD_Y_8

vfmadd.d U4, U3, U1, U4
vfmadd.d U6, U3, U14, U6
vfmadd.d U8, U3, U15, U8
vfmadd.d U10, U3, U16, U10

vextrins.d U5, U4, 0x01
vextrins.d U7, U6, 0x01
vextrins.d U9, U8, 0x01
vextrins.d U11, U10, 0x01

add.d T2, IY, INCY
fstx.d $f4, Y, T2
add.d T2, T2, INCY
fstx.d $f5, Y, T2
add.d T2, T2, INCY
fstx.d $f6, Y, T2
add.d T2, T2, INCY
fstx.d $f7, Y, T2

add.d T2, T2, INCY
fstx.d $f8, Y, T2
add.d T2, T2, INCY
fstx.d $f9, Y, T2
add.d T2, T2, INCY
fstx.d $f10, Y, T2
add.d T2, T2, INCY
fstx.d $f11, Y, T2

slli.d T2, INCY, 3
add.d IY, IY, T2

add.d T2, IX, INCX
fldx.d $f4, X, T2
add.d T2, T2, INCX
fldx.d $f5, X, T2
add.d T2, T2, INCX
fldx.d $f6, X, T2
add.d T2, T2, INCX
fldx.d $f7, X, T2

add.d T2, T2, INCX
fldx.d $f8, X, T2
add.d T2, T2, INCX
fldx.d $f9, X, T2
add.d T2, T2, INCX
fldx.d $f10, X, T2
add.d T2, T2, INCX
fldx.d $f11, X, T2
STORE_Y_8

vextrins.d U4, U5, 0x10
vextrins.d U6, U7, 0x10
vextrins.d U8, U9, 0x10
vextrins.d U10, U11, 0x10
alsl.d IY, INCY, IY, 3

vand.v $vr12, $vr2, $vr2
LOAD_X_8

vfmadd.d U2, U1, U4, U2
vfsub.d U2, U2, $vr12
vfmadd.d U2, U14, U6, U2
vfmadd.d U2, U15, U8, U2
vfmadd.d U2, U16, U10, U2

vextrins.d U4, U2, 0x01

fadd.d $f2, $f2, $f4
fadd.d $f2, $f2, $f12

vextrins.d U2, U2, 0x10

slli.d T2, INCX, 3
add.d IX, IX, T2
alsl.d IX, INCX, IX, 3

addi.d II, II, 64
addi.d I, I, 1
blt I, T0, .L02

// Acc U2
GACC vf, d, U4, U2
vilvl.d U2, U4, U4

.L03: /* &4 */
sub.d T0, M, J
addi.d T0, T0, -1
@@ -429,4 +467,4 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi.d $sp, $sp, 88
jirl $r0, $r1, 0x0

EPILOGUE
EPILOGUE

+ 118
- 82
kernel/loongarch64/dsymv_U_lsx.S View File

@@ -28,6 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ASSEMBLER

#include "common.h"
#include "loongarch64_asm.S"

/* Param */
#define M $r4
@@ -57,6 +58,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define T2 $r28
#define T3 $r29
#define T4 $r30
#define T5 $r17
#define T6 $r16
#define T7 $r12

/* LSX vectors */
#define U0 $vr31
@@ -87,10 +91,109 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define a8 $f8
#define a9 $f9

.macro LOAD_Y_8
beqz T5, .L01_Y_0
fldx.d $f4, Y, IY
add.d T2, IY, INCY
fldx.d $f5, Y, T2
add.d T2, T2, INCY
fldx.d $f6, Y, T2
add.d T2, T2, INCY
fldx.d $f7, Y, T2

PROLOGUE
add.d T2, T2, INCY
fldx.d $f8, Y, T2
add.d T2, T2, INCY
fldx.d $f9, Y, T2
add.d T2, T2, INCY
fldx.d $f10, Y, T2
add.d T2, T2, INCY
fldx.d $f11, Y, T2

LDARG BUFFER, $sp, 0
vextrins.d U4, U5, 0x10
vextrins.d U6, U7, 0x10
vextrins.d U8, U9, 0x10
vextrins.d U10, U11, 0x10
b .L01_Y_1
.L01_Y_0:
vldx U4, Y, IY
alsl.d T2, INCY, IY, 1
vldx U6, Y, T2
alsl.d T3, INCY, T2, 1
vldx U8, Y, T3
alsl.d T4, INCY, T3, 1
vldx U10, Y, T4
.L01_Y_1:
.endm

.macro STORE_Y_8
beqz T5, .L01_Y_2
vextrins.d U5, U4, 0x01
vextrins.d U7, U6, 0x01
vextrins.d U9, U8, 0x01
vextrins.d U11, U10, 0x01

fstx.d $f4, Y, IY
add.d T2, IY, INCY
fstx.d $f5, Y, T2
add.d T2, T2, INCY
fstx.d $f6, Y, T2
add.d T2, T2, INCY
fstx.d $f7, Y, T2

add.d T2, T2, INCY
fstx.d $f8, Y, T2
add.d T2, T2, INCY
fstx.d $f9, Y, T2
add.d T2, T2, INCY
fstx.d $f10, Y, T2
add.d T2, T2, INCY
fstx.d $f11, Y, T2
b .L01_Y_3
.L01_Y_2:
vstx U4, Y, IY
vstx U6, Y, T2
vstx U8, Y, T3
vstx U10,Y, T4
.L01_Y_3:
.endm

.macro LOAD_X_8
beqz T6, .L01_X_0
fldx.d $f4, X, IX
add.d T2, IX, INCX
fldx.d $f5, X, T2
add.d T2, T2, INCX
fldx.d $f6, X, T2
add.d T2, T2, INCX
fldx.d $f7, X, T2

add.d T2, T2, INCX
fldx.d $f8, X, T2
add.d T2, T2, INCX
fldx.d $f9, X, T2
add.d T2, T2, INCX
fldx.d $f10, X, T2
add.d T2, T2, INCX
fldx.d $f11, X, T2

vextrins.d U4, U5, 0x10
vextrins.d U6, U7, 0x10
vextrins.d U8, U9, 0x10
vextrins.d U10, U11, 0x10
b .L01_X_1
.L01_X_0:
vldx U4, X, IX
alsl.d T2, INCX, IX, 1
vldx U6, X, T2
alsl.d T3, INCX, T2, 1
vldx U8, X, T3
alsl.d T4, INCX, T3, 1
vldx U10, X, T4
.L01_X_1:
.endm

PROLOGUE

addi.d $sp, $sp, -88

@@ -107,6 +210,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

vldrepl.d VALPHA, $sp, 80

addi.d T5, INCY, -1
addi.d T6, INCX, -1
slli.d LDA, LDA, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
slli.d INCY, INCY, BASE_SHIFT
@@ -125,11 +230,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
beq J, M, .L999

.L01:
MTC $f2, $r0 //temp2
vxor.v U2, U2, U2
fldx.d $f6, X, JX
fmul.d $f3, ALPHA, $f6 //temp1
vshuf4i.d U3, U3, 0x00
vshuf4i.d U2, U2, 0x00

move IY, $r0
move IX, $r0
@@ -152,102 +256,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vldx U16, AO1, T1
addi.d T1, T1, 16

fldx.d $f4, Y, IY
add.d T2, IY, INCY
fldx.d $f5, Y, T2
add.d T2, T2, INCY
fldx.d $f6, Y, T2
add.d T2, T2, INCY
fldx.d $f7, Y, T2

add.d T2, T2, INCY
fldx.d $f8, Y, T2
add.d T2, T2, INCY
fldx.d $f9, Y, T2
add.d T2, T2, INCY
fldx.d $f10, Y, T2
add.d T2, T2, INCY
fldx.d $f11, Y, T2

vextrins.d U4, U5, 0x10
vextrins.d U6, U7, 0x10
vextrins.d U8, U9, 0x10
vextrins.d U10, U11, 0x10
LOAD_Y_8

vfmadd.d U4, U3, U1, U4
vfmadd.d U6, U3, U14, U6
vfmadd.d U8, U3, U15, U8
vfmadd.d U10, U3, U16, U10

vextrins.d U5, U4, 0x01
vextrins.d U7, U6, 0x01
vextrins.d U9, U8, 0x01
vextrins.d U11, U10, 0x01
STORE_Y_8

fstx.d $f4, Y, IY
add.d T2, IY, INCY
fstx.d $f5, Y, T2
add.d T2, T2, INCY
fstx.d $f6, Y, T2
add.d T2, T2, INCY
fstx.d $f7, Y, T2
alsl.d IY, INCY, IY, 3

add.d T2, T2, INCY
fstx.d $f8, Y, T2
add.d T2, T2, INCY
fstx.d $f9, Y, T2
add.d T2, T2, INCY
fstx.d $f10, Y, T2
add.d T2, T2, INCY
fstx.d $f11, Y, T2

slli.d T2, INCY, 3
add.d IY, IY, T2

fldx.d $f4, X, IX
add.d T2, IX, INCX
fldx.d $f5, X, T2
add.d T2, T2, INCX
fldx.d $f6, X, T2
add.d T2, T2, INCX
fldx.d $f7, X, T2

add.d T2, T2, INCX
fldx.d $f8, X, T2
add.d T2, T2, INCX
fldx.d $f9, X, T2
add.d T2, T2, INCX
fldx.d $f10, X, T2
add.d T2, T2, INCX
fldx.d $f11, X, T2

vextrins.d U4, U5, 0x10
vextrins.d U6, U7, 0x10
vextrins.d U8, U9, 0x10
vextrins.d U10, U11, 0x10

vand.v $vr12, $vr2, $vr2
LOAD_X_8

vfmadd.d U2, U1, U4, U2
vfsub.d U2, U2, $vr12
vfmadd.d U2, U14, U6, U2
vfmadd.d U2, U15, U8, U2
vfmadd.d U2, U16, U10, U2

vextrins.d U4, U2, 0x01

fadd.d $f2, $f2, $f4
fadd.d $f2, $f2, $f12

vextrins.d U2, U2, 0x10

slli.d T2, INCX, 3
add.d IX, IX, T2
alsl.d IX, INCX, IX, 3

addi.d II, II, 64
addi.d I, I, 1
blt I, T0, .L02

// Acc U2
GACC vf, d, U4, U2
vilvl.d U2, U4, U4

.L03: /* &4 */
andi T0, J, 4
beq $r0, T0, .L04
@@ -417,4 +453,4 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi.d $sp, $sp, 88
jirl $r0, $r1, 0x0

EPILOGUE
EPILOGUE

+ 118
- 94
kernel/loongarch64/ssymv_L_lsx.S View File

@@ -28,6 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ASSEMBLER

#include "common.h"
#include "loongarch64_asm.S"

/* Param */
#define M $r4
@@ -57,6 +58,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define T2 $r28
#define T3 $r29
#define T4 $r30
#define T5 $r17
#define T6 $r16

/* LSX vectors */
#define U0 $vr31
@@ -88,77 +91,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define a9 $f9


PROLOGUE

LDARG BUFFER, $sp, 0

addi.d $sp, $sp, -88

SDARG $r23, $sp, 0
SDARG $r24, $sp, 8
SDARG $r25, $sp, 16
SDARG $r26, $sp, 32
SDARG $r27, $sp, 40
SDARG $r28, $sp, 48
SDARG $r29, $sp, 56
SDARG $r30, $sp, 64
SDARG $r31, $sp, 72
ST ALPHA, $sp, 80

vldrepl.w VALPHA, $sp, 80

slli.d LDA, LDA, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
slli.d INCY, INCY, BASE_SHIFT

bge $r0, M, .L999
bge $r0, N, .L999

move J, $r0
move JY, $r0
move JX, $r0
move AO1, A

beq J, N, .L999

.L01:
MTC a2, $r0 //temp2
fldx.s a6, X, JX
fmul.s a3, ALPHA, a6 //temp1
vpermi.w U3, U3, 0x00
vpermi.w U2, U2, 0x00

mul.w T0, J, LDA
slli.d T1, J, BASE_SHIFT
add.w T0, T0, T1
fldx.s a6, AO1, T0
fldx.s a4, Y, JY
fmadd.s a4, a3, a6, a4
fstx.s a4, Y, JY

move IY, JY
move IX, JX
addi.d II, J, 1
move I, II
slli.d II, II, BASE_SHIFT

sub.d T0, M, J
addi.d T0, T0, -1
srai.d T0, T0, 3
add.d T0, T0, J
addi.d T0, T0, 1
beq I, T0, .L03
bge I, T0, .L03

mul.w T1, J, LDA
add.d T1, T1, II

.L02: /* /8 */
vldx U1, AO1, T1
addi.d T1, T1, 16
vldx U14, AO1, T1
addi.d T1, T1, 16

.macro LOAD_Y_8
beqz T5, .L01_Y_0
add.d T2, IY, INCY
fldx.s $f4, Y, T2
add.d T2, T2, INCY
@@ -183,10 +117,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vextrins.w U8, U9, 0x10
vextrins.w U8, U10, 0x20
vextrins.w U8, U11, 0x30

vfmadd.s U4, U3, U1, U4
vfmadd.s U8, U3, U14, U8

b .L01_Y_1
.L01_Y_0:
add.d T3, IY, INCY
vldx U4, Y, T3
alsl.d T4, INCY, T3, 2
vldx U8, Y, T4
.L01_Y_1:
.endm

.macro STORE_Y_8
beqz T5, .L01_Y_2
vextrins.w U5, U4, 0x01
vextrins.w U6, U4, 0x02
vextrins.w U7, U4, 0x03
@@ -211,10 +152,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fstx.s $f10, Y, T2
add.d T2, T2, INCY
fstx.s $f11, Y, T2

slli.d T2, INCY, 3
add.d IY, IY, T2

b .L01_Y_3
.L01_Y_2:
vstx U4, Y, T3
vstx U8, Y, T4
.L01_Y_3:
.endm

.macro LOAD_X_8
beqz T6, .L01_X_0
add.d T2, IX, INCX
fldx.s $f4, X, T2
add.d T2, T2, INCX
@@ -239,31 +185,109 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vextrins.w $vr8, $vr9, 0x10
vextrins.w $vr8, $vr10, 0x20
vextrins.w $vr8, $vr11, 0x30
b .L01_X_1
.L01_X_0:
add.d T3, IX, INCX
vldx U4, X, T3
alsl.d T4, INCX, T3, 2
vldx U8, X, T4
.L01_X_1:
.endm

vand.v $vr12, $vr2, $vr2
PROLOGUE

vfmadd.s U2, U1, U4, U2
vfsub.s U2, U2, $vr12
vfmadd.s U2, U14, U8, U2
addi.d $sp, $sp, -88

vextrins.w U4, U2, 0x01
vextrins.w U5, U2, 0x02
vextrins.w U6, U2, 0x03
SDARG $r23, $sp, 0
SDARG $r24, $sp, 8
SDARG $r25, $sp, 16
SDARG $r26, $sp, 32
SDARG $r27, $sp, 40
SDARG $r28, $sp, 48
SDARG $r29, $sp, 56
SDARG $r30, $sp, 64
SDARG $r31, $sp, 72
ST ALPHA, $sp, 80

fadd.s $f2, $f2, $f4
fadd.s $f2, $f2, $f5
fadd.s $f2, $f2, $f6
fadd.s $f2, $f2, $f12
vldrepl.w VALPHA, $sp, 80

vpermi.w U2, U2, 0x00
addi.d T5, INCY, -1
addi.d T6, INCX, -1
slli.d LDA, LDA, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
slli.d INCY, INCY, BASE_SHIFT

bge $r0, M, .L999
bge $r0, N, .L999

move J, $r0
move JY, $r0
move JX, $r0
move AO1, A

beq J, N, .L999

.L01:
vxor.v U2, U2, U2
fldx.s a6, X, JX
fmul.s a3, ALPHA, a6 //temp1
vpermi.w U3, U3, 0x00

mul.w T0, J, LDA
slli.d T1, J, BASE_SHIFT
add.w T0, T0, T1
fldx.s a6, AO1, T0
fldx.s a4, Y, JY
fmadd.s a4, a3, a6, a4
fstx.s a4, Y, JY

move IY, JY
move IX, JX
addi.d II, J, 1
move I, II
slli.d II, II, BASE_SHIFT

slli.d T2, INCX, 3
add.d IX, IX, T2
sub.d T0, M, J
addi.d T0, T0, -1
srai.d T0, T0, 3
add.d T0, T0, J
addi.d T0, T0, 1
beq I, T0, .L03
bge I, T0, .L03

mul.w T1, J, LDA
add.d T1, T1, II

.L02: /* /8 */
vldx U1, AO1, T1
addi.d T1, T1, 16
vldx U14, AO1, T1
addi.d T1, T1, 16

LOAD_Y_8

vfmadd.s U4, U3, U1, U4
vfmadd.s U8, U3, U14, U8

STORE_Y_8

alsl.d IY, INCY, IY, 3

LOAD_X_8

vfmadd.s U2, U1, U4, U2
vfmadd.s U2, U14, U8, U2

alsl.d IX, INCX, IX, 3

addi.d II, II, 32
addi.d I, I, 1
blt I, T0, .L02

// Acc U2
GACC vf, s, U4, U2
vpermi.w U2, U4, 0

.L03: /* &4 */
sub.d T0, M, J
addi.d T0, T0, -1
@@ -426,4 +450,4 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi.d $sp, $sp, 88
jirl $r0, $r1, 0x0

EPILOGUE
EPILOGUE

+ 105
- 84
kernel/loongarch64/ssymv_U_lsx.S View File

@@ -28,6 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ASSEMBLER

#include "common.h"
#include "loongarch64_asm.S"

/* Param */
#define M $r4
@@ -57,6 +58,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define T2 $r28
#define T3 $r29
#define T4 $r30
#define T5 $r17
#define T6 $r16

/* LSX vectors */
#define U0 $vr31
@@ -87,67 +90,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define a8 $f8
#define a9 $f9


PROLOGUE

LDARG BUFFER, $sp, 0

addi.d $sp, $sp, -88

SDARG $r23, $sp, 0
SDARG $r24, $sp, 8
SDARG $r25, $sp, 16
SDARG $r26, $sp, 32
SDARG $r27, $sp, 40
SDARG $r28, $sp, 48
SDARG $r29, $sp, 56
SDARG $r30, $sp, 64
SDARG $r31, $sp, 72
ST ALPHA, $sp, 80

vldrepl.w VALPHA, $sp, 80

slli.d LDA, LDA, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
slli.d INCY, INCY, BASE_SHIFT

bge $r0, M, .L999
bge $r0, N, .L999

sub.d M1, M, N

mul.d JY, M1, INCY
mul.d JX, M1, INCX

move J, M1
move AO1, A

beq J, M, .L999

.L01:
MTC $f2, $r0 //temp2
fldx.s $f6, X, JX
fmul.s $f3, ALPHA, $f6 //temp1
vpermi.w U3, U3, 0x00
vpermi.w U2, U2, 0x00

move IY, $r0
move IX, $r0
move II, $r0
move I, $r0

srai.d T0, J, 3
beq I, T0, .L03

mul.w T1, J, LDA
add.d T1, T1, II

.L02: /* /8 */
vldx U1, AO1, T1
addi.d T1, T1, 16
vldx U14, AO1, T1
addi.d T1, T1, 16

.macro LOAD_Y_8
beqz T5, .L01_Y_0
fldx.s $f4, Y, IY
add.d T2, IY, INCY
fldx.s $f5, Y, T2
@@ -171,10 +115,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vextrins.w U8, U9, 0x10
vextrins.w U8, U10, 0x20
vextrins.w U8, U11, 0x30

vfmadd.s U4, U3, U1, U4
vfmadd.s U8, U3, U14, U8

b .L01_Y_1
.L01_Y_0:
vldx U4, Y, IY
alsl.d T2, INCY, IY, 2
vldx U8, Y, T2
.L01_Y_1:
.endm

.macro STORE_Y_8
beqz T5, .L01_Y_2
vextrins.w U5, U4, 0x01
vextrins.w U6, U4, 0x02
vextrins.w U7, U4, 0x03
@@ -198,10 +148,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fstx.s $f10, Y, T2
add.d T2, T2, INCY
fstx.s $f11, Y, T2

slli.d T2, INCY, 3
add.d IY, IY, T2

b .L01_Y_3
.L01_Y_2:
vstx U4, Y, IY
vstx U8, Y, T2
.L01_Y_3:
.endm

.macro LOAD_X_8
beqz T6, .L01_X_0
fldx.s $f4, X, IX
add.d T2, IX, INCX
fldx.s $f5, X, T2
@@ -225,31 +180,97 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vextrins.w $vr8, $vr9, 0x10
vextrins.w $vr8, $vr10, 0x20
vextrins.w $vr8, $vr11, 0x30
b .L01_X_1
.L01_X_0:
vldx U4, X, IX
alsl.d T3, INCX, IX, 2
vldx U8, X, T3
.L01_X_1:
.endm

vand.v $vr12, $vr2, $vr2
PROLOGUE

vfmadd.s U2, U1, U4, U2
vfsub.s U2, U2, $vr12
vfmadd.s U2, U14, U8, U2
addi.d $sp, $sp, -88

vextrins.w U4, U2, 0x01
vextrins.w U5, U2, 0x02
vextrins.w U6, U2, 0x03
SDARG $r23, $sp, 0
SDARG $r24, $sp, 8
SDARG $r25, $sp, 16
SDARG $r26, $sp, 32
SDARG $r27, $sp, 40
SDARG $r28, $sp, 48
SDARG $r29, $sp, 56
SDARG $r30, $sp, 64
SDARG $r31, $sp, 72
ST ALPHA, $sp, 80

fadd.s $f2, $f2, $f4
fadd.s $f2, $f2, $f5
fadd.s $f2, $f2, $f6
fadd.s $f2, $f2, $f12
vldrepl.w VALPHA, $sp, 80

vpermi.w U2, U2, 0x00
addi.d T5, INCY, -1
addi.d T6, INCX, -1
slli.d LDA, LDA, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
slli.d INCY, INCY, BASE_SHIFT

bge $r0, M, .L999
bge $r0, N, .L999

sub.d M1, M, N

mul.d JY, M1, INCY
mul.d JX, M1, INCX

move J, M1
move AO1, A

beq J, M, .L999

.L01:
vxor.v U2, U2, U2
fldx.s $f6, X, JX
fmul.s $f3, ALPHA, $f6 //temp1
vpermi.w U3, U3, 0x00

move IY, $r0
move IX, $r0
move II, $r0
move I, $r0

srai.d T0, J, 3
beq I, T0, .L03

mul.w T1, J, LDA
add.d T1, T1, II

slli.d T2, INCX, 3
add.d IX, IX, T2
.L02: /* /8 */
vldx U1, AO1, T1
addi.d T1, T1, 16
vldx U14, AO1, T1
addi.d T1, T1, 16

LOAD_Y_8

vfmadd.s U4, U3, U1, U4
vfmadd.s U8, U3, U14, U8

STORE_Y_8

alsl.d IY, INCY, IY, 3

LOAD_X_8

vfmadd.s U2, U1, U4, U2
vfmadd.s U2, U14, U8, U2

alsl.d IX, INCX, IX, 3

addi.d II, II, 32
addi.d I, I, 1
blt I, T0, .L02

// Acc U2
GACC vf, s, U4, U2
vpermi.w U2, U4, 0x00

.L03: /* &4 */
andi T0, J, 4
beq $r0, T0, .L04
@@ -414,4 +435,4 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi.d $sp, $sp, 88
jirl $r0, $r1, 0x0

EPILOGUE
EPILOGUE

Loading…
Cancel
Save