Browse Source

LoongArch64: Fixed iamax_lsx.S

Fixed index retrieval issue when there are
identical maximum absolute values

Signed-off-by: Hao Chen <chenhao@loongson.cn>
Signed-off-by: gxw <guxiwei-hf@loongson.cn>
tags/v0.3.30
Hao Chen gxw 1 year ago
parent
commit
7f1ebc7ae6
1 changed files with 134 additions and 100 deletions
  1. +134
    -100
      kernel/loongarch64/iamax_lsx.S

+ 134
- 100
kernel/loongarch64/iamax_lsx.S View File

@@ -56,19 +56,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define VI3 $vr8
#define VI4 $vr19
#define VT0 $vr23
#define VZE $vr3
#define VT1 $vr4
#define VT2 $vr5
#define VC0 $vr6

PROLOGUE
li.d i0, 0
bge $r0, N, .L999
bge $r0, INCX, .L999
li.d TEMP, 1
vldi VZE, 0
slli.d TEMP, TEMP, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
bne INCX, TEMP, .L20
vld VM0, X, 0
#ifdef DOUBLE
vfsub.d VT1, VZE, VM0
addi.d i0, i0, 1
srai.d I, N, 3
vfmaxa.d VM0, VM0, VT1
bge $r0, I, .L11
slli.d i0, i0, 1 //2
vreplgr2vr.d VINC2, i0
@@ -79,12 +86,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi.d i0, i0, 1
vinsgr2vr.d VI1, i0, 1
addi.d i0, i0, 3
vinsgr2vr.d VI0, i0, 0 //1
vinsgr2vr.d VI0, i0, 0 //initialize the index value for vectorization
addi.d i0, i0, 1
vinsgr2vr.d VI0, i0, 1 //2
vinsgr2vr.d VI0, i0, 1
#else
vfsub.s VT1, VZE, VM0
addi.w i0, i0, 1
srai.d I, N, 3
vfmaxa.s VM0, VM0, VT1
bge $r0, I, .L21
slli.w i0, i0, 2 //4
vreplgr2vr.w VINC2, i0
@@ -115,39 +124,51 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vadd.d VI1, VI1, VINC4
vld VX1, X, 2 * SIZE
vadd.d VI2, VI1, VINC2
vfmaxa.d x1, VX0, VX1
vfcmp.ceq.d VT0, VX0, x1
vbitsel.v x2, VI2, VI1, VT0
vfsub.d VT1, VZE, VX0
vfsub.d VT2, VZE, VX1
vfmaxa.d VX0, VX0, VT1
vfmaxa.d VX1, VX1, VT2
vfcmp.clt.d VT0, VX0, VX1 //abx(x0) < abs(x1)
vbitsel.v x1, VX0, VX1, VT0 //abs(maxf)
vbitsel.v x2, VI1, VI2, VT0 //i

vld VX0, X, 4 * SIZE
vadd.d VI1, VI2, VINC2
vld VX1, X, 6 * SIZE
vadd.d VI2, VI1, VINC2
vfmaxa.d x3, VX0, VX1
vfcmp.ceq.d VT0, VX0, x3
vbitsel.v x4, VI2, VI1, VT0
vfmaxa.d x3, x1, x3
vfcmp.ceq.d VT0, x1, x3
vbitsel.v x2, x4, x2, VT0
vfmaxa.d VM1, VM0, x3
vfcmp.ceq.d VT0, VM0, VM1
vbitsel.v VM0, VM1, VM0, VT0
vbitsel.v VI0, x2, VI0, VT0
vfsub.d VT1, VZE, VX0
vfsub.d VT2, VZE, VX1
vfmaxa.d VX0, VX0, VT1
vfmaxa.d VX1, VX1, VT2
vfcmp.clt.d VT0, VX0, VX1
vbitsel.v x3, VX0, VX1, VT0 //abs(maxf)
vbitsel.v x4, VI1, VI2, VT0 //i
vfcmp.clt.d VC0, x1, x3
vbitsel.v x1, x1, x3, VC0 //abs(maxf)
vbitsel.v x2, x2, x4, VC0 //i
vfcmp.clt.d VT0, VM0, x1
addi.d I, I, -1
addi.d X, X, 8 * SIZE
vbitsel.v VM0, VM0, x1, VT0
vbitsel.v VI0, VI0, x2, VT0
#else
vld VX0, X, 0 * SIZE
vadd.w VI1, VI1, VINC4
vld VX1, X, 4 * SIZE
vadd.w VI2, VI1, VINC2
vfmaxa.s VM1, VX0, VX1
vfcmp.ceq.s VT0, VX0, VM1
vfsub.s VT1, VZE, VX0
vfsub.s VT2, VZE, VX1
vfmaxa.s VX0, VX0, VT1
vfmaxa.s VX1, VX1, VT2
vfcmp.clt.s VT0, VX0, VX1
vbitsel.v x1, VX0, VX1, VT0 //abs(maxf)
vbitsel.v x2, VI1, VI2, VT0 //i
addi.d I, I, -1
vbitsel.v VI2, VI2, VI1, VT0
vfmaxa.s VM1, VM0, VM1
vfcmp.ceq.s VT0, VM0, VM1
vfcmp.clt.s VT0, VM0, x1
addi.d X, X, 8 * SIZE
vbitsel.v VM0, VM1, VM0, VT0
vbitsel.v VI0, VI2, VI0, VT0
vbitsel.v VM0, VM0, x1, VT0
vbitsel.v VI0, VI0, x2, VT0

#endif
blt $r0, I, .L10
.align 3
@@ -158,7 +179,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vreplvei.d VI2, VI0, 1
vreplvei.d x1, VM0, 0
vreplvei.d x2, VM0, 1
fcmp.ceq.d $fcc0, $f10, $f9
fcmp.ceq.d $fcc0, $f9, $f10
bceqz $fcc0, .L16
vfcmp.clt.d VT0, VI1, VI2
vbitsel.v VI0, VI2, VI1, VT0
@@ -172,28 +193,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vreplvei.w x2, VM0, 1
vreplvei.w x3, VM0, 2
vreplvei.w x4, VM0, 3
vfmaxa.s VM1, x1, x2
vfcmp.ceq.s VT0, VM1, x1
vbitsel.v VINC2, VI2, VI1, VT0
vfmaxa.s VM0, x3, x4
vfcmp.ceq.s VT0, x3, VM0
vbitsel.v VINC4, VI4, VI3, VT0
vfmaxa.s VM0, VM0, VM1
vfcmp.ceq.s VT0, VM0, VM1
vbitsel.v VI0, VINC4, VINC2, VT0
fcmp.ceq.d $fcc0, $f15, $f9
bceqz $fcc0, .L26
vfcmp.clt.s VT0, VI1, VI0
vbitsel.v VI0, VI0, VI1, VT0
b .L26
#endif
.align 3

#ifdef DOUBLE
.L16:
vfmaxa.d VM0, x1, x2
vfcmp.ceq.d VT0, x1, VM0
vbitsel.v VI0, VI2, VI1, VT0
vfcmp.clt.d VT0, x1, x2
vbitsel.v VI0, VI1, VI2, VT0
vbitsel.v VM0, x1, x2, VT0
.align 3

.L17:
@@ -212,10 +220,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

.L13:
fld.d $f9, X, 0
vfmaxa.d VM1, x1, VM0
vfcmp.ceq.d VT0, VM0, VM1
vbitsel.v VM0, VM1, VM0, VT0
vbitsel.v VI0, VI1, VI0, VT0
fsub.d $f10, $f3, $f9
vfmaxa.d x1, x1, x2
vfcmp.clt.d VT0, VM0, x1
vbitsel.v VM0, VM0, x1, VT0
vbitsel.v VI0, VI0, VI1, VT0
addi.d I, I, -1
addi.d i1, i1, 1
addi.d X, X, SIZE
@@ -241,10 +250,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add.d TEMP, TEMP, INCX
vinsgr2vr.d VM0, t2, 1
slli.d i0, i0, 1 //2
vfsub.d VT1, VZE, VM0
vreplgr2vr.d VINC2, i0
slli.d i0, i0, 1 //4
vreplgr2vr.d VINC4, i0
addi.d i0, i0, -7
vfmaxa.d VM0, VM0, VT1
vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
addi.d i0, i0, 1
vinsgr2vr.d VI1, i0, 1
@@ -269,9 +280,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add.d X, X, INCX
vinsgr2vr.d VX1, t2, 1
vadd.d VI2, VI1, VINC2
vfmaxa.d x1, VX0, VX1
vfcmp.ceq.d VT0, VX0, x1
vbitsel.v x2, VI2, VI1, VT0

vfsub.d VT1, VZE, VX0
vfsub.d VT2, VZE, VX1
vfmaxa.d VX0, VX0, VT1
vfmaxa.d VX1, VX1, VT2
vfcmp.clt.d VT0, VX0, VX1
vbitsel.v x1, VX0, VX1, VT0
vbitsel.v x2, VI1, VI2, VT0
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
vinsgr2vr.d VX0, t1, 0
@@ -286,16 +302,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add.d X, X, INCX
vinsgr2vr.d VX1, t2, 1
vadd.d VI2, VI1, VINC2
vfmaxa.d x3, VX0, VX1
vfcmp.ceq.d VT0, VX0, x3
vbitsel.v x4, VI2, VI1, VT0
vfmaxa.d x3, x1, x3
vfcmp.ceq.d VT0, x1, x3
vbitsel.v x2, x4, x2, VT0
vfmaxa.d VM1, VM0, x3
vbitsel.v VM0, VM1, VM0, VT0
vfcmp.ceq.d VT0, VM0, VM1
vbitsel.v VI0, x2, VI0, VT0
vfsub.d VT1, VZE, VX0
vfsub.d VT2, VZE, VX1
vfmaxa.d VX0, VX0, VT1
vfmaxa.d VX1, VX1, VT2
vfcmp.clt.d VT0, VX0, VX1
vbitsel.v x3, VX0, VX1, VT0
vbitsel.v x4, VI1, VI2, VT0
vfcmp.clt.d VC0, x1, x3
vbitsel.v x1, x1, x3, VC0
vbitsel.v x2, x2, x4, VC0
vfcmp.clt.d VT0, VM0, x1
vbitsel.v VM0, VM0, x1, VT0
vbitsel.v VI0, VI0, x2, VT0

addi.d I, I, -1
blt $r0, I, .L24
.align 3
@@ -313,9 +333,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.align 3

.L26:
vfmaxa.d VM0, x1, x2
vfcmp.ceq.d VT0, x1, VM0
vbitsel.v VI0, VI2, VI1, VT0
vfcmp.clt.d VT0, x1, x2
vbitsel.v VI0, VI1, VI2, VT0
vbitsel.v VM0, x1, x2, VT0
.align 3

.L27:
@@ -389,14 +409,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vinsgr2vr.w VX1, t3, 2
vinsgr2vr.w VX1, t4, 3
vadd.w VI2, VI1, VINC2
vfmaxa.s VM1, VX0, VX1
vfcmp.ceq.s VT0, VX0, VM1
vbitsel.v VI2, VI2, VI1, VT0
vfmaxa.s VM1, VM0, VM1
vfcmp.ceq.s VT0, VM0, VM1
vfsub.s VT1, VZE, VX0
vfsub.s VT2, VZE, VX1
vfmaxa.s VX0, VX0, VT1
vfmaxa.s VX1, VX1, VT2
vfcmp.clt.s VT0, VX0, VX1
vbitsel.v x1, VX0, VX1, VT0
vbitsel.v x2, VI1, VI2, VT0 //i

addi.d I, I, -1
vbitsel.v VM0, VM1, VM0, VT0
vbitsel.v VI0, VI2, VI0, VT0
vfcmp.clt.s VT0, VM0, x1
vbitsel.v VM0, VM0, x1, VT0
vbitsel.v VI0, VI0, x2, VT0
blt $r0, I, .L24
.align 3

@@ -409,42 +433,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vreplvei.w x2, VM0, 1
vreplvei.w x3, VM0, 2
vreplvei.w x4, VM0, 3
vfmaxa.s VM1, x1, x2
vfcmp.ceq.s VT0, VM1, x1
vbitsel.v VINC2, VI2, VI1, VT0
vfmaxa.s VM0, x3, x4
vfcmp.ceq.s VT0, x3, VM0
vbitsel.v VINC4, VI4, VI3, VT0
vfmaxa.s VM0, VM0, VM1
vfcmp.ceq.s VT0, VM0, VM1
vbitsel.v VI0, VINC4, VINC2, VT0
fcmp.ceq.d $fcc0, $f15, $f9
bceqz $fcc0, .L26
vfcmp.clt.s VT0, VI1, VI0
vbitsel.v VI0, VI0, VI1, VT0
.align 3

.L26:
fcmp.ceq.d $fcc0, $f15, $f10
bceqz $fcc0, .L27
vfcmp.clt.s VT0, VI2, VI0
vbitsel.v VI0, VI0, VI2, VT0
fcmp.ceq.s $fcc0, $f9, $f10
bceqz $fcc0, .L31
vfcmp.clt.s VT0, VI1, VI2
vbitsel.v VI1, VI2, VI1, VT0
b .L32
.align 3

.L27:
fcmp.ceq.d $fcc0, $f15, $f11
bceqz $fcc0, .L28
vfcmp.clt.s VT0, VI3, VI0
vbitsel.v VI0, VI0, VI3, VT0
.L31:
vfcmp.clt.s VT0, x1, x2
vbitsel.v VI1, VI1, VI2, VT0
vbitsel.v x1, x1, x2, VT0
.align 3

.L28:
fcmp.ceq.d $fcc0, $f15, $f12
bceqz $fcc0, .L29
vfcmp.clt.s VT0, VI4, VI0
vbitsel.v VI0, VI0, VI4, VT0
.L32:
fcmp.ceq.s $fcc0, $f11, $f12
bceqz $fcc0, .L33
vfcmp.clt.s VT1, VI3, VI4
vbitsel.v VI3, VI4, VI3, VT1
b .L34
.align 3
.L33:
vfcmp.clt.s VT1, x3, x4
vbitsel.v x3, x3, x4, VT1
vbitsel.v VI3, VI3, VI4, VT1
.align 3
.L34:
fcmp.ceq.s $fcc0, $f9, $f11
bceqz $fcc0, .L35
vfcmp.clt.s VT0, VI1, VI3
vbitsel.v VI0, VI3, VI1, VT0
vxor.v VM0, x1, VZE
b .L29
.align 3
.L35:
vfcmp.clt.s VT0, x1, x3
vbitsel.v VM0, x1, x3, VT0
vbitsel.v VI0, VI1, VI3, VT0
.align 3

.L29:
movfr2gr.s i0, $f20
.align 3
@@ -462,10 +489,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

.L22:
LD $f9, X, 0
VFMAXA VM1, x1, VM0
VCMPEQ VT0, VM0, VM1
vbitsel.v VM0, VM1, VM0, VT0
vbitsel.v VI0, VI1, VI0, VT0
#ifdef DOUBLE
fsub.d $f10, $f3, $f9
vfmaxa.d x1, x1, x2
vfcmp.clt.d VT0, VM0, x1
#else
fsub.s $f10, $f3, $f9
vfmaxa.s x1, x1, x2
vfcmp.clt.s VT0, VM0, x1
#endif
vbitsel.v VM0, VM0, x1, VT0
vbitsel.v VI0, VI0, VI1, VT0
addi.d I, I, -1
addi.d i1, i1, 1
add.d X, X, INCX


Loading…
Cancel
Save