Browse Source

Loongarch64: fixed iamax_lasx

tags/v0.3.30
pengxu 9 months ago
parent
commit
b528b1b8ea
1 changed files with 282 additions and 284 deletions
  1. +282
    -284
      kernel/loongarch64/iamax_lasx.S

+ 282
- 284
kernel/loongarch64/iamax_lasx.S View File

@@ -56,25 +56,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define VI3 $xr8 #define VI3 $xr8
#define VI4 $xr19 #define VI4 $xr19
#define VT0 $xr23 #define VT0 $xr23
#define VZE $xr3
#define VT1 $xr4
#define VT2 $xr5
#define VC0 $xr6


PROLOGUE PROLOGUE
li.d i0, 0 li.d i0, 0
bge $r0, N, .L999 bge $r0, N, .L999
bge $r0, INCX, .L999 bge $r0, INCX, .L999
li.d TEMP, 1 li.d TEMP, 1
xvldi VZE, 0
slli.d TEMP, TEMP, BASE_SHIFT slli.d TEMP, TEMP, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT slli.d INCX, INCX, BASE_SHIFT
bne INCX, TEMP, .L20 bne INCX, TEMP, .L20
xvld VM0, X, 0 xvld VM0, X, 0
#ifdef DOUBLE #ifdef DOUBLE
xvfsub.d VT1, VZE, VM0
addi.d i0, i0, 1 addi.d i0, i0, 1
srai.d I, N, 3 srai.d I, N, 3
bge $r0, I, .L21
slli.d i0, i0, 2 //4
xvfmaxa.d VM0, VM0, VT1
bge $r0, I, .L11
slli.d i0, i0, 1 //2
xvreplgr2vr.d VINC4, i0 xvreplgr2vr.d VINC4, i0
slli.d i0, i0, 1 //8
slli.d i0, i0, 1 //4
xvreplgr2vr.d VINC8, i0 xvreplgr2vr.d VINC8, i0
addi.d i0, i0, -15
addi.d i0, i0, -7
xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
addi.d i0, i0, 1 addi.d i0, i0, 1
xvinsgr2vr.d VI1, i0, 1 xvinsgr2vr.d VI1, i0, 1
@@ -82,19 +89,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvinsgr2vr.d VI1, i0, 2 xvinsgr2vr.d VI1, i0, 2
addi.d i0, i0, 1 addi.d i0, i0, 1
xvinsgr2vr.d VI1, i0, 3 xvinsgr2vr.d VI1, i0, 3
addi.d i0, i0, 5
xvinsgr2vr.d VI0, i0, 0 //1
addi.d i0, i0, 1 addi.d i0, i0, 1
xvinsgr2vr.d VI0, i0, 1 //2
xvinsgr2vr.d VI0, i0, 0 //initialize the index value for vectorization
addi.d i0, i0, 1 addi.d i0, i0, 1
xvinsgr2vr.d VI0, i0, 2 //3
xvinsgr2vr.d VI0, i0, 1
addi.d i0, i0, 1 addi.d i0, i0, 1
xvinsgr2vr.d VI0, i0, 3 //4
xvinsgr2vr.d VI0, i0, 2
addi.d i0, i0, 1
xvinsgr2vr.d VI0, i0, 3
#else #else
xvfsub.s VT1, VZE, VM0
addi.w i0, i0, 1 addi.w i0, i0, 1
srai.d I, N, 3 srai.d I, N, 3
xvfmaxa.s VM0, VM0, VT1
bge $r0, I, .L21 bge $r0, I, .L21
slli.w i0, i0, 3 //8
slli.w i0, i0, 2 //4
xvreplgr2vr.w VINC4, i0
slli.w i0, i0, 1 //8
xvreplgr2vr.w VINC8, i0 xvreplgr2vr.w VINC8, i0
addi.w i0, i0, -15 addi.w i0, i0, -15
xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
@@ -135,73 +146,124 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifdef DOUBLE #ifdef DOUBLE
xvld VX0, X, 0 * SIZE xvld VX0, X, 0 * SIZE
xvadd.d VI1, VI1, VINC8 xvadd.d VI1, VI1, VINC8
xvld VX1, X, 4 * SIZE
xvld VX1, X, 2 * SIZE
xvadd.d VI2, VI1, VINC4
xvfsub.d VT1, VZE, VX0
xvfsub.d VT2, VZE, VX1
xvfmaxa.d VX0, VX0, VT1
xvfmaxa.d VX1, VX1, VT2
xvfcmp.clt.d VT0, VX0, VX1 //abx(x0) < abs(x1)
xvbitsel.v x1, VX0, VX1, VT0 //abs(maxf)
xvbitsel.v x2, VI1, VI2, VT0 //i

xvld VX0, X, 4 * SIZE
xvadd.d VI1, VI2, VINC4
xvld VX1, X, 6 * SIZE
xvadd.d VI2, VI1, VINC4 xvadd.d VI2, VI1, VINC4
xvfmaxa.d VM1, VX0, VX1
xvfcmp.ceq.d VT0, VX0, VM1
xvfsub.d VT1, VZE, VX0
xvfsub.d VT2, VZE, VX1
xvfmaxa.d VX0, VX0, VT1
xvfmaxa.d VX1, VX1, VT2
xvfcmp.clt.d VT0, VX0, VX1
xvbitsel.v x3, VX0, VX1, VT0 //abs(maxf)
xvbitsel.v x4, VI1, VI2, VT0 //i
xvfcmp.clt.d VC0, x1, x3
xvbitsel.v x1, x1, x3, VC0 //abs(maxf)
xvbitsel.v x2, x2, x4, VC0 //i
xvfcmp.clt.d VT0, VM0, x1
addi.d I, I, -1 addi.d I, I, -1
xvbitsel.v VI2, VI2, VI1, VT0
xvfmaxa.d VM1, VM0, VM1
xvfcmp.ceq.d VT0, VM0, VM1
addi.d X, X, 8 * SIZE addi.d X, X, 8 * SIZE
xvbitsel.v VM0, VM1, VM0, VT0
xvbitsel.v VI0, VI2, VI0, VT0
xvbitsel.v VM0, VM0, x1, VT0
xvbitsel.v VI0, VI0, x2, VT0
#else #else
xvld VX0, X, 0 * SIZE xvld VX0, X, 0 * SIZE
addi.d I, I, -1
xvadd.w VI1, VI1, VINC8 xvadd.w VI1, VI1, VINC8
xvfmaxa.s VM1, VX0, VM0
xvfcmp.ceq.s VT0, VM0, VM1
xvld VX1, X, 4 * SIZE
xvadd.w VI2, VI1, VINC4
xvfsub.s VT1, VZE, VX0
xvfsub.s VT2, VZE, VX1
xvfmaxa.s VX0, VX0, VT1
xvfmaxa.s VX1, VX1, VT2
xvfcmp.clt.s VT0, VX0, VX1
xvbitsel.v x1, VX0, VX1, VT0 //abs(maxf)
xvbitsel.v x2, VI1, VI2, VT0 //i
addi.d I, I, -1
xvfcmp.clt.s VT0, VM0, x1
addi.d X, X, 8 * SIZE addi.d X, X, 8 * SIZE
xvbitsel.v VM0, VM1, VM0, VT0
xvbitsel.v VI0, VI1, VI0, VT0
xvbitsel.v VM0, VM0, x1, VT0
xvbitsel.v VI0, VI0, x2, VT0

#endif #endif
blt $r0, I, .L10 blt $r0, I, .L10
.align 3 .align 3


.L15: .L15:
#ifdef DOUBLE #ifdef DOUBLE
xvpickve.d VI1, VI0, 0
xvpickve.d VI2, VI0, 1
xvpickve.d VI3, VI0, 2
xvpickve.d VI4, VI0, 3
xvpickve.d x1, VM0, 0
xvpickve.d x2, VM0, 1
xvpickve.d x3, VM0, 2
xvpickve.d x4, VM0, 3
vreplvei.d $vr21, $vr20, 0
vreplvei.d $vr22, $vr20, 1
vreplvei.d $vr9, $vr15, 0
vreplvei.d $vr10, $vr15, 1
fcmp.ceq.d $fcc0, $f9, $f10
bceqz $fcc0, .L16
xvfcmp.clt.d VT0, VI1, VI2
xvbitsel.v VI0, VI2, VI1, VT0
b .L17
#else #else
xvxor.v VX0, VX0, VX0
xvor.v VX0, VI0, VX0
xvxor.v VX1, VX1, VX1
xvor.v VX1, VM0, VX1
xvpickve.w VI1, VI0, 0
xvpickve.w VI2, VI0, 1
xvpickve.w VI3, VI0, 2
xvpickve.w VI4, VI0, 3
xvpickve.w x1, VM0, 0
xvpickve.w x2, VM0, 1
xvpickve.w x3, VM0, 2
xvpickve.w x4, VM0, 3
vreplvei.w $vr21, $vr20, 0
vreplvei.w $vr22, $vr20, 1
vreplvei.w $vr8, $vr20, 2
vreplvei.w $vr19, $vr20, 3
vreplvei.w $vr9, $vr15, 0
vreplvei.w $vr10, $vr15, 1
vreplvei.w $vr11, $vr15, 2
vreplvei.w $vr12, $vr15, 3
b .L26
#endif #endif
XVFMAXA VM1, x1, x2
XVCMPEQ VT0, x1, VM1
xvbitsel.v VINC4, VI2, VI1, VT0
XVFMAXA VM0, x3, x4
XVCMPEQ VT0, x3, VM0
xvbitsel.v VINC8, VI4, VI3, VT0
XVFMAXA VM0, VM0, VM1
XVCMPEQ VT0, VM0, VM1
xvbitsel.v VI0, VINC8, VINC4, VT0
CMPEQ $fcc0, $f15, $f9
bceqz $fcc0, .L26
XVCMPLT VT0, VI1, VI0
.align 3

#ifdef DOUBLE
.L16:
xvfcmp.clt.d VT0, x1, x2
xvbitsel.v VI0, VI1, VI2, VT0
xvbitsel.v VM0, x1, x2, VT0
.align 3

.L17:
movfr2gr.d i0, $f20
.align 3

.L11: //INCX==1 and N<8
andi I, N, 7
bge $r0, I, .L14
srai.d i1, N, 3
slli.d i1, i1, 3
addi.d i1, i1, 1 //current index
movgr2fr.d $f21, i1
movgr2fr.d $f20, i0
.align 3

.L13:
fld.d $f9, X, 0
fsub.d $f10, $f3, $f9
xvfmaxa.d x1, x1, x2
xvfcmp.clt.d VT0, VM0, x1
xvbitsel.v VM0, VM0, x1, VT0
xvbitsel.v VI0, VI0, VI1, VT0 xvbitsel.v VI0, VI0, VI1, VT0
b .L26
addi.d I, I, -1
addi.d i1, i1, 1
addi.d X, X, SIZE
movgr2fr.d $f21, i1
blt $r0, I, .L13
movfr2gr.d i0, $f20
.align 3

.L14:
move $r4, $r17
jirl $r0, $r1, 0x0
.align 3 .align 3


.L20: // INCX!=1 .L20: // INCX!=1
move TEMP, X move TEMP, X
#ifdef DOUBLE
addi.d i0, i0, 1 addi.d i0, i0, 1
ld.d t1, TEMP, 0 * SIZE ld.d t1, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX add.d TEMP, TEMP, INCX
@@ -210,34 +272,103 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
bge $r0, I, .L21 bge $r0, I, .L21
ld.d t2, TEMP, 0 * SIZE ld.d t2, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX add.d TEMP, TEMP, INCX
ld.d t3, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.d t4, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
xvinsgr2vr.d VM0, t2, 1 xvinsgr2vr.d VM0, t2, 1
xvinsgr2vr.d VM0, t3, 2
xvinsgr2vr.d VM0, t4, 3
slli.d i0, i0, 2 //4
slli.d i0, i0, 1 //2
xvfsub.d VT1, VZE, VM0
xvreplgr2vr.d VINC4, i0 xvreplgr2vr.d VINC4, i0
slli.d i0, i0, 1 //8
slli.d i0, i0, 1 //4
xvreplgr2vr.d VINC8, i0 xvreplgr2vr.d VINC8, i0
addi.d i0, i0, -15
addi.d i0, i0, -7
xvfmaxa.d VM0, VM0, VT1
xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
addi.d i0, i0, 1 addi.d i0, i0, 1
xvinsgr2vr.d VI1, i0, 1 xvinsgr2vr.d VI1, i0, 1
addi.d i0, i0, 1
xvinsgr2vr.d VI1, i0, 2
addi.d i0, i0, 1
xvinsgr2vr.d VI1, i0, 3
addi.d i0, i0, 5
addi.d i0, i0, 3
xvinsgr2vr.d VI0, i0, 0 //1 xvinsgr2vr.d VI0, i0, 0 //1
addi.d i0, i0, 1 addi.d i0, i0, 1
xvinsgr2vr.d VI0, i0, 1 //2 xvinsgr2vr.d VI0, i0, 1 //2
addi.d i0, i0, 1
xvinsgr2vr.d VI0, i0, 2 //3
addi.d i0, i0, 1
xvinsgr2vr.d VI0, i0, 3 //4
.align 3

.L24:
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.d VX0, t1, 0
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.d VX0, t2, 1
xvadd.d VI1, VI1, VINC8
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.d VX1, t1, 0
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.d VX1, t2, 1
xvadd.d VI2, VI1, VINC4

xvfsub.d VT1, VZE, VX0
xvfsub.d VT2, VZE, VX1
xvfmaxa.d VX0, VX0, VT1
xvfmaxa.d VX1, VX1, VT2
xvfcmp.clt.d VT0, VX0, VX1
xvbitsel.v x1, VX0, VX1, VT0
xvbitsel.v x2, VI1, VI2, VT0
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.d VX0, t1, 0
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.d VX0, t2, 1
xvadd.d VI1, VI2, VINC4
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.d VX1, t1, 0
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.d VX1, t2, 1
xvadd.d VI2, VI1, VINC4
xvfsub.d VT1, VZE, VX0
xvfsub.d VT2, VZE, VX1
xvfmaxa.d VX0, VX0, VT1
xvfmaxa.d VX1, VX1, VT2
xvfcmp.clt.d VT0, VX0, VX1
xvbitsel.v x3, VX0, VX1, VT0
xvbitsel.v x4, VI1, VI2, VT0
xvfcmp.clt.d VC0, x1, x3
xvbitsel.v x1, x1, x3, VC0
xvbitsel.v x2, x2, x4, VC0
xvfcmp.clt.d VT0, VM0, x1
xvbitsel.v VM0, VM0, x1, VT0
xvbitsel.v VI0, VI0, x2, VT0

addi.d I, I, -1
blt $r0, I, .L24
.align 3

.L25:
vreplvei.d $vr21, $vr20, 0
vreplvei.d $vr22, $vr20, 1
vreplvei.d $vr9, $vr15, 0
vreplvei.d $vr10, $vr15, 1
fcmp.ceq.d $fcc0, $f10, $f9
bceqz $fcc0, .L26
xvfcmp.clt.d VT0, VI1, VI2
xvbitsel.v VI0, VI2, VI1, VT0
b .L27
.align 3

.L26:
xvfcmp.clt.d VT0, x1, x2
xvbitsel.v VI0, VI1, VI2, VT0
xvbitsel.v VM0, x1, x2, VT0
.align 3

.L27:
movfr2gr.d i0, $f20
.align 3

#else #else
.L20: // INCX!=1
move TEMP, X
addi.w i0, i0, 1 addi.w i0, i0, 1
ld.w t1, TEMP, 0 * SIZE ld.w t1, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX add.d TEMP, TEMP, INCX
@@ -253,19 +384,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvinsgr2vr.w VM0, t2, 1 xvinsgr2vr.w VM0, t2, 1
xvinsgr2vr.w VM0, t3, 2 xvinsgr2vr.w VM0, t3, 2
xvinsgr2vr.w VM0, t4, 3 xvinsgr2vr.w VM0, t4, 3
ld.w t1, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.w t2, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.w t3, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
ld.w t4, TEMP, 0 * SIZE
add.d TEMP, TEMP, INCX
xvinsgr2vr.w VM0, t1, 4
xvinsgr2vr.w VM0, t2, 5
xvinsgr2vr.w VM0, t3, 6
xvinsgr2vr.w VM0, t4, 7
slli.w i0, i0, 3 //8
slli.w i0, i0, 2 //4
xvreplgr2vr.w VINC4, i0
slli.w i0, i0, 1 //8
xvreplgr2vr.w VINC8, i0 xvreplgr2vr.w VINC8, i0
addi.w i0, i0, -15 addi.w i0, i0, -15
xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
@@ -275,15 +396,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvinsgr2vr.w VI1, i0, 2 xvinsgr2vr.w VI1, i0, 2
addi.w i0, i0, 1 addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 3 xvinsgr2vr.w VI1, i0, 3
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 4
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 5
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 6
addi.w i0, i0, 1
xvinsgr2vr.w VI1, i0, 7
addi.w i0, i0, 1
addi.w i0, i0, 5
xvinsgr2vr.w VI0, i0, 0 //1 xvinsgr2vr.w VI0, i0, 0 //1
addi.w i0, i0, 1 addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 1 //2 xvinsgr2vr.w VI0, i0, 1 //2
@@ -291,54 +404,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvinsgr2vr.w VI0, i0, 2 //3 xvinsgr2vr.w VI0, i0, 2 //3
addi.w i0, i0, 1 addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 3 //4 xvinsgr2vr.w VI0, i0, 3 //4
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 4 //5
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 5 //6
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 6 //7
addi.w i0, i0, 1
xvinsgr2vr.w VI0, i0, 7 //8
#endif
.align 3 .align 3


.L24: .L24:
#ifdef DOUBLE
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.d VX0, t1, 0
xvinsgr2vr.d VX0, t2, 1
xvinsgr2vr.d VX0, t3, 2
xvinsgr2vr.d VX0, t4, 3
xvadd.d VI1, VI1, VINC8
ld.d t1, X, 0 * SIZE
add.d X, X, INCX
ld.d t2, X, 0 * SIZE
add.d X, X, INCX
ld.d t3, X, 0 * SIZE
add.d X, X, INCX
ld.d t4, X, 0 * SIZE
add.d X, X, INCX
xvinsgr2vr.d VX1, t1, 0
xvinsgr2vr.d VX1, t2, 1
xvinsgr2vr.d VX1, t3, 2
xvinsgr2vr.d VX1, t4, 3
xvadd.d VI2, VI1, VINC4
xvfmaxa.d VM1, VX0, VX1
xvfcmp.ceq.d VT0, VX0, VM1
addi.d I, I, -1
xvbitsel.v VI2, VI2, VI1, VT0
xvfmaxa.d VM1, VM0, VM1
xvfcmp.ceq.d VT0, VM0, VM1
xvbitsel.v VM0, VM1, VM0, VT0
xvbitsel.v VI0, VI2, VI0, VT0
#else
ld.w t1, X, 0 * SIZE ld.w t1, X, 0 * SIZE
add.d X, X, INCX add.d X, X, INCX
ld.w t2, X, 0 * SIZE ld.w t2, X, 0 * SIZE
@@ -351,6 +419,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvinsgr2vr.w VX0, t2, 1 xvinsgr2vr.w VX0, t2, 1
xvinsgr2vr.w VX0, t3, 2 xvinsgr2vr.w VX0, t3, 2
xvinsgr2vr.w VX0, t4, 3 xvinsgr2vr.w VX0, t4, 3
xvadd.w VI1, VI1, VINC8
ld.w t1, X, 0 * SIZE ld.w t1, X, 0 * SIZE
add.d X, X, INCX add.d X, X, INCX
ld.w t2, X, 0 * SIZE ld.w t2, X, 0 * SIZE
@@ -359,158 +428,80 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add.d X, X, INCX add.d X, X, INCX
ld.w t4, X, 0 * SIZE ld.w t4, X, 0 * SIZE
add.d X, X, INCX add.d X, X, INCX
xvinsgr2vr.w VX0, t1, 4
xvinsgr2vr.w VX0, t2, 5
xvinsgr2vr.w VX0, t3, 6
xvinsgr2vr.w VX0, t4, 7
xvadd.w VI1, VI1, VINC8
xvfmaxa.s VM1, VX0, VM0
xvfcmp.ceq.s VT0, VM1, VM0
xvinsgr2vr.w VX1, t1, 0
xvinsgr2vr.w VX1, t2, 1
xvinsgr2vr.w VX1, t3, 2
xvinsgr2vr.w VX1, t4, 3
xvadd.w VI2, VI1, VINC4
xvfsub.s VT1, VZE, VX0
xvfsub.s VT2, VZE, VX1
xvfmaxa.s VX0, VX0, VT1
xvfmaxa.s VX1, VX1, VT2
xvfcmp.clt.s VT0, VX0, VX1
xvbitsel.v x1, VX0, VX1, VT0
xvbitsel.v x2, VI1, VI2, VT0 //i

addi.d I, I, -1 addi.d I, I, -1
xvbitsel.v VM0, VM1, VM0, VT0
xvbitsel.v VI0, VI1, VI0, VT0
#endif
xvfcmp.clt.s VT0, VM0, x1
xvbitsel.v VM0, VM0, x1, VT0
xvbitsel.v VI0, VI0, x2, VT0
blt $r0, I, .L24 blt $r0, I, .L24
.align 3 .align 3


.L25: .L25:
#ifdef DOUBLE
xvpickve.d VI1, VI0, 0
xvpickve.d VI2, VI0, 1
xvpickve.d VI3, VI0, 2
xvpickve.d VI4, VI0, 3
xvpickve.d x1, VM0, 0
xvpickve.d x2, VM0, 1
xvpickve.d x3, VM0, 2
xvpickve.d x4, VM0, 3
xvfmaxa.d VM1, x1, x2
xvfcmp.ceq.d VT0, x1, VM1
xvbitsel.v VINC4, VI2, VI1, VT0
xvfmaxa.d VM0, x4, x3
xvfcmp.ceq.d VT0, x3, VM0
xvbitsel.v VINC8, VI4, VI3, VT0
xvfmaxa.d VM0, VM0, VM1
xvfcmp.ceq.d VT0, VM0, VM1
xvbitsel.v VI0, VINC8, VINC4, VT0
#else
xvxor.v VX0, VX0, VX0
xvor.v VX0, VI0, VX0
xvxor.v VX1, VX1, VX1
xvor.v VX1, VM0, VX1
xvpickve.w VI1, VI0, 0
xvpickve.w VI2, VI0, 1
xvpickve.w VI3, VI0, 2
xvpickve.w VI4, VI0, 3
xvpickve.w x1, VM0, 0
xvpickve.w x2, VM0, 1
xvpickve.w x3, VM0, 2
xvpickve.w x4, VM0, 3
xvfmaxa.s VM1, x1, x2
xvfcmp.ceq.s VT0, x1, VM1
xvbitsel.v VINC4, VI2, VI1, VT0
xvfmaxa.s VM0, x3, x4
xvfcmp.ceq.s VT0, x3, VM0
xvbitsel.v VINC8, VI3, VI4, VT0
xvfmaxa.s VM0, VM0, VM1
xvfcmp.ceq.s VT0, VM0, VM1
xvbitsel.v VM0, VM0, VM1, VT0
xvbitsel.v VI0, VINC8, VINC4, VT0
#endif
CMPEQ $fcc0, $f15, $f9
bceqz $fcc0, .L26
XVCMPLT VT0, VI1, VI0
xvbitsel.v VI0, VI0, VI1, VT0
vreplvei.w $vr21, $vr20, 0
vreplvei.w $vr22, $vr20, 1
vreplvei.w $vr8, $vr20, 2
vreplvei.w $vr19, $vr20, 3
vreplvei.w $vr9, $vr15, 0
vreplvei.w $vr10, $vr15, 1
vreplvei.w $vr11, $vr15, 2
vreplvei.w $vr12, $vr15, 3
.align 3 .align 3


.L26: .L26:
fcmp.ceq.d $fcc0, $f15, $f10
bceqz $fcc0, .L27
XVCMPLT VT0, VI2, VI0
xvbitsel.v VI0, VI0, VI2, VT0
fcmp.ceq.s $fcc0, $f9, $f10
bceqz $fcc0, .L31
xvfcmp.clt.s VT0, VI1, VI2
xvbitsel.v VI1, VI2, VI1, VT0
b .L32
.align 3 .align 3

.L27:
fcmp.ceq.d $fcc0, $f15, $f11
bceqz $fcc0, .L28
XVCMPLT VT0, VI3, VI0
xvbitsel.v VI0, VI0, VI3, VT0
.L31:
xvfcmp.clt.s VT0, x1, x2
xvbitsel.v VI1, VI1, VI2, VT0
xvbitsel.v x1, x1, x2, VT0
.align 3 .align 3
.L28:
fcmp.ceq.d $fcc0, $f15, $f12
bceqz $fcc0, .L29
XVCMPLT VT0, VI4, VI0
xvbitsel.v VI0, VI0, VI4, VT0
.L32:
fcmp.ceq.s $fcc0, $f11, $f12
bceqz $fcc0, .L33
xvfcmp.clt.s VT1, VI3, VI4
xvbitsel.v VI3, VI4, VI3, VT1
b .L34
.align 3 .align 3

.L29:
#ifdef DOUBLE
movfr2gr.d i0, $f20
#else
fmov.s $f16, $f20
#endif
.L33:
xvfcmp.clt.s VT1, x3, x4
xvbitsel.v x3, x3, x4, VT1
xvbitsel.v VI3, VI3, VI4, VT1
.align 3 .align 3

#ifdef DOUBLE

#else
.L252:
xvxor.v VI0, VI0, VI0
xvor.v VI0, VI0, VX0
fmov.s $f13, $f15
xvxor.v VM0, VM0, VM0
xvor.v VM0, VM0, VX1
xvpickve.w VI1, VI0, 4
xvpickve.w VI2, VI0, 5
xvpickve.w VI3, VI0, 6
xvpickve.w VI4, VI0, 7
xvpickve.w x1, VM0, 4
xvpickve.w x2, VM0, 5
xvpickve.w x3, VM0, 6
xvpickve.w x4, VM0, 7
xvfmaxa.s VM1, x1, x2
xvfcmp.ceq.s VT0, x1, VM1
xvbitsel.v VINC4, VI2, VI1, VT0
xvfmaxa.s VM0, x3, x4
xvfcmp.ceq.s VT0, x3, VM0
xvbitsel.v VINC8, VI4, VI3, VT0
xvfmaxa.s VM0, VM0, VM1
xvfcmp.ceq.s VT0, VM0, VM1
xvbitsel.v VI0, VINC8, VINC4, VT0
fcmp.ceq.d $fcc0, $f15, $f9
bceqz $fcc0, .L262
xvfcmp.clt.s VT0, VI1, VI0
xvbitsel.v VI0, VI0, VI1, VT0
.L34:
fcmp.ceq.s $fcc0, $f9, $f11
bceqz $fcc0, .L35
xvfcmp.clt.s VT0, VI1, VI3
xvbitsel.v VI0, VI3, VI1, VT0
xvxor.v VM0, x1, VZE
b .L29
.align 3 .align 3

.L262:
fcmp.ceq.d $fcc0, $f15, $f10
bceqz $fcc0, .L272
xvfcmp.clt.s VT0, VI2, VI0
xvbitsel.v VI0, VI0, VI2, VT0
.L35:
xvfcmp.clt.s VT0, x1, x3
xvbitsel.v VM0, x1, x3, VT0
xvbitsel.v VI0, VI1, VI3, VT0
.align 3 .align 3
.L272:
fcmp.ceq.d $fcc0, $f15, $f11
bceqz $fcc0, .L282
xvfcmp.clt.s VT0, VI3, VI0
xvbitsel.v VI0, VI0, VI3, VT0
.align 3

.L282:
fcmp.ceq.d $fcc0, $f15, $f12
bceqz $fcc0, .L292
xvfcmp.clt.s VT0, VI4, VI0
xvbitsel.v VI0, VI0, VI4, VT0
.L29:
movfr2gr.s i0, $f20
.align 3 .align 3


.L292:
xvfmaxa.s VM0, VX0, VM0
xvfcmp.ceq.s VT0, VM0, VX0
xvbitsel.v VI0, VI0, VI1, VT0
movfr2gr.s i0, $f20
#endif #endif

.L21: //N<8
.L21: // N<8
andi I, N, 7 andi I, N, 7
bge $r0, I, .L999 bge $r0, I, .L999
srai.d i1, N, 3 srai.d i1, N, 3
@@ -521,17 +512,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.align 3 .align 3


.L22: .L22:
LD $f9, X, 0
LD $f9, X, 0
#ifdef DOUBLE
fsub.d $f10, $f3, $f9
xvfmaxa.d x1, x1, x2
xvfcmp.clt.d VT0, VM0, x1
#else
fsub.s $f10, $f3, $f9
xvfmaxa.s x1, x1, x2
xvfcmp.clt.s VT0, VM0, x1
#endif
xvbitsel.v VM0, VM0, x1, VT0
xvbitsel.v VI0, VI0, VI1, VT0
addi.d I, I, -1 addi.d I, I, -1
XVFMAXA VM1, x1, VM0
XVCMPEQ VT0, VM0, VM1
add.d X, X, INCX
xvbitsel.v VM0, VM1, VM0, VT0
xvbitsel.v VI0, VI1, VI0, VT0
addi.d i1, i1, 1 addi.d i1, i1, 1
add.d X, X, INCX
movgr2fr.d $f21, i1 movgr2fr.d $f21, i1
blt $r0, I, .L22 blt $r0, I, .L22
MTG i0, $f20
MTG i0, $f20
.align 3 .align 3


.L999: .L999:


Loading…
Cancel
Save