| @@ -955,12 +955,18 @@ endif | |||||
| ifeq ($(ARCH), loongarch64) | ifeq ($(ARCH), loongarch64) | ||||
| LA64_ABI=$(shell $(CC) -mabi=lp64d -c $(TOPDIR)/cpuid_loongarch64.c -o /dev/null > /dev/null 2> /dev/null && echo lp64d) | LA64_ABI=$(shell $(CC) -mabi=lp64d -c $(TOPDIR)/cpuid_loongarch64.c -o /dev/null > /dev/null 2> /dev/null && echo lp64d) | ||||
| LA64_ARCH=$(shell $(CC) -march=loongarch64 -c $(TOPDIR)/cpuid_loongarch64.c -o /dev/null > /dev/null 2> /dev/null && echo loongarch64) | |||||
| ifneq ($(LA64_ABI), lp64d) | ifneq ($(LA64_ABI), lp64d) | ||||
| LA64_ABI=lp64 | LA64_ABI=lp64 | ||||
| endif | endif | ||||
| ifneq ($(LA64_ARCH), loongarch64) | |||||
| CCOMMON_OPT += -mabi=$(LA64_ABI) | |||||
| FCOMMON_OPT += -mabi=$(LA64_ABI) | |||||
| else | |||||
| CCOMMON_OPT += -march=loongarch64 -mabi=$(LA64_ABI) | CCOMMON_OPT += -march=loongarch64 -mabi=$(LA64_ABI) | ||||
| FCOMMON_OPT += -march=loongarch64 -mabi=$(LA64_ABI) | FCOMMON_OPT += -march=loongarch64 -mabi=$(LA64_ABI) | ||||
| endif | endif | ||||
| endif | |||||
| endif | endif | ||||
| @@ -197,10 +197,22 @@ fi | |||||
| no_lsx=0 | no_lsx=0 | ||||
| no_lasx=0 | no_lasx=0 | ||||
| if [ "$architecture" = "loongarch64" ]; then | if [ "$architecture" = "loongarch64" ]; then | ||||
| lasx_flags='-march=loongarch64' | |||||
| lsx_flags='-march=loongarch64' | |||||
| tmpd="$(mktemp -d)" | tmpd="$(mktemp -d)" | ||||
| tmparch="$tmpd/arch.c" | |||||
| printf "void main(void){ }\n" >> "$tmparch" | |||||
| args="-march=loongarch64 -o $tmparch.o $tmparch" | |||||
| { | |||||
| $compiler_name $flags $args >/dev/null 2>&1 | |||||
| } || { | |||||
| lasx_flags='' | |||||
| lsx_flags='' | |||||
| } | |||||
| tmplsx="$tmpd/lsx.c" | tmplsx="$tmpd/lsx.c" | ||||
| codelsx='"vadd.b $vr0, $vr0, $vr0"' | codelsx='"vadd.b $vr0, $vr0, $vr0"' | ||||
| lsx_flags='-march=loongarch64' | |||||
| printf "void main(void){ __asm__ volatile(%s);}\n" "$codelsx" >> "$tmplsx" | printf "void main(void){ __asm__ volatile(%s);}\n" "$codelsx" >> "$tmplsx" | ||||
| args="$lsx_flags -o $tmplsx.o $tmplsx" | args="$lsx_flags -o $tmplsx.o $tmplsx" | ||||
| { | { | ||||
| @@ -211,7 +223,6 @@ if [ "$architecture" = "loongarch64" ]; then | |||||
| tmplasx="$tmpd/lasx.c" | tmplasx="$tmpd/lasx.c" | ||||
| codelasx='"xvadd.b $xr0, $xr0, $xr0"' | codelasx='"xvadd.b $xr0, $xr0, $xr0"' | ||||
| lasx_flags='-march=loongarch64' | |||||
| printf "void main(void){ __asm__ volatile(%s);}\n" "$codelasx" >> "$tmplasx" | printf "void main(void){ __asm__ volatile(%s);}\n" "$codelasx" >> "$tmplasx" | ||||
| args="$lasx_flags -o $tmplasx.o $tmplasx" | args="$lasx_flags -o $tmplasx.o $tmplasx" | ||||
| { | { | ||||
| @@ -279,7 +279,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| PROLOGUE | PROLOGUE | ||||
| PTR_LD INC_Y, $sp, 0 | PTR_LD INC_Y, $sp, 0 | ||||
| push_if_used 17 + 7, 31 | |||||
| push_if_used 7, 7 | |||||
| PTR_ADDI K, $r0, 0x01 | PTR_ADDI K, $r0, 0x01 | ||||
| PTR_SUB I, INC_X, K | PTR_SUB I, INC_X, K | ||||
| PTR_SUB J, INC_Y, K | PTR_SUB J, INC_Y, K | ||||
| @@ -318,6 +318,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */ | .L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */ | ||||
| CGEMV_N_LSX GAP_1_1, X_4_GAP, X_1, Y_4_GAP, Y_1 | CGEMV_N_LSX GAP_1_1, X_4_GAP, X_1, Y_4_GAP, Y_1 | ||||
| .L_END: | .L_END: | ||||
| pop_if_used 17 + 7, 31 | |||||
| pop_if_used 7, 7 | |||||
| jirl $r0, $r1, 0x0 | jirl $r0, $r1, 0x0 | ||||
| EPILOGUE | EPILOGUE | ||||
| @@ -336,7 +336,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| PROLOGUE | PROLOGUE | ||||
| PTR_LD INC_Y, $sp, 0 | PTR_LD INC_Y, $sp, 0 | ||||
| push_if_used 17 + 7, 31 | |||||
| push_if_used 7, 7 | |||||
| PTR_ADDI K, $r0, 0x01 | PTR_ADDI K, $r0, 0x01 | ||||
| PTR_SUB I, INC_X, K | PTR_SUB I, INC_X, K | ||||
| PTR_SUB J, INC_Y, K | PTR_SUB J, INC_Y, K | ||||
| @@ -378,6 +378,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */ | .L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */ | ||||
| CGEMV_N_LASX GAP_1_1, X_8_GAP, X_1, Y_8_GAP, Y_1 | CGEMV_N_LASX GAP_1_1, X_8_GAP, X_1, Y_8_GAP, Y_1 | ||||
| .L_END: | .L_END: | ||||
| pop_if_used 17 + 7, 31 | |||||
| pop_if_used 7, 7 | |||||
| jirl $r0, $r1, 0x0 | jirl $r0, $r1, 0x0 | ||||
| EPILOGUE | EPILOGUE | ||||
| @@ -255,7 +255,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| PROLOGUE | PROLOGUE | ||||
| PTR_LD INC_Y, $sp, 0 | PTR_LD INC_Y, $sp, 0 | ||||
| push_if_used 17 + 8, 30 | |||||
| push_if_used 8, 6 | |||||
| PTR_ADDI K, $r0, 0x01 | PTR_ADDI K, $r0, 0x01 | ||||
| PTR_SUB I, INC_X, K | PTR_SUB I, INC_X, K | ||||
| maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ | maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ | ||||
| @@ -285,6 +285,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .L_GAP_1: /* if (incx != 1) */ | .L_GAP_1: /* if (incx != 1) */ | ||||
| CGEMV_T_LSX GAP_1, X4_GAP | CGEMV_T_LSX GAP_1, X4_GAP | ||||
| .L_END: | .L_END: | ||||
| pop_if_used 17 + 8, 30 | |||||
| pop_if_used 8, 6 | |||||
| jirl $r0, $r1, 0x0 | jirl $r0, $r1, 0x0 | ||||
| EPILOGUE | EPILOGUE | ||||
| @@ -304,7 +304,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| PROLOGUE | PROLOGUE | ||||
| PTR_LD INC_Y, $sp, 0 | PTR_LD INC_Y, $sp, 0 | ||||
| push_if_used 17 + 8, 30 | |||||
| push_if_used 8, 6 | |||||
| PTR_ADDI K, $r0, 0x01 | PTR_ADDI K, $r0, 0x01 | ||||
| PTR_SUB I, INC_X, K | PTR_SUB I, INC_X, K | ||||
| maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ | maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ | ||||
| @@ -337,6 +337,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .L_GAP_1: /* if (incx != 1) */ | .L_GAP_1: /* if (incx != 1) */ | ||||
| CGEMV_T_LASX GAP_1, X8_GAP | CGEMV_T_LASX GAP_1, X8_GAP | ||||
| .L_END: | .L_END: | ||||
| pop_if_used 17 + 8, 30 | |||||
| pop_if_used 8, 6 | |||||
| jirl $r0, $r1, 0x0 | jirl $r0, $r1, 0x0 | ||||
| EPILOGUE | EPILOGUE | ||||
| @@ -79,7 +79,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define D7 $vr15 | #define D7 $vr15 | ||||
| PROLOGUE | PROLOGUE | ||||
| push_if_used 26, 32 | |||||
| push_if_used 0, 0 | |||||
| move TD, DST | move TD, DST | ||||
| move TS, SRC | move TS, SRC | ||||
| slli.d TL, LDA, 0x03 | slli.d TL, LDA, 0x03 | ||||
| @@ -278,6 +278,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| addi.d M, M, -1 | addi.d M, M, -1 | ||||
| blt ZERO, M, .L_M1 | blt ZERO, M, .L_M1 | ||||
| .L_N0: | .L_N0: | ||||
| pop_if_used 26, 32 | |||||
| pop_if_used 0, 0 | |||||
| jirl $r0, $r1, 0x00 | jirl $r0, $r1, 0x00 | ||||
| EPILOGUE | EPILOGUE | ||||
| @@ -66,7 +66,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define U7 $vr7 | #define U7 $vr7 | ||||
| PROLOGUE | PROLOGUE | ||||
| push_if_used 18, 8 | |||||
| push_if_used 1, 0 | |||||
| move S0, SRC | move S0, SRC | ||||
| move P0, DST | move P0, DST | ||||
| @@ -274,7 +274,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| fst.d F0, P3, 0x00 | fst.d F0, P3, 0x00 | ||||
| .L_M0: | .L_M0: | ||||
| pop_if_used 18, 8 | |||||
| pop_if_used 1, 0 | |||||
| jirl $r0, $r1, 0x00 | jirl $r0, $r1, 0x00 | ||||
| EPILOGUE | EPILOGUE | ||||
| @@ -76,7 +76,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define U7 $vr7 | #define U7 $vr7 | ||||
| PROLOGUE | PROLOGUE | ||||
| push_if_used 24, 8 | |||||
| push_if_used 7, 0 | |||||
| move S0, SRC | move S0, SRC | ||||
| move P0, DST | move P0, DST | ||||
| @@ -592,6 +592,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| addi.d S1, S1, 0x08 | addi.d S1, S1, 0x08 | ||||
| addi.d P4, P4, 0x08 | addi.d P4, P4, 0x08 | ||||
| .L_M0: | .L_M0: | ||||
| pop_if_used 24, 8 | |||||
| pop_if_used 7, 0 | |||||
| jirl $r0, $r1, 0x00 | jirl $r0, $r1, 0x00 | ||||
| EPILOGUE | EPILOGUE | ||||
| @@ -509,7 +509,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| PROLOGUE | PROLOGUE | ||||
| PTR_LD INC_Y, $sp, 0 | PTR_LD INC_Y, $sp, 0 | ||||
| push_if_used 17 + 7, 24 + 4 | |||||
| push_if_used 7, 4 | |||||
| PTR_ADDI K, $r0, 0x01 | PTR_ADDI K, $r0, 0x01 | ||||
| PTR_SUB I, INC_X, K | PTR_SUB I, INC_X, K | ||||
| PTR_SUB J, INC_Y, K | PTR_SUB J, INC_Y, K | ||||
| @@ -549,6 +549,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */ | .L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */ | ||||
| DGEMV_N_LASX GAP_1_1, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8_GAP, Y_4_GAP, Y_1 | DGEMV_N_LASX GAP_1_1, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8_GAP, Y_4_GAP, Y_1 | ||||
| .L_END: | .L_END: | ||||
| pop_if_used 17 + 7, 24 + 4 | |||||
| pop_if_used 7, 4 | |||||
| jirl $r0, $r1, 0x0 | jirl $r0, $r1, 0x0 | ||||
| EPILOGUE | EPILOGUE | ||||
| @@ -445,7 +445,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| PROLOGUE | PROLOGUE | ||||
| PTR_LD INC_Y, $sp, 0 | PTR_LD INC_Y, $sp, 0 | ||||
| push_if_used 17 + 8, 24 + 3 | |||||
| push_if_used 8, 3 | |||||
| PTR_ADDI K, $r0, 0x01 | PTR_ADDI K, $r0, 0x01 | ||||
| PTR_SUB I, INC_X, K | PTR_SUB I, INC_X, K | ||||
| maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ | maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ | ||||
| @@ -476,6 +476,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .L_GAP_1: /* if (incx != 1) */ | .L_GAP_1: /* if (incx != 1) */ | ||||
| DGEMV_T_LASX GAP_1, X8_GAP, X4_GAP | DGEMV_T_LASX GAP_1, X8_GAP, X4_GAP | ||||
| .L_END: | .L_END: | ||||
| pop_if_used 17 + 8, 24 + 3 | |||||
| pop_if_used 8, 3 | |||||
| jirl $r0, $r1, 0x0 | jirl $r0, $r1, 0x0 | ||||
| EPILOGUE | EPILOGUE | ||||
| @@ -1029,7 +1029,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .endm | .endm | ||||
| PROLOGUE | PROLOGUE | ||||
| push_if_used 26, 32 | |||||
| push_if_used 9, 8 | |||||
| PTR_SLLI LDC, LDC, 3 | PTR_SLLI LDC, LDC, 3 | ||||
| /* if (!(N >> 2)) goto L_N3 */ | /* if (!(N >> 2)) goto L_N3 */ | ||||
| PTR_SRAI J, N, 2 /* J = bn >> 2 */ | PTR_SRAI J, N, 2 /* J = bn >> 2 */ | ||||
| @@ -1361,6 +1361,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| blt ZERO, I, .L_N1_I1 | blt ZERO, I, .L_N1_I1 | ||||
| .L_N1_M0: | .L_N1_M0: | ||||
| .L_N0: | .L_N0: | ||||
| pop_if_used 26, 32 | |||||
| pop_if_used 9, 8 | |||||
| jirl $r0, $r1, 0x0 | jirl $r0, $r1, 0x0 | ||||
| EPILOGUE | EPILOGUE | ||||
| @@ -128,31 +128,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "dtrsm_kernel_macro.S" | #include "dtrsm_kernel_macro.S" | ||||
| .macro ldrepl_macro start, end, stride | |||||
| .macro ldrepl_macro stride:req, index:req, more:vararg | |||||
| // Load Ux (x = 0...15) | // Load Ux (x = 0...15) | ||||
| .if \start <= \end | |||||
| GLDREPL xv, d, $xr\start, A0, \stride * 8 | |||||
| ldrepl_macro %start + 1, \end, %stride + 1 | |||||
| GLDREPL xv, d, $xr\index, A0, \index * 8 - \stride * 8 | |||||
| .ifnb \more | |||||
| ldrepl_macro \stride, \more | |||||
| .endif | .endif | ||||
| .endm | .endm | ||||
| .macro nmsub_macro start0, end0, start1, reg | |||||
| .macro nmsub_macro reg:req, start0:req, start1:req, more:vararg | |||||
| // Gx -= reg * Ux | // Gx -= reg * Ux | ||||
| .if \start0 <= \end0 | |||||
| xvfnmsub.d $xr\start0, \reg, $xr\start1, $xr\start0 | xvfnmsub.d $xr\start0, \reg, $xr\start1, $xr\start0 | ||||
| nmsub_macro %start0 + 1, \end0, %start1 + 1, \reg | |||||
| .ifnb \more | |||||
| nmsub_macro \reg, \more | |||||
| .endif | .endif | ||||
| .endm | .endm | ||||
| .macro B_st_macro start, end, stride, N | |||||
| .macro B_st_macro N:req, stride:req, start:req, more:vararg | |||||
| // Store Gx(x = 16...31) | // Store Gx(x = 16...31) | ||||
| .if \start <= \end | |||||
| .if \N == 4 | .if \N == 4 | ||||
| xvst $xr\start, B0, \stride * 0x20 | |||||
| xvst $xr\start, B0, \start * 0x20 - \stride * 0x20 | |||||
| .elseif \N == 2 | .elseif \N == 2 | ||||
| vst $vr\start, B0, \stride * 0x10 | |||||
| vst $vr\start, B0, \start * 0x10 - \stride * 0x10 | |||||
| .elseif \N == 1 | .elseif \N == 1 | ||||
| fst.d $f\start, B0, \stride * 0x08 | |||||
| fst.d $f\start, B0, \start * 0x08 - \stride * 0x08 | |||||
| .endif | .endif | ||||
| B_st_macro %start + 1, \end, %stride + 1, \N | |||||
| .ifnb \more | |||||
| B_st_macro \N, \stride, \more | |||||
| .endif | .endif | ||||
| .endm | .endm | ||||
| @@ -194,86 +194,93 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| // 255 | // 255 | ||||
| // Sequentially extract data from A in row order | // Sequentially extract data from A in row order | ||||
| // Load 0 | // Load 0 | ||||
| ldrepl_macro 0, 15, 0 | |||||
| ldrepl_macro 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 | |||||
| GMUL xvf, d, G0, G0, U0 | GMUL xvf, d, G0, G0, U0 | ||||
| nmsub_macro 17, 31, 1, G0 | |||||
| nmsub_macro G0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7, 24, 8, \ | |||||
| 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15 | |||||
| PTR_ADDI A0, A0, 17 * 8 | PTR_ADDI A0, A0, 17 * 8 | ||||
| // Load 1 | // Load 1 | ||||
| ldrepl_macro 1, 15, 0 | |||||
| ldrepl_macro 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 | |||||
| GMUL xvf, d, G1, G1, U1 | GMUL xvf, d, G1, G1, U1 | ||||
| nmsub_macro 18, 31, 2, G1 | |||||
| nmsub_macro G1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7, 24, 8, \ | |||||
| 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15 | |||||
| PTR_ADDI A0, A0, 17 * 8 | PTR_ADDI A0, A0, 17 * 8 | ||||
| // Load 2 | // Load 2 | ||||
| ldrepl_macro 2, 15, 0 | |||||
| ldrepl_macro 2, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 | |||||
| GMUL xvf, d, G2, G2, U2 | GMUL xvf, d, G2, G2, U2 | ||||
| nmsub_macro 19, 31, 3, G2 | |||||
| nmsub_macro G2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7, 24, 8, 25, 9, 26, \ | |||||
| 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15 | |||||
| PTR_ADDI A0, A0, 17 * 8 | PTR_ADDI A0, A0, 17 * 8 | ||||
| // Load 3 | // Load 3 | ||||
| ldrepl_macro 3, 15, 0 | |||||
| ldrepl_macro 3, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 | |||||
| GMUL xvf, d, G3, G3, U3 | GMUL xvf, d, G3, G3, U3 | ||||
| nmsub_macro 20, 31, 4, G3 | |||||
| nmsub_macro G3, 20, 4, 21, 5, 22, 6, 23, 7, 24, 8, 25, 9, 26, 10, \ | |||||
| 27, 11, 28, 12, 29, 13, 30, 14, 31, 15 | |||||
| PTR_ADDI A0, A0, 17 * 8 | PTR_ADDI A0, A0, 17 * 8 | ||||
| // Load 4 | // Load 4 | ||||
| ldrepl_macro 4, 15, 0 | |||||
| ldrepl_macro 4, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 | |||||
| GMUL xvf, d, G4, G4, U4 | GMUL xvf, d, G4, G4, U4 | ||||
| nmsub_macro 21, 31, 5, G4 | |||||
| nmsub_macro G4, 21, 5, 22, 6, 23, 7, 24, 8, 25, 9, 26, 10, 27, 11, \ | |||||
| 28, 12, 29, 13, 30, 14, 31, 15 | |||||
| PTR_ADDI A0, A0, 17 * 8 | PTR_ADDI A0, A0, 17 * 8 | ||||
| // Load 5 | // Load 5 | ||||
| ldrepl_macro 5, 15, 0 | |||||
| ldrepl_macro 5, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 | |||||
| GMUL xvf, d, G5, G5, U5 | GMUL xvf, d, G5, G5, U5 | ||||
| nmsub_macro 22, 31, 6, G5 | |||||
| nmsub_macro G5, 22, 6, 23, 7, 24, 8, 25, 9, 26, 10, 27, 11, 28, 12, \ | |||||
| 29, 13, 30, 14, 31, 15 | |||||
| PTR_ADDI A0, A0, 17 * 8 | PTR_ADDI A0, A0, 17 * 8 | ||||
| // Load 6 | // Load 6 | ||||
| ldrepl_macro 6, 15, 0 | |||||
| ldrepl_macro 6, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 | |||||
| GMUL xvf, d, G6, G6, U6 | GMUL xvf, d, G6, G6, U6 | ||||
| nmsub_macro 23, 31, 7, G6 | |||||
| nmsub_macro G6, 23, 7, 24, 8, 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, \ | |||||
| 30, 14, 31, 15 | |||||
| PTR_ADDI A0, A0, 17 * 8 | PTR_ADDI A0, A0, 17 * 8 | ||||
| // Load 7 | // Load 7 | ||||
| ldrepl_macro 7, 15, 0 | |||||
| ldrepl_macro 7, 7, 8, 9, 10, 11, 12, 13, 14, 15 | |||||
| GMUL xvf, d, G7, G7, U7 | GMUL xvf, d, G7, G7, U7 | ||||
| nmsub_macro 24, 31, 8, G7 | |||||
| nmsub_macro G7, 24, 8, 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15 | |||||
| PTR_ADDI A0, A0, 17 * 8 | PTR_ADDI A0, A0, 17 * 8 | ||||
| // Load 8 | // Load 8 | ||||
| ldrepl_macro 8, 15, 0 | |||||
| ldrepl_macro 8, 8, 9, 10, 11, 12, 13, 14, 15 | |||||
| GMUL xvf, d, G8, G8, U8 | GMUL xvf, d, G8, G8, U8 | ||||
| nmsub_macro 25, 31, 9, G8 | |||||
| nmsub_macro G8, 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15 | |||||
| PTR_ADDI A0, A0, 17 * 8 | PTR_ADDI A0, A0, 17 * 8 | ||||
| // Load 9 | // Load 9 | ||||
| ldrepl_macro 9, 15, 0 | |||||
| ldrepl_macro 9, 9, 10, 11, 12, 13, 14, 15 | |||||
| GMUL xvf, d, G9, G9, U9 | GMUL xvf, d, G9, G9, U9 | ||||
| nmsub_macro 26, 31, 10, G9 | |||||
| nmsub_macro G9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15 | |||||
| PTR_ADDI A0, A0, 17 * 8 | PTR_ADDI A0, A0, 17 * 8 | ||||
| // Load 10 | // Load 10 | ||||
| ldrepl_macro 10, 15, 0 | |||||
| ldrepl_macro 10, 10, 11, 12, 13, 14, 15 | |||||
| GMUL xvf, d, G10, G10, U10 | GMUL xvf, d, G10, G10, U10 | ||||
| nmsub_macro 27, 31, 11, G10 | |||||
| nmsub_macro G10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15 | |||||
| PTR_ADDI A0, A0, 17 * 8 | PTR_ADDI A0, A0, 17 * 8 | ||||
| // Load 11 | // Load 11 | ||||
| ldrepl_macro 11, 15, 0 | |||||
| ldrepl_macro 11, 11, 12, 13, 14, 15 | |||||
| GMUL xvf, d, G11, G11, U11 | GMUL xvf, d, G11, G11, U11 | ||||
| nmsub_macro 28, 31, 12, G11 | |||||
| nmsub_macro G11, 28, 12, 29, 13, 30, 14, 31, 15 | |||||
| PTR_ADDI A0, A0, 17 * 8 | PTR_ADDI A0, A0, 17 * 8 | ||||
| // Load 12 | // Load 12 | ||||
| ldrepl_macro 12, 15, 0 | |||||
| ldrepl_macro 12, 12, 13, 14, 15 | |||||
| GMUL xvf, d, G12, G12, U12 | GMUL xvf, d, G12, G12, U12 | ||||
| nmsub_macro 29, 31, 13, G12 | |||||
| nmsub_macro G12, 29, 13, 30, 14, 31, 15 | |||||
| PTR_ADDI A0, A0, 17 * 8 | PTR_ADDI A0, A0, 17 * 8 | ||||
| // Load 13 | // Load 13 | ||||
| ldrepl_macro 13, 15, 0 | |||||
| ldrepl_macro 13, 13, 14, 15 | |||||
| GMUL xvf, d, G13, G13, U13 | GMUL xvf, d, G13, G13, U13 | ||||
| nmsub_macro 30, 31, 14, G13 | |||||
| nmsub_macro G13, 30, 14, 31, 15 | |||||
| PTR_ADDI A0, A0, 17 * 8 | PTR_ADDI A0, A0, 17 * 8 | ||||
| // Load 14 | // Load 14 | ||||
| ldrepl_macro 14, 15, 0 | |||||
| ldrepl_macro 14, 14, 15 | |||||
| GMUL xvf, d, G14, G14, U14 | GMUL xvf, d, G14, G14, U14 | ||||
| nmsub_macro 31, 31, 15, G14 | |||||
| nmsub_macro G14, 31, 15 | |||||
| PTR_ADDI A0, A0, 17 * 8 | PTR_ADDI A0, A0, 17 * 8 | ||||
| // Load 15 | // Load 15 | ||||
| ldrepl_macro 15, 15, 0 | |||||
| ldrepl_macro 15, 15 | |||||
| GMUL xvf, d, G15, G15, U15 | GMUL xvf, d, G15, G15, U15 | ||||
| // Finally, We can store the result. | // Finally, We can store the result. | ||||
| // For B, stored sequentially, and C, first transpose and then store | // For B, stored sequentially, and C, first transpose and then store | ||||
| B_st_macro 16, 31, 0, \N | |||||
| B_st_macro \N, 16, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 | |||||
| GTRANSPOSE4x4_D G0, G1, G2, G3, G0, G1, G2, G3, U0, U1 | GTRANSPOSE4x4_D G0, G1, G2, G3, G0, G1, G2, G3, U0, U1 | ||||
| GTRANSPOSE4x4_D G4, G5, G6, G7, G4, G5, G6, G7, U0, U1 | GTRANSPOSE4x4_D G4, G5, G6, G7, G4, G5, G6, G7, U0, U1 | ||||
| GTRANSPOSE4x4_D G8, G9, G10, G11, G8, G9, G10, G11, U0, U1 | GTRANSPOSE4x4_D G8, G9, G10, G11, G8, G9, G10, G11, U0, U1 | ||||
| @@ -334,46 +341,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| // 63 | // 63 | ||||
| // Sequentially extract data from A in row order | // Sequentially extract data from A in row order | ||||
| // Load 0 | // Load 0 | ||||
| ldrepl_macro 0, 7, 0 | |||||
| ldrepl_macro 0, 0, 1, 2, 3, 4, 5, 6, 7 | |||||
| GMUL xvf, d, G0, G0, U0 | GMUL xvf, d, G0, G0, U0 | ||||
| nmsub_macro 17, 23, 1, G0 | |||||
| nmsub_macro G0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7 | |||||
| PTR_ADDI A0, A0, 9 * 8 | PTR_ADDI A0, A0, 9 * 8 | ||||
| // Load 1 | // Load 1 | ||||
| ldrepl_macro 1, 7, 0 | |||||
| ldrepl_macro 1, 1, 2, 3, 4, 5, 6, 7 | |||||
| GMUL xvf, d, G1, G1, U1 | GMUL xvf, d, G1, G1, U1 | ||||
| nmsub_macro 18, 23, 2, G1 | |||||
| nmsub_macro G1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7 | |||||
| PTR_ADDI A0, A0, 9 * 8 | PTR_ADDI A0, A0, 9 * 8 | ||||
| // Load 2 | // Load 2 | ||||
| ldrepl_macro 2, 7, 0 | |||||
| ldrepl_macro 2, 2, 3, 4, 5, 6, 7 | |||||
| GMUL xvf, d, G2, G2, U2 | GMUL xvf, d, G2, G2, U2 | ||||
| nmsub_macro 19, 23, 3, G2 | |||||
| nmsub_macro G2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7 | |||||
| PTR_ADDI A0, A0, 9 * 8 | PTR_ADDI A0, A0, 9 * 8 | ||||
| // Load 3 | // Load 3 | ||||
| ldrepl_macro 3, 7, 0 | |||||
| ldrepl_macro 3, 3, 4, 5, 6, 7 | |||||
| GMUL xvf, d, G3, G3, U3 | GMUL xvf, d, G3, G3, U3 | ||||
| nmsub_macro 20, 23, 4, G3 | |||||
| nmsub_macro G3, 20, 4, 21, 5, 22, 6, 23, 7 | |||||
| PTR_ADDI A0, A0, 9 * 8 | PTR_ADDI A0, A0, 9 * 8 | ||||
| // Load 4 | // Load 4 | ||||
| ldrepl_macro 4, 7, 0 | |||||
| ldrepl_macro 4, 4, 5, 6, 7 | |||||
| GMUL xvf, d, G4, G4, U4 | GMUL xvf, d, G4, G4, U4 | ||||
| nmsub_macro 21, 23, 5, G4 | |||||
| nmsub_macro G4, 21, 5, 22, 6, 23, 7 | |||||
| PTR_ADDI A0, A0, 9 * 8 | PTR_ADDI A0, A0, 9 * 8 | ||||
| // Load 5 | // Load 5 | ||||
| ldrepl_macro 5, 7, 0 | |||||
| ldrepl_macro 5, 5, 6, 7 | |||||
| GMUL xvf, d, G5, G5, U5 | GMUL xvf, d, G5, G5, U5 | ||||
| nmsub_macro 22, 23, 6, G5 | |||||
| nmsub_macro G5, 22, 6, 23, 7 | |||||
| PTR_ADDI A0, A0, 9 * 8 | PTR_ADDI A0, A0, 9 * 8 | ||||
| // Load 6 | // Load 6 | ||||
| ldrepl_macro 6, 7, 0 | |||||
| ldrepl_macro 6, 6, 7 | |||||
| GMUL xvf, d, G6, G6, U6 | GMUL xvf, d, G6, G6, U6 | ||||
| nmsub_macro 23, 23, 7, G6 | |||||
| nmsub_macro G6, 23, 7 | |||||
| PTR_ADDI A0, A0, 9 * 8 | PTR_ADDI A0, A0, 9 * 8 | ||||
| // Load 7 | // Load 7 | ||||
| ldrepl_macro 7, 7, 0 | |||||
| ldrepl_macro 7, 7 | |||||
| GMUL xvf, d, G7, G7, U7 | GMUL xvf, d, G7, G7, U7 | ||||
| // Finally, We can store the result. | // Finally, We can store the result. | ||||
| // For B, stored sequentially, and C, first transpose and then store | // For B, stored sequentially, and C, first transpose and then store | ||||
| B_st_macro 16, 23, 0, \N | |||||
| B_st_macro \N, 16, 16, 17, 18, 19, 20, 21, 22, 23 | |||||
| GTRANSPOSE4x4_D G0, G1, G2, G3, G0, G1, G2, G3, U0, U1 | GTRANSPOSE4x4_D G0, G1, G2, G3, G0, G1, G2, G3, U0, U1 | ||||
| GTRANSPOSE4x4_D G4, G5, G6, G7, G4, G5, G6, G7, U0, U1 | GTRANSPOSE4x4_D G4, G5, G6, G7, G4, G5, G6, G7, U0, U1 | ||||
| .if \N == 4 | .if \N == 4 | ||||
| @@ -437,26 +444,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| // 15 | // 15 | ||||
| // Sequentially extract data from A in row order | // Sequentially extract data from A in row order | ||||
| // Load 0 | // Load 0 | ||||
| ldrepl_macro 0, 3, 0 | |||||
| ldrepl_macro 0, 0, 1, 2, 3 | |||||
| GMUL xvf, d, G0, G0, U0 | GMUL xvf, d, G0, G0, U0 | ||||
| nmsub_macro 17, 19, 1, G0 | |||||
| nmsub_macro G0, 17, 1, 18, 2, 19, 3 | |||||
| PTR_ADDI A0, A0, 5 * 8 | PTR_ADDI A0, A0, 5 * 8 | ||||
| // Load 1 | // Load 1 | ||||
| ldrepl_macro 1, 3, 0 | |||||
| ldrepl_macro 1, 1, 2, 3 | |||||
| GMUL xvf, d, G1, G1, U1 | GMUL xvf, d, G1, G1, U1 | ||||
| nmsub_macro 18, 19, 2, G1 | |||||
| nmsub_macro G1, 18, 2, 19, 3 | |||||
| PTR_ADDI A0, A0, 5 * 8 | PTR_ADDI A0, A0, 5 * 8 | ||||
| // Load 2 | // Load 2 | ||||
| ldrepl_macro 2, 3, 0 | |||||
| ldrepl_macro 2, 2, 3 | |||||
| GMUL xvf, d, G2, G2, U2 | GMUL xvf, d, G2, G2, U2 | ||||
| nmsub_macro 19, 19, 3, G2 | |||||
| nmsub_macro G2, 19, 3 | |||||
| PTR_ADDI A0, A0, 5 * 8 | PTR_ADDI A0, A0, 5 * 8 | ||||
| // Load 3 | // Load 3 | ||||
| ldrepl_macro 3, 3, 0 | |||||
| ldrepl_macro 3, 3 | |||||
| GMUL xvf, d, G3, G3, U3 | GMUL xvf, d, G3, G3, U3 | ||||
| // Finally, We can store the result. | // Finally, We can store the result. | ||||
| // For B, stored sequentially, and C, first transpose and then store | // For B, stored sequentially, and C, first transpose and then store | ||||
| B_st_macro 16, 19, 0, \N | |||||
| B_st_macro \N, 16, 16, 17, 18, 19 | |||||
| GTRANSPOSE4x4_D G0, G1, G2, G3, G0, G1, G2, G3, U0, U1 | GTRANSPOSE4x4_D G0, G1, G2, G3, G0, G1, G2, G3, U0, U1 | ||||
| .if \N == 4 | .if \N == 4 | ||||
| GST xv, , G0, C0, 0x00, G1, C1, 0x00, G2, C2, 0x00, G3, C3, 0x00 | GST xv, , G0, C0, 0x00, G1, C1, 0x00, G2, C2, 0x00, G3, C3, 0x00 | ||||
| @@ -501,16 +508,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| // 3 | // 3 | ||||
| // Sequentially extract data from A in row order | // Sequentially extract data from A in row order | ||||
| // Load 0 | // Load 0 | ||||
| ldrepl_macro 0, 1, 0 | |||||
| ldrepl_macro 0, 0, 1 | |||||
| GMUL xvf, d, G0, G0, U0 | GMUL xvf, d, G0, G0, U0 | ||||
| nmsub_macro 17, 17, 1, G0 | |||||
| nmsub_macro G0, 17, 1 | |||||
| PTR_ADDI A0, A0, 3 * 8 | PTR_ADDI A0, A0, 3 * 8 | ||||
| // Load 1 | // Load 1 | ||||
| ldrepl_macro 1, 1, 0 | |||||
| ldrepl_macro 1, 1 | |||||
| GMUL xvf, d, G1, G1, U1 | GMUL xvf, d, G1, G1, U1 | ||||
| // Finally, We can store the result. | // Finally, We can store the result. | ||||
| // For B, stored sequentially, and C, first transpose and then store | // For B, stored sequentially, and C, first transpose and then store | ||||
| B_st_macro 16, 17, 0, \N | |||||
| B_st_macro \N, 16, 16, 17 | |||||
| GSBUTTERFLY xv, d, U0, U1, G1, G0 | GSBUTTERFLY xv, d, U0, U1, G1, G0 | ||||
| .if \N == 4 | .if \N == 4 | ||||
| vst $vr0, C0, 0x00 | vst $vr0, C0, 0x00 | ||||
| @@ -717,7 +724,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .endm | .endm | ||||
| PROLOGUE | PROLOGUE | ||||
| push_if_used 26, 32 | |||||
| push_if_used 9, 8 | |||||
| PTR_SLLI LDC, LDC, 3 | PTR_SLLI LDC, LDC, 3 | ||||
| /* if (!(N >> 2)) goto L_N3 */ | /* if (!(N >> 2)) goto L_N3 */ | ||||
| PTR_SRAI J, N, 2 /* J = bn >> 2 */ | PTR_SRAI J, N, 2 /* J = bn >> 2 */ | ||||
| @@ -954,6 +961,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| PTR_ADD AA, AA, T0 // aa += 1 * k | PTR_ADD AA, AA, T0 // aa += 1 * k | ||||
| .L_N1_M0: | .L_N1_M0: | ||||
| .L_N0: | .L_N0: | ||||
| pop_if_used 26, 32 | |||||
| pop_if_used 9, 8 | |||||
| jirl $r0, $r1, 0x0 | jirl $r0, $r1, 0x0 | ||||
| EPILOGUE | EPILOGUE | ||||
| @@ -128,33 +128,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "dtrsm_kernel_macro.S" | #include "dtrsm_kernel_macro.S" | ||||
| .macro ldrepl_macro start, end, stride | |||||
| .macro ldrepl_macro stride:req, index:req, more:vararg | |||||
| // Load Ux (x = 0...15) | // Load Ux (x = 0...15) | ||||
| .if \start <= \end | |||||
| GLDREPL xv, d, $xr\start, B0, \stride * 8 | |||||
| ldrepl_macro %start + 1, \end, %stride + 1 | |||||
| GLDREPL xv, d, $xr\index, B0, \index * 8 - \stride * 8 | |||||
| .ifnb \more | |||||
| ldrepl_macro \stride, \more | |||||
| .endif | .endif | ||||
| .endm | .endm | ||||
| .macro nmsub_macro start0, end0, start1, reg | |||||
| // Ux -= reg * Dx | |||||
| .if \start0 <= \end0 | |||||
| .macro nmsub_macro reg:req, start0:req, start1:req, more:vararg | |||||
| // Gx -= reg * Ux | |||||
| xvfnmsub.d $xr\start0, \reg, $xr\start1, $xr\start0 | xvfnmsub.d $xr\start0, \reg, $xr\start1, $xr\start0 | ||||
| nmsub_macro %start0 + 1, \end0, %start1 + 1, \reg | |||||
| .ifnb \more | |||||
| nmsub_macro \reg, \more | |||||
| .endif | .endif | ||||
| .endm | .endm | ||||
| .macro A_st_macro start, end, stride, N | |||||
| // Store Ux(x = 0...15) | |||||
| .if \start <= \end | |||||
| .macro A_st_macro N:req, stride:req, start:req, more:vararg | |||||
| // Store Gx(x = 16...31) | |||||
| .if \N == 4 | .if \N == 4 | ||||
| xvst $xr\start, A0, \stride * 0x20 | |||||
| xvst $xr\start, A0, \start * 0x20 - \stride * 0x20 | |||||
| .elseif \N == 2 | .elseif \N == 2 | ||||
| vst $vr\start, A0, \stride * 0x10 | |||||
| vst $vr\start, A0, \start * 0x10 - \stride * 0x10 | |||||
| .elseif \N == 1 | .elseif \N == 1 | ||||
| fst.d $f\start, A0, \stride * 0x08 | |||||
| fst.d $f\start, A0, \start * 0x08 - \stride * 0x08 | |||||
| .endif | .endif | ||||
| A_st_macro %start + 1, \end, %stride + 1, \N | |||||
| .ifnb \more | |||||
| A_st_macro \N, \stride, \more | |||||
| .endif | .endif | ||||
| .endm | .endm | ||||
| @@ -167,22 +165,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| // 10 11 | // 10 11 | ||||
| // 15 | // 15 | ||||
| // Sequentially extract data from B in row order | // Sequentially extract data from B in row order | ||||
| ldrepl_macro 16, 19, 0 | |||||
| ldrepl_macro 16, 16, 17, 18, 19 | |||||
| GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3 | GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3 | ||||
| ldrepl_macro 20, 22, 5 | |||||
| nmsub_macro 4, 7, 0, D1 | |||||
| ldrepl_macro 23, 24, 10 | |||||
| ldrepl_macro 15, 20, 21, 22 | |||||
| nmsub_macro D1, 4, 0, 5, 1, 6, 2, 7, 3 | |||||
| ldrepl_macro 13, 23, 24 | |||||
| GMUL xvf, d, U4, D4, U4, U5, D4, U5, U6, D4, U6, U7, D4, U7 | GMUL xvf, d, U4, D4, U4, U5, D4, U5, U6, D4, U6, U7, D4, U7 | ||||
| ldrepl_macro 25, 25, 15 | |||||
| nmsub_macro 8, 11, 0, D2 | |||||
| nmsub_macro 8, 11, 4, D5 | |||||
| ldrepl_macro 10, 25 | |||||
| nmsub_macro D2, 8, 0, 9, 1, 10, 2, 11, 3 | |||||
| nmsub_macro D5, 8, 4, 9, 5, 10, 6, 11, 7 | |||||
| GMUL xvf, d, U8, D7, U8, U9, D7, U9, U10, D7, U10, U11, D7, U11 | GMUL xvf, d, U8, D7, U8, U9, D7, U9, U10, D7, U10, U11, D7, U11 | ||||
| nmsub_macro 12, 15, 0, D3 | |||||
| nmsub_macro 12, 15, 4, D6 | |||||
| nmsub_macro 12, 15, 8, D8 | |||||
| nmsub_macro D3, 12, 0, 13, 1, 14, 2, 15, 3 | |||||
| nmsub_macro D6, 12, 4, 13, 5, 14, 6, 15, 7 | |||||
| nmsub_macro D8, 12, 8, 13, 9, 14, 10, 15, 11 | |||||
| GMUL xvf, d, U12, D9, U12, U13, D9, U13, U14, D9, U14, U15, D9, U15 | GMUL xvf, d, U12, D9, U12, U13, D9, U13, U14, D9, U14, U15, D9, U15 | ||||
| // Store A | // Store A | ||||
| A_st_macro 0, 15, 0, 4 | |||||
| A_st_macro 4, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 | |||||
| // Store C | // Store C | ||||
| GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60, \ | GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60, \ | ||||
| U4, C1, 0x00, U5, C1, 0x20, U6, C1, 0x40, U7, C1, 0x60, \ | U4, C1, 0x00, U5, C1, 0x20, U6, C1, 0x40, U7, C1, 0x60, \ | ||||
| @@ -197,13 +196,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| //0 1 | //0 1 | ||||
| // 3 | // 3 | ||||
| // Sequentially extract data from B in row order | // Sequentially extract data from B in row order | ||||
| ldrepl_macro 16, 17, 0 | |||||
| ldrepl_macro 16, 16, 17 | |||||
| GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3 | GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3 | ||||
| ldrepl_macro 18, 18, 3 | |||||
| nmsub_macro 4, 7, 0, D1 | |||||
| ldrepl_macro 15, 18 | |||||
| nmsub_macro D1, 4, 0, 5, 1, 6, 2, 7, 3 | |||||
| GMUL xvf, d, U4, D2, U4, U5, D2, U5, U6, D2, U6, U7, D2, U7 | GMUL xvf, d, U4, D2, U4, U5, D2, U5, U6, D2, U6, U7, D2, U7 | ||||
| // Store A | // Store A | ||||
| A_st_macro 0, 7, 0, 4 | |||||
| A_st_macro 4, 0, 0, 1, 2, 3, 4, 5, 6, 7 | |||||
| // Store C | // Store C | ||||
| GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60, \ | GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60, \ | ||||
| U4, C1, 0x00, U5, C1, 0x20, U6, C1, 0x40, U7, C1, 0x60 | U4, C1, 0x00, U5, C1, 0x20, U6, C1, 0x40, U7, C1, 0x60 | ||||
| @@ -218,22 +217,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| // 10 11 | // 10 11 | ||||
| // 15 | // 15 | ||||
| // Sequentially extract data from B in row order | // Sequentially extract data from B in row order | ||||
| ldrepl_macro 16, 19, 0 | |||||
| ldrepl_macro 16, 16, 17, 18, 19 | |||||
| GMUL xvf, d, U0, D0, U0, U1, D0, U1 | GMUL xvf, d, U0, D0, U0, U1, D0, U1 | ||||
| ldrepl_macro 20, 22, 5 | |||||
| nmsub_macro 2, 3, 0, D1 | |||||
| ldrepl_macro 23, 24, 10 | |||||
| ldrepl_macro 15, 20, 21, 22 | |||||
| nmsub_macro D1, 2, 0, 3, 1 | |||||
| ldrepl_macro 13, 23, 24 | |||||
| GMUL xvf, d, U2, D4, U2, U3, D4, U3 | GMUL xvf, d, U2, D4, U2, U3, D4, U3 | ||||
| ldrepl_macro 25, 25, 15 | |||||
| nmsub_macro 4, 5, 0, D2 | |||||
| nmsub_macro 4, 5, 2, D5 | |||||
| ldrepl_macro 10, 25 | |||||
| nmsub_macro D2, 4, 0, 5, 1 | |||||
| nmsub_macro D5, 4, 2, 5, 3 | |||||
| GMUL xvf, d, U4, D7, U4, U5, D7, U5 | GMUL xvf, d, U4, D7, U4, U5, D7, U5 | ||||
| nmsub_macro 6, 7, 0, D3 | |||||
| nmsub_macro 6, 7, 2, D6 | |||||
| nmsub_macro 6, 7, 4, D8 | |||||
| nmsub_macro D3, 6, 0, 7, 1 | |||||
| nmsub_macro D6, 6, 2, 7, 3 | |||||
| nmsub_macro D8, 6, 4, 7, 5 | |||||
| GMUL xvf, d, U6, D9, U6, U7, D9, U7 | GMUL xvf, d, U6, D9, U6, U7, D9, U7 | ||||
| // Store A | // Store A | ||||
| A_st_macro 0, 7, 0, 4 | |||||
| A_st_macro 4, 0, 0, 1, 2, 3, 4, 5, 6, 7 | |||||
| // Store C | // Store C | ||||
| GST xv, , U0, C0, 0x00, U1, C0, 0x20, \ | GST xv, , U0, C0, 0x00, U1, C0, 0x20, \ | ||||
| U2, C1, 0x00, U3, C1, 0x20, \ | U2, C1, 0x00, U3, C1, 0x20, \ | ||||
| @@ -248,13 +247,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| //0 1 | //0 1 | ||||
| // 3 | // 3 | ||||
| // Sequentially extract data from B in row order | // Sequentially extract data from B in row order | ||||
| ldrepl_macro 16, 17, 0 | |||||
| ldrepl_macro 16, 16, 17 | |||||
| GMUL xvf, d, U0, D0, U0, U1, D0, U1 | GMUL xvf, d, U0, D0, U0, U1, D0, U1 | ||||
| ldrepl_macro 18, 18, 3 | |||||
| nmsub_macro 2, 3, 0, D1 | |||||
| ldrepl_macro 15, 18 | |||||
| nmsub_macro D1, 2, 0, 3, 1 | |||||
| GMUL xvf, d, U2, D2, U2, U3, D2, U3 | GMUL xvf, d, U2, D2, U2, U3, D2, U3 | ||||
| // Store A | // Store A | ||||
| A_st_macro 0, 3, 0, 4 | |||||
| A_st_macro 4, 0, 0, 1, 2, 3 | |||||
| // Store C | // Store C | ||||
| GST xv, , U0, C0, 0x00, U1, C0, 0x20, \ | GST xv, , U0, C0, 0x00, U1, C0, 0x20, \ | ||||
| U2, C1, 0x00, U3, C1, 0x20 | U2, C1, 0x00, U3, C1, 0x20 | ||||
| @@ -269,22 +268,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| // 10 11 | // 10 11 | ||||
| // 15 | // 15 | ||||
| // Sequentially extract data from B in row order | // Sequentially extract data from B in row order | ||||
| ldrepl_macro 16, 19, 0 | |||||
| ldrepl_macro 16, 16, 17, 18, 19 | |||||
| GMUL xvf, d, U0, D0, U0 | GMUL xvf, d, U0, D0, U0 | ||||
| ldrepl_macro 20, 22, 5 | |||||
| nmsub_macro 1, 1, 0, D1 | |||||
| ldrepl_macro 23, 24, 10 | |||||
| ldrepl_macro 15, 20, 21, 22 | |||||
| nmsub_macro D1, 1, 0 | |||||
| ldrepl_macro 13, 23, 24 | |||||
| GMUL xvf, d, U1, D4, U1 | GMUL xvf, d, U1, D4, U1 | ||||
| ldrepl_macro 25, 25, 15 | |||||
| nmsub_macro 2, 2, 0, D2 | |||||
| nmsub_macro 2, 2, 1, D5 | |||||
| ldrepl_macro 10, 25 | |||||
| nmsub_macro D2, 2, 0 | |||||
| nmsub_macro D5, 2, 1 | |||||
| GMUL xvf, d, U2, D7, U2 | GMUL xvf, d, U2, D7, U2 | ||||
| nmsub_macro 3, 3, 0, D3 | |||||
| nmsub_macro 3, 3, 1, D6 | |||||
| nmsub_macro 3, 3, 2, D8 | |||||
| nmsub_macro D3, 3, 0 | |||||
| nmsub_macro D6, 3, 1 | |||||
| nmsub_macro D8, 3, 2 | |||||
| GMUL xvf, d, U3, D9, U3 | GMUL xvf, d, U3, D9, U3 | ||||
| // Store A | // Store A | ||||
| A_st_macro 0, 3, 0, 4 | |||||
| A_st_macro 4, 0, 0, 1, 2, 3 | |||||
| // Store C | // Store C | ||||
| GST xv, , U0, C0, 0x00, U1, C1, 0x00, U2, C2, 0x00, U3, C3, 0x00 | GST xv, , U0, C0, 0x00, U1, C1, 0x00, U2, C2, 0x00, U3, C3, 0x00 | ||||
| .endm | .endm | ||||
| @@ -296,13 +295,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| //0 1 | //0 1 | ||||
| // 3 | // 3 | ||||
| // Sequentially extract data from B in row order | // Sequentially extract data from B in row order | ||||
| ldrepl_macro 16, 17, 0 | |||||
| ldrepl_macro 16, 16, 17 | |||||
| GMUL xvf, d, U0, D0, U0 | GMUL xvf, d, U0, D0, U0 | ||||
| ldrepl_macro 18, 18, 3 | |||||
| nmsub_macro 1, 1, 0, D1 | |||||
| ldrepl_macro 15, 18 | |||||
| nmsub_macro D1, 1, 0 | |||||
| GMUL xvf, d, U1, D2, U1 | GMUL xvf, d, U1, D2, U1 | ||||
| // Store A | // Store A | ||||
| A_st_macro 0, 1, 0, 4 | |||||
| A_st_macro 4, 0, 0, 1 | |||||
| // Store C | // Store C | ||||
| GST xv, , U0, C0, 0x00, U1, C1, 0x00 | GST xv, , U0, C0, 0x00, U1, C1, 0x00 | ||||
| .endm | .endm | ||||
| @@ -316,23 +315,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| // 10 11 | // 10 11 | ||||
| // 15 | // 15 | ||||
| // Sequentially extract data from B in row order | // Sequentially extract data from B in row order | ||||
| ldrepl_macro 16, 19, 0 | |||||
| ldrepl_macro 16, 16, 17, 18, 19 | |||||
| GMUL xvf, d, U0, D0, U0 | GMUL xvf, d, U0, D0, U0 | ||||
| ldrepl_macro 20, 22, 5 | |||||
| nmsub_macro 1, 1, 0, D1 | |||||
| ldrepl_macro 23, 24, 10 | |||||
| ldrepl_macro 15, 20, 21, 22 | |||||
| nmsub_macro D1, 1, 0 | |||||
| ldrepl_macro 13, 23, 24 | |||||
| GMUL xvf, d, U1, D4, U1 | GMUL xvf, d, U1, D4, U1 | ||||
| ldrepl_macro 25, 25, 15 | |||||
| nmsub_macro 2, 2, 0, D2 | |||||
| nmsub_macro 2, 2, 1, D5 | |||||
| ldrepl_macro 10, 25 | |||||
| nmsub_macro D2, 2, 0 | |||||
| nmsub_macro D5, 2, 1 | |||||
| GMUL xvf, d, U2, D7, U2 | GMUL xvf, d, U2, D7, U2 | ||||
| nmsub_macro 3, 3, 0, D3 | |||||
| nmsub_macro 3, 3, 1, D6 | |||||
| nmsub_macro 3, 3, 2, D8 | |||||
| nmsub_macro D3, 3, 0 | |||||
| nmsub_macro D6, 3, 1 | |||||
| nmsub_macro D8, 3, 2 | |||||
| GMUL xvf, d, U3, D9, U3 | GMUL xvf, d, U3, D9, U3 | ||||
| // Store A | // Store A | ||||
| A_st_macro 0, 3, 0, 2 | |||||
| A_st_macro 2, 0, 0, 1, 2, 3 | |||||
| // Store C | // Store C | ||||
| GST v, , $vr0, C0, 0x00, $vr1, C1, 0x00, $vr2, C2, 0x00, $vr3, C3, 0x00, | GST v, , $vr0, C0, 0x00, $vr1, C1, 0x00, $vr2, C2, 0x00, $vr3, C3, 0x00, | ||||
| .endm | .endm | ||||
| @@ -344,13 +343,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| //0 1 | //0 1 | ||||
| // 3 | // 3 | ||||
| // Sequentially extract data from B in row order | // Sequentially extract data from B in row order | ||||
| ldrepl_macro 16, 17, 0 | |||||
| ldrepl_macro 16, 16, 17 | |||||
| GMUL xvf, d, U0, D0, U0 | GMUL xvf, d, U0, D0, U0 | ||||
| ldrepl_macro 18, 18, 3 | |||||
| nmsub_macro 1, 1, 0, D1 | |||||
| ldrepl_macro 15, 18 | |||||
| nmsub_macro D1, 1, 0 | |||||
| GMUL xvf, d, U1, D2, U1 | GMUL xvf, d, U1, D2, U1 | ||||
| // Store A | // Store A | ||||
| A_st_macro 0, 1, 0, 2 | |||||
| A_st_macro 2, 0, 0, 1 | |||||
| // Store C | // Store C | ||||
| GST v, , $vr0, C0, 0x00, $vr1, C1, 0x00 | GST v, , $vr0, C0, 0x00, $vr1, C1, 0x00 | ||||
| .endm | .endm | ||||
| @@ -364,23 +363,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| // 10 11 | // 10 11 | ||||
| // 15 | // 15 | ||||
| // Sequentially extract data from B in row order | // Sequentially extract data from B in row order | ||||
| ldrepl_macro 16, 19, 0 | |||||
| ldrepl_macro 16, 16, 17, 18, 19 | |||||
| GMUL xvf, d, U0, D0, U0 | GMUL xvf, d, U0, D0, U0 | ||||
| ldrepl_macro 20, 22, 5 | |||||
| nmsub_macro 1, 1, 0, D1 | |||||
| ldrepl_macro 23, 24, 10 | |||||
| ldrepl_macro 15, 20, 21, 22 | |||||
| nmsub_macro D1, 1, 0 | |||||
| ldrepl_macro 13, 23, 24 | |||||
| GMUL xvf, d, U1, D4, U1 | GMUL xvf, d, U1, D4, U1 | ||||
| ldrepl_macro 25, 25, 15 | |||||
| nmsub_macro 2, 2, 0, D2 | |||||
| nmsub_macro 2, 2, 1, D5 | |||||
| ldrepl_macro 10, 25 | |||||
| nmsub_macro D2, 2, 0 | |||||
| nmsub_macro D5, 2, 1 | |||||
| GMUL xvf, d, U2, D7, U2 | GMUL xvf, d, U2, D7, U2 | ||||
| nmsub_macro 3, 3, 0, D3 | |||||
| nmsub_macro 3, 3, 1, D6 | |||||
| nmsub_macro 3, 3, 2, D8 | |||||
| nmsub_macro D3, 3, 0 | |||||
| nmsub_macro D6, 3, 1 | |||||
| nmsub_macro D8, 3, 2 | |||||
| GMUL xvf, d, U3, D9, U3 | GMUL xvf, d, U3, D9, U3 | ||||
| // Store A | // Store A | ||||
| A_st_macro 0, 3, 0, 1 | |||||
| A_st_macro 1, 0, 0, 1, 2, 3 | |||||
| // Store C | // Store C | ||||
| GST f, d, $f0, C0, 0x00, $f1, C1, 0x00, $f2, C2, 0x00, $f3, C3, 0x00, | GST f, d, $f0, C0, 0x00, $f1, C1, 0x00, $f2, C2, 0x00, $f3, C3, 0x00, | ||||
| .endm | .endm | ||||
| @@ -392,13 +391,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| //0 1 | //0 1 | ||||
| // 3 | // 3 | ||||
| // Sequentially extract data from B in row order | // Sequentially extract data from B in row order | ||||
| ldrepl_macro 16, 17, 0 | |||||
| ldrepl_macro 16, 16, 17 | |||||
| GMUL xvf, d, U0, D0, U0 | GMUL xvf, d, U0, D0, U0 | ||||
| ldrepl_macro 18, 18, 3 | |||||
| nmsub_macro 1, 1, 0, D1 | |||||
| ldrepl_macro 15, 18 | |||||
| nmsub_macro D1, 1, 0 | |||||
| GMUL xvf, d, U1, D2, U1 | GMUL xvf, d, U1, D2, U1 | ||||
| // Store A | // Store A | ||||
| A_st_macro 0, 1, 0, 1 | |||||
| A_st_macro 1, 0, 0, 1 | |||||
| // Store C | // Store C | ||||
| GST f, d, $f0, C0, 0x00, $f1, C1, 0x00 | GST f, d, $f0, C0, 0x00, $f1, C1, 0x00 | ||||
| .endm | .endm | ||||
| @@ -582,10 +581,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| xvld U2, C0, 0x40 | xvld U2, C0, 0x40 | ||||
| xvld U3, C0, 0x60 | xvld U3, C0, 0x60 | ||||
| .L_dsolve_16x1: | .L_dsolve_16x1: | ||||
| ldrepl_macro 16, 16, 0 | |||||
| ldrepl_macro 16, 16 | |||||
| GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3 | GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3 | ||||
| // Store A | // Store A | ||||
| A_st_macro 0, 3, 0, 4 | |||||
| A_st_macro 4, 0, 0, 1, 2, 3 | |||||
| // Strore C | // Strore C | ||||
| GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60 | GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60 | ||||
| .endm | .endm | ||||
| @@ -599,10 +598,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| xvld U0, C0, 0x00 | xvld U0, C0, 0x00 | ||||
| xvld U1, C0, 0x20 | xvld U1, C0, 0x20 | ||||
| .L_dsolve_8x1: | .L_dsolve_8x1: | ||||
| ldrepl_macro 16, 16, 0 | |||||
| ldrepl_macro 16, 16 | |||||
| GMUL xvf, d, U0, D0, U0, U1, D0, U1 | GMUL xvf, d, U0, D0, U0, U1, D0, U1 | ||||
| // Store A | // Store A | ||||
| A_st_macro 0, 1, 0, 4 | |||||
| A_st_macro 4, 0, 0, 1 | |||||
| // Strore C | // Strore C | ||||
| GST xv, , U0, C0, 0x00, U1, C0, 0x20 | GST xv, , U0, C0, 0x00, U1, C0, 0x20 | ||||
| .endm | .endm | ||||
| @@ -615,10 +614,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| /* Load C0 */ | /* Load C0 */ | ||||
| xvld U0, C0, 0x00 | xvld U0, C0, 0x00 | ||||
| .L_dsolve_4x1: | .L_dsolve_4x1: | ||||
| ldrepl_macro 16, 16, 0 | |||||
| ldrepl_macro 16, 16 | |||||
| GMUL xvf, d, U0, D0, U0 | GMUL xvf, d, U0, D0, U0 | ||||
| // Store A | // Store A | ||||
| A_st_macro 0, 0, 0, 4 | |||||
| A_st_macro 4, 0, 0 | |||||
| // Strore C | // Strore C | ||||
| GST xv, , U0, C0, 0x00 | GST xv, , U0, C0, 0x00 | ||||
| .endm | .endm | ||||
| @@ -631,10 +630,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| /* Load C0 */ | /* Load C0 */ | ||||
| xvld U0, C0, 0x00 | xvld U0, C0, 0x00 | ||||
| .L_dsolve_2x1: | .L_dsolve_2x1: | ||||
| ldrepl_macro 16, 16, 0 | |||||
| ldrepl_macro 16, 16 | |||||
| GMUL xvf, d, U0, D0, U0 | GMUL xvf, d, U0, D0, U0 | ||||
| // Store A | // Store A | ||||
| A_st_macro 0, 0, 0, 2 | |||||
| A_st_macro 2, 0, 0 | |||||
| // Strore C | // Strore C | ||||
| GST v, , $vr0, C0, 0x00 | GST v, , $vr0, C0, 0x00 | ||||
| .endm | .endm | ||||
| @@ -647,16 +646,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| // Load C | // Load C | ||||
| fld.d $f0, C0, 0x00 | fld.d $f0, C0, 0x00 | ||||
| .L_dsolve_1x1: | .L_dsolve_1x1: | ||||
| ldrepl_macro 16, 16, 0 | |||||
| ldrepl_macro 16, 16 | |||||
| GMUL xvf, d, U0, D0, U0 | GMUL xvf, d, U0, D0, U0 | ||||
| // Store A | // Store A | ||||
| A_st_macro 0, 0, 0, 1 | |||||
| A_st_macro 1, 0, 0 | |||||
| // Strore C | // Strore C | ||||
| GST f, d, $f0, C0, 0x00 | GST f, d, $f0, C0, 0x00 | ||||
| .endm | .endm | ||||
| PROLOGUE | PROLOGUE | ||||
| push_if_used 26, 32 | |||||
| push_if_used 9, 8 | |||||
| PTR_SLLI LDC, LDC, 3 | PTR_SLLI LDC, LDC, 3 | ||||
| PTR_SUB KK, ZERO, OFFSET | PTR_SUB KK, ZERO, OFFSET | ||||
| /* if (!(N >> 2)) goto L_N3 */ | /* if (!(N >> 2)) goto L_N3 */ | ||||
| @@ -877,6 +876,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| PTR_ADD AA, AA, T0 // aa += 1 * k | PTR_ADD AA, AA, T0 // aa += 1 * k | ||||
| .L_N1_M0: | .L_N1_M0: | ||||
| .L_N0: | .L_N0: | ||||
| pop_if_used 26, 32 | |||||
| pop_if_used 9, 8 | |||||
| jirl $r0, $r1, 0x0 | jirl $r0, $r1, 0x0 | ||||
| EPILOGUE | EPILOGUE | ||||
| @@ -111,33 +111,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "dtrsm_kernel_macro.S" | #include "dtrsm_kernel_macro.S" | ||||
| .macro ldrepl_macro start, end, stride | |||||
| .macro ldrepl_macro stride:req, index:req, more:vararg | |||||
| // Load Ux (x = 0...15) | // Load Ux (x = 0...15) | ||||
| .if \start <= \end | |||||
| GLDREPL xv, d, $xr\start, B0, \stride * 8 | |||||
| ldrepl_macro %start + 1, \end, %stride + 1 | |||||
| GLDREPL xv, d, $xr\index, B0, \index * 8 - \stride * 8 | |||||
| .ifnb \more | |||||
| ldrepl_macro \stride, \more | |||||
| .endif | .endif | ||||
| .endm | .endm | ||||
| .macro nmsub_macro start0, end0, start1, reg | |||||
| // Ux -= reg * Dx | |||||
| .if \start0 <= \end0 | |||||
| .macro nmsub_macro reg:req, start0:req, start1:req, more:vararg | |||||
| // Gx -= reg * Ux | |||||
| xvfnmsub.d $xr\start0, \reg, $xr\start1, $xr\start0 | xvfnmsub.d $xr\start0, \reg, $xr\start1, $xr\start0 | ||||
| nmsub_macro %start0 + 1, \end0, %start1 + 1, \reg | |||||
| .ifnb \more | |||||
| nmsub_macro \reg, \more | |||||
| .endif | .endif | ||||
| .endm | .endm | ||||
| .macro A_st_macro start, end, stride, N | |||||
| // Store Ux(x = 0...15) | |||||
| .if \start <= \end | |||||
| .macro A_st_macro N:req, stride:req, start:req, more:vararg | |||||
| // Store Gx(x = 16...31) | |||||
| .if \N == 4 | .if \N == 4 | ||||
| xvst $xr\start, A0, \stride * 0x20 | |||||
| xvst $xr\start, A0, \start * 0x20 - \stride * 0x20 | |||||
| .elseif \N == 2 | .elseif \N == 2 | ||||
| vst $vr\start, A0, \stride * 0x10 | |||||
| vst $vr\start, A0, \start * 0x10 - \stride * 0x10 | |||||
| .elseif \N == 1 | .elseif \N == 1 | ||||
| fst.d $f\start, A0, \stride * 0x08 | |||||
| fst.d $f\start, A0, \start * 0x08 - \stride * 0x08 | |||||
| .endif | .endif | ||||
| A_st_macro %start + 1, \end, %stride + 1, \N | |||||
| .ifnb \more | |||||
| A_st_macro \N, \stride, \more | |||||
| .endif | .endif | ||||
| .endm | .endm | ||||
| @@ -148,13 +146,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| //0 | //0 | ||||
| //2 3 | //2 3 | ||||
| // Sequentially extract data from B in row order | // Sequentially extract data from B in row order | ||||
| ldrepl_macro 16, 16, 0 | |||||
| ldrepl_macro 17, 18, 2 | |||||
| ldrepl_macro 16, 16 | |||||
| ldrepl_macro 15, 17, 18 | |||||
| GMUL xvf, d, U4, D2, U4, U5, D2, U5, U6, D2, U6, U7, D2, U7 | GMUL xvf, d, U4, D2, U4, U5, D2, U5, U6, D2, U6, U7, D2, U7 | ||||
| nmsub_macro 0, 3, 4, D1 | |||||
| nmsub_macro D1, 0, 4, 1, 5, 2, 6, 3, 7 | |||||
| GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3 | GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3 | ||||
| // Store A | // Store A | ||||
| A_st_macro 0, 7, 0, 4 | |||||
| A_st_macro 4, 0, 0, 1, 2, 3, 4, 5, 6, 7 | |||||
| // Store C | // Store C | ||||
| GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60, \ | GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60, \ | ||||
| U4, C1, 0x00, U5, C1, 0x20, U6, C1, 0x40, U7, C1, 0x60 | U4, C1, 0x00, U5, C1, 0x20, U6, C1, 0x40, U7, C1, 0x60 | ||||
| @@ -167,13 +165,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| //0 | //0 | ||||
| //2 3 | //2 3 | ||||
| // Sequentially extract data from B in row order | // Sequentially extract data from B in row order | ||||
| ldrepl_macro 16, 16, 0 | |||||
| ldrepl_macro 17, 18, 2 | |||||
| ldrepl_macro 16, 16 | |||||
| ldrepl_macro 15, 17, 18 | |||||
| GMUL xvf, d, U2, D2, U2, U3, D2, U3 | GMUL xvf, d, U2, D2, U2, U3, D2, U3 | ||||
| nmsub_macro 0, 1, 2, D1 | |||||
| nmsub_macro D1, 0, 2, 1, 3 | |||||
| GMUL xvf, d, U0, D0, U0, U1, D0, U1 | GMUL xvf, d, U0, D0, U0, U1, D0, U1 | ||||
| // Store A | // Store A | ||||
| A_st_macro 0, 3, 0, 4 | |||||
| A_st_macro 4, 0, 0, 1, 2, 3 | |||||
| // Store C | // Store C | ||||
| GST xv, , U0, C0, 0x00, U1, C0, 0x20, \ | GST xv, , U0, C0, 0x00, U1, C0, 0x20, \ | ||||
| U2, C1, 0x00, U3, C1, 0x20 | U2, C1, 0x00, U3, C1, 0x20 | ||||
| @@ -186,13 +184,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| //0 | //0 | ||||
| //2 3 | //2 3 | ||||
| // Sequentially extract data from B in row order | // Sequentially extract data from B in row order | ||||
| ldrepl_macro 16, 16, 0 | |||||
| ldrepl_macro 17, 18, 2 | |||||
| ldrepl_macro 16, 16 | |||||
| ldrepl_macro 15, 17, 18 | |||||
| GMUL xvf, d, U1, D2, U1 | GMUL xvf, d, U1, D2, U1 | ||||
| nmsub_macro 0, 0, 1, D1 | |||||
| nmsub_macro D1, 0, 1 | |||||
| GMUL xvf, d, U0, D0, U0 | GMUL xvf, d, U0, D0, U0 | ||||
| // Store A | // Store A | ||||
| A_st_macro 0, 1, 0, 4 | |||||
| A_st_macro 4, 0, 0, 1 | |||||
| // Store C | // Store C | ||||
| GST xv, , U0, C0, 0x00, U1, C1, 0x00 | GST xv, , U0, C0, 0x00, U1, C1, 0x00 | ||||
| .endm | .endm | ||||
| @@ -204,13 +202,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| //0 | //0 | ||||
| //2 3 | //2 3 | ||||
| // Sequentially extract data from B in row order | // Sequentially extract data from B in row order | ||||
| ldrepl_macro 16, 16, 0 | |||||
| ldrepl_macro 17, 18, 2 | |||||
| ldrepl_macro 16, 16 | |||||
| ldrepl_macro 15, 17, 18 | |||||
| GMUL xvf, d, U1, D2, U1 | GMUL xvf, d, U1, D2, U1 | ||||
| nmsub_macro 0, 0, 1, D1 | |||||
| nmsub_macro D1, 0, 1 | |||||
| GMUL xvf, d, U0, D0, U0 | GMUL xvf, d, U0, D0, U0 | ||||
| // Store A | // Store A | ||||
| A_st_macro 0, 1, 0, 2 | |||||
| A_st_macro 2, 0, 0, 1 | |||||
| // Store C | // Store C | ||||
| GST v, , $vr0, C0, 0x00, $vr1, C1, 0x00 | GST v, , $vr0, C0, 0x00, $vr1, C1, 0x00 | ||||
| .endm | .endm | ||||
| @@ -222,13 +220,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| //0 | //0 | ||||
| //2 3 | //2 3 | ||||
| // Sequentially extract data from B in row order | // Sequentially extract data from B in row order | ||||
| ldrepl_macro 16, 16, 0 | |||||
| ldrepl_macro 17, 18, 2 | |||||
| ldrepl_macro 16, 16 | |||||
| ldrepl_macro 15, 17, 18 | |||||
| GMUL xvf, d, U1, D2, U1 | GMUL xvf, d, U1, D2, U1 | ||||
| nmsub_macro 0, 0, 1, D1 | |||||
| nmsub_macro D1, 0, 1 | |||||
| GMUL xvf, d, U0, D0, U0 | GMUL xvf, d, U0, D0, U0 | ||||
| // Store A | // Store A | ||||
| A_st_macro 0, 1, 0, 1 | |||||
| A_st_macro 1, 0, 0, 1 | |||||
| // Store C | // Store C | ||||
| GST f, d, $f0, C0, 0x00, $f1, C1, 0x00 | GST f, d, $f0, C0, 0x00, $f1, C1, 0x00 | ||||
| .endm | .endm | ||||
| @@ -242,22 +240,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| //8 9 10 | //8 9 10 | ||||
| //12 13 14 15 | //12 13 14 15 | ||||
| // Sequentially extract data from B in row order | // Sequentially extract data from B in row order | ||||
| ldrepl_macro 22, 25, 12 | |||||
| ldrepl_macro 10, 22, 23, 24, 25 | |||||
| GMUL xvf, d, U12, D9, U12, U13, D9, U13, U14, D9, U14, U15, D9, U15 | GMUL xvf, d, U12, D9, U12, U13, D9, U13, U14, D9, U14, U15, D9, U15 | ||||
| ldrepl_macro 19, 21, 8 | |||||
| nmsub_macro 8, 11, 12, D8 | |||||
| ldrepl_macro 17, 18, 4 | |||||
| ldrepl_macro 11, 19, 20, 21 | |||||
| nmsub_macro D8, 8, 12, 9, 13, 10, 14, 11, 15 | |||||
| ldrepl_macro 13, 17, 18 | |||||
| GMUL xvf, d, U8, D5, U8, U9, D5, U9, U10, D5, U10, U11, D5, U11 | GMUL xvf, d, U8, D5, U8, U9, D5, U9, U10, D5, U10, U11, D5, U11 | ||||
| ldrepl_macro 16, 16, 0 | |||||
| nmsub_macro 4, 7, 12, D7 | |||||
| nmsub_macro 4, 7, 8, D4 | |||||
| ldrepl_macro 16, 16 | |||||
| nmsub_macro D7, 4, 12, 5, 13, 6, 14, 7, 15 | |||||
| nmsub_macro D4, 4, 8, 5, 9, 6, 10, 7, 11 | |||||
| GMUL xvf, d, U4, D2, U4, U5, D2, U5, U6, D2, U6, U7, D2, U7 | GMUL xvf, d, U4, D2, U4, U5, D2, U5, U6, D2, U6, U7, D2, U7 | ||||
| nmsub_macro 0, 3, 12, D6 | |||||
| nmsub_macro 0, 3, 8, D3 | |||||
| nmsub_macro 0, 3, 4, D1 | |||||
| nmsub_macro D6, 0, 12, 1, 13, 2, 14, 3, 15 | |||||
| nmsub_macro D3, 0, 8, 1, 9, 2, 10, 3, 11 | |||||
| nmsub_macro D1, 0, 4, 1, 5, 2, 6, 3, 7 | |||||
| GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3 | GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3 | ||||
| // Store A | // Store A | ||||
| A_st_macro 0, 15, 0, 4 | |||||
| A_st_macro 4, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 | |||||
| // Store C | // Store C | ||||
| GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60, \ | GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60, \ | ||||
| U4, C1, 0x00, U5, C1, 0x20, U6, C1, 0x40, U7, C1, 0x60, \ | U4, C1, 0x00, U5, C1, 0x20, U6, C1, 0x40, U7, C1, 0x60, \ | ||||
| @@ -274,22 +272,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| //8 9 10 | //8 9 10 | ||||
| //12 13 14 15 | //12 13 14 15 | ||||
| // Sequentially extract data from B in row order | // Sequentially extract data from B in row order | ||||
| ldrepl_macro 22, 25, 12 | |||||
| ldrepl_macro 10, 22, 23, 24, 25 | |||||
| GMUL xvf, d, U6, D9, U6, U7, D9, U7 | GMUL xvf, d, U6, D9, U6, U7, D9, U7 | ||||
| ldrepl_macro 19, 21, 8 | |||||
| nmsub_macro 4, 5, 6, D8 | |||||
| ldrepl_macro 17, 18, 4 | |||||
| ldrepl_macro 11, 19, 20, 21 | |||||
| nmsub_macro D8, 4, 6, 5, 7 | |||||
| ldrepl_macro 13, 17, 18 | |||||
| GMUL xvf, d, U4, D5, U4, U5, D5, U5 | GMUL xvf, d, U4, D5, U4, U5, D5, U5 | ||||
| ldrepl_macro 16, 16, 0 | |||||
| nmsub_macro 2, 3, 6, D7 | |||||
| nmsub_macro 2, 3, 4, D4 | |||||
| ldrepl_macro 16, 16 | |||||
| nmsub_macro D7, 2, 6, 3, 7 | |||||
| nmsub_macro D4, 2, 4, 3, 5 | |||||
| GMUL xvf, d, U2, D2, U2, U3, D2, U3 | GMUL xvf, d, U2, D2, U2, U3, D2, U3 | ||||
| nmsub_macro 0, 1, 6, D6 | |||||
| nmsub_macro 0, 1, 4, D3 | |||||
| nmsub_macro 0, 1, 2, D1 | |||||
| nmsub_macro D6, 0, 6, 1, 7 | |||||
| nmsub_macro D3, 0, 4, 1, 5 | |||||
| nmsub_macro D1, 0, 2, 1, 3 | |||||
| GMUL xvf, d, U0, D0, U0, U1, D0, U1 | GMUL xvf, d, U0, D0, U0, U1, D0, U1 | ||||
| // Store A | // Store A | ||||
| A_st_macro 0, 7, 0, 4 | |||||
| A_st_macro 4, 0, 0, 1, 2, 3, 4, 5, 6, 7 | |||||
| // Store C | // Store C | ||||
| GST xv, , U0, C0, 0x00, U1, C0, 0x20, \ | GST xv, , U0, C0, 0x00, U1, C0, 0x20, \ | ||||
| U2, C1, 0x00, U3, C1, 0x20, \ | U2, C1, 0x00, U3, C1, 0x20, \ | ||||
| @@ -306,22 +304,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| //8 9 10 | //8 9 10 | ||||
| //12 13 14 15 | //12 13 14 15 | ||||
| // Sequentially extract data from B in row order | // Sequentially extract data from B in row order | ||||
| ldrepl_macro 22, 25, 12 | |||||
| ldrepl_macro 10, 22, 23, 24, 25 | |||||
| GMUL xvf, d, U3, D9, U3 | GMUL xvf, d, U3, D9, U3 | ||||
| ldrepl_macro 19, 21, 8 | |||||
| nmsub_macro 2, 2, 3, D8 | |||||
| ldrepl_macro 17, 18, 4 | |||||
| ldrepl_macro 11, 19, 20, 21 | |||||
| nmsub_macro D8, 2, 3 | |||||
| ldrepl_macro 13, 17, 18 | |||||
| GMUL xvf, d, U2, D5, U2 | GMUL xvf, d, U2, D5, U2 | ||||
| ldrepl_macro 16, 16, 0 | |||||
| nmsub_macro 1, 1, 3, D7 | |||||
| nmsub_macro 1, 1, 2, D4 | |||||
| ldrepl_macro 16, 16 | |||||
| nmsub_macro D7, 1, 3 | |||||
| nmsub_macro D4, 1, 2 | |||||
| GMUL xvf, d, U1, D2, U1 | GMUL xvf, d, U1, D2, U1 | ||||
| nmsub_macro 0, 0, 3, D6 | |||||
| nmsub_macro 0, 0, 2, D3 | |||||
| nmsub_macro 0, 0, 1, D1 | |||||
| nmsub_macro D6, 0, 3 | |||||
| nmsub_macro D3, 0, 2 | |||||
| nmsub_macro D1, 0, 1 | |||||
| GMUL xvf, d, U0, D0, U0 | GMUL xvf, d, U0, D0, U0 | ||||
| // Store A | // Store A | ||||
| A_st_macro 0, 3, 0, 4 | |||||
| A_st_macro 4, 0, 0, 1, 2, 3 | |||||
| // Store C | // Store C | ||||
| GST xv, , U0, C0, 0x00, U1, C1, 0x00, U2, C2, 0x00, U3, C3, 0x00 | GST xv, , U0, C0, 0x00, U1, C1, 0x00, U2, C2, 0x00, U3, C3, 0x00 | ||||
| .endm | .endm | ||||
| @@ -335,22 +333,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| //8 9 10 | //8 9 10 | ||||
| //12 13 14 15 | //12 13 14 15 | ||||
| // Sequentially extract data from B in row order | // Sequentially extract data from B in row order | ||||
| ldrepl_macro 22, 25, 12 | |||||
| ldrepl_macro 10, 22, 23, 24, 25 | |||||
| GMUL xvf, d, U3, D9, U3 | GMUL xvf, d, U3, D9, U3 | ||||
| ldrepl_macro 19, 21, 8 | |||||
| nmsub_macro 2, 2, 3, D8 | |||||
| ldrepl_macro 17, 18, 4 | |||||
| ldrepl_macro 11, 19, 20, 21 | |||||
| nmsub_macro D8, 2, 3 | |||||
| ldrepl_macro 13, 17, 18 | |||||
| GMUL xvf, d, U2, D5, U2 | GMUL xvf, d, U2, D5, U2 | ||||
| ldrepl_macro 16, 16, 0 | |||||
| nmsub_macro 1, 1, 3, D7 | |||||
| nmsub_macro 1, 1, 2, D4 | |||||
| ldrepl_macro 16, 16 | |||||
| nmsub_macro D7, 1, 3 | |||||
| nmsub_macro D4, 1, 2 | |||||
| GMUL xvf, d, U1, D2, U1 | GMUL xvf, d, U1, D2, U1 | ||||
| nmsub_macro 0, 0, 3, D6 | |||||
| nmsub_macro 0, 0, 2, D3 | |||||
| nmsub_macro 0, 0, 1, D1 | |||||
| nmsub_macro D6, 0, 3 | |||||
| nmsub_macro D3, 0, 2 | |||||
| nmsub_macro D1, 0, 1 | |||||
| GMUL xvf, d, U0, D0, U0 | GMUL xvf, d, U0, D0, U0 | ||||
| // Store A | // Store A | ||||
| A_st_macro 0, 3, 0, 2 | |||||
| A_st_macro 2, 0, 0, 1, 2, 3 | |||||
| // Store C | // Store C | ||||
| GST v, , $vr0, C0, 0x00, $vr1, C1, 0x00, $vr2, C2, 0x00, $vr3, C3, 0x00 | GST v, , $vr0, C0, 0x00, $vr1, C1, 0x00, $vr2, C2, 0x00, $vr3, C3, 0x00 | ||||
| .endm | .endm | ||||
| @@ -364,22 +362,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| //8 9 10 | //8 9 10 | ||||
| //12 13 14 15 | //12 13 14 15 | ||||
| // Sequentially extract data from B in row order | // Sequentially extract data from B in row order | ||||
| ldrepl_macro 22, 25, 12 | |||||
| ldrepl_macro 10, 22, 23, 24, 25 | |||||
| GMUL xvf, d, U3, D9, U3 | GMUL xvf, d, U3, D9, U3 | ||||
| ldrepl_macro 19, 21, 8 | |||||
| nmsub_macro 2, 2, 3, D8 | |||||
| ldrepl_macro 17, 18, 4 | |||||
| ldrepl_macro 11, 19, 20, 21 | |||||
| nmsub_macro D8, 2, 3 | |||||
| ldrepl_macro 13, 17, 18 | |||||
| GMUL xvf, d, U2, D5, U2 | GMUL xvf, d, U2, D5, U2 | ||||
| ldrepl_macro 16, 16, 0 | |||||
| nmsub_macro 1, 1, 3, D7 | |||||
| nmsub_macro 1, 1, 2, D4 | |||||
| ldrepl_macro 16, 16 | |||||
| nmsub_macro D7, 1, 3 | |||||
| nmsub_macro D4, 1, 2 | |||||
| GMUL xvf, d, U1, D2, U1 | GMUL xvf, d, U1, D2, U1 | ||||
| nmsub_macro 0, 0, 3, D6 | |||||
| nmsub_macro 0, 0, 2, D3 | |||||
| nmsub_macro 0, 0, 1, D1 | |||||
| nmsub_macro D6, 0, 3 | |||||
| nmsub_macro D3, 0, 2 | |||||
| nmsub_macro D1, 0, 1 | |||||
| GMUL xvf, d, U0, D0, U0 | GMUL xvf, d, U0, D0, U0 | ||||
| // Store A | // Store A | ||||
| A_st_macro 0, 3, 0, 1 | |||||
| A_st_macro 1, 0, 0, 1, 2, 3 | |||||
| // Store C | // Store C | ||||
| GST f, d, $f0, C0, 0x00, $f1, C1, 0x00, $f2, C2, 0x00, $f3, C3, 0x00, | GST f, d, $f0, C0, 0x00, $f1, C1, 0x00, $f2, C2, 0x00, $f3, C3, 0x00, | ||||
| .endm | .endm | ||||
| @@ -399,10 +397,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .L_dsolve_16x1: | .L_dsolve_16x1: | ||||
| PTR_ADDI A0, T1, -16 * 8 | PTR_ADDI A0, T1, -16 * 8 | ||||
| PTR_ADDI B0, T2, -1 * 8 | PTR_ADDI B0, T2, -1 * 8 | ||||
| ldrepl_macro 16, 16, 0 | |||||
| ldrepl_macro 16, 16 | |||||
| GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3 | GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3 | ||||
| // Store A | // Store A | ||||
| A_st_macro 0, 3, 0, 4 | |||||
| A_st_macro 4, 0, 0, 1, 2, 3 | |||||
| // Strore C | // Strore C | ||||
| GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60 | GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60 | ||||
| .endm | .endm | ||||
| @@ -420,10 +418,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .L_dsolve_8x1: | .L_dsolve_8x1: | ||||
| PTR_ADDI A0, T1, -8 * 8 | PTR_ADDI A0, T1, -8 * 8 | ||||
| PTR_ADDI B0, T2, -1 * 8 | PTR_ADDI B0, T2, -1 * 8 | ||||
| ldrepl_macro 16, 16, 0 | |||||
| ldrepl_macro 16, 16 | |||||
| GMUL xvf, d, U0, D0, U0, U1, D0, U1 | GMUL xvf, d, U0, D0, U0, U1, D0, U1 | ||||
| // Store A | // Store A | ||||
| A_st_macro 0, 1, 0, 4 | |||||
| A_st_macro 4, 0, 0, 1 | |||||
| // Strore C | // Strore C | ||||
| GST xv, , U0, C0, 0x00, U1, C0, 0x20 | GST xv, , U0, C0, 0x00, U1, C0, 0x20 | ||||
| .endm | .endm | ||||
| @@ -440,10 +438,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .L_dsolve_4x1: | .L_dsolve_4x1: | ||||
| PTR_ADDI A0, T1, -4 * 8 | PTR_ADDI A0, T1, -4 * 8 | ||||
| PTR_ADDI B0, T2, -1 * 8 | PTR_ADDI B0, T2, -1 * 8 | ||||
| ldrepl_macro 16, 16, 0 | |||||
| ldrepl_macro 16, 16 | |||||
| GMUL xvf, d, U0, D0, U0 | GMUL xvf, d, U0, D0, U0 | ||||
| // Store A | // Store A | ||||
| A_st_macro 0, 0, 0, 4 | |||||
| A_st_macro 4, 0, 0 | |||||
| // Strore C | // Strore C | ||||
| GST xv, , U0, C0, 0x00 | GST xv, , U0, C0, 0x00 | ||||
| .endm | .endm | ||||
| @@ -460,10 +458,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .L_dsolve_2x1: | .L_dsolve_2x1: | ||||
| PTR_ADDI A0, T1, -2 * 8 | PTR_ADDI A0, T1, -2 * 8 | ||||
| PTR_ADDI B0, T2, -1 * 8 | PTR_ADDI B0, T2, -1 * 8 | ||||
| ldrepl_macro 16, 16, 0 | |||||
| ldrepl_macro 16, 16 | |||||
| GMUL xvf, d, U0, D0, U0 | GMUL xvf, d, U0, D0, U0 | ||||
| // Store A | // Store A | ||||
| A_st_macro 0, 0, 0, 2 | |||||
| A_st_macro 2, 0, 0 | |||||
| // Strore C | // Strore C | ||||
| GST v, , $vr0, C0, 0x00 | GST v, , $vr0, C0, 0x00 | ||||
| .endm | .endm | ||||
| @@ -480,10 +478,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .L_dsolve_1x1: | .L_dsolve_1x1: | ||||
| PTR_ADDI A0, T1, -1 * 8 | PTR_ADDI A0, T1, -1 * 8 | ||||
| PTR_ADDI B0, T2, -1 * 8 | PTR_ADDI B0, T2, -1 * 8 | ||||
| ldrepl_macro 16, 16, 0 | |||||
| ldrepl_macro 16, 16 | |||||
| GMUL xvf, d, U0, D0, U0 | GMUL xvf, d, U0, D0, U0 | ||||
| // Store A | // Store A | ||||
| A_st_macro 0, 0, 0, 1 | |||||
| A_st_macro 1, 0, 0 | |||||
| // Strore C | // Strore C | ||||
| GST f, d, $f0, C0, 0x00 | GST f, d, $f0, C0, 0x00 | ||||
| .endm | .endm | ||||
| @@ -697,7 +695,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .endm | .endm | ||||
| PROLOGUE | PROLOGUE | ||||
| push_if_used 26, 32 | |||||
| push_if_used 9, 8 | |||||
| PTR_SLLI LDC, LDC, 3 | PTR_SLLI LDC, LDC, 3 | ||||
| PTR_SUB KK, N, OFFSET | PTR_SUB KK, N, OFFSET | ||||
| PTR_MUL T0, N, LDC | PTR_MUL T0, N, LDC | ||||
| @@ -948,6 +946,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| PTR_ADDI KK, KK, -4 | PTR_ADDI KK, KK, -4 | ||||
| bnez J, .L_J1 | bnez J, .L_J1 | ||||
| .L_N0: | .L_N0: | ||||
| pop_if_used 26, 32 | |||||
| pop_if_used 9, 8 | |||||
| jirl $r0, $r1, 0x0 | jirl $r0, $r1, 0x0 | ||||
| EPILOGUE | EPILOGUE | ||||
| @@ -90,57 +90,175 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define PTR_FST fst.d | #define PTR_FST fst.d | ||||
| #endif | #endif | ||||
| // The max registers available to the user which | |||||
| // do not need to be preserved across calls. | |||||
| // Ref: https://loongson.github.io/LoongArch-Documentation/LoongArch-ELF-ABI-CN.html | |||||
| #define MAX_INT_CALLER_SAVED 17 | |||||
| #define MAX_FP_CALLER_SAVED 24 | |||||
| .altmacro // Enable alternate macro mode | .altmacro // Enable alternate macro mode | ||||
| /* | |||||
| * Pushing and popping static registers into/from the stack. | |||||
| * regs : number of static general-purpose registers, greater than or equal to 0, less than or equal to 9 | |||||
| * fregs: number of static floating-point registers, greater than or equal to 0, less than or equal to 8 | |||||
| */ | |||||
| .macro push_if_used regs, fregs | .macro push_if_used regs, fregs | ||||
| .if \regs > MAX_INT_CALLER_SAVED | |||||
| PTR_ADDI $sp, $sp, -((\regs - MAX_INT_CALLER_SAVED) << REG_LOG) | |||||
| push_regs 0, \regs - MAX_INT_CALLER_SAVED - 1 | |||||
| .if \regs > 0 | |||||
| PTR_ADDI $sp, $sp, -(\regs << REG_LOG) | |||||
| push_regs 0, \regs - 1 | |||||
| .endif | .endif | ||||
| .if \fregs > MAX_FP_CALLER_SAVED | |||||
| PTR_ADDI $sp, $sp, -((\fregs - MAX_FP_CALLER_SAVED) << FREG_LOG) | |||||
| push_fregs 0, \fregs - MAX_FP_CALLER_SAVED - 1 | |||||
| .if \fregs > 0 | |||||
| PTR_ADDI $sp, $sp, -(\fregs << FREG_LOG) | |||||
| push_fregs 0, \fregs - 1 | |||||
| .endif | .endif | ||||
| .endm // End push_if_used | .endm // End push_if_used | ||||
| .macro pop_if_used regs, fregs | .macro pop_if_used regs, fregs | ||||
| .if \fregs > MAX_FP_CALLER_SAVED | |||||
| pop_fregs 0, \fregs - MAX_FP_CALLER_SAVED - 1 | |||||
| PTR_ADDI $sp, $sp, (\fregs - MAX_FP_CALLER_SAVED) << FREG_LOG | |||||
| .if \fregs > 0 | |||||
| pop_fregs 0, \fregs - 1 | |||||
| PTR_ADDI $sp, $sp, \fregs << FREG_LOG | |||||
| .endif | .endif | ||||
| .if \regs > MAX_INT_CALLER_SAVED | |||||
| pop_regs 0, \regs - MAX_INT_CALLER_SAVED - 1 | |||||
| PTR_ADDI $sp, $sp, (\regs - MAX_INT_CALLER_SAVED) << REG_LOG | |||||
| .if \regs > 0 | |||||
| pop_regs 0, \regs - 1 | |||||
| PTR_ADDI $sp, $sp, \regs << REG_LOG | |||||
| .endif | .endif | ||||
| .endm // End pop_if_used | .endm // End pop_if_used | ||||
| .macro push_regs from, to | .macro push_regs from, to | ||||
| PTR_ST $s\()\from, $sp, \from << REG_LOG | |||||
| #ifdef __clang__ | |||||
| .if \to >= 0 | |||||
| PTR_ST $s0, $sp, 0 << REG_LOG | |||||
| .endif | |||||
| .if \to >= 1 | |||||
| PTR_ST $s1, $sp, 1 << REG_LOG | |||||
| .endif | |||||
| .if \to >= 2 | |||||
| PTR_ST $s2, $sp, 2 << REG_LOG | |||||
| .endif | |||||
| .if \to >= 3 | |||||
| PTR_ST $s3, $sp, 3 << REG_LOG | |||||
| .endif | |||||
| .if \to >= 4 | |||||
| PTR_ST $s4, $sp, 4 << REG_LOG | |||||
| .endif | |||||
| .if \to >= 5 | |||||
| PTR_ST $s5, $sp, 5 << REG_LOG | |||||
| .endif | |||||
| .if \to >= 6 | |||||
| PTR_ST $s6, $sp, 6 << REG_LOG | |||||
| .endif | |||||
| .if \to >= 7 | |||||
| PTR_ST $s7, $sp, 7 << REG_LOG | |||||
| .endif | |||||
| .if \to >= 8 | |||||
| PTR_ST $s8, $sp, 8 << REG_LOG | |||||
| .endif | |||||
| #else | |||||
| PTR_ST $s\()\from, $sp, \from << REG_LOG | |||||
| .if \to - \from | .if \to - \from | ||||
| push_regs %from + 1, \to | push_regs %from + 1, \to | ||||
| .endif | .endif | ||||
| #endif | |||||
| .endm // End push_regs | .endm // End push_regs | ||||
| .macro pop_regs from, to | .macro pop_regs from, to | ||||
| #ifdef __clang__ | |||||
| .if \to >= 0 | |||||
| PTR_LD $s0, $sp, 0 << REG_LOG | |||||
| .endif | |||||
| .if \to >= 1 | |||||
| PTR_LD $s1, $sp, 1 << REG_LOG | |||||
| .endif | |||||
| .if \to >= 2 | |||||
| PTR_LD $s2, $sp, 2 << REG_LOG | |||||
| .endif | |||||
| .if \to >= 3 | |||||
| PTR_LD $s3, $sp, 3 << REG_LOG | |||||
| .endif | |||||
| .if \to >= 4 | |||||
| PTR_LD $s4, $sp, 4 << REG_LOG | |||||
| .endif | |||||
| .if \to >= 5 | |||||
| PTR_LD $s5, $sp, 5 << REG_LOG | |||||
| .endif | |||||
| .if \to >= 6 | |||||
| PTR_LD $s6, $sp, 6 << REG_LOG | |||||
| .endif | |||||
| .if \to >= 7 | |||||
| PTR_LD $s7, $sp, 7 << REG_LOG | |||||
| .endif | |||||
| .if \to >= 8 | |||||
| PTR_LD $s8, $sp, 8 << REG_LOG | |||||
| .endif | |||||
| #else | |||||
| PTR_LD $s\()\from, $sp, \from << REG_LOG | PTR_LD $s\()\from, $sp, \from << REG_LOG | ||||
| .if \to - \from | .if \to - \from | ||||
| pop_regs %from + 1, \to | pop_regs %from + 1, \to | ||||
| .endif | .endif | ||||
| #endif | |||||
| .endm // End pop_regs | .endm // End pop_regs | ||||
| .macro push_fregs from, to | .macro push_fregs from, to | ||||
| #ifdef __clang__ | |||||
| .if \to >= 0 | |||||
| PTR_FST $fs0, $sp, 0 << FREG_LOG | |||||
| .endif | |||||
| .if \to >= 1 | |||||
| PTR_FST $fs1, $sp, 1 << FREG_LOG | |||||
| .endif | |||||
| .if \to >= 2 | |||||
| PTR_FST $fs2, $sp, 2 << FREG_LOG | |||||
| .endif | |||||
| .if \to >= 3 | |||||
| PTR_FST $fs3, $sp, 3 << FREG_LOG | |||||
| .endif | |||||
| .if \to >= 4 | |||||
| PTR_FST $fs4, $sp, 4 << FREG_LOG | |||||
| .endif | |||||
| .if \to >= 5 | |||||
| PTR_FST $fs5, $sp, 5 << FREG_LOG | |||||
| .endif | |||||
| .if \to >= 6 | |||||
| PTR_FST $fs6, $sp, 6 << FREG_LOG | |||||
| .endif | |||||
| .if \to >= 7 | |||||
| PTR_FST $fs7, $sp, 7 << FREG_LOG | |||||
| .endif | |||||
| #else | |||||
| PTR_FST $fs\()\from, $sp, \from << FREG_LOG | PTR_FST $fs\()\from, $sp, \from << FREG_LOG | ||||
| .if \to - \from | .if \to - \from | ||||
| push_fregs %from + 1, \to | push_fregs %from + 1, \to | ||||
| .endif | .endif | ||||
| #endif | |||||
| .endm // End push_fregs | .endm // End push_fregs | ||||
| .macro pop_fregs from, to | .macro pop_fregs from, to | ||||
| #ifdef __clang__ | |||||
| .if \to >= 0 | |||||
| PTR_FLD $fs0, $sp, 0 << FREG_LOG | |||||
| .endif | |||||
| .if \to >= 1 | |||||
| PTR_FLD $fs1, $sp, 1 << FREG_LOG | |||||
| .endif | |||||
| .if \to >= 2 | |||||
| PTR_FLD $fs2, $sp, 2 << FREG_LOG | |||||
| .endif | |||||
| .if \to >= 3 | |||||
| PTR_FLD $fs3, $sp, 3 << FREG_LOG | |||||
| .endif | |||||
| .if \to >= 4 | |||||
| PTR_FLD $fs4, $sp, 4 << FREG_LOG | |||||
| .endif | |||||
| .if \to >= 5 | |||||
| PTR_FLD $fs5, $sp, 5 << FREG_LOG | |||||
| .endif | |||||
| .if \to >= 6 | |||||
| PTR_FLD $fs6, $sp, 6 << FREG_LOG | |||||
| .endif | |||||
| .if \to >= 7 | |||||
| PTR_FLD $fs7, $sp, 7 << FREG_LOG | |||||
| .endif | |||||
| #else | |||||
| PTR_FLD $fs\()\from, $sp, \from << FREG_LOG | PTR_FLD $fs\()\from, $sp, \from << FREG_LOG | ||||
| .if \to - \from | .if \to - \from | ||||
| pop_fregs %from + 1, \to | pop_fregs %from + 1, \to | ||||
| .endif | .endif | ||||
| #endif | |||||
| .endm // End pop_fregs | .endm // End pop_fregs | ||||
| // | // | ||||
| @@ -275,7 +393,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| // GXOR | // GXOR | ||||
| // | // | ||||
| .macro GXOR pre_op:req, suf_op:req, out:req, in0:req, in1:req, more:vararg | .macro GXOR pre_op:req, suf_op:req, out:req, in0:req, in1:req, more:vararg | ||||
| \pre_op\()xor.\suf_op \out, \in0, \in1 | |||||
| .ifnb \pre_op | |||||
| \pre_op\()xor.v \out, \in0, \in1 | |||||
| .else | |||||
| xor.\suf_op \out, \in0, \in1 | |||||
| .endif | |||||
| .ifnb \more | .ifnb \more | ||||
| GXOR \pre_op, \suf_op, \more | GXOR \pre_op, \suf_op, \more | ||||
| .endif | .endif | ||||
| @@ -307,6 +429,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| GPRELD \more | GPRELD \more | ||||
| .endif | .endif | ||||
| .endm | .endm | ||||
| // | |||||
| // GPACKEV | |||||
| // | |||||
| .macro GPACKEV pre_op:req, suf_op:req, out:req, in0:req, in1:req, more:vararg | |||||
| \pre_op\()packev.\suf_op \out, \in0, \in1 | |||||
| .ifnb \more | |||||
| GPACKEV \pre_op, \suf_op, \more | |||||
| .endif | |||||
| .endm | |||||
| // | |||||
| // GPACKOD | |||||
| // | |||||
| .macro GPACKOD pre_op:req, suf_op:req, out:req, in0:req, in1:req, more:vararg | |||||
| \pre_op\()packod.\suf_op \out, \in0, \in1 | |||||
| .ifnb \more | |||||
| GPACKOD \pre_op, \suf_op, \more | |||||
| .endif | |||||
| .endm | |||||
| // | |||||
| // GSHUF4I | |||||
| // | |||||
| .macro GSHUF4I pre_op:req, suf_op:req, out:req, in0:req, in1:req /* imm */, more:vararg | |||||
| \pre_op\()shuf4i.\suf_op \out, \in0, \in1 | |||||
| .ifnb \more | |||||
| GSHUF4I \pre_op, \suf_op, \more | |||||
| .endif | |||||
| .endm | |||||
| .macro TRANSF2G name, pre_op:req, suf_op:req, more:vararg | |||||
| .ifeqs "\pre_op\()\suf_op", "vfs" | |||||
| \name v, w, \more | |||||
| .endif | |||||
| .ifeqs "\pre_op\()\suf_op", "vfd" | |||||
| \name v, d, \more | |||||
| .endif | |||||
| .ifeqs "\pre_op\()\suf_op", "xvfs" | |||||
| \name xv, w, \more | |||||
| .endif | |||||
| .ifeqs "\pre_op\()\suf_op", "xvfd" | |||||
| \name xv, d, \more | |||||
| .endif | |||||
| .endm | |||||
| // | // | ||||
| // Compound instructions | // Compound instructions | ||||
| @@ -314,61 +478,96 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| // GACC: Accumulate the values of vector registers | // GACC: Accumulate the values of vector registers | ||||
| // | // | ||||
| .macro GACC pre_op:req, suf_op:req, out:req, in:req, more:vararg | .macro GACC pre_op:req, suf_op:req, out:req, in:req, more:vararg | ||||
| .ifeqs "\pre_op", "xvf" | |||||
| .ifeqs "\pre_op\()\suf_op", "xvfd" | |||||
| xvpermi.q \out, \in, 0x01 | |||||
| \pre_op\()add.\suf_op \in, \out, \in | |||||
| xvpackod.d \out, \in, \in | |||||
| \pre_op\()add.\suf_op \out, \out, \in | |||||
| .endif | |||||
| .ifeqs "\pre_op\()\suf_op", "xvfs" | |||||
| xvpermi.q \out, \in, 0x01 | xvpermi.q \out, \in, 0x01 | ||||
| \pre_op\()add.\suf_op \in, \out, \in | \pre_op\()add.\suf_op \in, \out, \in | ||||
| xvpackod.d \out, \in, \in | xvpackod.d \out, \in, \in | ||||
| \pre_op\()add.\suf_op \out, \out, \in | \pre_op\()add.\suf_op \out, \out, \in | ||||
| .ifeqs "\suf_op", "s" | |||||
| xvpackod.w \in, \out, \out | xvpackod.w \in, \out, \out | ||||
| \pre_op\()add.\suf_op \out, \out, \in | \pre_op\()add.\suf_op \out, \out, \in | ||||
| .endif | .endif | ||||
| .ifeqs "\pre_op\()\suf_op", "vfd" | |||||
| vpackod.d \out, \in, \in | |||||
| \pre_op\()add.\suf_op \out, \out, \in | |||||
| .endif | .endif | ||||
| .ifeqs "\pre_op", "vf" | |||||
| .ifeqs "\pre_op\()\suf_op", "vfs" | |||||
| vpackod.d \out, \in, \in | vpackod.d \out, \in, \in | ||||
| \pre_op\()add.\suf_op \out, \out, \in | \pre_op\()add.\suf_op \out, \out, \in | ||||
| .ifeqs "\suf_op", "s" | |||||
| vpackod.w \in, \out, \out | vpackod.w \in, \out, \out | ||||
| \pre_op\()add.\suf_op \out, \out, \in | \pre_op\()add.\suf_op \out, \out, \in | ||||
| .endif | .endif | ||||
| .endif | |||||
| .ifeqs "\pre_op", "xv" | |||||
| .ifeqs "\pre_op\()\suf_op", "xvd" | |||||
| xvpermi.q \out, \in, 0x01 | |||||
| \pre_op\()add.\suf_op \in, \out, \in | |||||
| xvpackod.d \out, \in, \in | |||||
| \pre_op\()add.\suf_op \out, \out, \in | |||||
| .endif | |||||
| .ifeqs "\pre_op\()\suf_op", "xvw" | |||||
| xvpermi.q \out, \in, 0x01 | |||||
| \pre_op\()add.\suf_op \in, \out, \in | |||||
| xvpackod.d \out, \in, \in | |||||
| \pre_op\()add.\suf_op \out, \out, \in | |||||
| xvpackod.w \in, \out, \out | |||||
| \pre_op\()add.\suf_op \out, \out, \in | |||||
| .endif | |||||
| .ifeqs "\pre_op\()\suf_op", "xvh" | |||||
| xvpermi.q \out, \in, 0x01 | |||||
| \pre_op\()add.\suf_op \in, \out, \in | |||||
| xvpackod.d \out, \in, \in | |||||
| \pre_op\()add.\suf_op \out, \out, \in | |||||
| xvpackod.w \in, \out, \out | |||||
| \pre_op\()add.\suf_op \out, \out, \in | |||||
| xvpackod.h \in, \out, \out | |||||
| \pre_op\()add.\suf_op \out, \out, \in | |||||
| .endif | |||||
| .ifeqs "\pre_op\()\suf_op", "xvb" | |||||
| xvpermi.q \out, \in, 0x01 | xvpermi.q \out, \in, 0x01 | ||||
| \pre_op\()add.\suf_op \in, \out, \in | \pre_op\()add.\suf_op \in, \out, \in | ||||
| xvpackod.d \out, \in, \in | xvpackod.d \out, \in, \in | ||||
| \pre_op\()add.\suf_op \out, \out, \in | \pre_op\()add.\suf_op \out, \out, \in | ||||
| .ifnc "\suf_op", "d" | |||||
| xvpackod.w \in, \out, \out | xvpackod.w \in, \out, \out | ||||
| \pre_op\()add.\suf_op \out, \out, \in | \pre_op\()add.\suf_op \out, \out, \in | ||||
| .ifnc "\suf_op", "w" | |||||
| xvpackod.h \in, \out, \out | xvpackod.h \in, \out, \out | ||||
| \pre_op\()add.\suf_op \out, \out, \in | \pre_op\()add.\suf_op \out, \out, \in | ||||
| .ifnc "\suf_op", "h" | |||||
| xvpackod.b \in, \out, \out | xvpackod.b \in, \out, \out | ||||
| \pre_op\()add.\suf_op \out, \out, \in | \pre_op\()add.\suf_op \out, \out, \in | ||||
| .endif | .endif | ||||
| .ifeqs "\pre_op\()\suf_op", "vd" | |||||
| vpackod.d \out, \in, \in | |||||
| \pre_op\()add.\suf_op \out, \out, \in | |||||
| .endif | .endif | ||||
| .ifeqs "\pre_op\()\suf_op", "vw" | |||||
| vpackod.d \out, \in, \in | |||||
| \pre_op\()add.\suf_op \out, \out, \in | |||||
| vpackod.w \in, \out, \out | |||||
| \pre_op\()add.\suf_op \out, \out, \in | |||||
| .endif | .endif | ||||
| .ifeqs "\pre_op\()\suf_op", "vh" | |||||
| vpackod.d \out, \in, \in | |||||
| \pre_op\()add.\suf_op \out, \out, \in | |||||
| vpackod.w \in, \out, \out | |||||
| \pre_op\()add.\suf_op \out, \out, \in | |||||
| vpackod.h \in, \out, \out | |||||
| \pre_op\()add.\suf_op \out, \out, \in | |||||
| .endif | .endif | ||||
| .ifeqs "\pre_op", "v" | |||||
| .ifeqs "\pre_op\()\suf_op", "vb" | |||||
| vpackod.d \out, \in, \in | vpackod.d \out, \in, \in | ||||
| \pre_op\()add.\suf_op \out, \out, \in | \pre_op\()add.\suf_op \out, \out, \in | ||||
| .ifnc "\suf_op", "d" | |||||
| vpackod.w \in, \out, \out | vpackod.w \in, \out, \out | ||||
| \pre_op\()add.\suf_op \out, \out, \in | \pre_op\()add.\suf_op \out, \out, \in | ||||
| .ifnc "\suf_op", "w" | |||||
| vpackod.h \in, \out, \out | vpackod.h \in, \out, \out | ||||
| \pre_op\()add.\suf_op \out, \out, \in | \pre_op\()add.\suf_op \out, \out, \in | ||||
| .ifnc "\suf_op", "h" | |||||
| vpackod.b \in, \out, \out | vpackod.b \in, \out, \out | ||||
| \pre_op\()add.\suf_op \out, \out, \in | \pre_op\()add.\suf_op \out, \out, \in | ||||
| .endif | .endif | ||||
| .endif | |||||
| .endif | |||||
| .endif | |||||
| .ifnb \more | .ifnb \more | ||||
| GACC \pre_op, \suf_op, \more | GACC \pre_op, \suf_op, \more | ||||
| @@ -391,27 +590,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| // Note: When "pre_op = xvf && suf_op = s", in will be modified. | // Note: When "pre_op = xvf && suf_op = s", in will be modified. | ||||
| // | // | ||||
| .macro GCOMPLEXACC pre_op:req, suf_op:req, out:req, in:req, more:vararg | .macro GCOMPLEXACC pre_op:req, suf_op:req, out:req, in:req, more:vararg | ||||
| .ifeqs "\pre_op", "xvf" | |||||
| .ifeqs "\pre_op\()\suf_op", "xvfd" | |||||
| xvpermi.q \out, \in, 0x01 | |||||
| \pre_op\()add.\suf_op \out, \out, \in | |||||
| .endif | |||||
| .ifeqs "\pre_op\()\suf_op", "xvfs" | |||||
| xvpermi.q \out, \in, 0x01 | xvpermi.q \out, \in, 0x01 | ||||
| .ifeqs "\suf_op", "s" | |||||
| \pre_op\()add.\suf_op \in, \out, \in | \pre_op\()add.\suf_op \in, \out, \in | ||||
| xvpackod.d \out, \in, \in | xvpackod.d \out, \in, \in | ||||
| \pre_op\()add.\suf_op \out, \out, \in | \pre_op\()add.\suf_op \out, \out, \in | ||||
| .else | |||||
| \pre_op\()add.\suf_op \out, \out, \in | |||||
| .endif | .endif | ||||
| .ifeqs "\pre_op\()\suf_op", "vfd" | |||||
| vor.v \out, \in, \in | |||||
| .endif | .endif | ||||
| .ifeqs "\pre_op", "vf" | |||||
| .ifeqs "\suf_op", "s" | |||||
| .ifeqs "\pre_op\()\suf_op", "vfs" | |||||
| vpackod.d \out, \in, \in | vpackod.d \out, \in, \in | ||||
| \pre_op\()add.\suf_op \out, \out, \in | \pre_op\()add.\suf_op \out, \out, \in | ||||
| .else | |||||
| vor.v \out, \in, \in | |||||
| .endif | |||||
| .endif | .endif | ||||
| .ifnb \more | .ifnb \more | ||||
| GCOMPLEXACC \pre_op, \suf_op, \more | GCOMPLEXACC \pre_op, \suf_op, \more | ||||
| .endif | .endif | ||||
| @@ -430,56 +629,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| // suf_op: s or d, differentiate between single precision or double precision complex numbers | // suf_op: s or d, differentiate between single precision or double precision complex numbers | ||||
| // | // | ||||
| .macro GCOMPLEXMUL xconj=0, pre_op:req, suf_op:req, out:req, in0:req, in1:req, tmp0:req, tmp1:req, tmp2:req, more:vararg | .macro GCOMPLEXMUL xconj=0, pre_op:req, suf_op:req, out:req, in0:req, in1:req, tmp0:req, tmp1:req, tmp2:req, more:vararg | ||||
| .ifeqs "\pre_op", "xvf" | |||||
| xvxor.v \tmp1, \tmp1, \tmp1 | |||||
| .ifeqs "\suf_op", "s" | |||||
| xvpackev.w \tmp0, \in0, \in0 | |||||
| .else | |||||
| xvpackev.d \tmp0, \in0, \in0 | |||||
| .endif | |||||
| .else | |||||
| vxor.v \tmp1, \tmp1, \tmp1 | |||||
| .ifeqs "\suf_op", "s" | |||||
| vpackev.w \tmp0, \in0, \in0 | |||||
| .else | |||||
| vpackev.d \tmp0, \in0, \in0 | |||||
| .endif | |||||
| .endif | |||||
| TRANSF2G GXOR, \pre_op, s, \tmp1, \tmp1, \tmp1 | |||||
| TRANSF2G GPACKEV, \pre_op, \suf_op, \tmp0, \in0, \in0 | |||||
| \pre_op\()sub.\suf_op \tmp1, \tmp1, \in0 | \pre_op\()sub.\suf_op \tmp1, \tmp1, \in0 | ||||
| .ifeqs "\pre_op", "xvf" | |||||
| .ifeqs "\suf_op", "s" | |||||
| .ifeqs "\xconj", "0" | .ifeqs "\xconj", "0" | ||||
| xvpackod.w \tmp1, \in0, \tmp1 | |||||
| TRANSF2G GPACKOD, \pre_op, \suf_op, \tmp1, \in0, \tmp1 | |||||
| .else | .else | ||||
| xvpackod.w \tmp1, \tmp1, \in0 | |||||
| TRANSF2G GPACKOD, \pre_op, \suf_op, \tmp1, \tmp1, \in0 | |||||
| .endif | .endif | ||||
| xvshuf4i.w \tmp2, \in1, 0xb1 | |||||
| .else | |||||
| .ifeqs "\xconj", "0" | |||||
| xvpackod.d \tmp1, \in0, \tmp1 | |||||
| .else | |||||
| xvpackod.d \tmp1, \tmp1, \in0 | |||||
| .endif | |||||
| xvshuf4i.d \tmp2, \in1, 0x0b | |||||
| .endif | |||||
| .else | |||||
| .ifeqs "\suf_op", "s" | .ifeqs "\suf_op", "s" | ||||
| .ifeqs "\xconj", "0" | |||||
| vpackod.w \tmp1, \in0, \tmp1 | |||||
| .else | |||||
| vpackod.w \tmp1, \tmp1, \in0 | |||||
| .endif | |||||
| vshuf4i.w \tmp2, \in1, 0xb1 | |||||
| .else | |||||
| .ifeqs "\xconj", "0" | |||||
| vpackod.d \tmp1, \in0, \tmp1 | |||||
| TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp2, \in1, 0xb1 | |||||
| .else | .else | ||||
| vpackod.d \tmp1, \tmp1, \in0 | |||||
| .endif | |||||
| vshuf4i.d \tmp2, \in1, 0x0b | |||||
| .endif | |||||
| TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp2, \in1, 0x0b | |||||
| .endif | .endif | ||||
| \pre_op\()mul.\suf_op \out, \tmp0, \in1 | \pre_op\()mul.\suf_op \out, \tmp0, \in1 | ||||
| @@ -512,112 +676,58 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| // suf_op: s or d, differentiate between single precision or double precision complex numbers | // suf_op: s or d, differentiate between single precision or double precision complex numbers | ||||
| // | // | ||||
| .macro GCOMPLEXMADD xconj=0, conj=0, pre_op:req, suf_op:req, out:req, in0:req, in1:req, in2:req, tmp0:req, tmp1:req, tmp2:req, more:vararg | .macro GCOMPLEXMADD xconj=0, conj=0, pre_op:req, suf_op:req, out:req, in0:req, in1:req, in2:req, tmp0:req, tmp1:req, tmp2:req, more:vararg | ||||
| .ifeqs "\pre_op", "xvf" | |||||
| xvxor.v \tmp1, \tmp1, \tmp1 | |||||
| .ifeqs "\suf_op", "s" | |||||
| xvpackev.w \tmp0, \in0, \in0 | |||||
| .else | |||||
| xvpackev.d \tmp0, \in0, \in0 | |||||
| .endif | |||||
| .else | |||||
| vxor.v \tmp1, \tmp1, \tmp1 | |||||
| .ifeqs "\suf_op", "s" | |||||
| vpackev.w \tmp0, \in0, \in0 | |||||
| .else | |||||
| vpackev.d \tmp0, \in0, \in0 | |||||
| .endif | |||||
| .endif | |||||
| TRANSF2G GXOR, \pre_op, s, \tmp1, \tmp1, \tmp1 | |||||
| TRANSF2G GPACKEV, \pre_op, \suf_op, \tmp0, \in0, \in0 | |||||
| \pre_op\()madd.\suf_op \tmp2, \tmp0, \in1, \in2 | \pre_op\()madd.\suf_op \tmp2, \tmp0, \in1, \in2 | ||||
| .ifeqs "\conj", "1" | |||||
| .ifeqs "\conj\()\suf_op", "1s" | |||||
| \pre_op\()nmsub.\suf_op \tmp0, \tmp0, \in1, \in2 | \pre_op\()nmsub.\suf_op \tmp0, \tmp0, \in1, \in2 | ||||
| .ifeqs "\pre_op", "xvf" | |||||
| .ifeqs "\suf_op", "s" | |||||
| xvshuf4i.w \tmp0, \tmp0, 0xb1 | |||||
| xvpackev.w \out, \tmp0, \tmp2 | |||||
| .else | |||||
| xvshuf4i.d \tmp0, \tmp0, 0x0b | |||||
| xvpackev.d \out, \tmp0, \tmp2 | |||||
| TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp0, \tmp0, 0xb1 | |||||
| TRANSF2G GPACKEV, \pre_op, \suf_op, \out, \tmp0, \tmp2 | |||||
| .endif | .endif | ||||
| .else | |||||
| .ifeqs "\suf_op", "s" | |||||
| vshuf4i.w \tmp0, \tmp0, 0xb1 | |||||
| vpackev.w \out, \tmp0, \tmp2 | |||||
| .else | |||||
| vshuf4i.d \tmp0, \tmp0, 0x0b | |||||
| vpackev.d \out, \tmp0, \tmp2 | |||||
| .ifeqs "\conj\()\suf_op", "1d" | |||||
| \pre_op\()nmsub.\suf_op \tmp0, \tmp0, \in1, \in2 | |||||
| TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp0, \tmp0, 0x0b | |||||
| TRANSF2G GPACKEV, \pre_op, \suf_op, \out, \tmp0, \tmp2 | |||||
| .endif | .endif | ||||
| .endif /* pre_op = xvf */ | |||||
| .else | |||||
| .ifeqs "\conj", "0" | |||||
| \pre_op\()add.\suf_op \out, \tmp2, \tmp1 | \pre_op\()add.\suf_op \out, \tmp2, \tmp1 | ||||
| .endif /* conj = 1 */ | |||||
| .endif | |||||
| \pre_op\()sub.\suf_op \tmp1, \tmp1, \in0 | \pre_op\()sub.\suf_op \tmp1, \tmp1, \in0 | ||||
| .ifeqs "\pre_op", "xvf" | |||||
| .ifeqs "\suf_op", "s" | |||||
| .ifeqs "\conj", "0" | |||||
| .ifeqs "\xconj", "0" | |||||
| xvpackod.w \tmp1, \in0, \tmp1 | |||||
| .else | |||||
| xvpackod.w \tmp1, \tmp1, \in0 | |||||
| .ifeqs "\xconj\()\conj\()\suf_op", "00s" | |||||
| TRANSF2G GPACKOD, \pre_op, \suf_op, \tmp1, \in0, \tmp1 | |||||
| TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp2, \in1, 0xb1 | |||||
| .endif | .endif | ||||
| .else | |||||
| .ifeqs "\xconj", "0" | |||||
| xvpackod.w \tmp1, \in0, \in0 | |||||
| .else | |||||
| xvpackod.w \tmp1, \tmp1, \tmp1 | |||||
| .ifeqs "\xconj\()\conj\()\suf_op", "10s" | |||||
| TRANSF2G GPACKOD, \pre_op, \suf_op, \tmp1, \tmp1, \in0 | |||||
| TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp2, \in1, 0xb1 | |||||
| .endif | .endif | ||||
| .ifeqs "\xconj\()\conj\()\suf_op", "01s" | |||||
| TRANSF2G GPACKOD, \pre_op, \suf_op, \tmp1, \in0, \in0 | |||||
| TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp2, \in1, 0xb1 | |||||
| .endif | .endif | ||||
| xvshuf4i.w \tmp2, \in1, 0xb1 | |||||
| .else | |||||
| .ifeqs "\conj", "0" | |||||
| .ifeqs "\xconj", "0" | |||||
| xvpackod.d \tmp1, \in0, \tmp1 | |||||
| .else | |||||
| xvpackod.d \tmp1, \tmp1, \in0 | |||||
| .endif | |||||
| .else | |||||
| .ifeqs "\xconj", "0" | |||||
| xvpackod.d \tmp1, \in0, \in0 | |||||
| .else | |||||
| xvpackod.d \tmp1, \tmp1, \tmp1 | |||||
| .endif | |||||
| .endif | |||||
| xvshuf4i.d \tmp2, \in1, 0x0b | |||||
| .endif | |||||
| .else | |||||
| .ifeqs "\suf_op", "s" | |||||
| .ifeqs "\conj", "0" | |||||
| .ifeqs "\xconj", "0" | |||||
| vpackod.w \tmp1, \in0, \tmp1 | |||||
| .else | |||||
| vpackod.w \tmp1, \tmp1, \in0 | |||||
| .ifeqs "\xconj\()\conj\()\suf_op", "11s" | |||||
| TRANSF2G GPACKOD, \pre_op, \suf_op, \tmp1, \tmp1, \tmp1 | |||||
| TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp2, \in1, 0xb1 | |||||
| .endif | .endif | ||||
| .else | |||||
| .ifeqs "\xconj", "0" | |||||
| vpackod.w \tmp1, \in0, \in0 | |||||
| .else | |||||
| vpackod.w \tmp1, \tmp1, \tmp1 | |||||
| .endif | |||||
| .endif | |||||
| vshuf4i.w \tmp2, \in1, 0xb1 | |||||
| .else | |||||
| .ifeqs "\conj", "0" | |||||
| .ifeqs "\xconj", "0" | |||||
| vpackod.d \tmp1, \in0, \tmp1 | |||||
| .else | |||||
| vpackod.d \tmp1, \tmp1, \in0 | |||||
| .endif | |||||
| .else | |||||
| .ifeqs "\xconj", "0" | |||||
| vpackod.d \tmp1, \in0, \in0 | |||||
| .else | |||||
| vpackod.d \tmp1, \tmp1, \tmp1 | |||||
| .ifeqs "\xconj\()\conj\()\suf_op", "00d" | |||||
| TRANSF2G GPACKOD, \pre_op, \suf_op, \tmp1, \in0, \tmp1 | |||||
| TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp2, \in1, 0x0b | |||||
| .endif | .endif | ||||
| .ifeqs "\xconj\()\conj\()\suf_op", "10d" | |||||
| TRANSF2G GPACKOD, \pre_op, \suf_op, \tmp1, \tmp1, \in0 | |||||
| TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp2, \in1, 0x0b | |||||
| .endif | .endif | ||||
| vshuf4i.d \tmp2, \in1, 0x0b | |||||
| .ifeqs "\xconj\()\conj\()\suf_op", "01d" | |||||
| TRANSF2G GPACKOD, \pre_op, \suf_op, \tmp1, \in0, \in0 | |||||
| TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp2, \in1, 0x0b | |||||
| .endif | .endif | ||||
| .ifeqs "\xconj\()\conj\()\suf_op", "11d" | |||||
| TRANSF2G GPACKOD, \pre_op, \suf_op, \tmp1, \tmp1, \tmp1 | |||||
| TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp2, \in1, 0x0b | |||||
| .endif | .endif | ||||
| \pre_op\()madd.\suf_op \out, \tmp1, \tmp2, \out | \pre_op\()madd.\suf_op \out, \tmp1, \tmp2, \out | ||||
| @@ -837,7 +837,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .endm | .endm | ||||
| PROLOGUE | PROLOGUE | ||||
| push_if_used 26, 32 | |||||
| push_if_used 9, 8 | |||||
| xvreplve0.w VALPHA, $xr0 | xvreplve0.w VALPHA, $xr0 | ||||
| #if defined (TRMMKERNEL) && !defined(LEFT) | #if defined (TRMMKERNEL) && !defined(LEFT) | ||||
| PTR_SUB OFF, ZERO, OFFSET | PTR_SUB OFF, ZERO, OFFSET | ||||
| @@ -2343,6 +2343,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #endif // #if defined(TRMMKERNEL) | #endif // #if defined(TRMMKERNEL) | ||||
| .L_N1_M0: | .L_N1_M0: | ||||
| .L_N0: | .L_N0: | ||||
| pop_if_used 26, 32 | |||||
| pop_if_used 9, 8 | |||||
| jirl $r0, $r1, 0x0 | jirl $r0, $r1, 0x0 | ||||
| EPILOGUE | EPILOGUE | ||||
| @@ -135,7 +135,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| //.L_N0 | //.L_N0 | ||||
| PROLOGUE | PROLOGUE | ||||
| push_if_used 26, 32 | |||||
| push_if_used 9, 8 | |||||
| move TD, DST | move TD, DST | ||||
| move TS, SRC | move TS, SRC | ||||
| @@ -458,6 +458,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| PTR_ADDI M, M, -1 | PTR_ADDI M, M, -1 | ||||
| blt ZERO, M, .L_N1_M1 | blt ZERO, M, .L_N1_M1 | ||||
| .L_N0: | .L_N0: | ||||
| pop_if_used 26, 32 | |||||
| pop_if_used 9, 8 | |||||
| jirl $r0, $r1, 0x0 | jirl $r0, $r1, 0x0 | ||||
| EPILOGUE | EPILOGUE | ||||
| @@ -110,7 +110,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| //.L_N0 | //.L_N0 | ||||
| PROLOGUE | PROLOGUE | ||||
| push_if_used 17, 20 | |||||
| push_if_used 0, 0 | |||||
| move TD, DST | move TD, DST | ||||
| move TS, SRC | move TS, SRC | ||||
| @@ -293,6 +293,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| PTR_ADDI M, M, -1 | PTR_ADDI M, M, -1 | ||||
| blt ZERO, M, .L_N1_M1 | blt ZERO, M, .L_N1_M1 | ||||
| .L_N0: | .L_N0: | ||||
| pop_if_used 17, 20 | |||||
| pop_if_used 0, 0 | |||||
| jirl $r0, $r1, 0x0 | jirl $r0, $r1, 0x0 | ||||
| EPILOGUE | EPILOGUE | ||||
| @@ -118,7 +118,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| //.L_M0 | //.L_M0 | ||||
| PROLOGUE | PROLOGUE | ||||
| push_if_used 24, 8 | |||||
| push_if_used 7, 0 | |||||
| move S0, SRC | move S0, SRC | ||||
| move P0, DST | move P0, DST | ||||
| @@ -521,6 +521,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| PTR_ADDI S1, S1, 0x04 | PTR_ADDI S1, S1, 0x04 | ||||
| PTR_ADDI P5, P5, 0x04 | PTR_ADDI P5, P5, 0x04 | ||||
| .L_M0: | .L_M0: | ||||
| pop_if_used 24, 8 | |||||
| pop_if_used 7, 0 | |||||
| jirl $r0, $r1, 0x00 | jirl $r0, $r1, 0x00 | ||||
| EPILOGUE | EPILOGUE | ||||
| @@ -110,7 +110,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| //.L_M0 | //.L_M0 | ||||
| PROLOGUE | PROLOGUE | ||||
| push_if_used 23, 8 | |||||
| push_if_used 6, 0 | |||||
| move S0, SRC | move S0, SRC | ||||
| move P0, DST | move P0, DST | ||||
| @@ -401,6 +401,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| PTR_ADDI S1, S1, 0x04 | PTR_ADDI S1, S1, 0x04 | ||||
| PTR_ADDI P4, P4, 0x04 | PTR_ADDI P4, P4, 0x04 | ||||
| .L_M0: | .L_M0: | ||||
| pop_if_used 23, 8 | |||||
| pop_if_used 6, 0 | |||||
| jirl $r0, $r1, 0x00 | jirl $r0, $r1, 0x00 | ||||
| EPILOGUE | EPILOGUE | ||||
| @@ -418,7 +418,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| PROLOGUE | PROLOGUE | ||||
| PTR_LD INC_Y, $sp, 0 | PTR_LD INC_Y, $sp, 0 | ||||
| push_if_used 17 + 7, 19 | |||||
| push_if_used 7, 0 | |||||
| PTR_ADDI K, $r0, 0x01 | PTR_ADDI K, $r0, 0x01 | ||||
| PTR_SUB I, INC_X, K | PTR_SUB I, INC_X, K | ||||
| PTR_SUB J, INC_Y, K | PTR_SUB J, INC_Y, K | ||||
| @@ -458,6 +458,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */ | .L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */ | ||||
| SGEMV_N_LASX GAP_1_1, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8_GAP, Y_4_GAP, Y_1 | SGEMV_N_LASX GAP_1_1, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8_GAP, Y_4_GAP, Y_1 | ||||
| .L_END: | .L_END: | ||||
| pop_if_used 17 + 7, 19 | |||||
| pop_if_used 7, 0 | |||||
| jirl $r0, $r1, 0x0 | jirl $r0, $r1, 0x0 | ||||
| EPILOGUE | EPILOGUE | ||||
| @@ -369,7 +369,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| PROLOGUE | PROLOGUE | ||||
| PTR_LD INC_Y, $sp, 0 | PTR_LD INC_Y, $sp, 0 | ||||
| push_if_used 17 + 8, 18 | |||||
| push_if_used 8, 0 | |||||
| PTR_ADDI K, $r0, 0x01 | PTR_ADDI K, $r0, 0x01 | ||||
| PTR_SUB I, INC_X, K | PTR_SUB I, INC_X, K | ||||
| maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ | maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ | ||||
| @@ -400,6 +400,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .L_GAP_1: /* if (incx != 1) */ | .L_GAP_1: /* if (incx != 1) */ | ||||
| SGEMV_T_LASX GAP_1, X8_GAP, X4_GAP | SGEMV_T_LASX GAP_1, X8_GAP, X4_GAP | ||||
| .L_END: | .L_END: | ||||
| pop_if_used 17 + 8, 18 | |||||
| pop_if_used 8, 0 | |||||
| jirl $r0, $r1, 0x0 | jirl $r0, $r1, 0x0 | ||||
| EPILOGUE | EPILOGUE | ||||
| @@ -253,7 +253,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| PROLOGUE | PROLOGUE | ||||
| PTR_LD INC_Y, $sp, 0 | PTR_LD INC_Y, $sp, 0 | ||||
| push_if_used 17 + 7, 31 | |||||
| push_if_used 7, 7 | |||||
| PTR_ADDI K, $r0, 0x01 | PTR_ADDI K, $r0, 0x01 | ||||
| PTR_SUB I, INC_X, K | PTR_SUB I, INC_X, K | ||||
| PTR_SUB J, INC_Y, K | PTR_SUB J, INC_Y, K | ||||
| @@ -291,6 +291,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */ | .L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */ | ||||
| ZGEMV_N_LSX GAP_1_1, X_2_GAP, X_1, Y_2_GAP, Y_1 | ZGEMV_N_LSX GAP_1_1, X_2_GAP, X_1, Y_2_GAP, Y_1 | ||||
| .L_END: | .L_END: | ||||
| pop_if_used 17 + 7, 31 | |||||
| pop_if_used 7, 7 | |||||
| jirl $r0, $r1, 0x0 | jirl $r0, $r1, 0x0 | ||||
| EPILOGUE | EPILOGUE | ||||
| @@ -298,7 +298,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| PROLOGUE | PROLOGUE | ||||
| PTR_LD INC_Y, $sp, 0 | PTR_LD INC_Y, $sp, 0 | ||||
| push_if_used 17 + 7, 31 | |||||
| push_if_used 7, 7 | |||||
| PTR_ADDI K, $r0, 0x01 | PTR_ADDI K, $r0, 0x01 | ||||
| PTR_SUB I, INC_X, K | PTR_SUB I, INC_X, K | ||||
| PTR_SUB J, INC_Y, K | PTR_SUB J, INC_Y, K | ||||
| @@ -337,7 +337,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */ | .L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */ | ||||
| ZGEMV_N_LASX GAP_1_1, X_4_GAP, X_1, Y_4_GAP, Y_1 | ZGEMV_N_LASX GAP_1_1, X_4_GAP, X_1, Y_4_GAP, Y_1 | ||||
| .L_END: | .L_END: | ||||
| pop_if_used 17 + 7, 31 | |||||
| pop_if_used 7, 7 | |||||
| jirl $r0, $r1, 0x0 | jirl $r0, $r1, 0x0 | ||||
| EPILOGUE | EPILOGUE | ||||
| @@ -234,7 +234,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| PROLOGUE | PROLOGUE | ||||
| PTR_LD INC_Y, $sp, 0 | PTR_LD INC_Y, $sp, 0 | ||||
| push_if_used 17 + 8, 30 | |||||
| push_if_used 8, 6 | |||||
| PTR_ADDI K, $r0, 0x01 | PTR_ADDI K, $r0, 0x01 | ||||
| PTR_SUB I, INC_X, K | PTR_SUB I, INC_X, K | ||||
| maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ | maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ | ||||
| @@ -263,6 +263,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .L_GAP_1: /* if (incx != 1) */ | .L_GAP_1: /* if (incx != 1) */ | ||||
| ZGEMV_T_LSX GAP_1, X2_GAP | ZGEMV_T_LSX GAP_1, X2_GAP | ||||
| .L_END: | .L_END: | ||||
| pop_if_used 17 + 8, 30 | |||||
| pop_if_used 8, 6 | |||||
| jirl $r0, $r1, 0x0 | jirl $r0, $r1, 0x0 | ||||
| EPILOGUE | EPILOGUE | ||||
| @@ -264,7 +264,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| PROLOGUE | PROLOGUE | ||||
| PTR_LD INC_Y, $sp, 0 | PTR_LD INC_Y, $sp, 0 | ||||
| push_if_used 17 + 8, 30 | |||||
| push_if_used 8, 6 | |||||
| PTR_ADDI K, $r0, 0x01 | PTR_ADDI K, $r0, 0x01 | ||||
| PTR_SUB I, INC_X, K | PTR_SUB I, INC_X, K | ||||
| maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ | maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ | ||||
| @@ -294,6 +294,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .L_GAP_1: /* if (incx != 1) */ | .L_GAP_1: /* if (incx != 1) */ | ||||
| ZGEMV_T_LASX GAP_1, X4_GAP | ZGEMV_T_LASX GAP_1, X4_GAP | ||||
| .L_END: | .L_END: | ||||
| pop_if_used 17 + 8, 30 | |||||
| pop_if_used 8, 6 | |||||
| jirl $r0, $r1, 0x0 | jirl $r0, $r1, 0x0 | ||||
| EPILOGUE | EPILOGUE | ||||