| @@ -955,12 +955,18 @@ endif | |||
| ifeq ($(ARCH), loongarch64) | |||
| LA64_ABI=$(shell $(CC) -mabi=lp64d -c $(TOPDIR)/cpuid_loongarch64.c -o /dev/null > /dev/null 2> /dev/null && echo lp64d) | |||
| LA64_ARCH=$(shell $(CC) -march=loongarch64 -c $(TOPDIR)/cpuid_loongarch64.c -o /dev/null > /dev/null 2> /dev/null && echo loongarch64) | |||
| ifneq ($(LA64_ABI), lp64d) | |||
| LA64_ABI=lp64 | |||
| endif | |||
| ifneq ($(LA64_ARCH), loongarch64) | |||
| CCOMMON_OPT += -mabi=$(LA64_ABI) | |||
| FCOMMON_OPT += -mabi=$(LA64_ABI) | |||
| else | |||
| CCOMMON_OPT += -march=loongarch64 -mabi=$(LA64_ABI) | |||
| FCOMMON_OPT += -march=loongarch64 -mabi=$(LA64_ABI) | |||
| endif | |||
| endif | |||
| endif | |||
| @@ -197,10 +197,22 @@ fi | |||
| no_lsx=0 | |||
| no_lasx=0 | |||
| if [ "$architecture" = "loongarch64" ]; then | |||
| lasx_flags='-march=loongarch64' | |||
| lsx_flags='-march=loongarch64' | |||
| tmpd="$(mktemp -d)" | |||
| tmparch="$tmpd/arch.c" | |||
| printf "void main(void){ }\n" >> "$tmparch" | |||
| args="-march=loongarch64 -o $tmparch.o $tmparch" | |||
| { | |||
| $compiler_name $flags $args >/dev/null 2>&1 | |||
| } || { | |||
| lasx_flags='' | |||
| lsx_flags='' | |||
| } | |||
| tmplsx="$tmpd/lsx.c" | |||
| codelsx='"vadd.b $vr0, $vr0, $vr0"' | |||
| lsx_flags='-march=loongarch64' | |||
| printf "void main(void){ __asm__ volatile(%s);}\n" "$codelsx" >> "$tmplsx" | |||
| args="$lsx_flags -o $tmplsx.o $tmplsx" | |||
| { | |||
| @@ -211,7 +223,6 @@ if [ "$architecture" = "loongarch64" ]; then | |||
| tmplasx="$tmpd/lasx.c" | |||
| codelasx='"xvadd.b $xr0, $xr0, $xr0"' | |||
| lasx_flags='-march=loongarch64' | |||
| printf "void main(void){ __asm__ volatile(%s);}\n" "$codelasx" >> "$tmplasx" | |||
| args="$lasx_flags -o $tmplasx.o $tmplasx" | |||
| { | |||
| @@ -279,7 +279,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| PROLOGUE | |||
| PTR_LD INC_Y, $sp, 0 | |||
| push_if_used 17 + 7, 31 | |||
| push_if_used 7, 7 | |||
| PTR_ADDI K, $r0, 0x01 | |||
| PTR_SUB I, INC_X, K | |||
| PTR_SUB J, INC_Y, K | |||
| @@ -318,6 +318,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */ | |||
| CGEMV_N_LSX GAP_1_1, X_4_GAP, X_1, Y_4_GAP, Y_1 | |||
| .L_END: | |||
| pop_if_used 17 + 7, 31 | |||
| pop_if_used 7, 7 | |||
| jirl $r0, $r1, 0x0 | |||
| EPILOGUE | |||
| @@ -336,7 +336,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| PROLOGUE | |||
| PTR_LD INC_Y, $sp, 0 | |||
| push_if_used 17 + 7, 31 | |||
| push_if_used 7, 7 | |||
| PTR_ADDI K, $r0, 0x01 | |||
| PTR_SUB I, INC_X, K | |||
| PTR_SUB J, INC_Y, K | |||
| @@ -378,6 +378,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */ | |||
| CGEMV_N_LASX GAP_1_1, X_8_GAP, X_1, Y_8_GAP, Y_1 | |||
| .L_END: | |||
| pop_if_used 17 + 7, 31 | |||
| pop_if_used 7, 7 | |||
| jirl $r0, $r1, 0x0 | |||
| EPILOGUE | |||
| @@ -255,7 +255,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| PROLOGUE | |||
| PTR_LD INC_Y, $sp, 0 | |||
| push_if_used 17 + 8, 30 | |||
| push_if_used 8, 6 | |||
| PTR_ADDI K, $r0, 0x01 | |||
| PTR_SUB I, INC_X, K | |||
| maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ | |||
| @@ -285,6 +285,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .L_GAP_1: /* if (incx != 1) */ | |||
| CGEMV_T_LSX GAP_1, X4_GAP | |||
| .L_END: | |||
| pop_if_used 17 + 8, 30 | |||
| pop_if_used 8, 6 | |||
| jirl $r0, $r1, 0x0 | |||
| EPILOGUE | |||
| @@ -304,7 +304,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| PROLOGUE | |||
| PTR_LD INC_Y, $sp, 0 | |||
| push_if_used 17 + 8, 30 | |||
| push_if_used 8, 6 | |||
| PTR_ADDI K, $r0, 0x01 | |||
| PTR_SUB I, INC_X, K | |||
| maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ | |||
| @@ -337,6 +337,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .L_GAP_1: /* if (incx != 1) */ | |||
| CGEMV_T_LASX GAP_1, X8_GAP | |||
| .L_END: | |||
| pop_if_used 17 + 8, 30 | |||
| pop_if_used 8, 6 | |||
| jirl $r0, $r1, 0x0 | |||
| EPILOGUE | |||
| @@ -79,7 +79,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define D7 $vr15 | |||
| PROLOGUE | |||
| push_if_used 26, 32 | |||
| push_if_used 0, 0 | |||
| move TD, DST | |||
| move TS, SRC | |||
| slli.d TL, LDA, 0x03 | |||
| @@ -278,6 +278,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| addi.d M, M, -1 | |||
| blt ZERO, M, .L_M1 | |||
| .L_N0: | |||
| pop_if_used 26, 32 | |||
| pop_if_used 0, 0 | |||
| jirl $r0, $r1, 0x00 | |||
| EPILOGUE | |||
| @@ -66,7 +66,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define U7 $vr7 | |||
| PROLOGUE | |||
| push_if_used 18, 8 | |||
| push_if_used 1, 0 | |||
| move S0, SRC | |||
| move P0, DST | |||
| @@ -274,7 +274,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| fst.d F0, P3, 0x00 | |||
| .L_M0: | |||
| pop_if_used 18, 8 | |||
| pop_if_used 1, 0 | |||
| jirl $r0, $r1, 0x00 | |||
| EPILOGUE | |||
| @@ -76,7 +76,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define U7 $vr7 | |||
| PROLOGUE | |||
| push_if_used 24, 8 | |||
| push_if_used 7, 0 | |||
| move S0, SRC | |||
| move P0, DST | |||
| @@ -592,6 +592,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| addi.d S1, S1, 0x08 | |||
| addi.d P4, P4, 0x08 | |||
| .L_M0: | |||
| pop_if_used 24, 8 | |||
| pop_if_used 7, 0 | |||
| jirl $r0, $r1, 0x00 | |||
| EPILOGUE | |||
| @@ -509,7 +509,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| PROLOGUE | |||
| PTR_LD INC_Y, $sp, 0 | |||
| push_if_used 17 + 7, 24 + 4 | |||
| push_if_used 7, 4 | |||
| PTR_ADDI K, $r0, 0x01 | |||
| PTR_SUB I, INC_X, K | |||
| PTR_SUB J, INC_Y, K | |||
| @@ -549,6 +549,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */ | |||
| DGEMV_N_LASX GAP_1_1, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8_GAP, Y_4_GAP, Y_1 | |||
| .L_END: | |||
| pop_if_used 17 + 7, 24 + 4 | |||
| pop_if_used 7, 4 | |||
| jirl $r0, $r1, 0x0 | |||
| EPILOGUE | |||
| @@ -445,7 +445,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| PROLOGUE | |||
| PTR_LD INC_Y, $sp, 0 | |||
| push_if_used 17 + 8, 24 + 3 | |||
| push_if_used 8, 3 | |||
| PTR_ADDI K, $r0, 0x01 | |||
| PTR_SUB I, INC_X, K | |||
| maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ | |||
| @@ -476,6 +476,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .L_GAP_1: /* if (incx != 1) */ | |||
| DGEMV_T_LASX GAP_1, X8_GAP, X4_GAP | |||
| .L_END: | |||
| pop_if_used 17 + 8, 24 + 3 | |||
| pop_if_used 8, 3 | |||
| jirl $r0, $r1, 0x0 | |||
| EPILOGUE | |||
| @@ -1029,7 +1029,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| PROLOGUE | |||
| push_if_used 26, 32 | |||
| push_if_used 9, 8 | |||
| PTR_SLLI LDC, LDC, 3 | |||
| /* if (!(N >> 2)) goto L_N3 */ | |||
| PTR_SRAI J, N, 2 /* J = bn >> 2 */ | |||
| @@ -1361,6 +1361,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| blt ZERO, I, .L_N1_I1 | |||
| .L_N1_M0: | |||
| .L_N0: | |||
| pop_if_used 26, 32 | |||
| pop_if_used 9, 8 | |||
| jirl $r0, $r1, 0x0 | |||
| EPILOGUE | |||
| @@ -128,31 +128,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "dtrsm_kernel_macro.S" | |||
| .macro ldrepl_macro start, end, stride | |||
| .macro ldrepl_macro stride:req, index:req, more:vararg | |||
| // Load Ux (x = 0...15) | |||
| .if \start <= \end | |||
| GLDREPL xv, d, $xr\start, A0, \stride * 8 | |||
| ldrepl_macro %start + 1, \end, %stride + 1 | |||
| GLDREPL xv, d, $xr\index, A0, \index * 8 - \stride * 8 | |||
| .ifnb \more | |||
| ldrepl_macro \stride, \more | |||
| .endif | |||
| .endm | |||
| .macro nmsub_macro start0, end0, start1, reg | |||
| .macro nmsub_macro reg:req, start0:req, start1:req, more:vararg | |||
| // Gx -= reg * Ux | |||
| .if \start0 <= \end0 | |||
| xvfnmsub.d $xr\start0, \reg, $xr\start1, $xr\start0 | |||
| nmsub_macro %start0 + 1, \end0, %start1 + 1, \reg | |||
| .ifnb \more | |||
| nmsub_macro \reg, \more | |||
| .endif | |||
| .endm | |||
| .macro B_st_macro start, end, stride, N | |||
| .macro B_st_macro N:req, stride:req, start:req, more:vararg | |||
| // Store Gx(x = 16...31) | |||
| .if \start <= \end | |||
| .if \N == 4 | |||
| xvst $xr\start, B0, \stride * 0x20 | |||
| xvst $xr\start, B0, \start * 0x20 - \stride * 0x20 | |||
| .elseif \N == 2 | |||
| vst $vr\start, B0, \stride * 0x10 | |||
| vst $vr\start, B0, \start * 0x10 - \stride * 0x10 | |||
| .elseif \N == 1 | |||
| fst.d $f\start, B0, \stride * 0x08 | |||
| fst.d $f\start, B0, \start * 0x08 - \stride * 0x08 | |||
| .endif | |||
| B_st_macro %start + 1, \end, %stride + 1, \N | |||
| .ifnb \more | |||
| B_st_macro \N, \stride, \more | |||
| .endif | |||
| .endm | |||
| @@ -194,86 +194,93 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| // 255 | |||
| // Sequentially extract data from A in row order | |||
| // Load 0 | |||
| ldrepl_macro 0, 15, 0 | |||
| ldrepl_macro 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 | |||
| GMUL xvf, d, G0, G0, U0 | |||
| nmsub_macro 17, 31, 1, G0 | |||
| nmsub_macro G0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7, 24, 8, \ | |||
| 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15 | |||
| PTR_ADDI A0, A0, 17 * 8 | |||
| // Load 1 | |||
| ldrepl_macro 1, 15, 0 | |||
| ldrepl_macro 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 | |||
| GMUL xvf, d, G1, G1, U1 | |||
| nmsub_macro 18, 31, 2, G1 | |||
| nmsub_macro G1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7, 24, 8, \ | |||
| 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15 | |||
| PTR_ADDI A0, A0, 17 * 8 | |||
| // Load 2 | |||
| ldrepl_macro 2, 15, 0 | |||
| ldrepl_macro 2, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 | |||
| GMUL xvf, d, G2, G2, U2 | |||
| nmsub_macro 19, 31, 3, G2 | |||
| nmsub_macro G2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7, 24, 8, 25, 9, 26, \ | |||
| 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15 | |||
| PTR_ADDI A0, A0, 17 * 8 | |||
| // Load 3 | |||
| ldrepl_macro 3, 15, 0 | |||
| ldrepl_macro 3, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 | |||
| GMUL xvf, d, G3, G3, U3 | |||
| nmsub_macro 20, 31, 4, G3 | |||
| nmsub_macro G3, 20, 4, 21, 5, 22, 6, 23, 7, 24, 8, 25, 9, 26, 10, \ | |||
| 27, 11, 28, 12, 29, 13, 30, 14, 31, 15 | |||
| PTR_ADDI A0, A0, 17 * 8 | |||
| // Load 4 | |||
| ldrepl_macro 4, 15, 0 | |||
| ldrepl_macro 4, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 | |||
| GMUL xvf, d, G4, G4, U4 | |||
| nmsub_macro 21, 31, 5, G4 | |||
| nmsub_macro G4, 21, 5, 22, 6, 23, 7, 24, 8, 25, 9, 26, 10, 27, 11, \ | |||
| 28, 12, 29, 13, 30, 14, 31, 15 | |||
| PTR_ADDI A0, A0, 17 * 8 | |||
| // Load 5 | |||
| ldrepl_macro 5, 15, 0 | |||
| ldrepl_macro 5, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 | |||
| GMUL xvf, d, G5, G5, U5 | |||
| nmsub_macro 22, 31, 6, G5 | |||
| nmsub_macro G5, 22, 6, 23, 7, 24, 8, 25, 9, 26, 10, 27, 11, 28, 12, \ | |||
| 29, 13, 30, 14, 31, 15 | |||
| PTR_ADDI A0, A0, 17 * 8 | |||
| // Load 6 | |||
| ldrepl_macro 6, 15, 0 | |||
| ldrepl_macro 6, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 | |||
| GMUL xvf, d, G6, G6, U6 | |||
| nmsub_macro 23, 31, 7, G6 | |||
| nmsub_macro G6, 23, 7, 24, 8, 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, \ | |||
| 30, 14, 31, 15 | |||
| PTR_ADDI A0, A0, 17 * 8 | |||
| // Load 7 | |||
| ldrepl_macro 7, 15, 0 | |||
| ldrepl_macro 7, 7, 8, 9, 10, 11, 12, 13, 14, 15 | |||
| GMUL xvf, d, G7, G7, U7 | |||
| nmsub_macro 24, 31, 8, G7 | |||
| nmsub_macro G7, 24, 8, 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15 | |||
| PTR_ADDI A0, A0, 17 * 8 | |||
| // Load 8 | |||
| ldrepl_macro 8, 15, 0 | |||
| ldrepl_macro 8, 8, 9, 10, 11, 12, 13, 14, 15 | |||
| GMUL xvf, d, G8, G8, U8 | |||
| nmsub_macro 25, 31, 9, G8 | |||
| nmsub_macro G8, 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15 | |||
| PTR_ADDI A0, A0, 17 * 8 | |||
| // Load 9 | |||
| ldrepl_macro 9, 15, 0 | |||
| ldrepl_macro 9, 9, 10, 11, 12, 13, 14, 15 | |||
| GMUL xvf, d, G9, G9, U9 | |||
| nmsub_macro 26, 31, 10, G9 | |||
| nmsub_macro G9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15 | |||
| PTR_ADDI A0, A0, 17 * 8 | |||
| // Load 10 | |||
| ldrepl_macro 10, 15, 0 | |||
| ldrepl_macro 10, 10, 11, 12, 13, 14, 15 | |||
| GMUL xvf, d, G10, G10, U10 | |||
| nmsub_macro 27, 31, 11, G10 | |||
| nmsub_macro G10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15 | |||
| PTR_ADDI A0, A0, 17 * 8 | |||
| // Load 11 | |||
| ldrepl_macro 11, 15, 0 | |||
| ldrepl_macro 11, 11, 12, 13, 14, 15 | |||
| GMUL xvf, d, G11, G11, U11 | |||
| nmsub_macro 28, 31, 12, G11 | |||
| nmsub_macro G11, 28, 12, 29, 13, 30, 14, 31, 15 | |||
| PTR_ADDI A0, A0, 17 * 8 | |||
| // Load 12 | |||
| ldrepl_macro 12, 15, 0 | |||
| ldrepl_macro 12, 12, 13, 14, 15 | |||
| GMUL xvf, d, G12, G12, U12 | |||
| nmsub_macro 29, 31, 13, G12 | |||
| nmsub_macro G12, 29, 13, 30, 14, 31, 15 | |||
| PTR_ADDI A0, A0, 17 * 8 | |||
| // Load 13 | |||
| ldrepl_macro 13, 15, 0 | |||
| ldrepl_macro 13, 13, 14, 15 | |||
| GMUL xvf, d, G13, G13, U13 | |||
| nmsub_macro 30, 31, 14, G13 | |||
| nmsub_macro G13, 30, 14, 31, 15 | |||
| PTR_ADDI A0, A0, 17 * 8 | |||
| // Load 14 | |||
| ldrepl_macro 14, 15, 0 | |||
| ldrepl_macro 14, 14, 15 | |||
| GMUL xvf, d, G14, G14, U14 | |||
| nmsub_macro 31, 31, 15, G14 | |||
| nmsub_macro G14, 31, 15 | |||
| PTR_ADDI A0, A0, 17 * 8 | |||
| // Load 15 | |||
| ldrepl_macro 15, 15, 0 | |||
| ldrepl_macro 15, 15 | |||
| GMUL xvf, d, G15, G15, U15 | |||
| // Finally, We can store the result. | |||
| // For B, stored sequentially, and C, first transpose and then store | |||
| B_st_macro 16, 31, 0, \N | |||
| B_st_macro \N, 16, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 | |||
| GTRANSPOSE4x4_D G0, G1, G2, G3, G0, G1, G2, G3, U0, U1 | |||
| GTRANSPOSE4x4_D G4, G5, G6, G7, G4, G5, G6, G7, U0, U1 | |||
| GTRANSPOSE4x4_D G8, G9, G10, G11, G8, G9, G10, G11, U0, U1 | |||
| @@ -334,46 +341,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| // 63 | |||
| // Sequentially extract data from A in row order | |||
| // Load 0 | |||
| ldrepl_macro 0, 7, 0 | |||
| ldrepl_macro 0, 0, 1, 2, 3, 4, 5, 6, 7 | |||
| GMUL xvf, d, G0, G0, U0 | |||
| nmsub_macro 17, 23, 1, G0 | |||
| nmsub_macro G0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7 | |||
| PTR_ADDI A0, A0, 9 * 8 | |||
| // Load 1 | |||
| ldrepl_macro 1, 7, 0 | |||
| ldrepl_macro 1, 1, 2, 3, 4, 5, 6, 7 | |||
| GMUL xvf, d, G1, G1, U1 | |||
| nmsub_macro 18, 23, 2, G1 | |||
| nmsub_macro G1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7 | |||
| PTR_ADDI A0, A0, 9 * 8 | |||
| // Load 2 | |||
| ldrepl_macro 2, 7, 0 | |||
| ldrepl_macro 2, 2, 3, 4, 5, 6, 7 | |||
| GMUL xvf, d, G2, G2, U2 | |||
| nmsub_macro 19, 23, 3, G2 | |||
| nmsub_macro G2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7 | |||
| PTR_ADDI A0, A0, 9 * 8 | |||
| // Load 3 | |||
| ldrepl_macro 3, 7, 0 | |||
| ldrepl_macro 3, 3, 4, 5, 6, 7 | |||
| GMUL xvf, d, G3, G3, U3 | |||
| nmsub_macro 20, 23, 4, G3 | |||
| nmsub_macro G3, 20, 4, 21, 5, 22, 6, 23, 7 | |||
| PTR_ADDI A0, A0, 9 * 8 | |||
| // Load 4 | |||
| ldrepl_macro 4, 7, 0 | |||
| ldrepl_macro 4, 4, 5, 6, 7 | |||
| GMUL xvf, d, G4, G4, U4 | |||
| nmsub_macro 21, 23, 5, G4 | |||
| nmsub_macro G4, 21, 5, 22, 6, 23, 7 | |||
| PTR_ADDI A0, A0, 9 * 8 | |||
| // Load 5 | |||
| ldrepl_macro 5, 7, 0 | |||
| ldrepl_macro 5, 5, 6, 7 | |||
| GMUL xvf, d, G5, G5, U5 | |||
| nmsub_macro 22, 23, 6, G5 | |||
| nmsub_macro G5, 22, 6, 23, 7 | |||
| PTR_ADDI A0, A0, 9 * 8 | |||
| // Load 6 | |||
| ldrepl_macro 6, 7, 0 | |||
| ldrepl_macro 6, 6, 7 | |||
| GMUL xvf, d, G6, G6, U6 | |||
| nmsub_macro 23, 23, 7, G6 | |||
| nmsub_macro G6, 23, 7 | |||
| PTR_ADDI A0, A0, 9 * 8 | |||
| // Load 7 | |||
| ldrepl_macro 7, 7, 0 | |||
| ldrepl_macro 7, 7 | |||
| GMUL xvf, d, G7, G7, U7 | |||
| // Finally, We can store the result. | |||
| // For B, stored sequentially, and C, first transpose and then store | |||
| B_st_macro 16, 23, 0, \N | |||
| B_st_macro \N, 16, 16, 17, 18, 19, 20, 21, 22, 23 | |||
| GTRANSPOSE4x4_D G0, G1, G2, G3, G0, G1, G2, G3, U0, U1 | |||
| GTRANSPOSE4x4_D G4, G5, G6, G7, G4, G5, G6, G7, U0, U1 | |||
| .if \N == 4 | |||
| @@ -437,26 +444,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| // 15 | |||
| // Sequentially extract data from A in row order | |||
| // Load 0 | |||
| ldrepl_macro 0, 3, 0 | |||
| ldrepl_macro 0, 0, 1, 2, 3 | |||
| GMUL xvf, d, G0, G0, U0 | |||
| nmsub_macro 17, 19, 1, G0 | |||
| nmsub_macro G0, 17, 1, 18, 2, 19, 3 | |||
| PTR_ADDI A0, A0, 5 * 8 | |||
| // Load 1 | |||
| ldrepl_macro 1, 3, 0 | |||
| ldrepl_macro 1, 1, 2, 3 | |||
| GMUL xvf, d, G1, G1, U1 | |||
| nmsub_macro 18, 19, 2, G1 | |||
| nmsub_macro G1, 18, 2, 19, 3 | |||
| PTR_ADDI A0, A0, 5 * 8 | |||
| // Load 2 | |||
| ldrepl_macro 2, 3, 0 | |||
| ldrepl_macro 2, 2, 3 | |||
| GMUL xvf, d, G2, G2, U2 | |||
| nmsub_macro 19, 19, 3, G2 | |||
| nmsub_macro G2, 19, 3 | |||
| PTR_ADDI A0, A0, 5 * 8 | |||
| // Load 3 | |||
| ldrepl_macro 3, 3, 0 | |||
| ldrepl_macro 3, 3 | |||
| GMUL xvf, d, G3, G3, U3 | |||
| // Finally, We can store the result. | |||
| // For B, stored sequentially, and C, first transpose and then store | |||
| B_st_macro 16, 19, 0, \N | |||
| B_st_macro \N, 16, 16, 17, 18, 19 | |||
| GTRANSPOSE4x4_D G0, G1, G2, G3, G0, G1, G2, G3, U0, U1 | |||
| .if \N == 4 | |||
| GST xv, , G0, C0, 0x00, G1, C1, 0x00, G2, C2, 0x00, G3, C3, 0x00 | |||
| @@ -501,16 +508,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| // 3 | |||
| // Sequentially extract data from A in row order | |||
| // Load 0 | |||
| ldrepl_macro 0, 1, 0 | |||
| ldrepl_macro 0, 0, 1 | |||
| GMUL xvf, d, G0, G0, U0 | |||
| nmsub_macro 17, 17, 1, G0 | |||
| nmsub_macro G0, 17, 1 | |||
| PTR_ADDI A0, A0, 3 * 8 | |||
| // Load 1 | |||
| ldrepl_macro 1, 1, 0 | |||
| ldrepl_macro 1, 1 | |||
| GMUL xvf, d, G1, G1, U1 | |||
| // Finally, We can store the result. | |||
| // For B, stored sequentially, and C, first transpose and then store | |||
| B_st_macro 16, 17, 0, \N | |||
| B_st_macro \N, 16, 16, 17 | |||
| GSBUTTERFLY xv, d, U0, U1, G1, G0 | |||
| .if \N == 4 | |||
| vst $vr0, C0, 0x00 | |||
| @@ -717,7 +724,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| PROLOGUE | |||
| push_if_used 26, 32 | |||
| push_if_used 9, 8 | |||
| PTR_SLLI LDC, LDC, 3 | |||
| /* if (!(N >> 2)) goto L_N3 */ | |||
| PTR_SRAI J, N, 2 /* J = bn >> 2 */ | |||
| @@ -954,6 +961,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| PTR_ADD AA, AA, T0 // aa += 1 * k | |||
| .L_N1_M0: | |||
| .L_N0: | |||
| pop_if_used 26, 32 | |||
| pop_if_used 9, 8 | |||
| jirl $r0, $r1, 0x0 | |||
| EPILOGUE | |||
| @@ -128,33 +128,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "dtrsm_kernel_macro.S" | |||
| .macro ldrepl_macro start, end, stride | |||
| .macro ldrepl_macro stride:req, index:req, more:vararg | |||
| // Load Ux (x = 0...15) | |||
| .if \start <= \end | |||
| GLDREPL xv, d, $xr\start, B0, \stride * 8 | |||
| ldrepl_macro %start + 1, \end, %stride + 1 | |||
| GLDREPL xv, d, $xr\index, B0, \index * 8 - \stride * 8 | |||
| .ifnb \more | |||
| ldrepl_macro \stride, \more | |||
| .endif | |||
| .endm | |||
| .macro nmsub_macro start0, end0, start1, reg | |||
| // Ux -= reg * Dx | |||
| .if \start0 <= \end0 | |||
| .macro nmsub_macro reg:req, start0:req, start1:req, more:vararg | |||
| // Gx -= reg * Ux | |||
| xvfnmsub.d $xr\start0, \reg, $xr\start1, $xr\start0 | |||
| nmsub_macro %start0 + 1, \end0, %start1 + 1, \reg | |||
| .ifnb \more | |||
| nmsub_macro \reg, \more | |||
| .endif | |||
| .endm | |||
| .macro A_st_macro start, end, stride, N | |||
| // Store Ux(x = 0...15) | |||
| .if \start <= \end | |||
| .macro A_st_macro N:req, stride:req, start:req, more:vararg | |||
| // Store Gx(x = 16...31) | |||
| .if \N == 4 | |||
| xvst $xr\start, A0, \stride * 0x20 | |||
| xvst $xr\start, A0, \start * 0x20 - \stride * 0x20 | |||
| .elseif \N == 2 | |||
| vst $vr\start, A0, \stride * 0x10 | |||
| vst $vr\start, A0, \start * 0x10 - \stride * 0x10 | |||
| .elseif \N == 1 | |||
| fst.d $f\start, A0, \stride * 0x08 | |||
| fst.d $f\start, A0, \start * 0x08 - \stride * 0x08 | |||
| .endif | |||
| A_st_macro %start + 1, \end, %stride + 1, \N | |||
| .ifnb \more | |||
| A_st_macro \N, \stride, \more | |||
| .endif | |||
| .endm | |||
| @@ -167,22 +165,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| // 10 11 | |||
| // 15 | |||
| // Sequentially extract data from B in row order | |||
| ldrepl_macro 16, 19, 0 | |||
| ldrepl_macro 16, 16, 17, 18, 19 | |||
| GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3 | |||
| ldrepl_macro 20, 22, 5 | |||
| nmsub_macro 4, 7, 0, D1 | |||
| ldrepl_macro 23, 24, 10 | |||
| ldrepl_macro 15, 20, 21, 22 | |||
| nmsub_macro D1, 4, 0, 5, 1, 6, 2, 7, 3 | |||
| ldrepl_macro 13, 23, 24 | |||
| GMUL xvf, d, U4, D4, U4, U5, D4, U5, U6, D4, U6, U7, D4, U7 | |||
| ldrepl_macro 25, 25, 15 | |||
| nmsub_macro 8, 11, 0, D2 | |||
| nmsub_macro 8, 11, 4, D5 | |||
| ldrepl_macro 10, 25 | |||
| nmsub_macro D2, 8, 0, 9, 1, 10, 2, 11, 3 | |||
| nmsub_macro D5, 8, 4, 9, 5, 10, 6, 11, 7 | |||
| GMUL xvf, d, U8, D7, U8, U9, D7, U9, U10, D7, U10, U11, D7, U11 | |||
| nmsub_macro 12, 15, 0, D3 | |||
| nmsub_macro 12, 15, 4, D6 | |||
| nmsub_macro 12, 15, 8, D8 | |||
| nmsub_macro D3, 12, 0, 13, 1, 14, 2, 15, 3 | |||
| nmsub_macro D6, 12, 4, 13, 5, 14, 6, 15, 7 | |||
| nmsub_macro D8, 12, 8, 13, 9, 14, 10, 15, 11 | |||
| GMUL xvf, d, U12, D9, U12, U13, D9, U13, U14, D9, U14, U15, D9, U15 | |||
| // Store A | |||
| A_st_macro 0, 15, 0, 4 | |||
| A_st_macro 4, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 | |||
| // Store C | |||
| GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60, \ | |||
| U4, C1, 0x00, U5, C1, 0x20, U6, C1, 0x40, U7, C1, 0x60, \ | |||
| @@ -197,13 +196,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| //0 1 | |||
| // 3 | |||
| // Sequentially extract data from B in row order | |||
| ldrepl_macro 16, 17, 0 | |||
| ldrepl_macro 16, 16, 17 | |||
| GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3 | |||
| ldrepl_macro 18, 18, 3 | |||
| nmsub_macro 4, 7, 0, D1 | |||
| ldrepl_macro 15, 18 | |||
| nmsub_macro D1, 4, 0, 5, 1, 6, 2, 7, 3 | |||
| GMUL xvf, d, U4, D2, U4, U5, D2, U5, U6, D2, U6, U7, D2, U7 | |||
| // Store A | |||
| A_st_macro 0, 7, 0, 4 | |||
| A_st_macro 4, 0, 0, 1, 2, 3, 4, 5, 6, 7 | |||
| // Store C | |||
| GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60, \ | |||
| U4, C1, 0x00, U5, C1, 0x20, U6, C1, 0x40, U7, C1, 0x60 | |||
| @@ -218,22 +217,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| // 10 11 | |||
| // 15 | |||
| // Sequentially extract data from B in row order | |||
| ldrepl_macro 16, 19, 0 | |||
| ldrepl_macro 16, 16, 17, 18, 19 | |||
| GMUL xvf, d, U0, D0, U0, U1, D0, U1 | |||
| ldrepl_macro 20, 22, 5 | |||
| nmsub_macro 2, 3, 0, D1 | |||
| ldrepl_macro 23, 24, 10 | |||
| ldrepl_macro 15, 20, 21, 22 | |||
| nmsub_macro D1, 2, 0, 3, 1 | |||
| ldrepl_macro 13, 23, 24 | |||
| GMUL xvf, d, U2, D4, U2, U3, D4, U3 | |||
| ldrepl_macro 25, 25, 15 | |||
| nmsub_macro 4, 5, 0, D2 | |||
| nmsub_macro 4, 5, 2, D5 | |||
| ldrepl_macro 10, 25 | |||
| nmsub_macro D2, 4, 0, 5, 1 | |||
| nmsub_macro D5, 4, 2, 5, 3 | |||
| GMUL xvf, d, U4, D7, U4, U5, D7, U5 | |||
| nmsub_macro 6, 7, 0, D3 | |||
| nmsub_macro 6, 7, 2, D6 | |||
| nmsub_macro 6, 7, 4, D8 | |||
| nmsub_macro D3, 6, 0, 7, 1 | |||
| nmsub_macro D6, 6, 2, 7, 3 | |||
| nmsub_macro D8, 6, 4, 7, 5 | |||
| GMUL xvf, d, U6, D9, U6, U7, D9, U7 | |||
| // Store A | |||
| A_st_macro 0, 7, 0, 4 | |||
| A_st_macro 4, 0, 0, 1, 2, 3, 4, 5, 6, 7 | |||
| // Store C | |||
| GST xv, , U0, C0, 0x00, U1, C0, 0x20, \ | |||
| U2, C1, 0x00, U3, C1, 0x20, \ | |||
| @@ -248,13 +247,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| //0 1 | |||
| // 3 | |||
| // Sequentially extract data from B in row order | |||
| ldrepl_macro 16, 17, 0 | |||
| ldrepl_macro 16, 16, 17 | |||
| GMUL xvf, d, U0, D0, U0, U1, D0, U1 | |||
| ldrepl_macro 18, 18, 3 | |||
| nmsub_macro 2, 3, 0, D1 | |||
| ldrepl_macro 15, 18 | |||
| nmsub_macro D1, 2, 0, 3, 1 | |||
| GMUL xvf, d, U2, D2, U2, U3, D2, U3 | |||
| // Store A | |||
| A_st_macro 0, 3, 0, 4 | |||
| A_st_macro 4, 0, 0, 1, 2, 3 | |||
| // Store C | |||
| GST xv, , U0, C0, 0x00, U1, C0, 0x20, \ | |||
| U2, C1, 0x00, U3, C1, 0x20 | |||
| @@ -269,22 +268,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| // 10 11 | |||
| // 15 | |||
| // Sequentially extract data from B in row order | |||
| ldrepl_macro 16, 19, 0 | |||
| ldrepl_macro 16, 16, 17, 18, 19 | |||
| GMUL xvf, d, U0, D0, U0 | |||
| ldrepl_macro 20, 22, 5 | |||
| nmsub_macro 1, 1, 0, D1 | |||
| ldrepl_macro 23, 24, 10 | |||
| ldrepl_macro 15, 20, 21, 22 | |||
| nmsub_macro D1, 1, 0 | |||
| ldrepl_macro 13, 23, 24 | |||
| GMUL xvf, d, U1, D4, U1 | |||
| ldrepl_macro 25, 25, 15 | |||
| nmsub_macro 2, 2, 0, D2 | |||
| nmsub_macro 2, 2, 1, D5 | |||
| ldrepl_macro 10, 25 | |||
| nmsub_macro D2, 2, 0 | |||
| nmsub_macro D5, 2, 1 | |||
| GMUL xvf, d, U2, D7, U2 | |||
| nmsub_macro 3, 3, 0, D3 | |||
| nmsub_macro 3, 3, 1, D6 | |||
| nmsub_macro 3, 3, 2, D8 | |||
| nmsub_macro D3, 3, 0 | |||
| nmsub_macro D6, 3, 1 | |||
| nmsub_macro D8, 3, 2 | |||
| GMUL xvf, d, U3, D9, U3 | |||
| // Store A | |||
| A_st_macro 0, 3, 0, 4 | |||
| A_st_macro 4, 0, 0, 1, 2, 3 | |||
| // Store C | |||
| GST xv, , U0, C0, 0x00, U1, C1, 0x00, U2, C2, 0x00, U3, C3, 0x00 | |||
| .endm | |||
| @@ -296,13 +295,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| //0 1 | |||
| // 3 | |||
| // Sequentially extract data from B in row order | |||
| ldrepl_macro 16, 17, 0 | |||
| ldrepl_macro 16, 16, 17 | |||
| GMUL xvf, d, U0, D0, U0 | |||
| ldrepl_macro 18, 18, 3 | |||
| nmsub_macro 1, 1, 0, D1 | |||
| ldrepl_macro 15, 18 | |||
| nmsub_macro D1, 1, 0 | |||
| GMUL xvf, d, U1, D2, U1 | |||
| // Store A | |||
| A_st_macro 0, 1, 0, 4 | |||
| A_st_macro 4, 0, 0, 1 | |||
| // Store C | |||
| GST xv, , U0, C0, 0x00, U1, C1, 0x00 | |||
| .endm | |||
| @@ -316,23 +315,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| // 10 11 | |||
| // 15 | |||
| // Sequentially extract data from B in row order | |||
| ldrepl_macro 16, 19, 0 | |||
| ldrepl_macro 16, 16, 17, 18, 19 | |||
| GMUL xvf, d, U0, D0, U0 | |||
| ldrepl_macro 20, 22, 5 | |||
| nmsub_macro 1, 1, 0, D1 | |||
| ldrepl_macro 23, 24, 10 | |||
| ldrepl_macro 15, 20, 21, 22 | |||
| nmsub_macro D1, 1, 0 | |||
| ldrepl_macro 13, 23, 24 | |||
| GMUL xvf, d, U1, D4, U1 | |||
| ldrepl_macro 25, 25, 15 | |||
| nmsub_macro 2, 2, 0, D2 | |||
| nmsub_macro 2, 2, 1, D5 | |||
| ldrepl_macro 10, 25 | |||
| nmsub_macro D2, 2, 0 | |||
| nmsub_macro D5, 2, 1 | |||
| GMUL xvf, d, U2, D7, U2 | |||
| nmsub_macro 3, 3, 0, D3 | |||
| nmsub_macro 3, 3, 1, D6 | |||
| nmsub_macro 3, 3, 2, D8 | |||
| nmsub_macro D3, 3, 0 | |||
| nmsub_macro D6, 3, 1 | |||
| nmsub_macro D8, 3, 2 | |||
| GMUL xvf, d, U3, D9, U3 | |||
| // Store A | |||
| A_st_macro 0, 3, 0, 2 | |||
| A_st_macro 2, 0, 0, 1, 2, 3 | |||
| // Store C | |||
| GST v, , $vr0, C0, 0x00, $vr1, C1, 0x00, $vr2, C2, 0x00, $vr3, C3, 0x00, | |||
| .endm | |||
| @@ -344,13 +343,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| //0 1 | |||
| // 3 | |||
| // Sequentially extract data from B in row order | |||
| ldrepl_macro 16, 17, 0 | |||
| ldrepl_macro 16, 16, 17 | |||
| GMUL xvf, d, U0, D0, U0 | |||
| ldrepl_macro 18, 18, 3 | |||
| nmsub_macro 1, 1, 0, D1 | |||
| ldrepl_macro 15, 18 | |||
| nmsub_macro D1, 1, 0 | |||
| GMUL xvf, d, U1, D2, U1 | |||
| // Store A | |||
| A_st_macro 0, 1, 0, 2 | |||
| A_st_macro 2, 0, 0, 1 | |||
| // Store C | |||
| GST v, , $vr0, C0, 0x00, $vr1, C1, 0x00 | |||
| .endm | |||
| @@ -364,23 +363,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| // 10 11 | |||
| // 15 | |||
| // Sequentially extract data from B in row order | |||
| ldrepl_macro 16, 19, 0 | |||
| ldrepl_macro 16, 16, 17, 18, 19 | |||
| GMUL xvf, d, U0, D0, U0 | |||
| ldrepl_macro 20, 22, 5 | |||
| nmsub_macro 1, 1, 0, D1 | |||
| ldrepl_macro 23, 24, 10 | |||
| ldrepl_macro 15, 20, 21, 22 | |||
| nmsub_macro D1, 1, 0 | |||
| ldrepl_macro 13, 23, 24 | |||
| GMUL xvf, d, U1, D4, U1 | |||
| ldrepl_macro 25, 25, 15 | |||
| nmsub_macro 2, 2, 0, D2 | |||
| nmsub_macro 2, 2, 1, D5 | |||
| ldrepl_macro 10, 25 | |||
| nmsub_macro D2, 2, 0 | |||
| nmsub_macro D5, 2, 1 | |||
| GMUL xvf, d, U2, D7, U2 | |||
| nmsub_macro 3, 3, 0, D3 | |||
| nmsub_macro 3, 3, 1, D6 | |||
| nmsub_macro 3, 3, 2, D8 | |||
| nmsub_macro D3, 3, 0 | |||
| nmsub_macro D6, 3, 1 | |||
| nmsub_macro D8, 3, 2 | |||
| GMUL xvf, d, U3, D9, U3 | |||
| // Store A | |||
| A_st_macro 0, 3, 0, 1 | |||
| A_st_macro 1, 0, 0, 1, 2, 3 | |||
| // Store C | |||
| GST f, d, $f0, C0, 0x00, $f1, C1, 0x00, $f2, C2, 0x00, $f3, C3, 0x00, | |||
| .endm | |||
| @@ -392,13 +391,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| //0 1 | |||
| // 3 | |||
| // Sequentially extract data from B in row order | |||
| ldrepl_macro 16, 17, 0 | |||
| ldrepl_macro 16, 16, 17 | |||
| GMUL xvf, d, U0, D0, U0 | |||
| ldrepl_macro 18, 18, 3 | |||
| nmsub_macro 1, 1, 0, D1 | |||
| ldrepl_macro 15, 18 | |||
| nmsub_macro D1, 1, 0 | |||
| GMUL xvf, d, U1, D2, U1 | |||
| // Store A | |||
| A_st_macro 0, 1, 0, 1 | |||
| A_st_macro 1, 0, 0, 1 | |||
| // Store C | |||
| GST f, d, $f0, C0, 0x00, $f1, C1, 0x00 | |||
| .endm | |||
| @@ -582,10 +581,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvld U2, C0, 0x40 | |||
| xvld U3, C0, 0x60 | |||
| .L_dsolve_16x1: | |||
| ldrepl_macro 16, 16, 0 | |||
| ldrepl_macro 16, 16 | |||
| GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3 | |||
| // Store A | |||
| A_st_macro 0, 3, 0, 4 | |||
| A_st_macro 4, 0, 0, 1, 2, 3 | |||
| // Strore C | |||
| GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60 | |||
| .endm | |||
| @@ -599,10 +598,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvld U0, C0, 0x00 | |||
| xvld U1, C0, 0x20 | |||
| .L_dsolve_8x1: | |||
| ldrepl_macro 16, 16, 0 | |||
| ldrepl_macro 16, 16 | |||
| GMUL xvf, d, U0, D0, U0, U1, D0, U1 | |||
| // Store A | |||
| A_st_macro 0, 1, 0, 4 | |||
| A_st_macro 4, 0, 0, 1 | |||
| // Strore C | |||
| GST xv, , U0, C0, 0x00, U1, C0, 0x20 | |||
| .endm | |||
| @@ -615,10 +614,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| /* Load C0 */ | |||
| xvld U0, C0, 0x00 | |||
| .L_dsolve_4x1: | |||
| ldrepl_macro 16, 16, 0 | |||
| ldrepl_macro 16, 16 | |||
| GMUL xvf, d, U0, D0, U0 | |||
| // Store A | |||
| A_st_macro 0, 0, 0, 4 | |||
| A_st_macro 4, 0, 0 | |||
| // Strore C | |||
| GST xv, , U0, C0, 0x00 | |||
| .endm | |||
| @@ -631,10 +630,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| /* Load C0 */ | |||
| xvld U0, C0, 0x00 | |||
| .L_dsolve_2x1: | |||
| ldrepl_macro 16, 16, 0 | |||
| ldrepl_macro 16, 16 | |||
| GMUL xvf, d, U0, D0, U0 | |||
| // Store A | |||
| A_st_macro 0, 0, 0, 2 | |||
| A_st_macro 2, 0, 0 | |||
| // Strore C | |||
| GST v, , $vr0, C0, 0x00 | |||
| .endm | |||
| @@ -647,16 +646,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| // Load C | |||
| fld.d $f0, C0, 0x00 | |||
| .L_dsolve_1x1: | |||
| ldrepl_macro 16, 16, 0 | |||
| ldrepl_macro 16, 16 | |||
| GMUL xvf, d, U0, D0, U0 | |||
| // Store A | |||
| A_st_macro 0, 0, 0, 1 | |||
| A_st_macro 1, 0, 0 | |||
| // Strore C | |||
| GST f, d, $f0, C0, 0x00 | |||
| .endm | |||
| PROLOGUE | |||
| push_if_used 26, 32 | |||
| push_if_used 9, 8 | |||
| PTR_SLLI LDC, LDC, 3 | |||
| PTR_SUB KK, ZERO, OFFSET | |||
| /* if (!(N >> 2)) goto L_N3 */ | |||
| @@ -877,6 +876,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| PTR_ADD AA, AA, T0 // aa += 1 * k | |||
| .L_N1_M0: | |||
| .L_N0: | |||
| pop_if_used 26, 32 | |||
| pop_if_used 9, 8 | |||
| jirl $r0, $r1, 0x0 | |||
| EPILOGUE | |||
| @@ -111,33 +111,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "dtrsm_kernel_macro.S" | |||
| .macro ldrepl_macro start, end, stride | |||
| .macro ldrepl_macro stride:req, index:req, more:vararg | |||
| // Load Ux (x = 0...15) | |||
| .if \start <= \end | |||
| GLDREPL xv, d, $xr\start, B0, \stride * 8 | |||
| ldrepl_macro %start + 1, \end, %stride + 1 | |||
| GLDREPL xv, d, $xr\index, B0, \index * 8 - \stride * 8 | |||
| .ifnb \more | |||
| ldrepl_macro \stride, \more | |||
| .endif | |||
| .endm | |||
| .macro nmsub_macro start0, end0, start1, reg | |||
| // Ux -= reg * Dx | |||
| .if \start0 <= \end0 | |||
| .macro nmsub_macro reg:req, start0:req, start1:req, more:vararg | |||
| // Gx -= reg * Ux | |||
| xvfnmsub.d $xr\start0, \reg, $xr\start1, $xr\start0 | |||
| nmsub_macro %start0 + 1, \end0, %start1 + 1, \reg | |||
| .ifnb \more | |||
| nmsub_macro \reg, \more | |||
| .endif | |||
| .endm | |||
| .macro A_st_macro start, end, stride, N | |||
| // Store Ux(x = 0...15) | |||
| .if \start <= \end | |||
| .macro A_st_macro N:req, stride:req, start:req, more:vararg | |||
| // Store Gx(x = 16...31) | |||
| .if \N == 4 | |||
| xvst $xr\start, A0, \stride * 0x20 | |||
| xvst $xr\start, A0, \start * 0x20 - \stride * 0x20 | |||
| .elseif \N == 2 | |||
| vst $vr\start, A0, \stride * 0x10 | |||
| vst $vr\start, A0, \start * 0x10 - \stride * 0x10 | |||
| .elseif \N == 1 | |||
| fst.d $f\start, A0, \stride * 0x08 | |||
| fst.d $f\start, A0, \start * 0x08 - \stride * 0x08 | |||
| .endif | |||
| A_st_macro %start + 1, \end, %stride + 1, \N | |||
| .ifnb \more | |||
| A_st_macro \N, \stride, \more | |||
| .endif | |||
| .endm | |||
| @@ -148,13 +146,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| //0 | |||
| //2 3 | |||
| // Sequentially extract data from B in row order | |||
| ldrepl_macro 16, 16, 0 | |||
| ldrepl_macro 17, 18, 2 | |||
| ldrepl_macro 16, 16 | |||
| ldrepl_macro 15, 17, 18 | |||
| GMUL xvf, d, U4, D2, U4, U5, D2, U5, U6, D2, U6, U7, D2, U7 | |||
| nmsub_macro 0, 3, 4, D1 | |||
| nmsub_macro D1, 0, 4, 1, 5, 2, 6, 3, 7 | |||
| GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3 | |||
| // Store A | |||
| A_st_macro 0, 7, 0, 4 | |||
| A_st_macro 4, 0, 0, 1, 2, 3, 4, 5, 6, 7 | |||
| // Store C | |||
| GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60, \ | |||
| U4, C1, 0x00, U5, C1, 0x20, U6, C1, 0x40, U7, C1, 0x60 | |||
| @@ -167,13 +165,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| //0 | |||
| //2 3 | |||
| // Sequentially extract data from B in row order | |||
| ldrepl_macro 16, 16, 0 | |||
| ldrepl_macro 17, 18, 2 | |||
| ldrepl_macro 16, 16 | |||
| ldrepl_macro 15, 17, 18 | |||
| GMUL xvf, d, U2, D2, U2, U3, D2, U3 | |||
| nmsub_macro 0, 1, 2, D1 | |||
| nmsub_macro D1, 0, 2, 1, 3 | |||
| GMUL xvf, d, U0, D0, U0, U1, D0, U1 | |||
| // Store A | |||
| A_st_macro 0, 3, 0, 4 | |||
| A_st_macro 4, 0, 0, 1, 2, 3 | |||
| // Store C | |||
| GST xv, , U0, C0, 0x00, U1, C0, 0x20, \ | |||
| U2, C1, 0x00, U3, C1, 0x20 | |||
| @@ -186,13 +184,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| //0 | |||
| //2 3 | |||
| // Sequentially extract data from B in row order | |||
| ldrepl_macro 16, 16, 0 | |||
| ldrepl_macro 17, 18, 2 | |||
| ldrepl_macro 16, 16 | |||
| ldrepl_macro 15, 17, 18 | |||
| GMUL xvf, d, U1, D2, U1 | |||
| nmsub_macro 0, 0, 1, D1 | |||
| nmsub_macro D1, 0, 1 | |||
| GMUL xvf, d, U0, D0, U0 | |||
| // Store A | |||
| A_st_macro 0, 1, 0, 4 | |||
| A_st_macro 4, 0, 0, 1 | |||
| // Store C | |||
| GST xv, , U0, C0, 0x00, U1, C1, 0x00 | |||
| .endm | |||
| @@ -204,13 +202,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| //0 | |||
| //2 3 | |||
| // Sequentially extract data from B in row order | |||
| ldrepl_macro 16, 16, 0 | |||
| ldrepl_macro 17, 18, 2 | |||
| ldrepl_macro 16, 16 | |||
| ldrepl_macro 15, 17, 18 | |||
| GMUL xvf, d, U1, D2, U1 | |||
| nmsub_macro 0, 0, 1, D1 | |||
| nmsub_macro D1, 0, 1 | |||
| GMUL xvf, d, U0, D0, U0 | |||
| // Store A | |||
| A_st_macro 0, 1, 0, 2 | |||
| A_st_macro 2, 0, 0, 1 | |||
| // Store C | |||
| GST v, , $vr0, C0, 0x00, $vr1, C1, 0x00 | |||
| .endm | |||
| @@ -222,13 +220,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| //0 | |||
| //2 3 | |||
| // Sequentially extract data from B in row order | |||
| ldrepl_macro 16, 16, 0 | |||
| ldrepl_macro 17, 18, 2 | |||
| ldrepl_macro 16, 16 | |||
| ldrepl_macro 15, 17, 18 | |||
| GMUL xvf, d, U1, D2, U1 | |||
| nmsub_macro 0, 0, 1, D1 | |||
| nmsub_macro D1, 0, 1 | |||
| GMUL xvf, d, U0, D0, U0 | |||
| // Store A | |||
| A_st_macro 0, 1, 0, 1 | |||
| A_st_macro 1, 0, 0, 1 | |||
| // Store C | |||
| GST f, d, $f0, C0, 0x00, $f1, C1, 0x00 | |||
| .endm | |||
| @@ -242,22 +240,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| //8 9 10 | |||
| //12 13 14 15 | |||
| // Sequentially extract data from B in row order | |||
| ldrepl_macro 22, 25, 12 | |||
| ldrepl_macro 10, 22, 23, 24, 25 | |||
| GMUL xvf, d, U12, D9, U12, U13, D9, U13, U14, D9, U14, U15, D9, U15 | |||
| ldrepl_macro 19, 21, 8 | |||
| nmsub_macro 8, 11, 12, D8 | |||
| ldrepl_macro 17, 18, 4 | |||
| ldrepl_macro 11, 19, 20, 21 | |||
| nmsub_macro D8, 8, 12, 9, 13, 10, 14, 11, 15 | |||
| ldrepl_macro 13, 17, 18 | |||
| GMUL xvf, d, U8, D5, U8, U9, D5, U9, U10, D5, U10, U11, D5, U11 | |||
| ldrepl_macro 16, 16, 0 | |||
| nmsub_macro 4, 7, 12, D7 | |||
| nmsub_macro 4, 7, 8, D4 | |||
| ldrepl_macro 16, 16 | |||
| nmsub_macro D7, 4, 12, 5, 13, 6, 14, 7, 15 | |||
| nmsub_macro D4, 4, 8, 5, 9, 6, 10, 7, 11 | |||
| GMUL xvf, d, U4, D2, U4, U5, D2, U5, U6, D2, U6, U7, D2, U7 | |||
| nmsub_macro 0, 3, 12, D6 | |||
| nmsub_macro 0, 3, 8, D3 | |||
| nmsub_macro 0, 3, 4, D1 | |||
| nmsub_macro D6, 0, 12, 1, 13, 2, 14, 3, 15 | |||
| nmsub_macro D3, 0, 8, 1, 9, 2, 10, 3, 11 | |||
| nmsub_macro D1, 0, 4, 1, 5, 2, 6, 3, 7 | |||
| GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3 | |||
| // Store A | |||
| A_st_macro 0, 15, 0, 4 | |||
| A_st_macro 4, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 | |||
| // Store C | |||
| GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60, \ | |||
| U4, C1, 0x00, U5, C1, 0x20, U6, C1, 0x40, U7, C1, 0x60, \ | |||
| @@ -274,22 +272,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| //8 9 10 | |||
| //12 13 14 15 | |||
| // Sequentially extract data from B in row order | |||
| ldrepl_macro 22, 25, 12 | |||
| ldrepl_macro 10, 22, 23, 24, 25 | |||
| GMUL xvf, d, U6, D9, U6, U7, D9, U7 | |||
| ldrepl_macro 19, 21, 8 | |||
| nmsub_macro 4, 5, 6, D8 | |||
| ldrepl_macro 17, 18, 4 | |||
| ldrepl_macro 11, 19, 20, 21 | |||
| nmsub_macro D8, 4, 6, 5, 7 | |||
| ldrepl_macro 13, 17, 18 | |||
| GMUL xvf, d, U4, D5, U4, U5, D5, U5 | |||
| ldrepl_macro 16, 16, 0 | |||
| nmsub_macro 2, 3, 6, D7 | |||
| nmsub_macro 2, 3, 4, D4 | |||
| ldrepl_macro 16, 16 | |||
| nmsub_macro D7, 2, 6, 3, 7 | |||
| nmsub_macro D4, 2, 4, 3, 5 | |||
| GMUL xvf, d, U2, D2, U2, U3, D2, U3 | |||
| nmsub_macro 0, 1, 6, D6 | |||
| nmsub_macro 0, 1, 4, D3 | |||
| nmsub_macro 0, 1, 2, D1 | |||
| nmsub_macro D6, 0, 6, 1, 7 | |||
| nmsub_macro D3, 0, 4, 1, 5 | |||
| nmsub_macro D1, 0, 2, 1, 3 | |||
| GMUL xvf, d, U0, D0, U0, U1, D0, U1 | |||
| // Store A | |||
| A_st_macro 0, 7, 0, 4 | |||
| A_st_macro 4, 0, 0, 1, 2, 3, 4, 5, 6, 7 | |||
| // Store C | |||
| GST xv, , U0, C0, 0x00, U1, C0, 0x20, \ | |||
| U2, C1, 0x00, U3, C1, 0x20, \ | |||
| @@ -306,22 +304,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| //8 9 10 | |||
| //12 13 14 15 | |||
| // Sequentially extract data from B in row order | |||
| ldrepl_macro 22, 25, 12 | |||
| ldrepl_macro 10, 22, 23, 24, 25 | |||
| GMUL xvf, d, U3, D9, U3 | |||
| ldrepl_macro 19, 21, 8 | |||
| nmsub_macro 2, 2, 3, D8 | |||
| ldrepl_macro 17, 18, 4 | |||
| ldrepl_macro 11, 19, 20, 21 | |||
| nmsub_macro D8, 2, 3 | |||
| ldrepl_macro 13, 17, 18 | |||
| GMUL xvf, d, U2, D5, U2 | |||
| ldrepl_macro 16, 16, 0 | |||
| nmsub_macro 1, 1, 3, D7 | |||
| nmsub_macro 1, 1, 2, D4 | |||
| ldrepl_macro 16, 16 | |||
| nmsub_macro D7, 1, 3 | |||
| nmsub_macro D4, 1, 2 | |||
| GMUL xvf, d, U1, D2, U1 | |||
| nmsub_macro 0, 0, 3, D6 | |||
| nmsub_macro 0, 0, 2, D3 | |||
| nmsub_macro 0, 0, 1, D1 | |||
| nmsub_macro D6, 0, 3 | |||
| nmsub_macro D3, 0, 2 | |||
| nmsub_macro D1, 0, 1 | |||
| GMUL xvf, d, U0, D0, U0 | |||
| // Store A | |||
| A_st_macro 0, 3, 0, 4 | |||
| A_st_macro 4, 0, 0, 1, 2, 3 | |||
| // Store C | |||
| GST xv, , U0, C0, 0x00, U1, C1, 0x00, U2, C2, 0x00, U3, C3, 0x00 | |||
| .endm | |||
| @@ -335,22 +333,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| //8 9 10 | |||
| //12 13 14 15 | |||
| // Sequentially extract data from B in row order | |||
| ldrepl_macro 22, 25, 12 | |||
| ldrepl_macro 10, 22, 23, 24, 25 | |||
| GMUL xvf, d, U3, D9, U3 | |||
| ldrepl_macro 19, 21, 8 | |||
| nmsub_macro 2, 2, 3, D8 | |||
| ldrepl_macro 17, 18, 4 | |||
| ldrepl_macro 11, 19, 20, 21 | |||
| nmsub_macro D8, 2, 3 | |||
| ldrepl_macro 13, 17, 18 | |||
| GMUL xvf, d, U2, D5, U2 | |||
| ldrepl_macro 16, 16, 0 | |||
| nmsub_macro 1, 1, 3, D7 | |||
| nmsub_macro 1, 1, 2, D4 | |||
| ldrepl_macro 16, 16 | |||
| nmsub_macro D7, 1, 3 | |||
| nmsub_macro D4, 1, 2 | |||
| GMUL xvf, d, U1, D2, U1 | |||
| nmsub_macro 0, 0, 3, D6 | |||
| nmsub_macro 0, 0, 2, D3 | |||
| nmsub_macro 0, 0, 1, D1 | |||
| nmsub_macro D6, 0, 3 | |||
| nmsub_macro D3, 0, 2 | |||
| nmsub_macro D1, 0, 1 | |||
| GMUL xvf, d, U0, D0, U0 | |||
| // Store A | |||
| A_st_macro 0, 3, 0, 2 | |||
| A_st_macro 2, 0, 0, 1, 2, 3 | |||
| // Store C | |||
| GST v, , $vr0, C0, 0x00, $vr1, C1, 0x00, $vr2, C2, 0x00, $vr3, C3, 0x00 | |||
| .endm | |||
| @@ -364,22 +362,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| //8 9 10 | |||
| //12 13 14 15 | |||
| // Sequentially extract data from B in row order | |||
| ldrepl_macro 22, 25, 12 | |||
| ldrepl_macro 10, 22, 23, 24, 25 | |||
| GMUL xvf, d, U3, D9, U3 | |||
| ldrepl_macro 19, 21, 8 | |||
| nmsub_macro 2, 2, 3, D8 | |||
| ldrepl_macro 17, 18, 4 | |||
| ldrepl_macro 11, 19, 20, 21 | |||
| nmsub_macro D8, 2, 3 | |||
| ldrepl_macro 13, 17, 18 | |||
| GMUL xvf, d, U2, D5, U2 | |||
| ldrepl_macro 16, 16, 0 | |||
| nmsub_macro 1, 1, 3, D7 | |||
| nmsub_macro 1, 1, 2, D4 | |||
| ldrepl_macro 16, 16 | |||
| nmsub_macro D7, 1, 3 | |||
| nmsub_macro D4, 1, 2 | |||
| GMUL xvf, d, U1, D2, U1 | |||
| nmsub_macro 0, 0, 3, D6 | |||
| nmsub_macro 0, 0, 2, D3 | |||
| nmsub_macro 0, 0, 1, D1 | |||
| nmsub_macro D6, 0, 3 | |||
| nmsub_macro D3, 0, 2 | |||
| nmsub_macro D1, 0, 1 | |||
| GMUL xvf, d, U0, D0, U0 | |||
| // Store A | |||
| A_st_macro 0, 3, 0, 1 | |||
| A_st_macro 1, 0, 0, 1, 2, 3 | |||
| // Store C | |||
| GST f, d, $f0, C0, 0x00, $f1, C1, 0x00, $f2, C2, 0x00, $f3, C3, 0x00, | |||
| .endm | |||
| @@ -399,10 +397,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .L_dsolve_16x1: | |||
| PTR_ADDI A0, T1, -16 * 8 | |||
| PTR_ADDI B0, T2, -1 * 8 | |||
| ldrepl_macro 16, 16, 0 | |||
| ldrepl_macro 16, 16 | |||
| GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3 | |||
| // Store A | |||
| A_st_macro 0, 3, 0, 4 | |||
| A_st_macro 4, 0, 0, 1, 2, 3 | |||
| // Strore C | |||
| GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60 | |||
| .endm | |||
| @@ -420,10 +418,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .L_dsolve_8x1: | |||
| PTR_ADDI A0, T1, -8 * 8 | |||
| PTR_ADDI B0, T2, -1 * 8 | |||
| ldrepl_macro 16, 16, 0 | |||
| ldrepl_macro 16, 16 | |||
| GMUL xvf, d, U0, D0, U0, U1, D0, U1 | |||
| // Store A | |||
| A_st_macro 0, 1, 0, 4 | |||
| A_st_macro 4, 0, 0, 1 | |||
| // Strore C | |||
| GST xv, , U0, C0, 0x00, U1, C0, 0x20 | |||
| .endm | |||
| @@ -440,10 +438,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .L_dsolve_4x1: | |||
| PTR_ADDI A0, T1, -4 * 8 | |||
| PTR_ADDI B0, T2, -1 * 8 | |||
| ldrepl_macro 16, 16, 0 | |||
| ldrepl_macro 16, 16 | |||
| GMUL xvf, d, U0, D0, U0 | |||
| // Store A | |||
| A_st_macro 0, 0, 0, 4 | |||
| A_st_macro 4, 0, 0 | |||
| // Strore C | |||
| GST xv, , U0, C0, 0x00 | |||
| .endm | |||
| @@ -460,10 +458,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .L_dsolve_2x1: | |||
| PTR_ADDI A0, T1, -2 * 8 | |||
| PTR_ADDI B0, T2, -1 * 8 | |||
| ldrepl_macro 16, 16, 0 | |||
| ldrepl_macro 16, 16 | |||
| GMUL xvf, d, U0, D0, U0 | |||
| // Store A | |||
| A_st_macro 0, 0, 0, 2 | |||
| A_st_macro 2, 0, 0 | |||
| // Strore C | |||
| GST v, , $vr0, C0, 0x00 | |||
| .endm | |||
| @@ -480,10 +478,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .L_dsolve_1x1: | |||
| PTR_ADDI A0, T1, -1 * 8 | |||
| PTR_ADDI B0, T2, -1 * 8 | |||
| ldrepl_macro 16, 16, 0 | |||
| ldrepl_macro 16, 16 | |||
| GMUL xvf, d, U0, D0, U0 | |||
| // Store A | |||
| A_st_macro 0, 0, 0, 1 | |||
| A_st_macro 1, 0, 0 | |||
| // Strore C | |||
| GST f, d, $f0, C0, 0x00 | |||
| .endm | |||
| @@ -697,7 +695,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| PROLOGUE | |||
| push_if_used 26, 32 | |||
| push_if_used 9, 8 | |||
| PTR_SLLI LDC, LDC, 3 | |||
| PTR_SUB KK, N, OFFSET | |||
| PTR_MUL T0, N, LDC | |||
| @@ -948,6 +946,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| PTR_ADDI KK, KK, -4 | |||
| bnez J, .L_J1 | |||
| .L_N0: | |||
| pop_if_used 26, 32 | |||
| pop_if_used 9, 8 | |||
| jirl $r0, $r1, 0x0 | |||
| EPILOGUE | |||
| @@ -90,57 +90,175 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define PTR_FST fst.d | |||
| #endif | |||
| // The max registers available to the user which | |||
| // do not need to be preserved across calls. | |||
| // Ref: https://loongson.github.io/LoongArch-Documentation/LoongArch-ELF-ABI-CN.html | |||
| #define MAX_INT_CALLER_SAVED 17 | |||
| #define MAX_FP_CALLER_SAVED 24 | |||
| .altmacro // Enable alternate macro mode | |||
| /* | |||
| * Pushing and popping static registers into/from the stack. | |||
| * regs : number of static general-purpose registers, greater than or equal to 0, less than or equal to 9 | |||
| * fregs: number of static floating-point registers, greater than or equal to 0, less than or equal to 8 | |||
| */ | |||
| .macro push_if_used regs, fregs | |||
| .if \regs > MAX_INT_CALLER_SAVED | |||
| PTR_ADDI $sp, $sp, -((\regs - MAX_INT_CALLER_SAVED) << REG_LOG) | |||
| push_regs 0, \regs - MAX_INT_CALLER_SAVED - 1 | |||
| .if \regs > 0 | |||
| PTR_ADDI $sp, $sp, -(\regs << REG_LOG) | |||
| push_regs 0, \regs - 1 | |||
| .endif | |||
| .if \fregs > MAX_FP_CALLER_SAVED | |||
| PTR_ADDI $sp, $sp, -((\fregs - MAX_FP_CALLER_SAVED) << FREG_LOG) | |||
| push_fregs 0, \fregs - MAX_FP_CALLER_SAVED - 1 | |||
| .if \fregs > 0 | |||
| PTR_ADDI $sp, $sp, -(\fregs << FREG_LOG) | |||
| push_fregs 0, \fregs - 1 | |||
| .endif | |||
| .endm // End push_if_used | |||
| .macro pop_if_used regs, fregs | |||
| .if \fregs > MAX_FP_CALLER_SAVED | |||
| pop_fregs 0, \fregs - MAX_FP_CALLER_SAVED - 1 | |||
| PTR_ADDI $sp, $sp, (\fregs - MAX_FP_CALLER_SAVED) << FREG_LOG | |||
| .if \fregs > 0 | |||
| pop_fregs 0, \fregs - 1 | |||
| PTR_ADDI $sp, $sp, \fregs << FREG_LOG | |||
| .endif | |||
| .if \regs > MAX_INT_CALLER_SAVED | |||
| pop_regs 0, \regs - MAX_INT_CALLER_SAVED - 1 | |||
| PTR_ADDI $sp, $sp, (\regs - MAX_INT_CALLER_SAVED) << REG_LOG | |||
| .if \regs > 0 | |||
| pop_regs 0, \regs - 1 | |||
| PTR_ADDI $sp, $sp, \regs << REG_LOG | |||
| .endif | |||
| .endm // End pop_if_used | |||
| .macro push_regs from, to | |||
| PTR_ST $s\()\from, $sp, \from << REG_LOG | |||
| #ifdef __clang__ | |||
| .if \to >= 0 | |||
| PTR_ST $s0, $sp, 0 << REG_LOG | |||
| .endif | |||
| .if \to >= 1 | |||
| PTR_ST $s1, $sp, 1 << REG_LOG | |||
| .endif | |||
| .if \to >= 2 | |||
| PTR_ST $s2, $sp, 2 << REG_LOG | |||
| .endif | |||
| .if \to >= 3 | |||
| PTR_ST $s3, $sp, 3 << REG_LOG | |||
| .endif | |||
| .if \to >= 4 | |||
| PTR_ST $s4, $sp, 4 << REG_LOG | |||
| .endif | |||
| .if \to >= 5 | |||
| PTR_ST $s5, $sp, 5 << REG_LOG | |||
| .endif | |||
| .if \to >= 6 | |||
| PTR_ST $s6, $sp, 6 << REG_LOG | |||
| .endif | |||
| .if \to >= 7 | |||
| PTR_ST $s7, $sp, 7 << REG_LOG | |||
| .endif | |||
| .if \to >= 8 | |||
| PTR_ST $s8, $sp, 8 << REG_LOG | |||
| .endif | |||
| #else | |||
| PTR_ST $s\()\from, $sp, \from << REG_LOG | |||
| .if \to - \from | |||
| push_regs %from + 1, \to | |||
| .endif | |||
| #endif | |||
| .endm // End push_regs | |||
| .macro pop_regs from, to | |||
| #ifdef __clang__ | |||
| .if \to >= 0 | |||
| PTR_LD $s0, $sp, 0 << REG_LOG | |||
| .endif | |||
| .if \to >= 1 | |||
| PTR_LD $s1, $sp, 1 << REG_LOG | |||
| .endif | |||
| .if \to >= 2 | |||
| PTR_LD $s2, $sp, 2 << REG_LOG | |||
| .endif | |||
| .if \to >= 3 | |||
| PTR_LD $s3, $sp, 3 << REG_LOG | |||
| .endif | |||
| .if \to >= 4 | |||
| PTR_LD $s4, $sp, 4 << REG_LOG | |||
| .endif | |||
| .if \to >= 5 | |||
| PTR_LD $s5, $sp, 5 << REG_LOG | |||
| .endif | |||
| .if \to >= 6 | |||
| PTR_LD $s6, $sp, 6 << REG_LOG | |||
| .endif | |||
| .if \to >= 7 | |||
| PTR_LD $s7, $sp, 7 << REG_LOG | |||
| .endif | |||
| .if \to >= 8 | |||
| PTR_LD $s8, $sp, 8 << REG_LOG | |||
| .endif | |||
| #else | |||
| PTR_LD $s\()\from, $sp, \from << REG_LOG | |||
| .if \to - \from | |||
| pop_regs %from + 1, \to | |||
| .endif | |||
| #endif | |||
| .endm // End pop_regs | |||
| .macro push_fregs from, to | |||
| #ifdef __clang__ | |||
| .if \to >= 0 | |||
| PTR_FST $fs0, $sp, 0 << FREG_LOG | |||
| .endif | |||
| .if \to >= 1 | |||
| PTR_FST $fs1, $sp, 1 << FREG_LOG | |||
| .endif | |||
| .if \to >= 2 | |||
| PTR_FST $fs2, $sp, 2 << FREG_LOG | |||
| .endif | |||
| .if \to >= 3 | |||
| PTR_FST $fs3, $sp, 3 << FREG_LOG | |||
| .endif | |||
| .if \to >= 4 | |||
| PTR_FST $fs4, $sp, 4 << FREG_LOG | |||
| .endif | |||
| .if \to >= 5 | |||
| PTR_FST $fs5, $sp, 5 << FREG_LOG | |||
| .endif | |||
| .if \to >= 6 | |||
| PTR_FST $fs6, $sp, 6 << FREG_LOG | |||
| .endif | |||
| .if \to >= 7 | |||
| PTR_FST $fs7, $sp, 7 << FREG_LOG | |||
| .endif | |||
| #else | |||
| PTR_FST $fs\()\from, $sp, \from << FREG_LOG | |||
| .if \to - \from | |||
| push_fregs %from + 1, \to | |||
| .endif | |||
| #endif | |||
| .endm // End push_fregs | |||
| .macro pop_fregs from, to | |||
| #ifdef __clang__ | |||
| .if \to >= 0 | |||
| PTR_FLD $fs0, $sp, 0 << FREG_LOG | |||
| .endif | |||
| .if \to >= 1 | |||
| PTR_FLD $fs1, $sp, 1 << FREG_LOG | |||
| .endif | |||
| .if \to >= 2 | |||
| PTR_FLD $fs2, $sp, 2 << FREG_LOG | |||
| .endif | |||
| .if \to >= 3 | |||
| PTR_FLD $fs3, $sp, 3 << FREG_LOG | |||
| .endif | |||
| .if \to >= 4 | |||
| PTR_FLD $fs4, $sp, 4 << FREG_LOG | |||
| .endif | |||
| .if \to >= 5 | |||
| PTR_FLD $fs5, $sp, 5 << FREG_LOG | |||
| .endif | |||
| .if \to >= 6 | |||
| PTR_FLD $fs6, $sp, 6 << FREG_LOG | |||
| .endif | |||
| .if \to >= 7 | |||
| PTR_FLD $fs7, $sp, 7 << FREG_LOG | |||
| .endif | |||
| #else | |||
| PTR_FLD $fs\()\from, $sp, \from << FREG_LOG | |||
| .if \to - \from | |||
| pop_fregs %from + 1, \to | |||
| .endif | |||
| #endif | |||
| .endm // End pop_fregs | |||
| // | |||
| @@ -275,7 +393,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| // GXOR | |||
| // | |||
| .macro GXOR pre_op:req, suf_op:req, out:req, in0:req, in1:req, more:vararg | |||
| \pre_op\()xor.\suf_op \out, \in0, \in1 | |||
| .ifnb \pre_op | |||
| \pre_op\()xor.v \out, \in0, \in1 | |||
| .else | |||
| xor.\suf_op \out, \in0, \in1 | |||
| .endif | |||
| .ifnb \more | |||
| GXOR \pre_op, \suf_op, \more | |||
| .endif | |||
| @@ -307,6 +429,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| GPRELD \more | |||
| .endif | |||
| .endm | |||
| // | |||
| // GPACKEV | |||
| // | |||
| .macro GPACKEV pre_op:req, suf_op:req, out:req, in0:req, in1:req, more:vararg | |||
| \pre_op\()packev.\suf_op \out, \in0, \in1 | |||
| .ifnb \more | |||
| GPACKEV \pre_op, \suf_op, \more | |||
| .endif | |||
| .endm | |||
| // | |||
| // GPACKOD | |||
| // | |||
| .macro GPACKOD pre_op:req, suf_op:req, out:req, in0:req, in1:req, more:vararg | |||
| \pre_op\()packod.\suf_op \out, \in0, \in1 | |||
| .ifnb \more | |||
| GPACKOD \pre_op, \suf_op, \more | |||
| .endif | |||
| .endm | |||
| // | |||
| // GSHUF4I | |||
| // | |||
| .macro GSHUF4I pre_op:req, suf_op:req, out:req, in0:req, in1:req /* imm */, more:vararg | |||
| \pre_op\()shuf4i.\suf_op \out, \in0, \in1 | |||
| .ifnb \more | |||
| GSHUF4I \pre_op, \suf_op, \more | |||
| .endif | |||
| .endm | |||
| .macro TRANSF2G name, pre_op:req, suf_op:req, more:vararg | |||
| .ifeqs "\pre_op\()\suf_op", "vfs" | |||
| \name v, w, \more | |||
| .endif | |||
| .ifeqs "\pre_op\()\suf_op", "vfd" | |||
| \name v, d, \more | |||
| .endif | |||
| .ifeqs "\pre_op\()\suf_op", "xvfs" | |||
| \name xv, w, \more | |||
| .endif | |||
| .ifeqs "\pre_op\()\suf_op", "xvfd" | |||
| \name xv, d, \more | |||
| .endif | |||
| .endm | |||
| // | |||
| // Compound instructions | |||
| @@ -314,61 +478,96 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| // GACC: Accumulate the values of vector registers | |||
| // | |||
| .macro GACC pre_op:req, suf_op:req, out:req, in:req, more:vararg | |||
| .ifeqs "\pre_op", "xvf" | |||
| .ifeqs "\pre_op\()\suf_op", "xvfd" | |||
| xvpermi.q \out, \in, 0x01 | |||
| \pre_op\()add.\suf_op \in, \out, \in | |||
| xvpackod.d \out, \in, \in | |||
| \pre_op\()add.\suf_op \out, \out, \in | |||
| .endif | |||
| .ifeqs "\pre_op\()\suf_op", "xvfs" | |||
| xvpermi.q \out, \in, 0x01 | |||
| \pre_op\()add.\suf_op \in, \out, \in | |||
| xvpackod.d \out, \in, \in | |||
| \pre_op\()add.\suf_op \out, \out, \in | |||
| .ifeqs "\suf_op", "s" | |||
| xvpackod.w \in, \out, \out | |||
| \pre_op\()add.\suf_op \out, \out, \in | |||
| .endif | |||
| .ifeqs "\pre_op\()\suf_op", "vfd" | |||
| vpackod.d \out, \in, \in | |||
| \pre_op\()add.\suf_op \out, \out, \in | |||
| .endif | |||
| .ifeqs "\pre_op", "vf" | |||
| .ifeqs "\pre_op\()\suf_op", "vfs" | |||
| vpackod.d \out, \in, \in | |||
| \pre_op\()add.\suf_op \out, \out, \in | |||
| .ifeqs "\suf_op", "s" | |||
| vpackod.w \in, \out, \out | |||
| \pre_op\()add.\suf_op \out, \out, \in | |||
| .endif | |||
| .endif | |||
| .ifeqs "\pre_op", "xv" | |||
| .ifeqs "\pre_op\()\suf_op", "xvd" | |||
| xvpermi.q \out, \in, 0x01 | |||
| \pre_op\()add.\suf_op \in, \out, \in | |||
| xvpackod.d \out, \in, \in | |||
| \pre_op\()add.\suf_op \out, \out, \in | |||
| .endif | |||
| .ifeqs "\pre_op\()\suf_op", "xvw" | |||
| xvpermi.q \out, \in, 0x01 | |||
| \pre_op\()add.\suf_op \in, \out, \in | |||
| xvpackod.d \out, \in, \in | |||
| \pre_op\()add.\suf_op \out, \out, \in | |||
| xvpackod.w \in, \out, \out | |||
| \pre_op\()add.\suf_op \out, \out, \in | |||
| .endif | |||
| .ifeqs "\pre_op\()\suf_op", "xvh" | |||
| xvpermi.q \out, \in, 0x01 | |||
| \pre_op\()add.\suf_op \in, \out, \in | |||
| xvpackod.d \out, \in, \in | |||
| \pre_op\()add.\suf_op \out, \out, \in | |||
| xvpackod.w \in, \out, \out | |||
| \pre_op\()add.\suf_op \out, \out, \in | |||
| xvpackod.h \in, \out, \out | |||
| \pre_op\()add.\suf_op \out, \out, \in | |||
| .endif | |||
| .ifeqs "\pre_op\()\suf_op", "xvb" | |||
| xvpermi.q \out, \in, 0x01 | |||
| \pre_op\()add.\suf_op \in, \out, \in | |||
| xvpackod.d \out, \in, \in | |||
| \pre_op\()add.\suf_op \out, \out, \in | |||
| .ifnc "\suf_op", "d" | |||
| xvpackod.w \in, \out, \out | |||
| \pre_op\()add.\suf_op \out, \out, \in | |||
| .ifnc "\suf_op", "w" | |||
| xvpackod.h \in, \out, \out | |||
| \pre_op\()add.\suf_op \out, \out, \in | |||
| .ifnc "\suf_op", "h" | |||
| xvpackod.b \in, \out, \out | |||
| \pre_op\()add.\suf_op \out, \out, \in | |||
| .endif | |||
| .ifeqs "\pre_op\()\suf_op", "vd" | |||
| vpackod.d \out, \in, \in | |||
| \pre_op\()add.\suf_op \out, \out, \in | |||
| .endif | |||
| .ifeqs "\pre_op\()\suf_op", "vw" | |||
| vpackod.d \out, \in, \in | |||
| \pre_op\()add.\suf_op \out, \out, \in | |||
| vpackod.w \in, \out, \out | |||
| \pre_op\()add.\suf_op \out, \out, \in | |||
| .endif | |||
| .ifeqs "\pre_op\()\suf_op", "vh" | |||
| vpackod.d \out, \in, \in | |||
| \pre_op\()add.\suf_op \out, \out, \in | |||
| vpackod.w \in, \out, \out | |||
| \pre_op\()add.\suf_op \out, \out, \in | |||
| vpackod.h \in, \out, \out | |||
| \pre_op\()add.\suf_op \out, \out, \in | |||
| .endif | |||
| .ifeqs "\pre_op", "v" | |||
| .ifeqs "\pre_op\()\suf_op", "vb" | |||
| vpackod.d \out, \in, \in | |||
| \pre_op\()add.\suf_op \out, \out, \in | |||
| .ifnc "\suf_op", "d" | |||
| vpackod.w \in, \out, \out | |||
| \pre_op\()add.\suf_op \out, \out, \in | |||
| .ifnc "\suf_op", "w" | |||
| vpackod.h \in, \out, \out | |||
| \pre_op\()add.\suf_op \out, \out, \in | |||
| .ifnc "\suf_op", "h" | |||
| vpackod.b \in, \out, \out | |||
| \pre_op\()add.\suf_op \out, \out, \in | |||
| .endif | |||
| .endif | |||
| .endif | |||
| .endif | |||
| .ifnb \more | |||
| GACC \pre_op, \suf_op, \more | |||
| @@ -391,27 +590,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| // Note: When "pre_op = xvf && suf_op = s", in will be modified. | |||
| // | |||
| .macro GCOMPLEXACC pre_op:req, suf_op:req, out:req, in:req, more:vararg | |||
| .ifeqs "\pre_op", "xvf" | |||
| .ifeqs "\pre_op\()\suf_op", "xvfd" | |||
| xvpermi.q \out, \in, 0x01 | |||
| \pre_op\()add.\suf_op \out, \out, \in | |||
| .endif | |||
| .ifeqs "\pre_op\()\suf_op", "xvfs" | |||
| xvpermi.q \out, \in, 0x01 | |||
| .ifeqs "\suf_op", "s" | |||
| \pre_op\()add.\suf_op \in, \out, \in | |||
| xvpackod.d \out, \in, \in | |||
| \pre_op\()add.\suf_op \out, \out, \in | |||
| .else | |||
| \pre_op\()add.\suf_op \out, \out, \in | |||
| .endif | |||
| .ifeqs "\pre_op\()\suf_op", "vfd" | |||
| vor.v \out, \in, \in | |||
| .endif | |||
| .ifeqs "\pre_op", "vf" | |||
| .ifeqs "\suf_op", "s" | |||
| .ifeqs "\pre_op\()\suf_op", "vfs" | |||
| vpackod.d \out, \in, \in | |||
| \pre_op\()add.\suf_op \out, \out, \in | |||
| .else | |||
| vor.v \out, \in, \in | |||
| .endif | |||
| .endif | |||
| .ifnb \more | |||
| GCOMPLEXACC \pre_op, \suf_op, \more | |||
| .endif | |||
| @@ -430,56 +629,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| // suf_op: s or d, differentiate between single precision or double precision complex numbers | |||
| // | |||
| .macro GCOMPLEXMUL xconj=0, pre_op:req, suf_op:req, out:req, in0:req, in1:req, tmp0:req, tmp1:req, tmp2:req, more:vararg | |||
| .ifeqs "\pre_op", "xvf" | |||
| xvxor.v \tmp1, \tmp1, \tmp1 | |||
| .ifeqs "\suf_op", "s" | |||
| xvpackev.w \tmp0, \in0, \in0 | |||
| .else | |||
| xvpackev.d \tmp0, \in0, \in0 | |||
| .endif | |||
| .else | |||
| vxor.v \tmp1, \tmp1, \tmp1 | |||
| .ifeqs "\suf_op", "s" | |||
| vpackev.w \tmp0, \in0, \in0 | |||
| .else | |||
| vpackev.d \tmp0, \in0, \in0 | |||
| .endif | |||
| .endif | |||
| TRANSF2G GXOR, \pre_op, s, \tmp1, \tmp1, \tmp1 | |||
| TRANSF2G GPACKEV, \pre_op, \suf_op, \tmp0, \in0, \in0 | |||
| \pre_op\()sub.\suf_op \tmp1, \tmp1, \in0 | |||
| .ifeqs "\pre_op", "xvf" | |||
| .ifeqs "\suf_op", "s" | |||
| .ifeqs "\xconj", "0" | |||
| xvpackod.w \tmp1, \in0, \tmp1 | |||
| TRANSF2G GPACKOD, \pre_op, \suf_op, \tmp1, \in0, \tmp1 | |||
| .else | |||
| xvpackod.w \tmp1, \tmp1, \in0 | |||
| TRANSF2G GPACKOD, \pre_op, \suf_op, \tmp1, \tmp1, \in0 | |||
| .endif | |||
| xvshuf4i.w \tmp2, \in1, 0xb1 | |||
| .else | |||
| .ifeqs "\xconj", "0" | |||
| xvpackod.d \tmp1, \in0, \tmp1 | |||
| .else | |||
| xvpackod.d \tmp1, \tmp1, \in0 | |||
| .endif | |||
| xvshuf4i.d \tmp2, \in1, 0x0b | |||
| .endif | |||
| .else | |||
| .ifeqs "\suf_op", "s" | |||
| .ifeqs "\xconj", "0" | |||
| vpackod.w \tmp1, \in0, \tmp1 | |||
| .else | |||
| vpackod.w \tmp1, \tmp1, \in0 | |||
| .endif | |||
| vshuf4i.w \tmp2, \in1, 0xb1 | |||
| .else | |||
| .ifeqs "\xconj", "0" | |||
| vpackod.d \tmp1, \in0, \tmp1 | |||
| TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp2, \in1, 0xb1 | |||
| .else | |||
| vpackod.d \tmp1, \tmp1, \in0 | |||
| .endif | |||
| vshuf4i.d \tmp2, \in1, 0x0b | |||
| .endif | |||
| TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp2, \in1, 0x0b | |||
| .endif | |||
| \pre_op\()mul.\suf_op \out, \tmp0, \in1 | |||
| @@ -512,112 +676,58 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| // suf_op: s or d, differentiate between single precision or double precision complex numbers | |||
| // | |||
| .macro GCOMPLEXMADD xconj=0, conj=0, pre_op:req, suf_op:req, out:req, in0:req, in1:req, in2:req, tmp0:req, tmp1:req, tmp2:req, more:vararg | |||
| .ifeqs "\pre_op", "xvf" | |||
| xvxor.v \tmp1, \tmp1, \tmp1 | |||
| .ifeqs "\suf_op", "s" | |||
| xvpackev.w \tmp0, \in0, \in0 | |||
| .else | |||
| xvpackev.d \tmp0, \in0, \in0 | |||
| .endif | |||
| .else | |||
| vxor.v \tmp1, \tmp1, \tmp1 | |||
| .ifeqs "\suf_op", "s" | |||
| vpackev.w \tmp0, \in0, \in0 | |||
| .else | |||
| vpackev.d \tmp0, \in0, \in0 | |||
| .endif | |||
| .endif | |||
| TRANSF2G GXOR, \pre_op, s, \tmp1, \tmp1, \tmp1 | |||
| TRANSF2G GPACKEV, \pre_op, \suf_op, \tmp0, \in0, \in0 | |||
| \pre_op\()madd.\suf_op \tmp2, \tmp0, \in1, \in2 | |||
| .ifeqs "\conj", "1" | |||
| .ifeqs "\conj\()\suf_op", "1s" | |||
| \pre_op\()nmsub.\suf_op \tmp0, \tmp0, \in1, \in2 | |||
| .ifeqs "\pre_op", "xvf" | |||
| .ifeqs "\suf_op", "s" | |||
| xvshuf4i.w \tmp0, \tmp0, 0xb1 | |||
| xvpackev.w \out, \tmp0, \tmp2 | |||
| .else | |||
| xvshuf4i.d \tmp0, \tmp0, 0x0b | |||
| xvpackev.d \out, \tmp0, \tmp2 | |||
| TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp0, \tmp0, 0xb1 | |||
| TRANSF2G GPACKEV, \pre_op, \suf_op, \out, \tmp0, \tmp2 | |||
| .endif | |||
| .else | |||
| .ifeqs "\suf_op", "s" | |||
| vshuf4i.w \tmp0, \tmp0, 0xb1 | |||
| vpackev.w \out, \tmp0, \tmp2 | |||
| .else | |||
| vshuf4i.d \tmp0, \tmp0, 0x0b | |||
| vpackev.d \out, \tmp0, \tmp2 | |||
| .ifeqs "\conj\()\suf_op", "1d" | |||
| \pre_op\()nmsub.\suf_op \tmp0, \tmp0, \in1, \in2 | |||
| TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp0, \tmp0, 0x0b | |||
| TRANSF2G GPACKEV, \pre_op, \suf_op, \out, \tmp0, \tmp2 | |||
| .endif | |||
| .endif /* pre_op = xvf */ | |||
| .else | |||
| .ifeqs "\conj", "0" | |||
| \pre_op\()add.\suf_op \out, \tmp2, \tmp1 | |||
| .endif /* conj = 1 */ | |||
| .endif | |||
| \pre_op\()sub.\suf_op \tmp1, \tmp1, \in0 | |||
| .ifeqs "\pre_op", "xvf" | |||
| .ifeqs "\suf_op", "s" | |||
| .ifeqs "\conj", "0" | |||
| .ifeqs "\xconj", "0" | |||
| xvpackod.w \tmp1, \in0, \tmp1 | |||
| .else | |||
| xvpackod.w \tmp1, \tmp1, \in0 | |||
| .ifeqs "\xconj\()\conj\()\suf_op", "00s" | |||
| TRANSF2G GPACKOD, \pre_op, \suf_op, \tmp1, \in0, \tmp1 | |||
| TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp2, \in1, 0xb1 | |||
| .endif | |||
| .else | |||
| .ifeqs "\xconj", "0" | |||
| xvpackod.w \tmp1, \in0, \in0 | |||
| .else | |||
| xvpackod.w \tmp1, \tmp1, \tmp1 | |||
| .ifeqs "\xconj\()\conj\()\suf_op", "10s" | |||
| TRANSF2G GPACKOD, \pre_op, \suf_op, \tmp1, \tmp1, \in0 | |||
| TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp2, \in1, 0xb1 | |||
| .endif | |||
| .ifeqs "\xconj\()\conj\()\suf_op", "01s" | |||
| TRANSF2G GPACKOD, \pre_op, \suf_op, \tmp1, \in0, \in0 | |||
| TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp2, \in1, 0xb1 | |||
| .endif | |||
| xvshuf4i.w \tmp2, \in1, 0xb1 | |||
| .else | |||
| .ifeqs "\conj", "0" | |||
| .ifeqs "\xconj", "0" | |||
| xvpackod.d \tmp1, \in0, \tmp1 | |||
| .else | |||
| xvpackod.d \tmp1, \tmp1, \in0 | |||
| .endif | |||
| .else | |||
| .ifeqs "\xconj", "0" | |||
| xvpackod.d \tmp1, \in0, \in0 | |||
| .else | |||
| xvpackod.d \tmp1, \tmp1, \tmp1 | |||
| .endif | |||
| .endif | |||
| xvshuf4i.d \tmp2, \in1, 0x0b | |||
| .endif | |||
| .else | |||
| .ifeqs "\suf_op", "s" | |||
| .ifeqs "\conj", "0" | |||
| .ifeqs "\xconj", "0" | |||
| vpackod.w \tmp1, \in0, \tmp1 | |||
| .else | |||
| vpackod.w \tmp1, \tmp1, \in0 | |||
| .ifeqs "\xconj\()\conj\()\suf_op", "11s" | |||
| TRANSF2G GPACKOD, \pre_op, \suf_op, \tmp1, \tmp1, \tmp1 | |||
| TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp2, \in1, 0xb1 | |||
| .endif | |||
| .else | |||
| .ifeqs "\xconj", "0" | |||
| vpackod.w \tmp1, \in0, \in0 | |||
| .else | |||
| vpackod.w \tmp1, \tmp1, \tmp1 | |||
| .endif | |||
| .endif | |||
| vshuf4i.w \tmp2, \in1, 0xb1 | |||
| .else | |||
| .ifeqs "\conj", "0" | |||
| .ifeqs "\xconj", "0" | |||
| vpackod.d \tmp1, \in0, \tmp1 | |||
| .else | |||
| vpackod.d \tmp1, \tmp1, \in0 | |||
| .endif | |||
| .else | |||
| .ifeqs "\xconj", "0" | |||
| vpackod.d \tmp1, \in0, \in0 | |||
| .else | |||
| vpackod.d \tmp1, \tmp1, \tmp1 | |||
| .ifeqs "\xconj\()\conj\()\suf_op", "00d" | |||
| TRANSF2G GPACKOD, \pre_op, \suf_op, \tmp1, \in0, \tmp1 | |||
| TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp2, \in1, 0x0b | |||
| .endif | |||
| .ifeqs "\xconj\()\conj\()\suf_op", "10d" | |||
| TRANSF2G GPACKOD, \pre_op, \suf_op, \tmp1, \tmp1, \in0 | |||
| TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp2, \in1, 0x0b | |||
| .endif | |||
| vshuf4i.d \tmp2, \in1, 0x0b | |||
| .ifeqs "\xconj\()\conj\()\suf_op", "01d" | |||
| TRANSF2G GPACKOD, \pre_op, \suf_op, \tmp1, \in0, \in0 | |||
| TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp2, \in1, 0x0b | |||
| .endif | |||
| .ifeqs "\xconj\()\conj\()\suf_op", "11d" | |||
| TRANSF2G GPACKOD, \pre_op, \suf_op, \tmp1, \tmp1, \tmp1 | |||
| TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp2, \in1, 0x0b | |||
| .endif | |||
| \pre_op\()madd.\suf_op \out, \tmp1, \tmp2, \out | |||
| @@ -837,7 +837,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| PROLOGUE | |||
| push_if_used 26, 32 | |||
| push_if_used 9, 8 | |||
| xvreplve0.w VALPHA, $xr0 | |||
| #if defined (TRMMKERNEL) && !defined(LEFT) | |||
| PTR_SUB OFF, ZERO, OFFSET | |||
| @@ -2343,6 +2343,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif // #if defined(TRMMKERNEL) | |||
| .L_N1_M0: | |||
| .L_N0: | |||
| pop_if_used 26, 32 | |||
| pop_if_used 9, 8 | |||
| jirl $r0, $r1, 0x0 | |||
| EPILOGUE | |||
| @@ -135,7 +135,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| //.L_N0 | |||
| PROLOGUE | |||
| push_if_used 26, 32 | |||
| push_if_used 9, 8 | |||
| move TD, DST | |||
| move TS, SRC | |||
| @@ -458,6 +458,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| PTR_ADDI M, M, -1 | |||
| blt ZERO, M, .L_N1_M1 | |||
| .L_N0: | |||
| pop_if_used 26, 32 | |||
| pop_if_used 9, 8 | |||
| jirl $r0, $r1, 0x0 | |||
| EPILOGUE | |||
| @@ -110,7 +110,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| //.L_N0 | |||
| PROLOGUE | |||
| push_if_used 17, 20 | |||
| push_if_used 0, 0 | |||
| move TD, DST | |||
| move TS, SRC | |||
| @@ -293,6 +293,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| PTR_ADDI M, M, -1 | |||
| blt ZERO, M, .L_N1_M1 | |||
| .L_N0: | |||
| pop_if_used 17, 20 | |||
| pop_if_used 0, 0 | |||
| jirl $r0, $r1, 0x0 | |||
| EPILOGUE | |||
| @@ -118,7 +118,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| //.L_M0 | |||
| PROLOGUE | |||
| push_if_used 24, 8 | |||
| push_if_used 7, 0 | |||
| move S0, SRC | |||
| move P0, DST | |||
| @@ -521,6 +521,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| PTR_ADDI S1, S1, 0x04 | |||
| PTR_ADDI P5, P5, 0x04 | |||
| .L_M0: | |||
| pop_if_used 24, 8 | |||
| pop_if_used 7, 0 | |||
| jirl $r0, $r1, 0x00 | |||
| EPILOGUE | |||
| @@ -110,7 +110,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| //.L_M0 | |||
| PROLOGUE | |||
| push_if_used 23, 8 | |||
| push_if_used 6, 0 | |||
| move S0, SRC | |||
| move P0, DST | |||
| @@ -401,6 +401,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| PTR_ADDI S1, S1, 0x04 | |||
| PTR_ADDI P4, P4, 0x04 | |||
| .L_M0: | |||
| pop_if_used 23, 8 | |||
| pop_if_used 6, 0 | |||
| jirl $r0, $r1, 0x00 | |||
| EPILOGUE | |||
| @@ -418,7 +418,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| PROLOGUE | |||
| PTR_LD INC_Y, $sp, 0 | |||
| push_if_used 17 + 7, 19 | |||
| push_if_used 7, 0 | |||
| PTR_ADDI K, $r0, 0x01 | |||
| PTR_SUB I, INC_X, K | |||
| PTR_SUB J, INC_Y, K | |||
| @@ -458,6 +458,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */ | |||
| SGEMV_N_LASX GAP_1_1, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8_GAP, Y_4_GAP, Y_1 | |||
| .L_END: | |||
| pop_if_used 17 + 7, 19 | |||
| pop_if_used 7, 0 | |||
| jirl $r0, $r1, 0x0 | |||
| EPILOGUE | |||
| @@ -369,7 +369,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| PROLOGUE | |||
| PTR_LD INC_Y, $sp, 0 | |||
| push_if_used 17 + 8, 18 | |||
| push_if_used 8, 0 | |||
| PTR_ADDI K, $r0, 0x01 | |||
| PTR_SUB I, INC_X, K | |||
| maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ | |||
| @@ -400,6 +400,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .L_GAP_1: /* if (incx != 1) */ | |||
| SGEMV_T_LASX GAP_1, X8_GAP, X4_GAP | |||
| .L_END: | |||
| pop_if_used 17 + 8, 18 | |||
| pop_if_used 8, 0 | |||
| jirl $r0, $r1, 0x0 | |||
| EPILOGUE | |||
| @@ -253,7 +253,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| PROLOGUE | |||
| PTR_LD INC_Y, $sp, 0 | |||
| push_if_used 17 + 7, 31 | |||
| push_if_used 7, 7 | |||
| PTR_ADDI K, $r0, 0x01 | |||
| PTR_SUB I, INC_X, K | |||
| PTR_SUB J, INC_Y, K | |||
| @@ -291,6 +291,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */ | |||
| ZGEMV_N_LSX GAP_1_1, X_2_GAP, X_1, Y_2_GAP, Y_1 | |||
| .L_END: | |||
| pop_if_used 17 + 7, 31 | |||
| pop_if_used 7, 7 | |||
| jirl $r0, $r1, 0x0 | |||
| EPILOGUE | |||
| @@ -298,7 +298,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| PROLOGUE | |||
| PTR_LD INC_Y, $sp, 0 | |||
| push_if_used 17 + 7, 31 | |||
| push_if_used 7, 7 | |||
| PTR_ADDI K, $r0, 0x01 | |||
| PTR_SUB I, INC_X, K | |||
| PTR_SUB J, INC_Y, K | |||
| @@ -337,7 +337,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */ | |||
| ZGEMV_N_LASX GAP_1_1, X_4_GAP, X_1, Y_4_GAP, Y_1 | |||
| .L_END: | |||
| pop_if_used 17 + 7, 31 | |||
| pop_if_used 7, 7 | |||
| jirl $r0, $r1, 0x0 | |||
| EPILOGUE | |||
| @@ -234,7 +234,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| PROLOGUE | |||
| PTR_LD INC_Y, $sp, 0 | |||
| push_if_used 17 + 8, 30 | |||
| push_if_used 8, 6 | |||
| PTR_ADDI K, $r0, 0x01 | |||
| PTR_SUB I, INC_X, K | |||
| maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ | |||
| @@ -263,6 +263,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .L_GAP_1: /* if (incx != 1) */ | |||
| ZGEMV_T_LSX GAP_1, X2_GAP | |||
| .L_END: | |||
| pop_if_used 17 + 8, 30 | |||
| pop_if_used 8, 6 | |||
| jirl $r0, $r1, 0x0 | |||
| EPILOGUE | |||
| @@ -264,7 +264,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| PROLOGUE | |||
| PTR_LD INC_Y, $sp, 0 | |||
| push_if_used 17 + 8, 30 | |||
| push_if_used 8, 6 | |||
| PTR_ADDI K, $r0, 0x01 | |||
| PTR_SUB I, INC_X, K | |||
| maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ | |||
| @@ -294,6 +294,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .L_GAP_1: /* if (incx != 1) */ | |||
| ZGEMV_T_LASX GAP_1, X4_GAP | |||
| .L_END: | |||
| pop_if_used 17 + 8, 30 | |||
| pop_if_used 8, 6 | |||
| jirl $r0, $r1, 0x0 | |||
| EPILOGUE | |||