From 71fcee6eef23bd058d596c42f2d90494a629b401 Mon Sep 17 00:00:00 2001 From: gxw Date: Thu, 29 Jun 2023 11:11:08 +0800 Subject: [PATCH] LoongArch64: Update dgemm kernel --- kernel/loongarch64/dgemm_kernel_16x4.S | 4058 ++++++++++-------------- 1 file changed, 1664 insertions(+), 2394 deletions(-) diff --git a/kernel/loongarch64/dgemm_kernel_16x4.S b/kernel/loongarch64/dgemm_kernel_16x4.S index 13faa977e..f8e26fda2 100644 --- a/kernel/loongarch64/dgemm_kernel_16x4.S +++ b/kernel/loongarch64/dgemm_kernel_16x4.S @@ -28,6 +28,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" +/********************************************************************* +* 2023/06/28 guxiwei +* UTEST : OK +* CTEST : OK +* TEST : OK +* +* +* 2023/06/28 guxiwei +* Parameter: +* DGEMM_DEFAULT_UNROLL_N 4 +* DGEMM_DEFAULT_UNROLL_M 16 +* DGEMM_DEFAULT_P 32 +* DGEMM_DEFAULT_Q 152 +* DGEMM_DEFAULT_R 858 +* A_PR1 1024 +* B_PR1 256 +* +* +* Performance at Loongson 3A5000 2.5GHz with 5000x5000x5000: +* 1 thread: 36.0 GFLOPS +* 2 threads: 71.6 GFLOPS +* 3 threads: 101.5 GFLOPS +* 4 threads: 132.8 GFLOPS +*********************************************************************/ + /* Function parameters */ #define M $r4 // param 1: bm #define N $r5 // param 2: bn @@ -68,1290 +93,1331 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define U4 $xr4 #define U5 $xr5 #define U6 $xr6 -#define D0 $xr7 -#define D1 $xr8 -#define D2 $xr9 -#define D3 $xr10 -#define D4 $xr11 -#define D5 $xr12 -#define D6 $xr13 -#define D7 $xr14 -#define D8 $xr15 -#define D9 $xr16 -#define D10 $xr17 -#define D11 $xr18 -#define D12 $xr19 -#define D13 $xr20 -#define D14 $xr21 -#define D15 $xr22 -#define VALPHA $xr23 +#define U7 $xr7 +#define U8 $xr8 +#define U9 $xr9 +#define U10 $xr10 +#define U11 $xr11 +#define U12 $xr12 +#define U13 $xr13 +#define U14 $xr14 +#define U15 $xr15 +#define D0 $xr16 +#define D1 $xr17 +#define D2 $xr18 +#define D3 $xr19 +#define D4 $xr20 +#define D5 $xr21 +#define D6 $xr22 +#define D7 $xr23 +#define D8 $xr24 +#define D9 $xr25 +#define D10 $xr26 +#define D11 $xr27 +#define D12 $xr28 +#define D13 $xr29 +#define D14 $xr30 +#define D15 $xr31 +#define VALPHA $xr15 /* Prefetch interval */ -#define A_PRE 0x200 +#define A_PRE 0x400 #define B_PRE 0x100 - PROLOGUE - - addi.d $sp, $sp, -56 - /* Store regs */ - SDARG $r23, $sp, 0 - SDARG $r24, $sp, 8 - SDARG $r25, $sp, 16 - SDARG $r26, $sp, 24 - SDARG $r27, $sp, 32 - ST $f23, $sp, 40 - ST ALPHA, $sp, 48 - - /* VALPHA = {ALPHA, ALPHA, ALPHA, ALPHA} */ - xvld VALPHA, $sp, 48 - xvreplve0.d VALPHA, VALPHA - -#if defined (TRMMKERNEL) && !defined(LEFT) - sub.d OFF, ZERO, OFFSET -#else - xor OFF, OFF, OFF -#endif - - /* if (!(N >> 2)) goto L_N3 */ - srai.d J, N, 2 /* J = bn >> 2 */ - andi N, N, 0x03 - beq ZERO, J, .L_N3 - -.L_J1: /* J-- && This loop include Condition 1 */ - -/************************* Condition 1 if((N >> 2) && (M >> 4)) START !!! ************************* -* dgemm_core_16x4 */ - move C0, C - move A0, A - slli.d T0, LDC, 3 - add.d C1, C0, T0 - addi.d J, J, -1 /* J-- */ - add.d C2, C1, T0 - add.d C3, C2, T0 - -#if defined(TRMMKERNEL) && defined(LEFT) - move OFF, OFFSET -#endif - - /* if (!(M >> 4)) goto L_M8 */ - srai.d I, M, 4 /* I = bm >> 4 */ - beq ZERO, I, .L_M8 - -.L_I1: /* I-- */ -#if defined(TRMMKERNEL) -#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - move B0, B -#else - slli.d T0, OFF, 0x07 - add.d A0, A0, T0 - slli.d T0, OFF, 0x05 - add.d B0, B, T0 -#endif - -#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - sub.d L, K, OFF -#elif defined(LEFT) - /* number of values in A */ - addi.d L, OFF, 16 -#else - /* number of values in B */ - addi.d L, OFF, 4 -#endif -#else // #if !defined(TRMMKERNEL) - move B0, B - move L, K /* L = bk */ -#endif - /* Calculate the first set of D0~D15, - * avoidig set 0 operation - * Load 16 * 64 from A0 - * U0 = {a3, a2, a1, a0} - * U1 = {a7, a6, a5, a4} - * U2 = {a11, a10, a9, a8} - * U3 = {a15, a14, a13, a12} - */ +.macro KERNEL2x16x4 xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - - xvldrepl.d U4, B0, 0x00 - preld 0, C0, 0x00 - /* line 1 */ - xvfmul.d D0, U0, U4 - xvfmul.d D1, U1, U4 - preld 0, C0, 0x40 - xvfmul.d D2, U2, U4 - xvfmul.d D3, U3, U4 - - xvldrepl.d U4, B0, 0x08 - preld 0, C1, 0x00 - /* line 2 */ - xvfmul.d D4, U0, U4 - xvfmul.d D5, U1, U4 - preld 0, C1, 0x40 - xvfmul.d D6, U2, U4 - xvfmul.d D7, U3, U4 - - xvldrepl.d U4, B0, 0x10 - preld 0, C2, 0x00 - /* line 3 */ - xvfmul.d D8, U0, U4 - xvfmul.d D9, U1, U4 - preld 0, C2, 0x40 - xvfmul.d D10, U2, U4 - xvfmul.d D11, U3, U4 - - xvldrepl.d U4, B0, 0x18 - preld 0, C3, 0x00 - /* line 4 */ - xvfmul.d D12, U0, U4 - xvfmul.d D13, U1, U4 - preld 0, C3, 0x40 - xvfmul.d D14, U2, U4 - xvfmul.d D15, U3, U4 - - /* Add stride for A0 and B0 */ - addi.d A0, A0, 0x80 - addi.d B0, B0, 0x20 - /* Reduce L */ - addi.d L, L, -1 - srai.d TL, L, 3 /* TL = (L-1) >> 3 */ - /* if (TL < 1) goto L_L7 */ - beq ZERO,TL, .L_L7 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D1, U9, U12, D1 - /* Calculate 8 sets of D0~D15 */ -.L_TL1: /* TL-- */ - /***8-1***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 xvld U1, A0, 0x20 + xvfmadd.d D2, U10, U12, D2 + xvfmadd.d D3, U11, U12, D3 + xvld U2, A0, 0x40 + xvfmadd.d D4, U8, U13, D4 + xvfmadd.d D5, U9, U13, D5 + xvld U3, A0, 0x60 + xvfmadd.d D6, U10, U13, D6 + xvfmadd.d D7, U11, U13, D7 - /* Cumulative D0~D15 */ xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 + xvfmadd.d D8, U8, U14, D8 + xvfmadd.d D9, U9, U14, D9 + preld 0, B0, B_PRE + xvldrepl.d U5, B0, 0x08 + xvfmadd.d D10, U10, U14, D10 + xvfmadd.d D11, U11, U14, D11 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - xvfmadd.d D6, U2, U4, D6 - xvfmadd.d D7, U3, U4, D7 preld 0, A0, A_PRE + xvldrepl.d U6, B0, 0x10 + xvfmadd.d D12, U8, U15, D12 + xvfmadd.d D13, U9, U15, D13 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - xvfmadd.d D9, U1, U4, D9 - xvfmadd.d D10, U2, U4, D10 - xvfmadd.d D11, U3, U4, D11 preld 0, A0, A_PRE + 0x40 - - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - xvfmadd.d D13, U1, U4, D13 - xvfmadd.d D14, U2, U4, D14 - xvfmadd.d D15, U3, U4, D15 + xvldrepl.d U7, B0, 0x18 + xvfmadd.d D14, U10, U15, D14 + xvfmadd.d D15, U11, U15, D15 addi.d A0, A0, 0x80 addi.d B0, B0, 0x20 - /***8-2***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - - /* Cumulative D0~D15 */ - xvldrepl.d U4, B0, 0x00 + xvld U8, A0, 0x00 xvfmadd.d D0, U0, U4, D0 xvfmadd.d D1, U1, U4, D1 + + xvld U9, A0, 0x20 xvfmadd.d D2, U2, U4, D2 xvfmadd.d D3, U3, U4, D3 + + xvld U10, A0, 0x40 + xvfmadd.d D4, U0, U5, D4 + xvfmadd.d D5, U1, U5, D5 + + xvld U11, A0, 0x60 + xvfmadd.d D6, U2, U5, D6 + xvfmadd.d D7, U3, U5, D7 + + xvldrepl.d U12, B0, 0x00 + xvfmadd.d D8, U0, U6, D8 + xvfmadd.d D9, U1, U6, D9 + preld 0, B0, B_PRE + xvldrepl.d U13, B0, 0x08 + xvfmadd.d D10, U2, U6, D10 + xvfmadd.d D11, U3, U6, D11 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - xvfmadd.d D6, U2, U4, D6 - xvfmadd.d D7, U3, U4, D7 preld 0, A0, A_PRE + xvldrepl.d U14, B0, 0x10 + xvfmadd.d D12, U0, U7, D12 + xvfmadd.d D13, U1, U7, D13 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - xvfmadd.d D9, U1, U4, D9 - xvfmadd.d D10, U2, U4, D10 - xvfmadd.d D11, U3, U4, D11 preld 0, A0, A_PRE + 0x40 - - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - xvfmadd.d D13, U1, U4, D13 - xvfmadd.d D14, U2, U4, D14 - xvfmadd.d D15, U3, U4, D15 + xvldrepl.d U15, B0, 0x18 + xvfmadd.d D14, U2, U7, D14 + xvfmadd.d D15, U3, U7, D15 addi.d A0, A0, 0x80 addi.d B0, B0, 0x20 +.endm - /***8-3***/ - /* Load 16 * 64 from A0 */ +.macro KERNEL2x16x4_END xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D1, U9, U12, D1 + xvld U1, A0, 0x20 + xvfmadd.d D2, U10, U12, D2 + xvfmadd.d D3, U11, U12, D3 + xvld U2, A0, 0x40 + xvfmadd.d D4, U8, U13, D4 + xvfmadd.d D5, U9, U13, D5 + xvld U3, A0, 0x60 + xvfmadd.d D6, U10, U13, D6 + xvfmadd.d D7, U11, U13, D7 - /* Cumulative D0~D15 */ xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 + xvfmadd.d D8, U8, U14, D8 + xvfmadd.d D9, U9, U14, D9 + preld 0, B0, B_PRE + xvldrepl.d U5, B0, 0x08 + xvfmadd.d D10, U10, U14, D10 + xvfmadd.d D11, U11, U14, D11 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - xvfmadd.d D6, U2, U4, D6 - xvfmadd.d D7, U3, U4, D7 preld 0, A0, A_PRE + xvldrepl.d U6, B0, 0x10 + xvfmadd.d D12, U8, U15, D12 + xvfmadd.d D13, U9, U15, D13 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - xvfmadd.d D9, U1, U4, D9 - xvfmadd.d D10, U2, U4, D10 - xvfmadd.d D11, U3, U4, D11 preld 0, A0, A_PRE + 0x40 - - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - xvfmadd.d D13, U1, U4, D13 - xvfmadd.d D14, U2, U4, D14 - xvfmadd.d D15, U3, U4, D15 + xvldrepl.d U7, B0, 0x18 + xvfmadd.d D14, U10, U15, D14 + xvfmadd.d D15, U11, U15, D15 addi.d A0, A0, 0x80 addi.d B0, B0, 0x20 - /***8-4***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - - /* Cumulative D0~D15 */ - xvldrepl.d U4, B0, 0x00 xvfmadd.d D0, U0, U4, D0 xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 xvfmadd.d D3, U3, U4, D3 + + xvfmadd.d D4, U0, U5, D4 + xvfmadd.d D5, U1, U5, D5 + + xvfmadd.d D6, U2, U5, D6 + xvfmadd.d D7, U3, U5, D7 + + xvfmadd.d D8, U0, U6, D8 + xvfmadd.d D9, U1, U6, D9 + preld 0, B0, B_PRE + xvfmadd.d D10, U2, U6, D10 + xvfmadd.d D11, U3, U6, D11 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - xvfmadd.d D6, U2, U4, D6 - xvfmadd.d D7, U3, U4, D7 preld 0, A0, A_PRE + xvfmadd.d D12, U0, U7, D12 + xvfmadd.d D13, U1, U7, D13 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - xvfmadd.d D9, U1, U4, D9 - xvfmadd.d D10, U2, U4, D10 - xvfmadd.d D11, U3, U4, D11 preld 0, A0, A_PRE + 0x40 + xvfmadd.d D14, U2, U7, D14 + xvfmadd.d D15, U3, U7, D15 +.endm - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - xvfmadd.d D13, U1, U4, D13 - xvfmadd.d D14, U2, U4, D14 - xvfmadd.d D15, U3, U4, D15 +.macro KERNEL8x16x4 +.rept 4 + KERNEL2x16x4 +.endr +.endm - addi.d A0, A0, 0x80 - addi.d B0, B0, 0x20 +.macro KERNEL8x16x4_END +.rept 3 + KERNEL2x16x4 +.endr + KERNEL2x16x4_END +.endm - /***8-5***/ - /* Load 16 * 64 from A0 */ +.macro KERNEL2x8x4 xvld U0, A0, 0x00 xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - /* Cumulative D0~D15 */ xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 - preld 0, B0, B_PRE + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D1, U9, U12, D1 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - xvfmadd.d D6, U2, U4, D6 - xvfmadd.d D7, U3, U4, D7 - preld 0, A0, A_PRE + xvldrepl.d U5, B0, 0x08 + xvfmadd.d D4, U8, U13, D4 + xvfmadd.d D5, U9, U13, D5 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - xvfmadd.d D9, U1, U4, D9 - xvfmadd.d D10, U2, U4, D10 - xvfmadd.d D11, U3, U4, D11 - preld 0, A0, A_PRE + 0x40 + xvldrepl.d U6, B0, 0x10 + xvfmadd.d D8, U8, U14, D8 + xvfmadd.d D9, U9, U14, D9 - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - xvfmadd.d D13, U1, U4, D13 - xvfmadd.d D14, U2, U4, D14 - xvfmadd.d D15, U3, U4, D15 + xvldrepl.d U7, B0, 0x18 + xvfmadd.d D12, U8, U15, D12 + xvfmadd.d D13, U9, U15, D13 - addi.d A0, A0, 0x80 + addi.d A0, A0, 0x40 addi.d B0, B0, 0x20 - /***8-6***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 + xvld U8, A0, 0x00 + xvld U9, A0, 0x20 - /* Cumulative D0~D15 */ - xvldrepl.d U4, B0, 0x00 + xvldrepl.d U12, B0, 0x00 xvfmadd.d D0, U0, U4, D0 xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 - preld 0, B0, B_PRE - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - xvfmadd.d D6, U2, U4, D6 - xvfmadd.d D7, U3, U4, D7 - preld 0, A0, A_PRE + xvldrepl.d U13, B0, 0x08 + xvfmadd.d D4, U0, U5, D4 + xvfmadd.d D5, U1, U5, D5 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - xvfmadd.d D9, U1, U4, D9 - xvfmadd.d D10, U2, U4, D10 - xvfmadd.d D11, U3, U4, D11 - preld 0, A0, A_PRE + 0x40 + xvldrepl.d U14, B0, 0x10 + xvfmadd.d D8, U0, U6, D8 + xvfmadd.d D9, U1, U6, D9 - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - xvfmadd.d D13, U1, U4, D13 - xvfmadd.d D14, U2, U4, D14 - xvfmadd.d D15, U3, U4, D15 + xvldrepl.d U15, B0, 0x18 + xvfmadd.d D12, U0, U7, D12 + xvfmadd.d D13, U1, U7, D13 - addi.d A0, A0, 0x80 + addi.d A0, A0, 0x40 addi.d B0, B0, 0x20 +.endm - /***8-7***/ - /* Load 16 * 64 from A0 */ +.macro KERNEL2x8x4_END xvld U0, A0, 0x00 xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - /* Cumulative D0~D15 */ xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D1, U9, U12, D1 + + xvldrepl.d U5, B0, 0x08 + xvfmadd.d D4, U8, U13, D4 + xvfmadd.d D5, U9, U13, D5 + + xvldrepl.d U6, B0, 0x10 + xvfmadd.d D8, U8, U14, D8 + xvfmadd.d D9, U9, U14, D9 + + xvldrepl.d U7, B0, 0x18 + xvfmadd.d D12, U8, U15, D12 + xvfmadd.d D13, U9, U15, D13 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + xvfmadd.d D0, U0, U4, D0 xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 - preld 0, B0, B_PRE - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - xvfmadd.d D6, U2, U4, D6 - xvfmadd.d D7, U3, U4, D7 - preld 0, A0, A_PRE + xvfmadd.d D4, U0, U5, D4 + xvfmadd.d D5, U1, U5, D5 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - xvfmadd.d D9, U1, U4, D9 - xvfmadd.d D10, U2, U4, D10 - xvfmadd.d D11, U3, U4, D11 - preld 0, A0, A_PRE + 0x40 + xvfmadd.d D8, U0, U6, D8 + xvfmadd.d D9, U1, U6, D9 - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - xvfmadd.d D13, U1, U4, D13 - xvfmadd.d D14, U2, U4, D14 - xvfmadd.d D15, U3, U4, D15 + xvfmadd.d D12, U0, U7, D12 + xvfmadd.d D13, U1, U7, D13 +.endm - addi.d A0, A0, 0x80 - addi.d B0, B0, 0x20 +.macro KERNEL8x8x4 +.rept 4 + KERNEL2x8x4 +.endr +.endm - /***8-8***/ - /* Load 16 * 64 from A0 */ +.macro KERNEL8x8x4_END +.rept 3 + KERNEL2x8x4 +.endr + KERNEL2x8x4_END +.endm + +.macro KERNEL2x4x4 xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - /* Cumulative D0~D15 */ xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 - preld 0, B0, B_PRE + xvfmadd.d D0, U8, U12, D0 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - xvfmadd.d D6, U2, U4, D6 - xvfmadd.d D7, U3, U4, D7 - preld 0, A0, A_PRE + xvldrepl.d U5, B0, 0x08 + xvfmadd.d D4, U8, U13, D4 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - xvfmadd.d D9, U1, U4, D9 - xvfmadd.d D10, U2, U4, D10 - xvfmadd.d D11, U3, U4, D11 - preld 0, A0, A_PRE + 0x40 + xvldrepl.d U6, B0, 0x10 + xvfmadd.d D8, U8, U14, D8 - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - xvfmadd.d D13, U1, U4, D13 - xvfmadd.d D14, U2, U4, D14 - xvfmadd.d D15, U3, U4, D15 + xvldrepl.d U7, B0, 0x18 + xvfmadd.d D12, U8, U15, D12 - addi.d A0, A0, 0x80 + addi.d A0, A0, 0x20 addi.d B0, B0, 0x20 - addi.d TL, TL, -1 /* TL-- */ - blt ZERO,TL, .L_TL1 + xvld U8, A0, 0x00 - /* Maybe we need calculate the last - * 7 sets of D0~D15? - */ -.L_L7: - /* if (!(L & 7)) goto L_L0 */ - andi TL, L, 7 - beq TL, ZERO,.L_L0 + xvldrepl.d U12, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 -.L_L71: - /* Load 16 * 64 from A0 */ + xvldrepl.d U13, B0, 0x08 + xvfmadd.d D4, U0, U5, D4 + + xvldrepl.d U14, B0, 0x10 + xvfmadd.d D8, U0, U6, D8 + + xvldrepl.d U15, B0, 0x18 + xvfmadd.d D12, U0, U7, D12 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 +.endm + +.macro KERNEL2x4x4_END xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - /* Cumulative D0~D15 */ xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 + xvfmadd.d D0, U8, U12, D0 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - xvfmadd.d D6, U2, U4, D6 - xvfmadd.d D7, U3, U4, D7 + xvldrepl.d U5, B0, 0x08 + xvfmadd.d D4, U8, U13, D4 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - xvfmadd.d D9, U1, U4, D9 - xvfmadd.d D10, U2, U4, D10 - xvfmadd.d D11, U3, U4, D11 + xvldrepl.d U6, B0, 0x10 + xvfmadd.d D8, U8, U14, D8 - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - xvfmadd.d D13, U1, U4, D13 - xvfmadd.d D14, U2, U4, D14 - xvfmadd.d D15, U3, U4, D15 + xvldrepl.d U7, B0, 0x18 + xvfmadd.d D12, U8, U15, D12 - /* Add stride for A0, B0 */ - addi.d A0, A0, 0x80 + addi.d A0, A0, 0x20 addi.d B0, B0, 0x20 - addi.d TL, TL, -1 - blt ZERO,TL, .L_L71 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D4, U0, U5, D4 + xvfmadd.d D8, U0, U6, D8 + xvfmadd.d D12, U0, U7, D12 +.endm -.L_L0: -#if defined(TRMMKERNEL) - xvfmul.d D0, D0, VALPHA - xvfmul.d D1, D1, VALPHA - xvfmul.d D2, D2, VALPHA - xvfmul.d D3, D3, VALPHA - xvfmul.d D4, D4, VALPHA - xvfmul.d D5, D5, VALPHA - xvfmul.d D6, D6, VALPHA - xvfmul.d D7, D7, VALPHA - xvfmul.d D8, D8, VALPHA - xvfmul.d D9, D9, VALPHA - xvfmul.d D10, D10, VALPHA - xvfmul.d D11, D11, VALPHA - xvfmul.d D12, D12, VALPHA - xvfmul.d D13, D13, VALPHA - xvfmul.d D14, D14, VALPHA - xvfmul.d D15, D15, VALPHA -#else - /* Load C0 */ - xvld U0, C0, 0x00 - xvld U1, C0, 0x20 - xvld U2, C0, 0x40 - xvld U3, C0, 0x60 - xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ - xvfmadd.d D1, D1, VALPHA, U1 - xvfmadd.d D2, D2, VALPHA, U2 - xvfmadd.d D3, D3, VALPHA, U3 +.macro KERNEL8x4x4 +.rept 4 + KERNEL2x4x4 +.endr +.endm - /* Load C1 */ - xvld U0, C1, 0x00 - xvld U1, C1, 0x20 - xvld U2, C1, 0x40 - xvld U3, C1, 0x60 - xvfmadd.d D4, D4, VALPHA, U0 - xvfmadd.d D5, D5, VALPHA, U1 - xvfmadd.d D6, D6, VALPHA, U2 - xvfmadd.d D7, D7, VALPHA, U3 +.macro KERNEL8x4x4_END +.rept 3 + KERNEL2x4x4 +.endr + KERNEL2x4x4_END +.endm - /* Load C2 */ - xvld U0, C2, 0x00 - xvld U1, C2, 0x20 - xvld U2, C2, 0x40 - xvld U3, C2, 0x60 - xvfmadd.d D8, D8, VALPHA, U0 - xvfmadd.d D9, D9, VALPHA, U1 - xvfmadd.d D10, D10, VALPHA, U2 - xvfmadd.d D11, D11, VALPHA, U3 +.macro KERNEL2x2x4 + xvldrepl.d U0, A0, 0x00 + xvldrepl.d U1, A0, 0x08 - /* Load C3 */ - xvld U0, C3, 0x00 - xvld U1, C3, 0x20 - xvld U2, C3, 0x40 - xvld U3, C3, 0x60 - xvfmadd.d D12, D12, VALPHA, U0 - xvfmadd.d D13, D13, VALPHA, U1 - xvfmadd.d D14, D14, VALPHA, U2 - xvfmadd.d D15, D15, VALPHA, U3 -#endif // #if defined(TRMMKERNEL) + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D1, U9, U12, D1 - /* Store C0 */ - xvst D0, C0, 0x00 - xvst D1, C0, 0x20 - xvst D2, C0, 0x40 - xvst D3, C0, 0x60 - /* Store C1 */ - xvst D4, C1, 0x00 - xvst D5, C1, 0x20 - xvst D6, C1, 0x40 - xvst D7, C1, 0x60 - /* Store C2 */ - xvst D8, C2, 0x00 - xvst D9, C2, 0x20 - xvst D10, C2, 0x40 - xvst D11, C2, 0x60 - /* Store C3 */ - xvst D12, C3, 0x00 - xvst D13, C3, 0x20 - xvst D14, C3, 0x40 - xvst D15, C3, 0x60 + xvld U4, B0, 0x00 + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 - /* Add stride for C */ - addi.d C0, C0, 0x80 - addi.d C1, C1, 0x80 - addi.d C2, C2, 0x80 - addi.d C3, C3, 0x80 + xvldrepl.d U8, A0, 0x00 + xvldrepl.d U9, A0, 0x08 -#if defined(TRMMKERNEL) -#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - sub.d L, K, OFF -#ifdef LEFT - /* number of values in A */ - addi.d L, L, -16 -#else - /* number of values in B */ - addi.d L, L, -4 -#endif - slli.d T0, L, 0x07 - add.d A0, A0, T0 - slli.d T0, L, 0x05 - add.d B0, B0, T0 -#endif + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 -#ifdef LEFT - addi.d OFF, OFF, 0x10 -#endif -#endif // #if defined(TRMMKERNEL) + xvld U12, B0, 0x00 + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 +.endm - addi.d I, I, -1 /* I-- */ - blt ZERO,I, .L_I1 +.macro KERNEL2x2x4_END + xvldrepl.d U0, A0, 0x00 + xvldrepl.d U1, A0, 0x08 -.L_M8: - /* We have done M & 16, considering M=8/4/2/1 */ - andi I, M, 15 - beq ZERO,I, .L_M0 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D1, U9, U12, D1 - andi I, M, 8 - beq ZERO,I, .L_M4 + xvld U4, B0, 0x00 + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 -#if defined(TRMMKERNEL) -#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - move B0, B -#else - slli.d T0, OFF, 0x06 - add.d A0, A0, T0 - slli.d T0, OFF, 0x05 - add.d B0, B, T0 -#endif + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 +.endm -#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - sub.d L, K, OFF -#elif defined(LEFT) - /* number of values in A */ - addi.d L, OFF, 8 -#else - /* number of values in B */ - addi.d L, OFF, 4 -#endif -#else // #if !defined(TRMMKERNEL) - move B0, B - move L, K /* L = bk */ -#endif // #if defined(TRMMKERNEL) +.macro KERNEL8x2x4 +.rept 4 + KERNEL2x2x4 +.endr +.endm - /* Load 8 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 +.macro KERNEL8x2x4_END +.rept 3 + KERNEL2x2x4 +.endr + KERNEL2x2x4_END +.endm - xvldrepl.d U4, B0, 0x00 - /* line 1 */ - xvfmul.d D0, U0, U4 - xvfmul.d D1, U1, U4 +.macro KERNEL2x1x4 + xvldrepl.d U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvld U4, B0, 0x00 - xvldrepl.d U4, B0, 0x08 - /* line 2 */ - xvfmul.d D4, U0, U4 - xvfmul.d D5, U1, U4 + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 - xvldrepl.d U4, B0, 0x10 - /* line 3 */ - xvfmul.d D8, U0, U4 - xvfmul.d D9, U1, U4 + xvldrepl.d U8, A0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvld U12, B0, 0x00 - xvldrepl.d U4, B0, 0x18 - /* line 4 */ - xvfmul.d D12, U0, U4 - xvfmul.d D13, U1, U4 + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 +.endm - /* Add stride for A0 and B0 */ - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x20 - /* Reduce L */ - addi.d L, L, -1 - srai.d TL, L, 3 /* TL = (L-1) >> 3 */ - /* if (TL < 1) goto L_M8_L7 */ - beq ZERO,TL, .L_M8_L7 +.macro KERNEL2x1x4_END + xvldrepl.d U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvld U4, B0, 0x00 -.L_M8_TL1: /* TL-- */ - /***8-1***/ - /* Load 16 * 64 from A0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + + xvfmadd.d D0, U0, U4, D0 +.endm + +.macro KERNEL8x1x4 +.rept 4 + KERNEL2x1x4 +.endr +.endm + +.macro KERNEL8x1x4_END +.rept 3 + KERNEL2x1x4 +.endr + KERNEL2x1x4_END +.endm + +.macro KERNEL2x16x2 xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D1, U9, U12, D1 + xvld U1, A0, 0x20 + xvfmadd.d D2, U10, U12, D2 + xvfmadd.d D3, U11, U12, D3 + + xvld U2, A0, 0x40 + xvfmadd.d D4, U8, U13, D4 + xvfmadd.d D5, U9, U13, D5 + + xvld U3, A0, 0x60 + xvfmadd.d D6, U10, U13, D6 + xvfmadd.d D7, U11, U13, D7 xvldrepl.d U4, B0, 0x00 + xvldrepl.d U5, B0, 0x08 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x10 + + xvld U8, A0, 0x00 xvfmadd.d D0, U0, U4, D0 xvfmadd.d D1, U1, U4, D1 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 + xvld U9, A0, 0x20 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - xvfmadd.d D9, U1, U4, D9 + xvld U10, A0, 0x40 + xvfmadd.d D4, U0, U5, D4 + xvfmadd.d D5, U1, U5, D5 - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - xvfmadd.d D13, U1, U4, D13 + xvld U11, A0, 0x60 + xvfmadd.d D6, U2, U5, D6 + xvfmadd.d D7, U3, U5, D7 - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x20 + xvldrepl.d U12, B0, 0x00 + xvldrepl.d U13, B0, 0x08 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x10 +.endm - /***8-2***/ +.macro KERNEL2x16x2_END xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D1, U9, U12, D1 + xvld U1, A0, 0x20 + xvfmadd.d D2, U10, U12, D2 + xvfmadd.d D3, U11, U12, D3 + + xvld U2, A0, 0x40 + xvfmadd.d D4, U8, U13, D4 + xvfmadd.d D5, U9, U13, D5 + + xvld U3, A0, 0x60 + xvfmadd.d D6, U10, U13, D6 + xvfmadd.d D7, U11, U13, D7 xvldrepl.d U4, B0, 0x00 + xvldrepl.d U5, B0, 0x08 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x10 + xvfmadd.d D0, U0, U4, D0 xvfmadd.d D1, U1, U4, D1 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - xvfmadd.d D9, U1, U4, D9 + xvfmadd.d D4, U0, U5, D4 + xvfmadd.d D5, U1, U5, D5 - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - xvfmadd.d D13, U1, U4, D13 + xvfmadd.d D6, U2, U5, D6 + xvfmadd.d D7, U3, U5, D7 +.endm - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x20 +.macro KERNEL8x16x2 +.rept 4 + KERNEL2x16x2 +.endr +.endm + +.macro KERNEL8x16x2_END +.rept 3 + KERNEL2x16x2 +.endr + KERNEL2x16x2_END +.endm - /***8-3***/ +.macro KERNEL2x8x2 xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D1, U9, U12, D1 + xvld U1, A0, 0x20 + xvfmadd.d D4, U8, U13, D4 + xvfmadd.d D5, U9, U13, D5 xvldrepl.d U4, B0, 0x00 + xvldrepl.d U5, B0, 0x08 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + + xvld U8, A0, 0x00 xvfmadd.d D0, U0, U4, D0 xvfmadd.d D1, U1, U4, D1 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - xvfmadd.d D9, U1, U4, D9 + xvld U9, A0, 0x20 + xvfmadd.d D4, U0, U5, D4 + xvfmadd.d D5, U1, U5, D5 - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - xvfmadd.d D13, U1, U4, D13 + xvldrepl.d U12, B0, 0x00 + xvldrepl.d U13, B0, 0x08 addi.d A0, A0, 0x40 - addi.d B0, B0, 0x20 + addi.d B0, B0, 0x10 +.endm - /***8-4***/ +.macro KERNEL2x8x2_END xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D1, U9, U12, D1 + xvld U1, A0, 0x20 + xvfmadd.d D4, U8, U13, D4 + xvfmadd.d D5, U9, U13, D5 xvldrepl.d U4, B0, 0x00 + xvldrepl.d U5, B0, 0x08 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + xvfmadd.d D0, U0, U4, D0 xvfmadd.d D1, U1, U4, D1 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D4, U0, U5, D4 + xvfmadd.d D5, U1, U5, D5 +.endm - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - xvfmadd.d D9, U1, U4, D9 +.macro KERNEL8x8x2 +.rept 4 + KERNEL2x8x2 +.endr +.endm - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - xvfmadd.d D13, U1, U4, D13 +.macro KERNEL8x8x2_END +.rept 3 + KERNEL2x8x2 + .endr + KERNEL2x8x2_END +.endm - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x20 - - /***8-5***/ +.macro KERNEL2x4x2 xvld U0, A0, 0x00 - xvld U1, A0, 0x20 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D4, U8, U13, D4 xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 + xvldrepl.d U5, B0, 0x08 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - xvfmadd.d D9, U1, U4, D9 + xvld U8, A0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D4, U0, U5, D4 - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - xvfmadd.d D13, U1, U4, D13 + xvldrepl.d U12, B0, 0x00 + xvldrepl.d U13, B0, 0x08 - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x20 + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 +.endm - /***8-6***/ +.macro KERNEL2x4x2_END xvld U0, A0, 0x00 - xvld U1, A0, 0x20 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D4, U8, U13, D4 xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 + xvldrepl.d U5, B0, 0x08 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - xvfmadd.d D9, U1, U4, D9 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D4, U0, U5, D4 +.endm - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - xvfmadd.d D13, U1, U4, D13 +.macro KERNEL8x4x2 +.rept 4 + KERNEL2x4x2 +.endr +.endm - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x20 +.macro KERNEL8x4x2_END +.rept 3 + KERNEL2x4x2 +.endr + KERNEL2x4x2_END +.endm - /***8-7***/ +.macro KERNEL2x2x2 xvld U0, A0, 0x00 - xvld U1, A0, 0x20 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D4, U8, U13, D4 xvldrepl.d U4, B0, 0x00 + xvldrepl.d U5, B0, 0x08 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + + xvld U8, A0, 0x00 xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D4, U0, U5, D4 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 + xvldrepl.d U12, B0, 0x00 + xvldrepl.d U13, B0, 0x08 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - xvfmadd.d D9, U1, U4, D9 + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 +.endm - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - xvfmadd.d D13, U1, U4, D13 +.macro KERNEL2x2x2_END + xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D4, U8, U13, D4 - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x20 + xvldrepl.d U4, B0, 0x00 + xvldrepl.d U5, B0, 0x08 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D4, U0, U5, D4 +.endm - /***8-8***/ +.macro KERNEL8x2x2 +.rept 4 + KERNEL2x2x2 +.endr +.endm + +.macro KERNEL8x2x2_END +.rept 3 + KERNEL2x2x2 +.endr + KERNEL2x2x2_END +.endm + +.macro KERNEL2x1x2 xvld U0, A0, 0x00 - xvld U1, A0, 0x20 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D4, U8, U13, D4 xvldrepl.d U4, B0, 0x00 + xvldrepl.d U5, B0, 0x08 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + + xvld U8, A0, 0x00 xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D4, U0, U5, D4 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 + xvldrepl.d U12, B0, 0x00 + xvldrepl.d U13, B0, 0x08 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - xvfmadd.d D9, U1, U4, D9 + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 +.endm - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - xvfmadd.d D13, U1, U4, D13 +.macro KERNEL2x1x2_END + xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D4, U8, U13, D4 - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x20 + xvldrepl.d U4, B0, 0x00 + xvldrepl.d U5, B0, 0x08 - addi.d TL, TL, -1 /* TL-- */ - blt ZERO,TL, .L_M8_TL1 + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 -.L_M8_L7: - /* if (!(L & 7)) goto L_M8_L0 */ - andi TL, L, 7 - beq TL, ZERO,.L_M8_L0 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D4, U0, U5, D4 +.endm -.L_M8_L71: +.macro KERNEL8x1x2 +.rept 4 + KERNEL2x1x2 +.endr +.endm + +.macro KERNEL8x1x2_END +.rept 3 + KERNEL2x1x2 +.endr + KERNEL2x1x2_END +.endm + +.macro KERNEL2x16x1 xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D1, U9, U12, D1 + xvld U1, A0, 0x20 + xvfmadd.d D2, U10, U12, D2 + xvfmadd.d D3, U11, U12, D3 + + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 xvldrepl.d U4, B0, 0x00 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x08 + + xvld U8, A0, 0x00 xvfmadd.d D0, U0, U4, D0 xvfmadd.d D1, U1, U4, D1 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 + xvld U9, A0, 0x20 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - xvfmadd.d D9, U1, U4, D9 + xvld U10, A0, 0x40 + xvld U11, A0, 0x60 - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - xvfmadd.d D13, U1, U4, D13 + xvldrepl.d U12, B0, 0x00 - /* Add stride for A0, B0 */ - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x20 + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x08 +.endm - addi.d TL, TL, -1 - blt ZERO,TL, .L_M8_L71 +.macro KERNEL2x16x1_END + xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D1, U9, U12, D1 -.L_M8_L0: -#if defined(TRMMKERNEL) - xvfmul.d D0, D0, VALPHA - xvfmul.d D1, D1, VALPHA - xvfmul.d D4, D4, VALPHA - xvfmul.d D5, D5, VALPHA - xvfmul.d D8, D8, VALPHA - xvfmul.d D9, D9, VALPHA - xvfmul.d D12, D12, VALPHA - xvfmul.d D13, D13, VALPHA -#else - /* Load C0 */ - xvld U0, C0, 0x00 - xvld U1, C0, 0x20 - xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ - xvfmadd.d D1, D1, VALPHA, U1 + xvld U1, A0, 0x20 + xvfmadd.d D2, U10, U12, D2 + xvfmadd.d D3, U11, U12, D3 - /* Load C1 */ - xvld U0, C1, 0x00 - xvld U1, C1, 0x20 - xvfmadd.d D4, D4, VALPHA, U0 - xvfmadd.d D5, D5, VALPHA, U1 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 - /* Load C2 */ - xvld U0, C2, 0x00 - xvld U1, C2, 0x20 - xvfmadd.d D8, D8, VALPHA, U0 - xvfmadd.d D9, D9, VALPHA, U1 + xvldrepl.d U4, B0, 0x00 - /* Load C3 */ - xvld U0, C3, 0x00 - xvld U1, C3, 0x20 - xvfmadd.d D12, D12, VALPHA, U0 - xvfmadd.d D13, D13, VALPHA, U1 -#endif // #if defined(TRMMKERNEL) + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x08 - /* Store C0 */ - xvst D0, C0, 0x00 - xvst D1, C0, 0x20 - /* Store C1 */ - xvst D4, C1, 0x00 - xvst D5, C1, 0x20 - /* Store C2 */ - xvst D8, C2, 0x00 - xvst D9, C2, 0x20 - /* Store C3 */ - xvst D12, C3, 0x00 - xvst D13, C3, 0x20 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 - /* Add stride for C */ - addi.d C0, C0, 0x40 - addi.d C1, C1, 0x40 - addi.d C2, C2, 0x40 - addi.d C3, C3, 0x40 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 +.endm -#if defined(TRMMKERNEL) -#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - sub.d L, K, OFF -#ifdef LEFT - /* number of values in A */ - addi.d L, L, -8 -#else - /* number of values in B */ - addi.d L, L, -4 -#endif - slli.d T0, L, 0x06 - add.d A0, A0, T0 - slli.d T0, L, 0x05 - add.d B0, B0, T0 -#endif +.macro KERNEL8x16x1 +.rept 4 + KERNEL2x16x1 +.endr +.endm -#ifdef LEFT - /* number of values in A */ - addi.d OFF, OFF, 0x08 -#endif -#endif // #if defined(TRMMKERNEL) +.macro KERNEL8x16x1_END +.rept 3 + KERNEL2x16x1 +.endr + KERNEL2x16x1_END +.endm -/********LOOP (if(N >> 2 ) && (M & 8)) End************/ +.macro KERNEL2x8x1 + xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D1, U9, U12, D1 + xvld U1, A0, 0x20 + xvldrepl.d U4, B0, 0x00 -.L_M4: - andi I, M, 4 - beq ZERO,I, .L_M2 + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 -#if defined(TRMMKERNEL) -#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - move B0, B -#else - slli.d T0, OFF, 0x05 - add.d A0, A0, T0 - add.d B0, B, T0 -#endif + xvld U8, A0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvld U9, A0, 0x20 + xvldrepl.d U12, B0, 0x00 -#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - sub.d L, K, OFF -#elif defined(LEFT) - /* number of values in A */ - addi.d L, OFF, 4 -#else - /* number of values in B */ - addi.d L, OFF, 4 -#endif -#else // #if !defined(TRMMKERNEL) - move B0, B - move L, K /* L = bk */ -#endif + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 +.endm - /* Load 4 * 64 from A0 */ +.macro KERNEL2x8x1_END xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D1, U9, U12, D1 + xvld U1, A0, 0x20 + xvldrepl.d U4, B0, 0x00 - xvldrepl.d U4, B0, 0x00 - /* line 1 */ - xvfmul.d D0, U0, U4 - - xvldrepl.d U4, B0, 0x08 - /* line 2 */ - xvfmul.d D4, U0, U4 + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 - xvldrepl.d U4, B0, 0x10 - /* line 3 */ - xvfmul.d D8, U0, U4 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 +.endm - xvldrepl.d U4, B0, 0x18 - /* line 4 */ - xvfmul.d D12, U0, U4 +.macro KERNEL8x8x1 +.rept 4 + KERNEL2x8x1 +.endr +.endm - /* Add stride for A0 and B0 */ - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x20 - /* Reduce L */ - addi.d L, L, -1 - srai.d TL, L, 3 /* TL = (L-1) >> 3 */ - /* if (TL < 1) goto L_M4_L7 */ - beq ZERO,TL, .L_M4_L7 +.macro KERNEL8x8x1_END +.rept 3 + KERNEL2x8x1 +.endr + KERNEL2x8x1_END +.endm -.L_M4_TL1: /* TL-- */ - /***8-1***/ +.macro KERNEL2x4x1 xvld U0, A0, 0x00 - + xvfmadd.d D0, U8, U12, D0 xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 + xvld U8, A0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvldrepl.d U12, B0, 0x00 addi.d A0, A0, 0x20 - addi.d B0, B0, 0x20 + addi.d B0, B0, 0x08 +.endm - /***8-2***/ +.macro KERNEL2x4x1_END xvld U0, A0, 0x00 - + xvfmadd.d D0, U8, U12, D0 xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D0, U0, U4, D0 +.endm - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 +.macro KERNEL8x4x1 +.rept 4 + KERNEL2x4x1 +.endr +.endm - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x20 +.macro KERNEL8x4x1_END +.rept 3 + KERNEL2x4x1 +.endr + KERNEL2x4x1_END +.endm - /***8-3***/ +.macro KERNEL2x2x1 xvld U0, A0, 0x00 - + xvfmadd.d D0, U8, U12, D0 xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 + xvld U8, A0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvldrepl.d U12, B0, 0x00 - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x20 + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 +.endm - /***8-4***/ +.macro KERNEL2x2x1_END xvld U0, A0, 0x00 - + xvfmadd.d D0, U8, U12, D0 xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D0, U0, U4, D0 +.endm - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 +.macro KERNEL8x2x1 +.rept 4 + KERNEL2x2x1 +.endr +.endm - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x20 +.macro KERNEL8x2x1_END +.rept 3 + KERNEL2x2x1 +.endr + KERNEL2x2x1_END +.endm - /***8-5***/ +.macro KERNEL2x1x1 xvld U0, A0, 0x00 - + xvfmadd.d D0, U8, U12, D0 xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 + xvld U8, A0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvldrepl.d U12, B0, 0x00 - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x20 + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 +.endm - /***8-6***/ +.macro KERNEL2x1x1_END xvld U0, A0, 0x00 - + xvfmadd.d D0, U8, U12, D0 xvldrepl.d U4, B0, 0x00 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + xvfmadd.d D0, U0, U4, D0 +.endm - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 +.macro KERNEL8x1x1 +.rept 4 + KERNEL2x1x1 +.endr +.endm - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 +.macro KERNEL8x1x1_END +.rept 3 + KERNEL2x1x1 +.endr + KERNEL2x1x1_END +.endm - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x20 + PROLOGUE - /***8-7***/ - xvld U0, A0, 0x00 + addi.d $sp, $sp, -120 + /* Store regs */ + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 24 + SDARG $r27, $sp, 32 + ST $f23, $sp, 40 + ST $f24, $sp, 48 + ST $f25, $sp, 56 + ST $f26, $sp, 64 + ST $f27, $sp, 72 + ST $f28, $sp, 80 + ST $f29, $sp, 88 + ST $f30, $sp, 96 + ST $f31, $sp, 104 + ST ALPHA, $sp, 112 - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 +#if defined (TRMMKERNEL) && !defined(LEFT) + sub.d OFF, ZERO, OFFSET +#else + xor OFF, OFF, OFF +#endif - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 + /* if (!(N >> 2)) goto L_N3 */ + srai.d J, N, 2 /* J = bn >> 2 */ + andi N, N, 0x03 + xvldrepl.d VALPHA, $sp, 112 /* When N < 4, VALPHA will not changed */ + beq ZERO, J, .L_N3 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 +.L_J1: /* J-- && This loop include Condition 1 */ - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 +/************************* Condition 1 if((N >> 2) && (M >> 4)) START !!! ************************* +* dgemm_core_16x4 */ + move C0, C + move A0, A + slli.d T0, LDC, 3 + add.d C1, C0, T0 + addi.d J, J, -1 /* J-- */ + add.d C2, C1, T0 + add.d C3, C2, T0 - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x20 +#if defined(TRMMKERNEL) && defined(LEFT) + move OFF, OFFSET +#endif + + /* if (!(M >> 4)) goto L_M8 */ + srai.d I, M, 4 /* I = bm >> 4 */ + beq ZERO, I, .L_M8 - /***8-8***/ +.L_I1: /* I-- */ +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x07 + add.d A0, A0, T0 + slli.d T0, OFF, 0x05 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 16 +#else + /* number of values in B */ + addi.d L, OFF, 4 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + /* Calculate the first set of D0~D15, + * avoidig set 0 operation + * Load 16 * 64 from A0 + * U0 = {a3, a2, a1, a0} + * U1 = {a7, a6, a5, a4} + * U2 = {a11, a10, a9, a8} + * U3 = {a15, a14, a13, a12} + */ xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 + xvldrepl.d U4, B0, 0x00 + preld 0, C0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + xvfmul.d D1, U1, U4 + preld 0, C0, 0x40 + xvfmul.d D2, U2, U4 + xvfmul.d D3, U3, U4 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 + xvldrepl.d U5, B0, 0x08 + preld 0, C1, 0x00 + /* line 2 */ + xvfmul.d D4, U0, U5 + xvfmul.d D5, U1, U5 + preld 0, C1, 0x40 + xvfmul.d D6, U2, U5 + xvfmul.d D7, U3, U5 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 + xvldrepl.d U6, B0, 0x10 + preld 0, C2, 0x00 + /* line 3 */ + xvfmul.d D8, U0, U6 + xvfmul.d D9, U1, U6 + preld 0, C2, 0x40 + xvfmul.d D10, U2, U6 + xvfmul.d D11, U3, U6 - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 + xvldrepl.d U7, B0, 0x18 + preld 0, C3, 0x00 + /* line 4 */ + xvfmul.d D12, U0, U7 + xvfmul.d D13, U1, U7 + preld 0, C3, 0x40 + xvfmul.d D14, U2, U7 + xvfmul.d D15, U3, U7 - addi.d A0, A0, 0x20 + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x20 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_L7 */ + beq ZERO,TL, .L_L7 + + xvld U8, A0, 0x00 + xvld U9, A0, 0x20 + xvld U10, A0, 0x40 + xvld U11, A0, 0x60 + + addi.d TL, TL, -1 + + xvldrepl.d U12, B0, 0x00 + xvldrepl.d U13, B0, 0x08 + xvldrepl.d U14, B0, 0x10 + xvldrepl.d U15, B0, 0x18 + addi.d A0, A0, 0x80 addi.d B0, B0, 0x20 + beq ZERO, TL, .L_TL1_END +.L_TL1: /* TL-- */ + KERNEL8x16x4 addi.d TL, TL, -1 /* TL-- */ - blt ZERO,TL, .L_M4_TL1 + blt ZERO,TL, .L_TL1 -.L_M4_L7: - /* if (!(L & 7)) goto L_M4_L0 */ +.L_TL1_END: + KERNEL8x16x4_END + + /* Maybe we need calculate the last + * 7 sets of D0~D15? + */ +.L_L7: + /* if (!(L & 7)) goto L_L0 */ andi TL, L, 7 - beq TL, ZERO,.L_M4_L0 + beq TL, ZERO,.L_L0 -.L_M4_L71: +.L_L71: + /* Load 16 * 64 from A0 */ xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + /* Cumulative D0~D15 */ xvldrepl.d U4, B0, 0x00 xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 + xvldrepl.d U5, B0, 0x08 + xvfmadd.d D4, U0, U5, D4 + xvfmadd.d D5, U1, U5, D5 + xvfmadd.d D6, U2, U5, D6 + xvfmadd.d D7, U3, U5, D7 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 + xvldrepl.d U6, B0, 0x10 + xvfmadd.d D8, U0, U6, D8 + xvfmadd.d D9, U1, U6, D9 + xvfmadd.d D10, U2, U6, D10 + xvfmadd.d D11, U3, U6, D11 - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 + xvldrepl.d U7, B0, 0x18 + xvfmadd.d D12, U0, U7, D12 + xvfmadd.d D13, U1, U7, D13 + xvfmadd.d D14, U2, U7, D14 + xvfmadd.d D15, U3, U7, D15 /* Add stride for A0, B0 */ - addi.d A0, A0, 0x20 + addi.d A0, A0, 0x80 addi.d B0, B0, 0x20 addi.d TL, TL, -1 - blt ZERO,TL, .L_M4_L71 + blt ZERO,TL, .L_L71 -.L_M4_L0: +.L_L0: + xvldrepl.d VALPHA, $sp, 112 #if defined(TRMMKERNEL) xvfmul.d D0, D0, VALPHA + xvfmul.d D1, D1, VALPHA + xvfmul.d D2, D2, VALPHA + xvfmul.d D3, D3, VALPHA xvfmul.d D4, D4, VALPHA + xvfmul.d D5, D5, VALPHA + xvfmul.d D6, D6, VALPHA + xvfmul.d D7, D7, VALPHA xvfmul.d D8, D8, VALPHA + xvfmul.d D9, D9, VALPHA + xvfmul.d D10, D10, VALPHA + xvfmul.d D11, D11, VALPHA xvfmul.d D12, D12, VALPHA + xvfmul.d D13, D13, VALPHA + xvfmul.d D14, D14, VALPHA + xvfmul.d D15, D15, VALPHA #else /* Load C0 */ xvld U0, C0, 0x00 + xvld U1, C0, 0x20 + xvld U2, C0, 0x40 + xvld U3, C0, 0x60 xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + xvfmadd.d D1, D1, VALPHA, U1 + xvfmadd.d D2, D2, VALPHA, U2 + xvfmadd.d D3, D3, VALPHA, U3 /* Load C1 */ - xvld U0, C1, 0x00 - xvfmadd.d D4, D4, VALPHA, U0 + xvld U4, C1, 0x00 + xvld U5, C1, 0x20 + xvld U6, C1, 0x40 + xvld U7, C1, 0x60 + xvfmadd.d D4, D4, VALPHA, U4 + xvfmadd.d D5, D5, VALPHA, U5 + xvfmadd.d D6, D6, VALPHA, U6 + xvfmadd.d D7, D7, VALPHA, U7 /* Load C2 */ - xvld U0, C2, 0x00 - xvfmadd.d D8, D8, VALPHA, U0 + xvld U8, C2, 0x00 + xvld U9, C2, 0x20 + xvld U10, C2, 0x40 + xvld U11, C2, 0x60 + xvfmadd.d D8, D8, VALPHA, U8 + xvfmadd.d D9, D9, VALPHA, U9 + xvfmadd.d D10, D10, VALPHA, U10 + xvfmadd.d D11, D11, VALPHA, U11 /* Load C3 */ xvld U0, C3, 0x00 + xvld U1, C3, 0x20 + xvld U2, C3, 0x40 + xvld U3, C3, 0x60 xvfmadd.d D12, D12, VALPHA, U0 -#endif // #if defined(TRMMKERNEL) + xvfmadd.d D13, D13, VALPHA, U1 + xvfmadd.d D14, D14, VALPHA, U2 + xvfmadd.d D15, D15, VALPHA, U3 +#endif // #if defined(TRMMKERNEL) /* Store C0 */ xvst D0, C0, 0x00 + xvst D1, C0, 0x20 + xvst D2, C0, 0x40 + xvst D3, C0, 0x60 /* Store C1 */ xvst D4, C1, 0x00 + xvst D5, C1, 0x20 + xvst D6, C1, 0x40 + xvst D7, C1, 0x60 /* Store C2 */ xvst D8, C2, 0x00 + xvst D9, C2, 0x20 + xvst D10, C2, 0x40 + xvst D11, C2, 0x60 /* Store C3 */ xvst D12, C3, 0x00 + xvst D13, C3, 0x20 + xvst D14, C3, 0x40 + xvst D15, C3, 0x60 /* Add stride for C */ - addi.d C0, C0, 0x20 - addi.d C1, C1, 0x20 - addi.d C2, C2, 0x20 - addi.d C3, C3, 0x20 + addi.d C0, C0, 0x80 + addi.d C1, C1, 0x80 + addi.d C2, C2, 0x80 + addi.d C3, C3, 0x80 #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub.d L, K, OFF #ifdef LEFT - /* number of values in A */ - addi.d L, L, -4 + /* number of values in A */ + addi.d L, L, -16 #else /* number of values in B */ addi.d L, L, -4 #endif - slli.d T0, L, 0x05 + slli.d T0, L, 0x07 add.d A0, A0, T0 + slli.d T0, L, 0x05 add.d B0, B0, T0 #endif #ifdef LEFT - /* number of values in A */ - addi.d OFF, OFF, 0x04 + addi.d OFF, OFF, 0x10 #endif #endif // #if defined(TRMMKERNEL) -/********LOOP (if(N >> 2 ) && (M & 4) ) End************/ + addi.d I, I, -1 /* I-- */ + blt ZERO,I, .L_I1 -.L_M2: - andi I, M, 2 - beq ZERO,I, .L_M1 +.L_M8: + /* We have done M & 16, considering M=8/4/2/1 */ + andi I, M, 15 + beq ZERO,I, .L_M0 + + andi I, M, 8 + beq ZERO,I, .L_M4 #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move B0, B #else - slli.d T0, OFF, 0x04 + slli.d T0, OFF, 0x06 add.d A0, A0, T0 slli.d T0, OFF, 0x05 add.d B0, B, T0 @@ -1361,7 +1427,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. sub.d L, K, OFF #elif defined(LEFT) /* number of values in A */ - addi.d L, OFF, 2 + addi.d L, OFF, 8 #else /* number of values in B */ addi.d L, OFF, 4 @@ -1369,262 +1435,163 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else // #if !defined(TRMMKERNEL) move B0, B move L, K /* L = bk */ -#endif +#endif // #if defined(TRMMKERNEL) - /* Load 2 * 64 from A0 */ + /* Load 8 * 64 from A0 */ xvld U0, A0, 0x00 + xvld U1, A0, 0x20 xvldrepl.d U4, B0, 0x00 /* line 1 */ xvfmul.d D0, U0, U4 + xvfmul.d D1, U1, U4 - xvldrepl.d U4, B0, 0x08 + xvldrepl.d U5, B0, 0x08 /* line 2 */ - xvfmul.d D4, U0, U4 + xvfmul.d D4, U0, U5 + xvfmul.d D5, U1, U5 - xvldrepl.d U4, B0, 0x10 + xvldrepl.d U6, B0, 0x10 /* line 3 */ - xvfmul.d D8, U0, U4 + xvfmul.d D8, U0, U6 + xvfmul.d D9, U1, U6 - xvldrepl.d U4, B0, 0x18 + xvldrepl.d U7, B0, 0x18 /* line 4 */ - xvfmul.d D12, U0, U4 + xvfmul.d D12, U0, U7 + xvfmul.d D13, U1, U7 /* Add stride for A0 and B0 */ - addi.d A0, A0, 0x10 + addi.d A0, A0, 0x40 addi.d B0, B0, 0x20 /* Reduce L */ addi.d L, L, -1 srai.d TL, L, 3 /* TL = (L-1) >> 3 */ - /* if (TL < 1) goto L_M2_L7 */ - beq ZERO,TL, .L_M2_L7 - -.L_M2_TL1: /* TL-- */ - /***8-1***/ - /* Load 2 * 64 from A0 */ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x20 - - /***8-2***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x20 - - /***8-3***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x20 - - /***8-4***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x20 - - /***8-5***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x20 - - /***8-6***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x20 - - /***8-7***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 + /* if (TL < 1) goto L_M8_L7 */ + beq ZERO,TL, .L_M8_L7 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 + xvld U8, A0, 0x00 + xvld U9, A0, 0x20 - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 + addi.d TL, TL, -1 - addi.d A0, A0, 0x10 + xvldrepl.d U12, B0, 0x00 + xvldrepl.d U13, B0, 0x08 + xvldrepl.d U14, B0, 0x10 + xvldrepl.d U15, B0, 0x18 + addi.d A0, A0, 0x40 addi.d B0, B0, 0x20 - /***8-8***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 + beq ZERO, TL, .L_M8_TL1_END - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x20 +.L_M8_TL1: /* TL-- */ + KERNEL8x8x4 addi.d TL, TL, -1 /* TL-- */ - blt ZERO,TL, .L_M2_TL1 + blt ZERO,TL, .L_M8_TL1 -.L_M2_L7: - /* if (!(L & 7)) goto L_M2_L0 */ +.L_M8_TL1_END: + KERNEL8x8x4_END + +.L_M8_L7: + /* if (!(L & 7)) goto L_M8_L0 */ andi TL, L, 7 - beq TL, ZERO,.L_M2_L0 + beq TL, ZERO,.L_M8_L0 -.L_M2_L71: +.L_M8_L71: xvld U0, A0, 0x00 + xvld U1, A0, 0x20 xvldrepl.d U4, B0, 0x00 xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 + xvldrepl.d U5, B0, 0x08 + xvfmadd.d D4, U0, U5, D4 + xvfmadd.d D5, U1, U5, D5 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 + xvldrepl.d U6, B0, 0x10 + xvfmadd.d D8, U0, U6, D8 + xvfmadd.d D9, U1, U6, D9 - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 + xvldrepl.d U7, B0, 0x18 + xvfmadd.d D12, U0, U7, D12 + xvfmadd.d D13, U1, U7, D13 /* Add stride for A0, B0 */ - addi.d A0, A0, 0x10 + addi.d A0, A0, 0x40 addi.d B0, B0, 0x20 addi.d TL, TL, -1 - blt ZERO,TL, .L_M2_L71 + blt ZERO,TL, .L_M8_L71 -.L_M2_L0: +.L_M8_L0: + xvldrepl.d VALPHA, $sp, 112 #if defined(TRMMKERNEL) xvfmul.d D0, D0, VALPHA + xvfmul.d D1, D1, VALPHA xvfmul.d D4, D4, VALPHA + xvfmul.d D5, D5, VALPHA xvfmul.d D8, D8, VALPHA + xvfmul.d D9, D9, VALPHA xvfmul.d D12, D12, VALPHA + xvfmul.d D13, D13, VALPHA #else /* Load C0 */ xvld U0, C0, 0x00 + xvld U1, C0, 0x20 xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + xvfmadd.d D1, D1, VALPHA, U1 /* Load C1 */ - xvld U0, C1, 0x00 - xvfmadd.d D4, D4, VALPHA, U0 + xvld U2, C1, 0x00 + xvld U3, C1, 0x20 + xvfmadd.d D4, D4, VALPHA, U2 + xvfmadd.d D5, D5, VALPHA, U3 /* Load C2 */ - xvld U0, C2, 0x00 - xvfmadd.d D8, D8, VALPHA, U0 + xvld U4, C2, 0x00 + xvld U5, C2, 0x20 + xvfmadd.d D8, D8, VALPHA, U4 + xvfmadd.d D9, D9, VALPHA, U5 /* Load C3 */ - xvld U0, C3, 0x00 - xvfmadd.d D12, D12, VALPHA, U0 + xvld U6, C3, 0x00 + xvld U7, C3, 0x20 + xvfmadd.d D12, D12, VALPHA, U6 + xvfmadd.d D13, D13, VALPHA, U7 #endif // #if defined(TRMMKERNEL) - xvstelm.d D0, C0, 0x00, 0x00 - xvstelm.d D4, C1, 0x00, 0x00 - xvstelm.d D8, C2, 0x00, 0x00 - xvstelm.d D12, C3, 0x00, 0x00 - xvstelm.d D0, C0, 0x08, 0x01 - xvstelm.d D4, C1, 0x08, 0x01 - xvstelm.d D8, C2, 0x08, 0x01 - xvstelm.d D12, C3, 0x08, 0x01 + /* Store C0 */ + xvst D0, C0, 0x00 + xvst D1, C0, 0x20 + /* Store C1 */ + xvst D4, C1, 0x00 + xvst D5, C1, 0x20 + /* Store C2 */ + xvst D8, C2, 0x00 + xvst D9, C2, 0x20 + /* Store C3 */ + xvst D12, C3, 0x00 + xvst D13, C3, 0x20 /* Add stride for C */ - addi.d C0, C0, 0x10 - addi.d C1, C1, 0x10 - addi.d C2, C2, 0x10 - addi.d C3, C3, 0x10 + addi.d C0, C0, 0x40 + addi.d C1, C1, 0x40 + addi.d C2, C2, 0x40 + addi.d C3, C3, 0x40 #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) sub.d L, K, OFF #ifdef LEFT /* number of values in A */ - addi.d L, L, -2 + addi.d L, L, -8 #else /* number of values in B */ addi.d L, L, -4 #endif - slli.d T0, L, 0x04 + slli.d T0, L, 0x06 add.d A0, A0, T0 slli.d T0, L, 0x05 add.d B0, B0, T0 @@ -1632,23 +1599,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifdef LEFT /* number of values in A */ - addi.d OFF, OFF, 0x02 + addi.d OFF, OFF, 0x08 #endif #endif // #if defined(TRMMKERNEL) -/********LOOP (if(N >> 2 ) && (M & 2) ) End************/ +/********LOOP (if(N >> 2 ) && (M & 8)) End************/ -.L_M1: - andi I, M, 1 - beq ZERO,I, .L_M0 +.L_M4: + andi I, M, 4 + beq ZERO,I, .L_M2 #if defined(TRMMKERNEL) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) move B0, B #else - slli.d T0, OFF, 0x03 - add.d A0, A0, T0 slli.d T0, OFF, 0x05 + add.d A0, A0, T0 add.d B0, B, T0 #endif @@ -1656,7 +1622,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. sub.d L, K, OFF #elif defined(LEFT) /* number of values in A */ - addi.d L, OFF, 1 + addi.d L, OFF, 4 #else /* number of values in B */ addi.d L, OFF, 4 @@ -1666,55 +1632,62 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. move L, K /* L = bk */ #endif - /* Load 1 * 64 from A0 */ + /* Load 4 * 64 from A0 */ xvld U0, A0, 0x00 xvldrepl.d U4, B0, 0x00 /* line 1 */ xvfmul.d D0, U0, U4 - xvldrepl.d U4, B0, 0x08 + xvldrepl.d U5, B0, 0x08 /* line 2 */ - xvfmul.d D4, U0, U4 + xvfmul.d D4, U0, U5 - xvldrepl.d U4, B0, 0x10 + xvldrepl.d U6, B0, 0x10 /* line 3 */ - xvfmul.d D8, U0, U4 + xvfmul.d D8, U0, U6 - xvldrepl.d U4, B0, 0x18 + xvldrepl.d U7, B0, 0x18 /* line 4 */ - xvfmul.d D12, U0, U4 + xvfmul.d D12, U0, U7 /* Add stride for A0 and B0 */ - addi.d A0, A0, 0x08 + addi.d A0, A0, 0x20 addi.d B0, B0, 0x20 /* Reduce L */ addi.d L, L, -1 srai.d TL, L, 3 /* TL = (L-1) >> 3 */ - /* if (TL < 1) goto L_M1_L7 */ - beq ZERO,TL, .L_M1_L7 + /* if (TL < 1) goto L_M4_L7 */ + beq ZERO,TL, .L_M4_L7 -.L_M1_TL1: /* TL-- */ - /***8-1***/ - /* Load 1 * 64 from A0 */ - xvld U0, A0, 0x00 + xvld U8, A0, 0x00 - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 + addi.d TL, TL, -1 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 + xvldrepl.d U12, B0, 0x00 + xvldrepl.d U13, B0, 0x08 + xvldrepl.d U14, B0, 0x10 + xvldrepl.d U15, B0, 0x18 + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 + beq ZERO, TL, .L_M4_TL1_END - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 +.L_M4_TL1: /* TL-- */ + KERNEL8x4x4 - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x20 + addi.d TL, TL, -1 + blt ZERO,TL, .L_M4_TL1 + +.L_M4_TL1_END: + KERNEL8x4x4_END + +.L_M4_L7: + /* if (!(L & 7)) goto L_M4_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_M4_L0 - /***8-2***/ +.L_M4_L71: xvld U0, A0, 0x00 xvldrepl.d U4, B0, 0x00 @@ -1729,119 +1702,287 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvldrepl.d U4, B0, 0x18 xvfmadd.d D12, U0, U4, D12 - addi.d A0, A0, 0x08 + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x20 addi.d B0, B0, 0x20 - /***8-3***/ - xvld U0, A0, 0x00 + addi.d TL, TL, -1 + blt ZERO,TL, .L_M4_L71 - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 +.L_M4_L0: + xvldrepl.d VALPHA, $sp, 112 +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA + xvfmul.d D4, D4, VALPHA + xvfmul.d D8, D8, VALPHA + xvfmul.d D12, D12, VALPHA +#else + /* Load C0 */ + xvld U0, C0, 0x00 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 + /* Load C1 */ + xvld U1, C1, 0x00 + xvfmadd.d D4, D4, VALPHA, U1 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 + /* Load C2 */ + xvld U2, C2, 0x00 + xvfmadd.d D8, D8, VALPHA, U2 - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 + /* Load C3 */ + xvld U3, C3, 0x00 + xvfmadd.d D12, D12, VALPHA, U3 +#endif // #if defined(TRMMKERNEL) - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x20 + /* Store C0 */ + xvst D0, C0, 0x00 + /* Store C1 */ + xvst D4, C1, 0x00 + /* Store C2 */ + xvst D8, C2, 0x00 + /* Store C3 */ + xvst D12, C3, 0x00 - /***8-4***/ - xvld U0, A0, 0x00 + /* Add stride for C */ + addi.d C0, C0, 0x20 + addi.d C1, C1, 0x20 + addi.d C2, C2, 0x20 + addi.d C3, C3, 0x20 - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + /* number of values in A */ + addi.d L, L, -4 +#else + /* number of values in B */ + addi.d L, L, -4 +#endif + slli.d T0, L, 0x05 + add.d A0, A0, T0 + add.d B0, B0, T0 +#endif - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 +#ifdef LEFT + /* number of values in A */ + addi.d OFF, OFF, 0x04 +#endif +#endif // #if defined(TRMMKERNEL) - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 +/********LOOP (if(N >> 2 ) && (M & 4) ) End************/ - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 +.L_M2: + andi I, M, 2 + beq ZERO,I, .L_M1 - addi.d A0, A0, 0x08 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x04 + add.d A0, A0, T0 + slli.d T0, OFF, 0x05 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 2 +#else + /* number of values in B */ + addi.d L, OFF, 4 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 2 * 64 from A0 */ + xvldrepl.d U0, A0, 0x00 + xvldrepl.d U1, A0, 0x08 + + xvld U4, B0, 0x00 + + xvfmul.d D0, U0, U4 + xvfmul.d D1, U1, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_M2_L7 */ + beq ZERO,TL, .L_M2_L7 + + xvldrepl.d U8, A0, 0x00 + xvldrepl.d U9, A0, 0x08 + + addi.d TL, TL, -1 + + xvld U12, B0, 0x00 + addi.d A0, A0, 0x10 addi.d B0, B0, 0x20 - /***8-5***/ - xvld U0, A0, 0x00 + beq ZERO, TL, .L_M2_TL1_END +.L_M2_TL1: /* TL-- */ + KERNEL8x2x4 - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_M2_TL1 +.L_M2_TL1_END: + KERNEL8x2x4_END - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 +.L_M2_L7: + /* if (!(L & 7)) goto L_M2_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_M2_L0 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 +.L_M2_L71: + xvldrepl.d U0, A0, 0x00 + xvldrepl.d U1, A0, 0x08 - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 + xvld U4, B0, 0x00 - addi.d A0, A0, 0x08 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x10 addi.d B0, B0, 0x20 - /***8-6***/ - xvld U0, A0, 0x00 + addi.d TL, TL, -1 + blt ZERO,TL, .L_M2_L71 - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 +.L_M2_L0: + xvldrepl.d VALPHA, $sp, 112 +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA + xvfmul.d D1, D1, VALPHA - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 + xvstelm.d D0, C0, 0x00, 0x00 + xvstelm.d D0, C1, 0x00, 0x01 + xvstelm.d D0, C2, 0x00, 0x02 + xvstelm.d D0, C3, 0x00, 0x03 + xvstelm.d D1, C0, 0x08, 0x00 + xvstelm.d D1, C1, 0x08, 0x01 + xvstelm.d D1, C2, 0x08, 0x02 + xvstelm.d D1, C3, 0x08, 0x03 +#else + xvpackev.d D4, D1, D0 + xvpackod.d D5, D1, D0 + /* Load C0 */ + xvld U0, C0, 0x00 + /* Load C1 */ + xvld U1, C1, 0x00 + /* Load C2 */ + xvld U2, C2, 0x00 + /* Load C3 */ + xvld U3, C3, 0x00 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 + xvpermi.q U2, U0, 0x20 + xvpermi.q U3, U1, 0x20 - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D0, D4, VALPHA, U2 + xvfmadd.d D1, D5, VALPHA, U3 - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x20 + vst $vr16, C0, 0x00 + vst $vr17, C1, 0x00 + xvstelm.d D0, C2, 0x00, 0x02 + xvstelm.d D1, C3, 0x00, 0x02 + xvstelm.d D0, C2, 0x08, 0x03 + xvstelm.d D1, C3, 0x08, 0x03 +#endif // #if defined(TRMMKERNEL) - /***8-7***/ - xvld U0, A0, 0x00 + /* Add stride for C */ + addi.d C0, C0, 0x10 + addi.d C1, C1, 0x10 + addi.d C2, C2, 0x10 + addi.d C3, C3, 0x10 - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + /* number of values in A */ + addi.d L, L, -2 +#else + /* number of values in B */ + addi.d L, L, -4 +#endif + slli.d T0, L, 0x04 + add.d A0, A0, T0 + slli.d T0, L, 0x05 + add.d B0, B0, T0 +#endif - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 +#ifdef LEFT + /* number of values in A */ + addi.d OFF, OFF, 0x02 +#endif +#endif // #if defined(TRMMKERNEL) - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 +/********LOOP (if(N >> 2 ) && (M & 2) ) End************/ - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 +.L_M1: + andi I, M, 1 + beq ZERO,I, .L_M0 - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x20 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x03 + add.d A0, A0, T0 + slli.d T0, OFF, 0x05 + add.d B0, B, T0 +#endif - /***8-8***/ - xvld U0, A0, 0x00 +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 1 +#else + /* number of values in B */ + addi.d L, OFF, 4 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 + xvldrepl.d U0, A0, 0x00 + xvld U4, B0, 0x00 + xvfmul.d D0, U0, U4 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_M1_L7 */ + beq ZERO,TL, .L_M1_L7 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 + xvldrepl.d U8, A0, 0x00 - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 + addi.d TL, TL, -1 + xvld U12, B0, 0x00 + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x20 + beq ZERO, TL, .L_M1_TL1_END + +.L_M1_TL1: /* TL-- */ + KERNEL8x1x4 addi.d TL, TL, -1 /* TL-- */ blt ZERO,TL, .L_M1_TL1 +.L_M1_TL1_END: + KERNEL8x1x4_END .L_M1_L7: /* if (!(L & 7)) goto L_M1_L0 */ @@ -1849,19 +1990,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. beq TL, ZERO,.L_M1_L0 .L_M1_L71: - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 + xvldrepl.d U0, A0, 0x00 + xvld U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 /* Add stride for A0, B0 */ addi.d A0, A0, 0x08 @@ -1871,33 +2002,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. blt ZERO,TL, .L_M1_L71 .L_M1_L0: + xvldrepl.d VALPHA, $sp, 112 #if defined(TRMMKERNEL) xvfmul.d D0, D0, VALPHA - xvfmul.d D4, D4, VALPHA - xvfmul.d D8, D8, VALPHA - xvfmul.d D12, D12, VALPHA + + xvstelm.d D0, C0, 0x00, 0x00 + xvstelm.d D0, C1, 0x00, 0x01 + xvstelm.d D0, C2, 0x00, 0x02 + xvstelm.d D0, C3, 0x00, 0x03 #else /* Load C0 */ - xvld U0, C0, 0x00 - xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + xvldrepl.d U0, C0, 0x00 + xvfmadd.d D4, D0, VALPHA, U0 /* Load C1 */ - xvld U0, C1, 0x00 - xvfmadd.d D4, D4, VALPHA, U0 + xvldrepl.d U1, C1, 0x00 + xvfmadd.d D5, D0, VALPHA, U1 /* Load C2 */ - xvld U0, C2, 0x00 - xvfmadd.d D8, D8, VALPHA, U0 + xvldrepl.d U2, C2, 0x00 + xvfmadd.d D6, D0, VALPHA, U2 /* Load C3 */ - xvld U0, C3, 0x00 - xvfmadd.d D12, D12, VALPHA, U0 -#endif // #if defined(TRMMKERNEL) + xvldrepl.d U3, C3, 0x00 + xvfmadd.d D7, D0, VALPHA, U3 - xvstelm.d D0, C0, 0x00, 0x00 - xvstelm.d D4, C1, 0x00, 0x00 - xvstelm.d D8, C2, 0x00, 0x00 - xvstelm.d D12, C3, 0x00, 0x00 + xvstelm.d D4, C0, 0x00, 0x00 + xvstelm.d D5, C1, 0x00, 0x01 + xvstelm.d D6, C2, 0x00, 0x02 + xvstelm.d D7, C3, 0x00, 0x03 +#endif // #if defined(TRMMKERNEL) /* Add stride for C */ addi.d C0, C0, 0x08 @@ -1952,6 +2086,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ///////////////////////////////////////////////// /************************ Condition 1 if((N >> 2) && (M >> 4)) END !!! ************************/ + xvldrepl.d VALPHA, $sp, 112 + .L_N3: andi J, N, 2 beq ZERO, J, .L_N1 @@ -1993,223 +2129,65 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi.d L, OFF, 2 #endif #else // #if !defined(TRMMKERNEL) - move B0, B - move L, K /* L = bk */ -#endif - - /* Load 16 * 64 from A0 - * U0 = {a3, a2, a1, a0} - * U1 = {a7, a6, a5, a4} - * U2 = {a11, a10, a9, a8} - * U3 = {a15, a14, a13, a12} - */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - - xvldrepl.d U4, B0, 0x00 - /* line 1 */ - xvfmul.d D0, U0, U4 - xvfmul.d D1, U1, U4 - xvfmul.d D2, U2, U4 - xvfmul.d D3, U3, U4 - - xvldrepl.d U4, B0, 0x08 - /* line 2 */ - xvfmul.d D4, U0, U4 - xvfmul.d D5, U1, U4 - xvfmul.d D6, U2, U4 - xvfmul.d D7, U3, U4 - - /* Add stride for A0 and B0 */ - addi.d A0, A0, 0x80 - addi.d B0, B0, 0x10 - /* Reduce L */ - addi.d L, L, -1 - srai.d TL, L, 3 /* TL = (L-1) >> 3 */ - /* if (TL < 1) goto L_N3_L7 */ - beq ZERO,TL, .L_N3_L7 - -.L_N3_TL1: /* TL-- */ - /***8-1***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - xvfmadd.d D6, U2, U4, D6 - xvfmadd.d D7, U3, U4, D7 - - addi.d A0, A0, 0x80 - addi.d B0, B0, 0x10 - - /***8-2***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - xvfmadd.d D6, U2, U4, D6 - xvfmadd.d D7, U3, U4, D7 - - addi.d A0, A0, 0x80 - addi.d B0, B0, 0x10 - - /***8-3***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - xvfmadd.d D6, U2, U4, D6 - xvfmadd.d D7, U3, U4, D7 - - addi.d A0, A0, 0x80 - addi.d B0, B0, 0x10 - - /***8-4***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - xvfmadd.d D6, U2, U4, D6 - xvfmadd.d D7, U3, U4, D7 - - addi.d A0, A0, 0x80 - addi.d B0, B0, 0x10 - - /***8-5***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - xvfmadd.d D6, U2, U4, D6 - xvfmadd.d D7, U3, U4, D7 - - addi.d A0, A0, 0x80 - addi.d B0, B0, 0x10 - - /***8-6***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - xvfmadd.d D6, U2, U4, D6 - xvfmadd.d D7, U3, U4, D7 - - addi.d A0, A0, 0x80 - addi.d B0, B0, 0x10 + move B0, B + move L, K /* L = bk */ +#endif - /***8-7***/ - /* Load 16 * 64 from A0 */ + /* Load 16 * 64 from A0 + * U0 = {a3, a2, a1, a0} + * U1 = {a7, a6, a5, a4} + * U2 = {a11, a10, a9, a8} + * U3 = {a15, a14, a13, a12} + */ xvld U0, A0, 0x00 xvld U1, A0, 0x20 xvld U2, A0, 0x40 xvld U3, A0, 0x60 - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - xvfmadd.d D6, U2, U4, D6 - xvfmadd.d D7, U3, U4, D7 + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + xvfmul.d D1, U1, U4 + xvfmul.d D2, U2, U4 + xvfmul.d D3, U3, U4 - addi.d A0, A0, 0x80 - addi.d B0, B0, 0x10 + xvldrepl.d U5, B0, 0x08 + /* line 2 */ + xvfmul.d D4, U0, U5 + xvfmul.d D5, U1, U5 + xvfmul.d D6, U2, U5 + xvfmul.d D7, U3, U5 - /***8-8***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x10 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N3_L7 */ + beq ZERO,TL, .L_N3_L7 - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 + xvld U8, A0, 0x00 + xvld U9, A0, 0x20 + xvld U10, A0, 0x40 + xvld U11, A0, 0x60 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - xvfmadd.d D6, U2, U4, D6 - xvfmadd.d D7, U3, U4, D7 + addi.d TL, TL, -1 + xvldrepl.d U12, B0, 0x00 + xvldrepl.d U13, B0, 0x08 addi.d A0, A0, 0x80 addi.d B0, B0, 0x10 + beq ZERO, TL, .L_N3_TL1_END + +.L_N3_TL1: /* TL-- */ + KERNEL8x16x2 + addi.d TL, TL, -1 /* TL-- */ blt ZERO,TL, .L_N3_TL1 +.L_N3_TL1_END: + KERNEL8x16x2_END .L_N3_L7: /* if (!(L & 7)) goto L_N3_L0 */ @@ -2229,12 +2207,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvfmadd.d D2, U2, U4, D2 xvfmadd.d D3, U3, U4, D3 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - xvfmadd.d D6, U2, U4, D6 - xvfmadd.d D7, U3, U4, D7 - + xvldrepl.d U5, B0, 0x08 + xvfmadd.d D4, U0, U5, D4 + xvfmadd.d D5, U1, U5, D5 + xvfmadd.d D6, U2, U5, D6 + xvfmadd.d D7, U3, U5, D7 /* Add stride for A0, B0 */ addi.d A0, A0, 0x80 addi.d B0, B0, 0x10 @@ -2264,14 +2241,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvfmadd.d D3, D3, VALPHA, U3 /* Load C1 */ - xvld U0, C1, 0x00 - xvld U1, C1, 0x20 - xvld U2, C1, 0x40 - xvld U3, C1, 0x60 - xvfmadd.d D4, D4, VALPHA, U0 - xvfmadd.d D5, D5, VALPHA, U1 - xvfmadd.d D6, D6, VALPHA, U2 - xvfmadd.d D7, D7, VALPHA, U3 + xvld U4, C1, 0x00 + xvld U5, C1, 0x20 + xvld U6, C1, 0x40 + xvld U7, C1, 0x60 + xvfmadd.d D4, D4, VALPHA, U4 + xvfmadd.d D5, D5, VALPHA, U5 + xvfmadd.d D6, D6, VALPHA, U6 + xvfmadd.d D7, D7, VALPHA, U7 #endif // #if defined(TRMMKERNEL) /* Store C0 */ @@ -2352,10 +2329,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvfmul.d D0, U0, U4 xvfmul.d D1, U1, U4 - xvldrepl.d U4, B0, 0x08 + xvldrepl.d U5, B0, 0x08 /* line 2 */ - xvfmul.d D4, U0, U4 - xvfmul.d D5, U1, U4 + xvfmul.d D4, U0, U5 + xvfmul.d D5, U1, U5 /* Add stride for A0 and B0 */ addi.d A0, A0, 0x40 @@ -2366,131 +2343,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* if (TL < 1) goto L_N3_M8_L7 */ beq ZERO,TL, .L_N3_M8_L7 -.L_N3_M8_TL1: /* TL-- */ - /***8-1***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x10 - - /***8-2***/ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x10 - - /***8-3***/ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x10 - - /***8-4***/ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x10 - - /***8-5***/ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - - /* Cumulative D0~D15 */ - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x10 - - /***8-6***/ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x10 - - /***8-7***/ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 + xvld U8, A0, 0x00 + xvld U9, A0, 0x20 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 + addi.d TL, TL, -1 + xvldrepl.d U12, B0, 0x00 + xvldrepl.d U13, B0, 0x08 addi.d A0, A0, 0x40 addi.d B0, B0, 0x10 - /***8-8***/ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 + beq ZERO, TL, .L_N3_M8_TL1_END - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x10 +.L_N3_M8_TL1: /* TL-- */ + KERNEL8x8x2 addi.d TL, TL, -1 /* TL-- */ blt ZERO,TL, .L_N3_M8_TL1 +.L_N3_M8_TL1_END: + KERNEL8x8x2_END .L_N3_M8_L7: /* if (!(L & 7)) goto L_N3_M8_L0 */ @@ -2505,9 +2376,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvfmadd.d D0, U0, U4, D0 xvfmadd.d D1, U1, U4, D1 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 + xvldrepl.d U5, B0, 0x08 + xvfmadd.d D4, U0, U5, D4 + xvfmadd.d D5, U1, U5, D5 /* Add stride for A0, B0 */ addi.d A0, A0, 0x40 @@ -2530,10 +2401,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvfmadd.d D1, D1, VALPHA, U1 /* Load C1 */ - xvld U0, C1, 0x00 - xvld U1, C1, 0x20 - xvfmadd.d D4, D4, VALPHA, U0 - xvfmadd.d D5, D5, VALPHA, U1 + xvld U2, C1, 0x00 + xvld U3, C1, 0x20 + xvfmadd.d D4, D4, VALPHA, U2 + xvfmadd.d D5, D5, VALPHA, U3 #endif // #if defined(TRMMKERNEL) /* Store C0 */ @@ -2561,162 +2432,79 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add.d B0, B0, T0 #endif -#ifdef LEFT - addi.d OFF, OFF, 0x08 -#endif -#endif // #if defined(TRMMKERNEL) - -/********LOOP (if(N & 2) && (M & 8) ) End************/ - -.L_N3_M4: - andi I, M, 4 - beq ZERO,I, .L_N3_M2 - -#if defined(TRMMKERNEL) -#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - move B0, B -#else - slli.d T0, OFF, 0x05 - add.d A0, A0, T0 - slli.d T0, OFF, 0x04 - add.d B0, B, T0 -#endif - -#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - sub.d L, K, OFF -#elif defined(LEFT) - /* number of values in A */ - addi.d L, OFF, 4 -#else - /* number of values in B */ - addi.d L, OFF, 2 -#endif -#else // #if !defined(TRMMKERNEL) - move B0, B - move L, K /* L = bk */ -#endif - - /* Load 4 * 64 from A0 */ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - /* line 1 */ - xvfmul.d D0, U0, U4 - - xvldrepl.d U4, B0, 0x08 - /* line 2 */ - xvfmul.d D4, U0, U4 - - /* Add stride for A0 and B0 */ - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x10 - /* Reduce L */ - addi.d L, L, -1 - srai.d TL, L, 3 /* TL = (L-1) >> 3 */ - /* if (TL < 1) goto L_N3_M4_L7 */ - beq ZERO,TL, .L_N3_M4_L7 - -.L_N3_M4_TL1: /* TL-- */ - /***8-1***/ - /* Load 8 * 64 from A0 */ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x10 - - /***8-2***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x10 - - /***8-3***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x10 - - /***8-4***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x10 +#ifdef LEFT + addi.d OFF, OFF, 0x08 +#endif +#endif // #if defined(TRMMKERNEL) - /***8-5***/ - xvld U0, A0, 0x00 +/********LOOP (if(N & 2) && (M & 8) ) End************/ - /* Cumulative D0~D15 */ - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 +.L_N3_M4: + andi I, M, 4 + beq ZERO,I, .L_N3_M2 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x05 + add.d A0, A0, T0 + slli.d T0, OFF, 0x04 + add.d B0, B, T0 +#endif - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x10 +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 4 +#else + /* number of values in B */ + addi.d L, OFF, 2 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif - /***8-6***/ + /* Load 4 * 64 from A0 */ xvld U0, A0, 0x00 - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x10 + xvldrepl.d U5, B0, 0x08 + /* line 2 */ + xvfmul.d D4, U0, U5 - /***8-7***/ - xvld U0, A0, 0x00 + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N3_M4_L7 */ + beq ZERO,TL, .L_N3_M4_L7 - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 + xvld U8, A0, 0x00 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 + addi.d TL, TL, -1 + xvldrepl.d U12, B0, 0x00 + xvldrepl.d U13, B0, 0x08 addi.d A0, A0, 0x20 addi.d B0, B0, 0x10 - /***8-8***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 + beq ZERO, TL, .L_N3_M4_TL1_END - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x10 +.L_N3_M4_TL1: /* TL-- */ + KERNEL8x4x2 addi.d TL, TL, -1 /* TL-- */ blt ZERO,TL, .L_N3_M4_TL1 +.L_N3_M4_TL1_END: + KERNEL8x4x2_END .L_N3_M4_L7: /* if (!(L & 7)) goto L_N3_M4_L0 */ @@ -2729,8 +2517,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvldrepl.d U4, B0, 0x00 xvfmadd.d D0, U0, U4, D0 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 + xvldrepl.d U5, B0, 0x08 + xvfmadd.d D4, U0, U5, D4 /* Add stride for A0, B0 */ addi.d A0, A0, 0x20 @@ -2749,8 +2537,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ /* Load C1 */ - xvld U0, C1, 0x00 - xvfmadd.d D4, D4, VALPHA, U0 + xvld U1, C1, 0x00 + xvfmadd.d D4, D4, VALPHA, U1 #endif // #if defined(TRMMKERNEL) /* Store C0 */ @@ -2830,106 +2618,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* if (TL < 1) goto L_N3_M2_L7 */ beq ZERO,TL, .L_N3_M2_L7 -.L_N3_M2_TL1: /* TL-- */ - /***8-1***/ - /* Load 2 * 64 from A0 */ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x10 - - /***8-2***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x10 - - /***8-3***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x10 - - /***8-4***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x10 - - /***8-5***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x10 - - /***8-6***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x10 + xvld U8, A0, 0x00 - /***8-7***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 + addi.d TL, TL, -1 + xvldrepl.d U12, B0, 0x00 + xvldrepl.d U13, B0, 0x08 addi.d A0, A0, 0x10 addi.d B0, B0, 0x10 - /***8-8***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 + beq ZERO, TL, .L_N3_M2_TL1_END - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x10 +.L_N3_M2_TL1: /* TL-- */ + KERNEL8x2x2 addi.d TL, TL, -1 /* TL-- */ blt ZERO,TL, .L_N3_M2_TL1 +.L_N3_M2_TL1_END: + KERNEL8x2x2_END .L_N3_M2_L7: /* if (!(L & 7)) goto L_N3_M2_L0 */ @@ -2942,8 +2648,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvldrepl.d U4, B0, 0x00 xvfmadd.d D0, U0, U4, D0 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 + xvldrepl.d U5, B0, 0x08 + xvfmadd.d D4, U0, U5, D4 /* Add stride for A0, B0 */ addi.d A0, A0, 0x10 @@ -2962,8 +2668,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ /* Load C1 */ - xvld U0, C1, 0x00 - xvfmadd.d D4, D4, VALPHA, U0 + xvld U1, C1, 0x00 + xvfmadd.d D4, D4, VALPHA, U1 #endif // #if defined(TRMMKERNEL) xvstelm.d D0, C0, 0x00, 0x00 @@ -3017,132 +2723,50 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else /* number of values in B */ addi.d L, OFF, 2 -#endif -#else // #if !defined(TRMMKERNEL) - move B0, B - move L, K /* L = bk */ -#endif - - /* Load 1 * 64 from A0 */ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - /* line 1 */ - xvfmul.d D0, U0, U4 - - xvldrepl.d U4, B0, 0x08 - /* line 2 */ - xvfmul.d D4, U0, U4 - - /* Add stride for A0 and B0 */ - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x10 - /* Reduce L */ - addi.d L, L, -1 - srai.d TL, L, 3 /* TL = (L-1) >> 3 */ - /* if (TL < 1) goto L_N3_M1_L7 */ - beq ZERO,TL, .L_N3_M1_L7 - -.L_N3_M1_TL1: /* TL-- */ - /***8-1***/ - /* Load 1 * 64 from A0 */ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x10 - - /***8-2***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x10 - - /***8-3***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x10 - - /***8-4***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x10 - - /***8-5***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x10 - - /***8-6***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x10 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif - /***8-7***/ + /* Load 1 * 64 from A0 */ xvld U0, A0, 0x00 - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x10 + xvldrepl.d U4, B0, 0x08 + /* line 2 */ + xvfmul.d D4, U0, U4 - /***8-8***/ - xvld U0, A0, 0x00 + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N3_M1_L7 */ + beq ZERO,TL, .L_N3_M1_L7 - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 + xvld U8, A0, 0x00 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 + addi.d TL, TL, -1 + xvldrepl.d U12, B0, 0x00 + xvldrepl.d U13, B0, 0x08 addi.d A0, A0, 0x08 addi.d B0, B0, 0x10 + beq ZERO, TL, .L_N3_M1_TL1_END + +.L_N3_M1_TL1: /* TL-- */ + KERNEL8x1x2 + addi.d TL, TL, -1 /* TL-- */ blt ZERO,TL, .L_N3_M1_TL1 +.L_N3_M1_TL1_END: + KERNEL8x1x2_END .L_N3_M1_L7: /* if (!(L & 7)) goto L_N3_M1_L0 */ @@ -3155,8 +2779,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvldrepl.d U4, B0, 0x00 xvfmadd.d D0, U0, U4, D0 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 + xvldrepl.d U5, B0, 0x08 + xvfmadd.d D4, U0, U5, D4 /* Add stride for A0, B0 */ addi.d A0, A0, 0x08 @@ -3175,8 +2799,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ /* Load C1 */ - xvld U0, C1, 0x00 - xvfmadd.d D4, D4, VALPHA, U0 + xvld U1, C1, 0x00 + xvfmadd.d D4, D4, VALPHA, U1 #endif // #if defined(TRMMKERNEL) xvstelm.d D0, C0, 0x00, 0x00 @@ -3300,137 +2924,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* if (TL < 1) goto L_N1_L7 */ beq ZERO,TL, .L_N1_L7 -.L_N1_TL1: /* TL-- */ - /***8-1***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 - - addi.d A0, A0, 0x80 - addi.d B0, B0, 0x08 - - /***8-2***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 - - addi.d A0, A0, 0x80 - addi.d B0, B0, 0x08 - - /***8-3***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 - - addi.d A0, A0, 0x80 - addi.d B0, B0, 0x08 - - /***8-4***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 - - addi.d A0, A0, 0x80 - addi.d B0, B0, 0x08 - - /***8-5***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 - - addi.d A0, A0, 0x80 - addi.d B0, B0, 0x08 - - /***8-6***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 - - addi.d A0, A0, 0x80 - addi.d B0, B0, 0x08 - - /***8-7***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 + xvld U8, A0, 0x00 + xvld U9, A0, 0x20 + xvld U10, A0, 0x40 + xvld U11, A0, 0x60 - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 + addi.d TL, TL, -1 + xvldrepl.d U12, B0, 0x00 addi.d A0, A0, 0x80 addi.d B0, B0, 0x08 - /***8-8***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 - - addi.d A0, A0, 0x80 - addi.d B0, B0, 0x08 + beq ZERO, TL, .L_N1_TL1_END +.L_N1_TL1: /* TL-- */ + KERNEL8x16x1 addi.d TL, TL, -1 /* TL-- */ blt ZERO,TL, .L_N1_TL1 +.L_N1_TL1_END: + KERNEL8x16x1_END .L_N1_L7: /* if (!(L & 7)) goto L_N1_L0 */ @@ -3494,161 +3006,87 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif slli.d T0, L, 0x07 add.d A0, A0, T0 - slli.d T0, L, 0x03 - add.d B0, B0, T0 -#endif - -#ifdef LEFT - addi.d OFF, OFF, 0x10 -#endif -#endif // #if defined(TRMMKERNEL) - - addi.d I, I, -1 /* I-- */ - blt ZERO,I, .L_N1_I1 - -.L_N1_M8: - /* We have done M & 16, considering M=8/4/2/1 */ - andi I, M, 15 - beq ZERO,I, .L_N1_M0 - - andi I, M, 8 - beq ZERO,I, .L_N1_M4 - -#if defined(TRMMKERNEL) -#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - move B0, B -#else - slli.d T0, OFF, 0x06 - add.d A0, A0, T0 - slli.d T0, OFF, 0x03 - add.d B0, B, T0 -#endif - -#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - sub.d L, K, OFF -#elif defined(LEFT) - /* number of values in A */ - addi.d L, OFF, 8 -#else - /* number of values in B */ - addi.d L, OFF, 1 -#endif -#else // #if !defined(TRMMKERNEL) - move B0, B - move L, K /* L = bk */ -#endif - - /* Load 8 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - - xvldrepl.d U4, B0, 0x00 - /* line 1 */ - xvfmul.d D0, U0, U4 - xvfmul.d D1, U1, U4 - - /* Add stride for A0 and B0 */ - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x08 - /* Reduce L */ - addi.d L, L, -1 - srai.d TL, L, 3 /* TL = (L-1) >> 3 */ - /* if (TL < 1) goto L_N1_M8_L7 */ - beq ZERO,TL, .L_N1_M8_L7 - -.L_N1_M8_TL1: /* TL-- */ - /***8-1***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x08 - - /***8-2***/ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x08 - - /***8-3***/ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x08 - - /***8-4***/ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x08 + slli.d T0, L, 0x03 + add.d B0, B0, T0 +#endif - /***8-5***/ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 +#ifdef LEFT + addi.d OFF, OFF, 0x10 +#endif +#endif // #if defined(TRMMKERNEL) - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 + addi.d I, I, -1 /* I-- */ + blt ZERO,I, .L_N1_I1 - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x08 +.L_N1_M8: + /* We have done M & 16, considering M=8/4/2/1 */ + andi I, M, 15 + beq ZERO,I, .L_N1_M0 - /***8-6***/ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 + andi I, M, 8 + beq ZERO,I, .L_N1_M4 - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x06 + add.d A0, A0, T0 + slli.d T0, OFF, 0x03 + add.d B0, B, T0 +#endif - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x08 +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 8 +#else + /* number of values in B */ + addi.d L, OFF, 1 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif - /***8-7***/ + /* Load 8 * 64 from A0 */ xvld U0, A0, 0x00 xvld U1, A0, 0x20 - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + xvfmul.d D1, U1, U4 - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x08 + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N1_M8_L7 */ + beq ZERO,TL, .L_N1_M8_L7 - /***8-8***/ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 + xvld U8, A0, 0x00 + xvld U9, A0, 0x20 - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 + addi.d TL, TL, -1 + xvldrepl.d U12, B0, 0x00 addi.d A0, A0, 0x40 addi.d B0, B0, 0x08 + beq ZERO, TL, .L_N1_M8_TL1_END +.L_N1_M8_TL1: /* TL-- */ + KERNEL8x8x1 + addi.d TL, TL, -1 /* TL-- */ blt ZERO,TL, .L_N1_M8_TL1 +.L_N1_M8_TL1_END: + KERNEL8x8x1_END + .L_N1_M8_L7: /* if (!(L & 7)) goto L_N1_M8_L0 */ andi TL, L, 7 @@ -3753,81 +3191,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* if (TL < 1) goto L_N1_M4_L7 */ beq ZERO,TL, .L_N1_M4_L7 -.L_N1_M4_TL1: /* TL-- */ - /***8-1***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x08 - - /***8-2***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x08 - - /***8-3***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x08 - - /***8-4***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x08 - - /***8-5***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x08 - - /***8-6***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x08 - - /***8-7***/ - xvld U0, A0, 0x00 + xvld U8, A0, 0x00 - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 + addi.d TL, TL, -1 + xvldrepl.d U12, B0, 0x00 addi.d A0, A0, 0x20 addi.d B0, B0, 0x08 - /***8-8***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 + beq ZERO, TL, .L_N1_M4_TL1_END - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x08 +.L_N1_M4_TL1: /* TL-- */ + KERNEL8x4x1 addi.d TL, TL, -1 /* TL-- */ blt ZERO,TL, .L_N1_M4_TL1 +.L_N1_M4_TL1_END: + KERNEL8x4x1_END .L_N1_M4_L7: /* if (!(L & 7)) goto L_N1_M4_L0 */ @@ -3927,82 +3307,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* if (TL < 1) goto L_N1_M2_L7 */ beq ZERO,TL, .L_N1_M2_L7 -.L_N1_M2_TL1: /* TL-- */ - /***8-1***/ - /* Load 2 * 64 from A0 */ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x08 - - /***8-2***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x08 - - /***8-3***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x08 - - /***8-4***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x08 - - /***8-5***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x08 - - /***8-6***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x08 - - /***8-7***/ - xvld U0, A0, 0x00 + xvld U8, A0, 0x00 - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 + addi.d TL, TL, -1 + xvldrepl.d U12, B0, 0x00 addi.d A0, A0, 0x10 addi.d B0, B0, 0x08 - /***8-8***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 + beq ZERO, TL, .L_N1_M2_TL1_END - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x08 +.L_N1_M2_TL1: /* TL-- */ + KERNEL8x2x1 addi.d TL, TL, -1 /* TL-- */ blt ZERO,TL, .L_N1_M2_TL1 +.L_N1_M2_TL1_END: + KERNEL8x2x1_END .L_N1_M2_L7: /* if (!(L & 7)) goto L_N1_M2_L0 */ @@ -4101,82 +3422,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* if (TL < 1) goto L_N1_M1_L7 */ beq ZERO,TL, .L_N1_M1_L7 -.L_N1_M1_TL1: /* TL-- */ - /***8-1***/ - /* Load 1 * 64 from A0 */ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x08 - - /***8-2***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x08 - - /***8-3***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x08 - - /***8-4***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x08 - - /***8-5***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x08 - - /***8-6***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x08 - - /***8-7***/ - xvld U0, A0, 0x00 + xvld U8, A0, 0x00 - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 + addi.d TL, TL, -1 + xvldrepl.d U12, B0, 0x00 addi.d A0, A0, 0x08 addi.d B0, B0, 0x08 - /***8-8***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 + beq ZERO, TL, .L_N1_M1_TL1_END - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x08 +.L_N1_M1_TL1: /* TL-- */ + KERNEL8x1x1 addi.d TL, TL, -1 /* TL-- */ blt ZERO,TL, .L_N1_M1_TL1 +.L_N1_M1_TL1_END: + KERNEL8x1x1_END .L_N1_M1_L7: /* if (!(L & 7)) goto L_N1_M1_L0 */ @@ -4243,7 +3505,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. LDARG $r26, $sp, 24 LDARG $r27, $sp, 32 LD $f23, $sp, 40 - addi.d $sp, $sp, 56 + LD $f24, $sp, 48 + LD $f25, $sp, 56 + LD $f26, $sp, 64 + LD $f27, $sp, 72 + LD $f28, $sp, 80 + LD $f29, $sp, 88 + LD $f30, $sp, 96 + LD $f31, $sp, 104 + addi.d $sp, $sp, 120 jirl $r0, $r1, 0x0