| @@ -7,6 +7,8 @@ | |||||
| #define ASSEMBLER | #define ASSEMBLER | ||||
| #include "common.h" | #include "common.h" | ||||
| #define M $4 | #define M $4 | ||||
| #define N $5 | #define N $5 | ||||
| #define K $6 | #define K $6 | ||||
| @@ -429,7 +431,7 @@ | |||||
| .L15: # N=4 M=4 K=2 | .L15: # N=4 M=4 K=2 | ||||
| #ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
| and K,KCO,2 # k = KCO&2 | |||||
| andi K,KCO,2 # k = KCO&2 | |||||
| #else | #else | ||||
| andi K,TEMP, 2 | andi K,TEMP, 2 | ||||
| #endif | #endif | ||||
| @@ -693,7 +695,7 @@ | |||||
| .L14_M2: | .L14_M2: | ||||
| and M,MCO,2 # Remainder M = 2 | |||||
| andi M,MCO,2 # Remainder M = 2 | |||||
| beqz M,.L14_M1 | beqz M,.L14_M1 | ||||
| nop | nop | ||||
| @@ -824,9 +826,9 @@ | |||||
| .L25: # N=4 M=2 K=2 | .L25: # N=4 M=2 K=2 | ||||
| #ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
| and K,KCO,2 # k = KCO&2 | |||||
| andi K,KCO,2 # k = KCO&2 | |||||
| #else | #else | ||||
| and K,TEMP,2 | |||||
| andi K,TEMP,2 | |||||
| #endif | #endif | ||||
| beqz K,.L28 | beqz K,.L28 | ||||
| nop | nop | ||||
| @@ -867,9 +869,9 @@ | |||||
| .L28: # N=4, M=2, K=1 | .L28: # N=4, M=2, K=1 | ||||
| #ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
| and K,KCO,1 | |||||
| andi K,KCO,1 | |||||
| #else | #else | ||||
| and K,TEMP,1 | |||||
| andi K,TEMP,1 | |||||
| #endif | #endif | ||||
| beqz K,.L29 # | beqz K,.L29 # | ||||
| LD ALPHA,152($sp) # Get ALPHA | LD ALPHA,152($sp) # Get ALPHA | ||||
| @@ -917,7 +919,6 @@ | |||||
| MADD t24,c24,t24,ALPHA | MADD t24,c24,t24,ALPHA | ||||
| ST t13,0(CO3) | ST t13,0(CO3) | ||||
| move B,BO # Reset B | |||||
| ST t23,1*SIZE(CO3) | ST t23,1*SIZE(CO3) | ||||
| daddu CO1,CO1,2*SIZE # COx += 2*8Byte | daddu CO1,CO1,2*SIZE # COx += 2*8Byte | ||||
| @@ -985,7 +986,7 @@ | |||||
| .L14_M1: | .L14_M1: | ||||
| and M,MCO,1 # Remainder M = 1 | |||||
| andi M,MCO,1 # Remainder M = 1 | |||||
| beqz M,.L0_N4_Loop # M = 0, finishing one panel B | beqz M,.L0_N4_Loop # M = 0, finishing one panel B | ||||
| nop | nop | ||||
| @@ -1001,7 +1002,8 @@ | |||||
| daddu B,BO,TEMP | daddu B,BO,TEMP | ||||
| #endif | #endif | ||||
| gsLQC1(R8,F1,F0,0) | |||||
| LD a0, 0 * SIZE(A) | |||||
| # gsLQC1(R8,F1,F0,0) | |||||
| gsLQC1(R9,F9,F8,0) #b0,b1 | gsLQC1(R9,F9,F8,0) #b0,b1 | ||||
| MTC $0,t11 | MTC $0,t11 | ||||
| gsLQC1(R9,F11,F10,1) #b2,b3 | gsLQC1(R9,F11,F10,1) #b2,b3 | ||||
| @@ -1019,9 +1021,11 @@ | |||||
| beqz K,.L35 | beqz K,.L35 | ||||
| MOV t14,t11 | MOV t14,t11 | ||||
| #else | |||||
| #else | |||||
| # gemm | |||||
| move B,BO | move B,BO | ||||
| gsLQC1(R8,F1,F0,0) | |||||
| LD a0, 0 * SIZE(A) | |||||
| # gsLQC1(R8,F1,F0,0) | |||||
| dsra K,KCO,2 # K=KCO/2 | dsra K,KCO,2 # K=KCO/2 | ||||
| gsLQC1(R9,F9,F8,0) #b0,b1 | gsLQC1(R9,F9,F8,0) #b0,b1 | ||||
| MTC $0,t11 | MTC $0,t11 | ||||
| @@ -1034,7 +1038,8 @@ | |||||
| #endif | #endif | ||||
| .L31: # N=4 m=1,=K=4 | .L31: # N=4 m=1,=K=4 | ||||
| gsLQC1(R8,F3,F2,1) | |||||
| # gsLQC1(R8,F3,F2,1) | |||||
| LD a1, 1*SIZE(A) | |||||
| gsLQC1(R9,F13,F12,2) # R9=B | gsLQC1(R9,F13,F12,2) # R9=B | ||||
| MADD t11,t11,a0,b0 | MADD t11,t11,a0,b0 | ||||
| MADD t12,t12,a0,b1 | MADD t12,t12,a0,b1 | ||||
| @@ -1042,7 +1047,8 @@ | |||||
| gsLQC1(R9,F15,F14,3) | gsLQC1(R9,F15,F14,3) | ||||
| MADD t13,t13,a0,b2 | MADD t13,t13,a0,b2 | ||||
| MADD t14,t14,a0,b3 | MADD t14,t14,a0,b3 | ||||
| LD a2, 2*SIZE(A) | |||||
| gsLQC1(R9,F9,F8,4) | gsLQC1(R9,F9,F8,4) | ||||
| MADD t11,t11,a1,b4 | MADD t11,t11,a1,b4 | ||||
| MADD t12,t12,a1,b5 | MADD t12,t12,a1,b5 | ||||
| @@ -1051,18 +1057,21 @@ | |||||
| MADD t13,t13,a1,b6 | MADD t13,t13,a1,b6 | ||||
| MADD t14,t14,a1,b7 | MADD t14,t14,a1,b7 | ||||
| daddiu K,K,-1 | daddiu K,K,-1 | ||||
| LD a3, 3*SIZE(A) | |||||
| gsLQC1(R9,F13,F12,6) | gsLQC1(R9,F13,F12,6) | ||||
| MADD t11,t11,a2,b0 | MADD t11,t11,a2,b0 | ||||
| MADD t12,t12,a2,b1 | MADD t12,t12,a2,b1 | ||||
| daddu A,A,4*SIZE # A+=1(mr)*4(kr)*8Byte=8*SIZE | |||||
| gsLQC1(R9,F15,F14,7) | gsLQC1(R9,F15,F14,7) | ||||
| MADD t13,t13,a2,b2 | MADD t13,t13,a2,b2 | ||||
| MADD t14,t14,a2,b3 | MADD t14,t14,a2,b3 | ||||
| daddu A,A,4*SIZE # A+=1(mr)*4(kr)*8Byte=8*SIZE | |||||
| daddu B,B,16*SIZE # B+=4(nr)*4(kr)*8Byte=16*SIZE | daddu B,B,16*SIZE # B+=4(nr)*4(kr)*8Byte=16*SIZE | ||||
| gsLQC1(R8,F1,F0,0) | |||||
| # gsLQC1(R8,F1,F0,0) | |||||
| LD a0, 0*SIZE(A) | |||||
| gsLQC1(R9,F9,F8,0) | gsLQC1(R9,F9,F8,0) | ||||
| MADD t11,t11,a3,b4 | MADD t11,t11,a3,b4 | ||||
| MADD t12,t12,a3,b5 | MADD t12,t12,a3,b5 | ||||
| @@ -1074,14 +1083,15 @@ | |||||
| .L35: # N=4 M=1 K=2 | .L35: # N=4 M=1 K=2 | ||||
| #ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
| and K,KCO,2 # k = KCO&2 | |||||
| andi K,KCO,2 # k = KCO&2 | |||||
| #else | #else | ||||
| and K,TEMP,2 | |||||
| andi K,TEMP,2 | |||||
| #endif | #endif | ||||
| beqz K,.L38 | beqz K,.L38 | ||||
| nop | nop | ||||
| .L36: | .L36: | ||||
| LD a1,1*SIZE(A) | |||||
| gsLQC1(R9,F13,F12,2) # R9=B | gsLQC1(R9,F13,F12,2) # R9=B | ||||
| MADD t11,t11,a0,b0 | MADD t11,t11,a0,b0 | ||||
| MADD t12,t12,a0,b1 | MADD t12,t12,a0,b1 | ||||
| @@ -1095,7 +1105,6 @@ | |||||
| .L37: | .L37: | ||||
| LD a0,0(A) | LD a0,0(A) | ||||
| gsLQC1(R9,F9,F8,0) | gsLQC1(R9,F9,F8,0) | ||||
| MADD t11,t11,a1,b4 | MADD t11,t11,a1,b4 | ||||
| MADD t12,t12,a1,b5 | MADD t12,t12,a1,b5 | ||||
| @@ -1106,7 +1115,7 @@ | |||||
| .L38: # N=4, M=1, K=1 | .L38: # N=4, M=1, K=1 | ||||
| #ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
| and K,KCO,1 | |||||
| andi K,KCO,1 | |||||
| #else | #else | ||||
| andi K,TEMP,1 | andi K,TEMP,1 | ||||
| #endif | #endif | ||||
| @@ -1182,7 +1191,7 @@ | |||||
| .align 5 | .align 5 | ||||
| .L0_N2: | .L0_N2: | ||||
| and N,NCO,2 # Remainder N = 2 | |||||
| andi N,NCO,2 # Remainder N = 2 | |||||
| beqz N,.L0_N1 # N=0,NCO<2 | beqz N,.L0_N1 # N=0,NCO<2 | ||||
| nop | nop | ||||
| @@ -1336,7 +1345,7 @@ | |||||
| .L45: # N=2 M=4 K=2 | .L45: # N=2 M=4 K=2 | ||||
| #ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
| and K,KCO,2 # k = KCO&2 | |||||
| andi K,KCO,2 # k = KCO&2 | |||||
| #else | #else | ||||
| andi K,TEMP,2 | andi K,TEMP,2 | ||||
| #endif | #endif | ||||
| @@ -1383,7 +1392,7 @@ | |||||
| .L48: # N=2, M=4, K=1 | .L48: # N=2, M=4, K=1 | ||||
| #ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
| and K,KCO,1 | |||||
| andi K,KCO,1 | |||||
| #else | #else | ||||
| andi K,TEMP,1 | andi K,TEMP,1 | ||||
| #endif | #endif | ||||
| @@ -1497,7 +1506,7 @@ | |||||
| #endif | #endif | ||||
| .L12_M2: | .L12_M2: | ||||
| and M,MCO,2 # Remainder M = 2 | |||||
| andi M,MCO,2 # Remainder M = 2 | |||||
| beqz M,.L12_M1 | beqz M,.L12_M1 | ||||
| nop | nop | ||||
| @@ -1585,7 +1594,7 @@ | |||||
| .L55: # N=2 M=2 K=2 | .L55: # N=2 M=2 K=2 | ||||
| #ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
| and K,KCO,2 # k = KCO&2 | |||||
| andi K,KCO,2 # k = KCO&2 | |||||
| #else | #else | ||||
| andi K,TEMP,2 | andi K,TEMP,2 | ||||
| #endif | #endif | ||||
| @@ -1616,9 +1625,9 @@ | |||||
| .L58: # N=2, M=2, K=1 | .L58: # N=2, M=2, K=1 | ||||
| #ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
| and K,KCO,1 | |||||
| andi K,KCO,1 | |||||
| #else | #else | ||||
| and K, TEMP, 1 | |||||
| andi K, TEMP, 1 | |||||
| #endif | #endif | ||||
| beqz K,.L59 # | beqz K,.L59 # | ||||
| LD ALPHA,152($sp) # Get ALPHA | LD ALPHA,152($sp) # Get ALPHA | ||||
| @@ -1695,7 +1704,7 @@ | |||||
| .L12_M1: | .L12_M1: | ||||
| and M,MCO,1 # Remainder M = 1 | |||||
| andi M,MCO,1 # Remainder M = 1 | |||||
| beqz M,.L0_N2_Loop # M = 0, finishing one panel B | beqz M,.L0_N2_Loop # M = 0, finishing one panel B | ||||
| nop | nop | ||||
| @@ -1711,8 +1720,8 @@ | |||||
| daddu B, BO, TEMP | daddu B, BO, TEMP | ||||
| #endif | #endif | ||||
| MTC $0,t11 | MTC $0,t11 | ||||
| gsLQC1(R8,F4,F0,0) | |||||
| #gsLQC1(R8,F4,F0,0) | |||||
| LD a0, 0*SIZE(A) | |||||
| MOV t21,t11 | MOV t21,t11 | ||||
| MOV t12,t11 | MOV t12,t11 | ||||
| gsLQC1(R9,F9,F8,0) #b0,b1 | gsLQC1(R9,F9,F8,0) #b0,b1 | ||||
| @@ -1733,8 +1742,8 @@ | |||||
| dsra K,KCO,2 # K=KCO/2 | dsra K,KCO,2 # K=KCO/2 | ||||
| MTC $0,t11 | MTC $0,t11 | ||||
| move B,BO # Reset B | move B,BO # Reset B | ||||
| gsLQC1(R8,F4,F0,0) | |||||
| # gsLQC1(R8,F4,F0,0) | |||||
| LD a0,0*SIZE(A) | |||||
| MOV t21,t11 | MOV t21,t11 | ||||
| MOV t12,t11 | MOV t12,t11 | ||||
| gsLQC1(R9,F9,F8,0) #b0,b1 | gsLQC1(R9,F9,F8,0) #b0,b1 | ||||
| @@ -1745,23 +1754,27 @@ | |||||
| #endif | #endif | ||||
| .L61: # N=2 m=1,=K=4 | .L61: # N=2 m=1,=K=4 | ||||
| LD a4, 1*SIZE(A) | |||||
| gsLQC1(R9,F13,F12,1) # R9=B | gsLQC1(R9,F13,F12,1) # R9=B | ||||
| MADD t11,t11,a0,b0 | MADD t11,t11,a0,b0 | ||||
| MADD t12,t12,a0,b1 | MADD t12,t12,a0,b1 | ||||
| LD a2, 2*SIZE(A) | |||||
| gsLQC1(R9,F11,F10,2) | gsLQC1(R9,F11,F10,2) | ||||
| MADD t11,t11,a4,b4 | MADD t11,t11,a4,b4 | ||||
| MADD t12,t12,a4,b5 | MADD t12,t12,a4,b5 | ||||
| daddiu K,K,-1 | |||||
| gsLQC1(R8,F6,F2,1) | |||||
| # gsLQC1(R8,F6,F2,1) | |||||
| LD a6, 3*SIZE(A) | |||||
| MADD t11,t11,a2,b2 | MADD t11,t11,a2,b2 | ||||
| MADD t12,t12,a2,b3 | |||||
| daddiu K,K,-1 | |||||
| gsLQC1(R9,F15,F14,3) | gsLQC1(R9,F15,F14,3) | ||||
| MADD t12,t12,a2,b3 | |||||
| daddu A,A,4*SIZE # A+=1(mr)*4(kr)*8Byte=32 | daddu A,A,4*SIZE # A+=1(mr)*4(kr)*8Byte=32 | ||||
| # gsLQC1(R8,F4,F0,0) | |||||
| gsLQC1(R8,F4,F0,0) | |||||
| LD a0, 0*SIZE(A) | |||||
| daddu B,B,8*SIZE # B+=2(nr)*4(kr)*8Byte=8*SIZE | daddu B,B,8*SIZE # B+=2(nr)*4(kr)*8Byte=8*SIZE | ||||
| gsLQC1(R9,F9,F8,0) | gsLQC1(R9,F9,F8,0) | ||||
| @@ -1771,16 +1784,18 @@ | |||||
| .L65: # N=2 M=1 K=2 | .L65: # N=2 M=1 K=2 | ||||
| #ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
| and K,KCO,2 # k = KCO&2 | |||||
| andi K,KCO,2 # k = KCO&2 | |||||
| #else | #else | ||||
| and K,TEMP,2 | |||||
| andi K,TEMP,2 | |||||
| #endif | #endif | ||||
| beqz K,.L68 | beqz K,.L68 | ||||
| nop | nop | ||||
| .L66: | .L66: | ||||
| gsLQC1(R9,F13,F12,1) # R9=B | |||||
| LD a4, 1*SIZE(A) | |||||
| MADD t11,t11,a0,b0 | MADD t11,t11,a0,b0 | ||||
| gsLQC1(R9,F13,F12,1) # R9=B | |||||
| MADD t12,t12,a0,b1 | MADD t12,t12,a0,b1 | ||||
| daddu A,A,2*SIZE # A+=1(mr)*2(kr)*8Byte=16 | daddu A,A,2*SIZE # A+=1(mr)*2(kr)*8Byte=16 | ||||
| daddu B,B,4*SIZE | daddu B,B,4*SIZE | ||||
| @@ -1794,9 +1809,9 @@ | |||||
| .L68: # N=2, M=1, K=1 | .L68: # N=2, M=1, K=1 | ||||
| #ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
| and K,KCO,1 | |||||
| andi K,KCO,1 | |||||
| #else | #else | ||||
| and K,TEMP,1 | |||||
| andi K,TEMP,1 | |||||
| #endif | #endif | ||||
| beqz K,.L69 # | beqz K,.L69 # | ||||
| LD ALPHA,152($sp) # Get ALPHA | LD ALPHA,152($sp) # Get ALPHA | ||||
| @@ -1862,7 +1877,7 @@ | |||||
| .align 5 | .align 5 | ||||
| .L0_N1: | .L0_N1: | ||||
| and N,NCO,1 # Remainder N = 1 | |||||
| andi N,NCO,1 # Remainder N = 1 | |||||
| beqz N,.L999 # N=0,NCO<1 | beqz N,.L999 # N=0,NCO<1 | ||||
| nop | nop | ||||
| @@ -1889,7 +1904,8 @@ | |||||
| daddu A, A, K | daddu A, A, K | ||||
| daddu B, BO, TEMP | daddu B, BO, TEMP | ||||
| #endif | #endif | ||||
| gsLQC1(R9,F12,F8,0) | |||||
| # gsLQC1(R9,F12,F8,0) | |||||
| LD b0, 0*SIZE(B) | |||||
| MTC $0,t11 | MTC $0,t11 | ||||
| gsLQC1(R8,F1,F0,0) #a0,a1 | gsLQC1(R8,F1,F0,0) #a0,a1 | ||||
| MOV t21,t11 | MOV t21,t11 | ||||
| @@ -1908,7 +1924,8 @@ | |||||
| #else | #else | ||||
| move B, BO | move B, BO | ||||
| dsra K,KCO,2 # K=KCO/2 | dsra K,KCO,2 # K=KCO/2 | ||||
| gsLQC1(R9,F12,F8,0) | |||||
| # gsLQC1(R9,F12,F8,0) | |||||
| LD b0, 0*SIZE(B) | |||||
| MTC $0,t11 | MTC $0,t11 | ||||
| gsLQC1(R8,F1,F0,0) #a0,a1 | gsLQC1(R8,F1,F0,0) #a0,a1 | ||||
| MOV t21,t11 | MOV t21,t11 | ||||
| @@ -1925,17 +1942,19 @@ | |||||
| MADD t11,t11,a0,b0 | MADD t11,t11,a0,b0 | ||||
| MADD t21,t21,a1,b0 | MADD t21,t21,a1,b0 | ||||
| LD b4, 1*SIZE(B) | |||||
| FETCH $0,(PREA) | FETCH $0,(PREA) | ||||
| MADD t31,t31,a2,b0 | MADD t31,t31,a2,b0 | ||||
| MADD t41,t41,a3,b0 | MADD t41,t41,a3,b0 | ||||
| .L72: | .L72: | ||||
| gsLQC1(R9,F14,F10,1) | |||||
| # gsLQC1(R9,F14,F10,1) | |||||
| gsLQC1(R8,F1,F0,4) | gsLQC1(R8,F1,F0,4) | ||||
| gsLQC1(R8,F3,F2,5) | gsLQC1(R8,F3,F2,5) | ||||
| MADD t11,t11,a4,b4 | MADD t11,t11,a4,b4 | ||||
| MADD t21,t21,a5,b4 | MADD t21,t21,a5,b4 | ||||
| LD b2, 2*SIZE(B) | |||||
| FETCH $0,4*SIZE(PREA) | FETCH $0,4*SIZE(PREA) | ||||
| MADD t31,t31,a6,b4 | MADD t31,t31,a6,b4 | ||||
| MADD t41,t41,a7,b4 | MADD t41,t41,a7,b4 | ||||
| @@ -1944,24 +1963,28 @@ | |||||
| gsLQC1(R8,F5,F4,6) | gsLQC1(R8,F5,F4,6) | ||||
| gsLQC1(R8,F7,F6,7) | gsLQC1(R8,F7,F6,7) | ||||
| MADD t11,t11,a0,b2 | MADD t11,t11,a0,b2 | ||||
| LD b6, 3*SIZE(B) | |||||
| MADD t21,t21,a1,b2 | MADD t21,t21,a1,b2 | ||||
| daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32 | |||||
| daddu A,A,16*SIZE # A+=4(mr)*4(kr)*8Byte=16*SIZE | |||||
| FETCH $0,8*SIZE(PREA) | FETCH $0,8*SIZE(PREA) | ||||
| MADD t31,t31,a2,b2 | MADD t31,t31,a2,b2 | ||||
| MADD t41,t41,a3,b2 | MADD t41,t41,a3,b2 | ||||
| daddu A,A,16*SIZE # A+=4(mr)*4(kr)*8Byte=16*SIZE | |||||
| daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32 | |||||
| .L74: | .L74: | ||||
| gsLQC1(R9,F12,F8,0) | |||||
| # gsLQC1(R9,F12,F8,0) | |||||
| gsLQC1(R8,F1,F0,0) | gsLQC1(R8,F1,F0,0) | ||||
| daddu PREA,PREA,16*SIZE | daddu PREA,PREA,16*SIZE | ||||
| gsLQC1(R8,F3,F2,1) | gsLQC1(R8,F3,F2,1) | ||||
| MADD t11,t11,a4,b6 | MADD t11,t11,a4,b6 | ||||
| MADD t21,t21,a5,b6 | MADD t21,t21,a5,b6 | ||||
| LD b0, 0*SIZE(B) | |||||
| daddiu K,K,-1 | daddiu K,K,-1 | ||||
| FETCH $0,-32(PREA) | FETCH $0,-32(PREA) | ||||
| MADD t31,t31,a6,b6 | MADD t31,t31,a6,b6 | ||||
| bnez K,.L71 | bnez K,.L71 | ||||
| MADD t41,t41,a7,b6 | MADD t41,t41,a7,b6 | ||||
| @@ -1969,9 +1992,9 @@ | |||||
| .L75: # N=2 M=4 K=2 | .L75: # N=2 M=4 K=2 | ||||
| #ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
| and K,KCO,2 # k = KCO&2 | |||||
| andi K,KCO,2 # k = KCO&2 | |||||
| #else | #else | ||||
| and K,TEMP,2 | |||||
| andi K,TEMP,2 | |||||
| #endif | #endif | ||||
| beqz K,.L78 | beqz K,.L78 | ||||
| nop | nop | ||||
| @@ -1981,20 +2004,21 @@ | |||||
| gsLQC1(R8,F7,F6,3) | gsLQC1(R8,F7,F6,3) | ||||
| MADD t11,t11,a0,b0 | MADD t11,t11,a0,b0 | ||||
| MADD t21,t21,a1,b0 | MADD t21,t21,a1,b0 | ||||
| daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=32 | |||||
| daddu A,A,8*SIZE # A+=4(mr)*2(kr)*8Byte=8*SIZE | |||||
| LD b4, 1*SIZE(B) | |||||
| FETCH $0,0(PREA) | FETCH $0,0(PREA) | ||||
| MADD t31,t31,a2,b0 | MADD t31,t31,a2,b0 | ||||
| MADD t41,t41,a3,b0 | MADD t41,t41,a3,b0 | ||||
| daddu A,A,8*SIZE # A+=4(mr)*2(kr)*8Byte=8*SIZE | |||||
| daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=32 | |||||
| .L77: | .L77: | ||||
| LD b0,0(B) | |||||
| gsLQC1(R8,F1,F0,0) | gsLQC1(R8,F1,F0,0) | ||||
| gsLQC1(R8,F3,F2,1) | gsLQC1(R8,F3,F2,1) | ||||
| MADD t11,t11,a4,b4 | MADD t11,t11,a4,b4 | ||||
| MADD t21,t21,a5,b4 | MADD t21,t21,a5,b4 | ||||
| LD b0,0(B) | |||||
| FETCH $0,4*SIZE(PREA) | FETCH $0,4*SIZE(PREA) | ||||
| MADD t31,t31,a6,b4 | MADD t31,t31,a6,b4 | ||||
| MADD t41,t41,a7,b4 | MADD t41,t41,a7,b4 | ||||
| @@ -2004,9 +2028,9 @@ | |||||
| .L78: # N=2, M=4, K=1 | .L78: # N=2, M=4, K=1 | ||||
| #ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
| and K,KCO,1 | |||||
| andi K,KCO,1 | |||||
| #else | #else | ||||
| and K,TEMP,1 | |||||
| andi K,TEMP,1 | |||||
| #endif | #endif | ||||
| beqz K,.L79 # | beqz K,.L79 # | ||||
| LD ALPHA,152($sp) # Get ALPHA | LD ALPHA,152($sp) # Get ALPHA | ||||
| @@ -2084,7 +2108,7 @@ | |||||
| .L11_M2: | .L11_M2: | ||||
| and M,MCO,2 # Remainder M = 2 | |||||
| andi M,MCO,2 # Remainder M = 2 | |||||
| beqz M,.L11_M1 | beqz M,.L11_M1 | ||||
| nop | nop | ||||
| @@ -2100,7 +2124,8 @@ | |||||
| daddu B, BO, TEMP | daddu B, BO, TEMP | ||||
| #endif | #endif | ||||
| gsLQC1(R9,F12,F8,0) | |||||
| # gsLQC1(R9,F12,F8,0) | |||||
| LD b0, 0*SIZE(B) | |||||
| MTC $0,t11 | MTC $0,t11 | ||||
| gsLQC1(R8,F1,F0,0) #a0,a1 | gsLQC1(R8,F1,F0,0) #a0,a1 | ||||
| MOV t21,t11 | MOV t21,t11 | ||||
| @@ -2117,7 +2142,8 @@ | |||||
| #else | #else | ||||
| move B, BO | move B, BO | ||||
| dsra K,KCO,2 # K=KCO/2 | dsra K,KCO,2 # K=KCO/2 | ||||
| gsLQC1(R9,F12,F8,0) | |||||
| # gsLQC1(R9,F12,F8,0) | |||||
| LD b0, 0*SIZE(B) | |||||
| MTC $0,t11 | MTC $0,t11 | ||||
| gsLQC1(R8,F1,F0,0) #a0,a1 | gsLQC1(R8,F1,F0,0) #a0,a1 | ||||
| MOV t21,t11 | MOV t21,t11 | ||||
| @@ -2126,34 +2152,39 @@ | |||||
| #endif | #endif | ||||
| .L81: # N=1,M=2,K=4 | .L81: # N=1,M=2,K=4 | ||||
| LD b4, 1*SIZE(B) | |||||
| gsLQC1(R8,F5,F4,1) # R8=A | gsLQC1(R8,F5,F4,1) # R8=A | ||||
| MADD t11,t11,a0,b0 | MADD t11,t11,a0,b0 | ||||
| MADD t21,t21,a1,b0 | MADD t21,t21,a1,b0 | ||||
| LD b2, 2*SIZE(B) | |||||
| gsLQC1(R8,F3,F2,2) | gsLQC1(R8,F3,F2,2) | ||||
| MADD t11,t11,a4,b4 | MADD t11,t11,a4,b4 | ||||
| MADD t21,t21,a5,b4 | MADD t21,t21,a5,b4 | ||||
| gsLQC1(R9,F14,F10,1) | |||||
| daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32 | |||||
| # gsLQC1(R9,F14,F10,1) | |||||
| LD b6, 3*SIZE(B) | |||||
| gsLQC1(R8,F7,F6,3) | gsLQC1(R8,F7,F6,3) | ||||
| MADD t11,t11,a2,b2 | MADD t11,t11,a2,b2 | ||||
| MADD t21,t21,a3,b2 | MADD t21,t21,a3,b2 | ||||
| daddu A,A,8*SIZE # A+=2(mr)*4(kr)*8Byte=8*SIZE | daddu A,A,8*SIZE # A+=2(mr)*4(kr)*8Byte=8*SIZE | ||||
| daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32 | |||||
| gsLQC1(R9,F12,F8,0) | |||||
| daddiu K,K,-1 | |||||
| # gsLQC1(R9,F12,F8,0) | |||||
| gsLQC1(R8,F1,F0,0) | gsLQC1(R8,F1,F0,0) | ||||
| daddiu K,K,-1 | |||||
| MADD t11,t11,a6,b6 | MADD t11,t11,a6,b6 | ||||
| LD b0, 0*SIZE(B) | |||||
| bnez K,.L81 | bnez K,.L81 | ||||
| MADD t21,t21,a7,b6 | MADD t21,t21,a7,b6 | ||||
| .L85: # N=2 M=4 K=2 | .L85: # N=2 M=4 K=2 | ||||
| #ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
| and K,KCO,2 # k = KCO&2 | |||||
| andi K,KCO,2 # k = KCO&2 | |||||
| #else | #else | ||||
| andi K,TEMP,2 | andi K,TEMP,2 | ||||
| #endif | #endif | ||||
| @@ -2163,21 +2194,22 @@ | |||||
| .L86: | .L86: | ||||
| gsLQC1(R8,F5,F4,1) # R8=A | gsLQC1(R8,F5,F4,1) # R8=A | ||||
| LD b4, 1*SIZE(B) | |||||
| MADD t11,t11,a0,b0 | MADD t11,t11,a0,b0 | ||||
| MADD t21,t21,a1,b0 | MADD t21,t21,a1,b0 | ||||
| daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=16 | |||||
| LD b0,0(B) | |||||
| daddu A,A,4*SIZE # A+=2(mr)*2(kr)*8Byte=32 | daddu A,A,4*SIZE # A+=2(mr)*2(kr)*8Byte=32 | ||||
| daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=16 | |||||
| gsLQC1(R8,F1,F0,0) | gsLQC1(R8,F1,F0,0) | ||||
| LD b0,0(B) | |||||
| MADD t11,t11,a4,b4 | MADD t11,t11,a4,b4 | ||||
| MADD t21,t21,a5,b4 | MADD t21,t21,a5,b4 | ||||
| .L88: # N=2, M=4, K=1 | .L88: # N=2, M=4, K=1 | ||||
| #ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
| and K,KCO,1 | |||||
| andi K,KCO,1 | |||||
| #else | #else | ||||
| andi K,TEMP,1 | andi K,TEMP,1 | ||||
| #endif | #endif | ||||
| @@ -2236,7 +2268,7 @@ | |||||
| .L11_M1: | .L11_M1: | ||||
| and M,MCO,1 # Remainder M = 1 | |||||
| andi M,MCO,1 # Remainder M = 1 | |||||
| beqz M,.L999 # M = 0, End | beqz M,.L999 # M = 0, End | ||||
| nop | nop | ||||
| @@ -2251,9 +2283,11 @@ | |||||
| daddu A, A, K | daddu A, A, K | ||||
| daddu B, BO, TEMP | daddu B, BO, TEMP | ||||
| #endif | #endif | ||||
| gsLQC1(R8,F4,F0,0) | |||||
| # gsLQC1(R8,F4,F0,0) | |||||
| MTC $0,t11 | MTC $0,t11 | ||||
| gsLQC1(R9,F12,F8,0) | |||||
| # gsLQC1(R9,F12,F8,0) | |||||
| LD a0, 0*SIZE(A) | |||||
| LD b0, 0*SIZE(B) | |||||
| #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) | #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) | ||||
| dsubu TEMP, KCO, KK | dsubu TEMP, KCO, KK | ||||
| #elif defined(LEFT) | #elif defined(LEFT) | ||||
| @@ -2268,33 +2302,45 @@ | |||||
| #else | #else | ||||
| move B, BO | move B, BO | ||||
| dsra K,KCO,2 # K=KCO/2 | dsra K,KCO,2 # K=KCO/2 | ||||
| gsLQC1(R8,F4,F0,0) | |||||
| gsLQC1(R9,F12,F8,0) | |||||
| # gsLQC1(R8,F4,F0,0) | |||||
| # gsLQC1(R9,F12,F8,0) | |||||
| LD a0, 0*SIZE(A) | |||||
| LD b0, 0*SIZE(B) | |||||
| beqz K,.L95 | beqz K,.L95 | ||||
| MTC $0,t11 | MTC $0,t11 | ||||
| #endif | #endif | ||||
| .L91: # N=1,M=1,K=4 | .L91: # N=1,M=1,K=4 | ||||
| gsLQC1(R8,F6,F2,1) | |||||
| # gsLQC1(R8,F6,F2,1) | |||||
| LD a4, 1*SIZE(A) | |||||
| LD b4, 1*SIZE(B) | |||||
| MADD t11,t11,a0,b0 | MADD t11,t11,a0,b0 | ||||
| gsLQC1(R9,F14,F10,1) | |||||
| # gsLQC1(R9,F14,F10,1) | |||||
| LD a2, 2*SIZE(A) | |||||
| LD b2, 2*SIZE(B) | |||||
| MADD t11,t11,a4,b4 | MADD t11,t11,a4,b4 | ||||
| daddu A,A,4*SIZE # A+=1(mr)*4(kr)*8Byte=32 | |||||
| gsLQC1(R8,F4,F0,0) | |||||
| # gsLQC1(R8,F4,F0,0) | |||||
| LD a6, 3*SIZE(A) | |||||
| LD b6, 3*SIZE(B) | |||||
| MADD t11,t11,a2,b2 | MADD t11,t11,a2,b2 | ||||
| daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32 | |||||
| gsLQC1(R9,F12,F8,0) | |||||
| daddu A,A,4*SIZE # A+=1(mr)*4(kr)*8Byte=32 | |||||
| daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32 | |||||
| LD a0, 0*SIZE(A) | |||||
| LD b0, 0*SIZE(B) | |||||
| # gsLQC1(R9,F12,F8,0) | |||||
| MADD t11,t11,a6,b6 | MADD t11,t11,a6,b6 | ||||
| daddiu K,K,-1 | daddiu K,K,-1 | ||||
| bnez K,.L91 | bnez K,.L91 | ||||
| nop | nop | ||||
| .L95: # N=2 M=4 K=2 | .L95: # N=2 M=4 K=2 | ||||
| #ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
| and K,KCO,2 # k = KCO&2 | |||||
| andi K,KCO,2 # k = KCO&2 | |||||
| #else | #else | ||||
| andi K,TEMP,2 | andi K,TEMP,2 | ||||
| #endif | #endif | ||||
| @@ -2302,18 +2348,21 @@ | |||||
| nop | nop | ||||
| .L96: | .L96: | ||||
| LD a4, 1*SIZE(A) | |||||
| LD b4, 1*SIZE(B) | |||||
| MADD t11,t11,a0,b0 | MADD t11,t11,a0,b0 | ||||
| MADD t11,t11,a4,b4 | |||||
| daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=16 | daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=16 | ||||
| daddu A,A,2*SIZE # A+=1(mr)*2(kr)*8Byte=32 | daddu A,A,2*SIZE # A+=1(mr)*2(kr)*8Byte=32 | ||||
| LD b0,0(B) | LD b0,0(B) | ||||
| LD a0,0(A) | LD a0,0(A) | ||||
| MADD t11,t11,a4,b4 | |||||
| .L98: # N=2, M=4, K=1 | .L98: # N=2, M=4, K=1 | ||||
| #ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
| and K,KCO,1 | |||||
| andi K,KCO,1 | |||||
| #else | #else | ||||
| andi K,TEMP,1 | andi K,TEMP,1 | ||||
| #endif | #endif | ||||