| @@ -146,6 +146,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define JMP jmp | |||
| #define NOP | |||
| #define XOR xorpd | |||
| #undef MOVQ | |||
| #define MOVQ movq | |||
| #define XOR_SY vxorps | |||
| @@ -305,7 +306,7 @@ movq %r11, kk; | |||
| MOVQ bn,j; | |||
| SARQ $2,j; # Rn = 4 | |||
| JLE .L0_loopE; | |||
| .align 32; | |||
| ALIGN_5; | |||
| .L0_bodyB:; | |||
| #if defined(TRMMKERNEL) && defined(LEFT) | |||
| MOVQ OFFSET, %rax; | |||
| @@ -320,7 +321,7 @@ MOVQ ba,ptrba; | |||
| MOVQ bm,i; | |||
| SARQ $3,i; # Rm = 8 | |||
| JLE .L1_loopE; | |||
| .align 32; | |||
| ALIGN_5; | |||
| .L1_bodyB:; | |||
| #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | |||
| MOVQ bb,ptrbb; | |||
| @@ -367,7 +368,7 @@ MOVQ %rax, kkk; | |||
| #endif | |||
| SARQ $2,k; # Unroll 4 times | |||
| JLE .L2_loopE; | |||
| .align 32; | |||
| ALIGN_5; | |||
| .L2_bodyB:; | |||
| # Computing kernel | |||
| @@ -591,7 +592,7 @@ ADD2_SY yvec7, yvec8, yvec8; | |||
| .L2_bodyE:; | |||
| DECQ k; | |||
| JG .L2_bodyB; | |||
| .align 64; | |||
| ALIGN_5 | |||
| .L2_loopE:; | |||
| #ifndef TRMMKERNEL | |||
| TEST $2, bk; | |||
| @@ -599,7 +600,7 @@ TEST $2, bk; | |||
| TEST $2, kkk; | |||
| #endif | |||
| JLE .L3_loopE; | |||
| .align 64 | |||
| ALIGN_5 | |||
| .L3_loopB: | |||
| ######### Unroll 1 ################## | |||
| PREFETCH0 PRESIZE*SIZE(ptrba) | |||
| @@ -717,7 +718,7 @@ TEST $1, bk; | |||
| TEST $1, kkk; | |||
| #endif | |||
| JLE .L4_loopE; | |||
| .align 64 | |||
| ALIGN_5 | |||
| .L4_loopB:; | |||
| ######### Unroll 1 ################## | |||
| PREFETCH0 PRESIZE*SIZE(ptrba) | |||
| @@ -875,7 +876,7 @@ MOVQ C0, %rax; | |||
| OR ldc, %rax; | |||
| TEST $15, %rax; | |||
| JNE .L4_loopEx; | |||
| .align 32 | |||
| ALIGN_5 | |||
| EXTRA_SY $1,yvec15,xvec7; | |||
| EXTRA_SY $1,yvec14,xvec6; | |||
| EXTRA_SY $1,yvec13,xvec5; | |||
| @@ -934,7 +935,7 @@ ADDQ $16*SIZE,C1; | |||
| DECQ i; | |||
| JG .L1_bodyB; | |||
| JMP .L1_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L4_loopEx: | |||
| EXTRA_SY $1, yvec15, xvec7; | |||
| #ifndef TRMMKERNEL | |||
| @@ -1077,11 +1078,11 @@ ADDQ $16*SIZE, C0; | |||
| ADDQ $16*SIZE, C1; | |||
| DECQ i; | |||
| JG .L1_bodyB; | |||
| .align 32; | |||
| ALIGN_5; | |||
| .L1_loopE:; | |||
| TEST $4, bm; | |||
| JLE .L5_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L5_bodyB: | |||
| #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | |||
| MOVQ bb,ptrbb; | |||
| @@ -1113,7 +1114,7 @@ MOVQ %rax, kkk; | |||
| #endif | |||
| SARQ $2, k; | |||
| JLE .L8_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L8_bodyB: | |||
| #### Unroll times 1 #### | |||
| LD_SY 0*SIZE(ptrba), yvec0; | |||
| @@ -1242,7 +1243,7 @@ ADDQ $32*SIZE, ptrba; | |||
| ADDQ $32*SIZE, ptrbb; | |||
| DECQ k; | |||
| JG .L8_bodyB; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L8_loopE: | |||
| #ifndef TRMMKERNEL | |||
| TEST $2, bk; | |||
| @@ -1250,7 +1251,7 @@ TEST $2, bk; | |||
| TEST $2, kkk; | |||
| #endif | |||
| JLE .L9_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L9_bodyB: | |||
| #### Unroll times 1 #### | |||
| LD_SY 0*SIZE(ptrba), yvec0; | |||
| @@ -1323,7 +1324,7 @@ TEST $1, bk; | |||
| TEST $1, kkk; | |||
| #endif | |||
| JLE .L10_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L10_bodyB: | |||
| #### Unroll times 1 #### | |||
| LD_SY 0*SIZE(ptrba), yvec0; | |||
| @@ -1494,7 +1495,7 @@ ADDQ $8*SIZE, C1; | |||
| .L5_loopE: | |||
| TEST $2, bm; | |||
| JLE .L6_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L6_bodyB: | |||
| #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | |||
| MOVQ bb,ptrbb; | |||
| @@ -1527,7 +1528,7 @@ MOVQ %rax, kkk; | |||
| #endif | |||
| SARQ $2, k; | |||
| JLE .L11_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L11_bodyB: | |||
| LD_SX 0*SIZE(ptrba), xvec0; # ar1, ai1, ar2, ai2 | |||
| EDUP_SX 0*SIZE(ptrbb), xvec2; # br1, br1, br2, br2 | |||
| @@ -1652,7 +1653,7 @@ ADDQ $16*SIZE, ptrba; | |||
| ADDQ $32*SIZE, ptrbb; | |||
| DECQ k; | |||
| JG .L11_bodyB; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L11_loopE: | |||
| #ifndef TRMMKERNEL | |||
| TEST $2, bk; | |||
| @@ -1660,7 +1661,7 @@ TEST $2, bk; | |||
| TEST $2, kkk; | |||
| #endif | |||
| JLE .L12_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L12_bodyB: | |||
| LD_SX 0*SIZE(ptrba), xvec0; # ar1, ai1, ar2, ai2 | |||
| EDUP_SX 0*SIZE(ptrbb), xvec2; # br1, br1, br2, br2 | |||
| @@ -1731,7 +1732,7 @@ TEST $1, bk; | |||
| TEST $1, kkk; | |||
| #endif | |||
| JLE .L13_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L13_bodyB: | |||
| LD_SX 0*SIZE(ptrba), xvec0; # ar1, ai1, ar2, ai2 | |||
| EDUP_SX 0*SIZE(ptrbb), xvec2; # br1, br1, br2, br2 | |||
| @@ -1875,7 +1876,7 @@ ADDQ $4*SIZE, C1; | |||
| .L6_loopE: | |||
| TEST $1, bm; | |||
| JLE .L7_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L7_bodyB: | |||
| #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | |||
| MOVQ bb,ptrbb; | |||
| @@ -1905,7 +1906,7 @@ MOVQ %rax, kkk; | |||
| #endif | |||
| SARQ $2, k; | |||
| JLE .L14_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L14_bodyB: | |||
| BROAD_SX 0*SIZE(ptrba), xvec0; | |||
| LD_SX 0*SIZE(ptrbb), xvec2; | |||
| @@ -1978,7 +1979,7 @@ ADDQ $8*SIZE, ptrba; | |||
| ADDQ $32*SIZE, ptrbb; | |||
| DECQ k; | |||
| JG .L14_bodyB; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L14_loopE: | |||
| #ifndef TRMMKERNEL | |||
| TEST $2, bk; | |||
| @@ -1986,7 +1987,7 @@ TEST $2, bk; | |||
| TEST $2, kkk; | |||
| #endif | |||
| JLE .L15_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L15_bodyB: | |||
| BROAD_SX 0*SIZE(ptrba), xvec0; | |||
| LD_SX 0*SIZE(ptrbb), xvec2; | |||
| @@ -2031,7 +2032,7 @@ TEST $1, bk; | |||
| TEST $1, kkk; | |||
| #endif | |||
| JLE .L16_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L16_bodyB: | |||
| BROAD_SX 0*SIZE(ptrba), xvec0; | |||
| LD_SX 0*SIZE(ptrbb), xvec2; | |||
| @@ -2129,11 +2130,11 @@ LEAQ (C,ldc,4),C; | |||
| .L0_bodyE:; | |||
| DECQ j; | |||
| JG .L0_bodyB; | |||
| .align 32; | |||
| ALIGN_5; | |||
| .L0_loopE:; | |||
| TEST $2, bn; | |||
| JLE .L20_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L20_bodyB: | |||
| #if defined(TRMMKERNEL) && defined(LEFT) | |||
| MOVQ OFFSET, %rax; | |||
| @@ -2145,7 +2146,7 @@ MOVQ ba, ptrba; | |||
| MOVQ bm, i; | |||
| SARQ $3, i; | |||
| JLE .L21_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L21_bodyB: | |||
| #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | |||
| MOVQ bb,ptrbb; | |||
| @@ -2181,7 +2182,7 @@ MOVQ %rax, kkk; | |||
| #endif | |||
| SARQ $2, k; | |||
| JLE .L211_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L211_bodyB: | |||
| EDUP_SX 0*SIZE(ptrbb), xvec4; | |||
| ODUP_SX 0*SIZE(ptrbb), xvec5; | |||
| @@ -2430,7 +2431,7 @@ ADDQ $64*SIZE, ptrba; | |||
| ADDQ $16*SIZE, ptrbb; | |||
| DECQ k; | |||
| JG .L211_bodyB; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L211_loopE: | |||
| #ifndef TRMMKERNEL | |||
| TEST $2, bk; | |||
| @@ -2438,7 +2439,7 @@ TEST $2, bk; | |||
| TEST $2, kkk; | |||
| #endif | |||
| JLE .L212_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L212_bodyB: | |||
| EDUP_SX 0*SIZE(ptrbb), xvec4; | |||
| ODUP_SX 0*SIZE(ptrbb), xvec5; | |||
| @@ -2571,7 +2572,7 @@ TEST $1, bk; | |||
| TEST $1, kkk; | |||
| #endif | |||
| JLE .L213_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L213_bodyB: | |||
| EDUP_SX 0*SIZE(ptrbb), xvec4; | |||
| ODUP_SX 0*SIZE(ptrbb), xvec5; | |||
| @@ -2825,11 +2826,11 @@ ADDQ $16*SIZE, C0; | |||
| ADDQ $16*SIZE, C1; | |||
| DECQ i; | |||
| JG .L21_bodyB; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L21_loopE: | |||
| TEST $4, bm; | |||
| JLE .L22_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L22_bodyB: | |||
| #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | |||
| MOVQ bb,ptrbb; | |||
| @@ -2862,7 +2863,7 @@ MOVQ %rax, kkk; | |||
| SARQ $2, k; | |||
| JLE .L221_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L221_bodyB: | |||
| EDUP_SX 0*SIZE(ptrbb), xvec4; | |||
| ODUP_SX 0*SIZE(ptrbb), xvec5; | |||
| @@ -3002,7 +3003,7 @@ ADDQ $32*SIZE, ptrba; | |||
| ADDQ $16*SIZE, ptrbb; | |||
| DECQ k; | |||
| JG .L221_bodyB; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L221_loopE: | |||
| #ifndef TRMMKERNEL | |||
| TEST $2, bk; | |||
| @@ -3010,7 +3011,7 @@ TEST $2, bk; | |||
| TEST $2, kkk; | |||
| #endif | |||
| JLE .L222_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L222_bodyB: | |||
| EDUP_SX 0*SIZE(ptrbb), xvec4; | |||
| ODUP_SX 0*SIZE(ptrbb), xvec5; | |||
| @@ -3089,7 +3090,7 @@ TEST $1, bk; | |||
| TEST $1, kkk; | |||
| #endif | |||
| JLE .L223_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L223_bodyB: | |||
| EDUP_SX 0*SIZE(ptrbb), xvec4; | |||
| ODUP_SX 0*SIZE(ptrbb), xvec5; | |||
| @@ -3237,7 +3238,7 @@ ADDQ $8*SIZE, C1; | |||
| .L22_loopE: | |||
| TEST $2, bm; | |||
| JLE .L23_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L23_bodyB: | |||
| #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | |||
| MOVQ bb,ptrbb; | |||
| @@ -3267,7 +3268,7 @@ MOVQ %rax, kkk; | |||
| #endif | |||
| SARQ $2, k; | |||
| JLE .L231_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L231_bodyB: | |||
| EDUP_SX 0*SIZE(ptrbb), xvec4; | |||
| ODUP_SX 0*SIZE(ptrbb), xvec5; | |||
| @@ -3351,7 +3352,7 @@ ADDQ $16*SIZE, ptrba; | |||
| ADDQ $16*SIZE, ptrbb; | |||
| DECQ k; | |||
| JG .L231_bodyB; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L231_loopE: | |||
| #ifndef TRMMKERNEL | |||
| TEST $2, bk; | |||
| @@ -3359,7 +3360,7 @@ TEST $2, bk; | |||
| TEST $2, kkk; | |||
| #endif | |||
| JLE .L232_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L232_bodyB: | |||
| EDUP_SX 0*SIZE(ptrbb), xvec4; | |||
| ODUP_SX 0*SIZE(ptrbb), xvec5; | |||
| @@ -3409,7 +3410,7 @@ TEST $1, bk; | |||
| TEST $1, kkk; | |||
| #endif | |||
| JLE .L233_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L233_bodyB: | |||
| EDUP_SX 0*SIZE(ptrbb), xvec4; | |||
| ODUP_SX 0*SIZE(ptrbb), xvec5; | |||
| @@ -3503,7 +3504,7 @@ ADDQ $4*SIZE, C1; | |||
| .L23_loopE: | |||
| TEST $1, bm; | |||
| JLE .L24_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L24_bodyB: | |||
| #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | |||
| MOVQ bb,ptrbb; | |||
| @@ -3532,7 +3533,7 @@ MOVQ %rax, kkk; | |||
| #endif | |||
| SARQ $2, k; | |||
| JLE .L241_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L241_bodyB: | |||
| BROAD_SX 0*SIZE(ptrba), xvec0; | |||
| LD_SX 0*SIZE(ptrbb), xvec2; | |||
| @@ -3585,7 +3586,7 @@ TEST $2, bk; | |||
| TEST $2, kkk; | |||
| #endif | |||
| JLE .L242_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L242_bodyB: | |||
| BROAD_SX 0*SIZE(ptrba), xvec0; | |||
| LD_SX 0*SIZE(ptrbb), xvec2; | |||
| @@ -3616,7 +3617,7 @@ TEST $1, bk; | |||
| TEST $1, kkk; | |||
| #endif | |||
| JLE .L243_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L243_bodyB: | |||
| BROAD_SX 0*SIZE(ptrba), xvec0; | |||
| LD_SX 0*SIZE(ptrbb), xvec2; | |||
| @@ -3684,7 +3685,7 @@ LEAQ (C, ldc, 2), C; | |||
| .L20_loopE: | |||
| TEST $1, bn; | |||
| JLE .L30_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L30_bodyB: | |||
| #if defined(TRMMKERNEL) && defined(LEFT) | |||
| MOVQ OFFSET, %rax; | |||
| @@ -3695,7 +3696,7 @@ MOVQ ba, ptrba; | |||
| MOVQ bm, i; | |||
| SARQ $3, i; | |||
| JLE .L31_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L31_bodyB: | |||
| MOVQ bb, ptrbb; | |||
| #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | |||
| @@ -3727,7 +3728,7 @@ MOVQ %rax, kkk; | |||
| #endif | |||
| SARQ $2, k; | |||
| JLE .L311_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L311_bodyB: | |||
| #### Unroll 1 #### | |||
| LD_SY 0*SIZE(ptrba), yvec0; | |||
| @@ -3800,7 +3801,7 @@ ADDQ $64*SIZE, ptrba; | |||
| ADDQ $8*SIZE, ptrbb; | |||
| DECQ k; | |||
| JG .L311_bodyB; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L311_loopE: | |||
| #ifndef TRMMKERNEL | |||
| TEST $2, bk; | |||
| @@ -3808,7 +3809,7 @@ TEST $2, bk; | |||
| TEST $2, kkk; | |||
| #endif | |||
| JLE .L312_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L312_bodyB: | |||
| #### Unroll 1 #### | |||
| LD_SY 0*SIZE(ptrba), yvec0; | |||
| @@ -3853,7 +3854,7 @@ TEST $1, bk; | |||
| TEST $1, kkk; | |||
| #endif | |||
| JLE .L313_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L313_bodyB: | |||
| #### Unroll 1 #### | |||
| LD_SY 0*SIZE(ptrba), yvec0; | |||
| @@ -3941,11 +3942,11 @@ ADDQ $8, kk; | |||
| ADDQ $16*SIZE, C0; | |||
| DECQ i; | |||
| JG .L31_bodyB; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L31_loopE: | |||
| TEST $4, bm; | |||
| JLE .L32_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L32_bodyB: | |||
| #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | |||
| MOVQ bb,ptrbb; | |||
| @@ -3974,7 +3975,7 @@ MOVQ %rax, kkk; | |||
| #endif | |||
| SARQ $2, k; | |||
| JLE .L321_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L321_bodyB: | |||
| #### Unroll 1 #### | |||
| LD_SY 0*SIZE(ptrba), yvec0; | |||
| @@ -4023,7 +4024,7 @@ ADDQ $32*SIZE, ptrba; | |||
| ADDQ $8*SIZE, ptrbb; | |||
| DECQ k; | |||
| JG .L321_bodyB; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L321_loopE: | |||
| #ifndef TRMMKERNEL | |||
| TEST $2, bk; | |||
| @@ -4031,7 +4032,7 @@ TEST $2, bk; | |||
| TEST $2, kkk; | |||
| #endif | |||
| JLE .L322_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L322_bodyB: | |||
| #### Unroll 1 #### | |||
| LD_SY 0*SIZE(ptrba), yvec0; | |||
| @@ -4064,7 +4065,7 @@ TEST $1, bk; | |||
| TEST $1, kkk; | |||
| #endif | |||
| JLE .L323_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L323_bodyB: | |||
| #### Unroll 1 #### | |||
| LD_SY 0*SIZE(ptrba), yvec0; | |||
| @@ -4128,7 +4129,7 @@ ADDQ $8*SIZE, C0; | |||
| .L32_loopE: | |||
| TEST $2, bm; | |||
| JLE .L33_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L33_bodyB: | |||
| #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | |||
| MOVQ bb,ptrbb; | |||
| @@ -4157,7 +4158,7 @@ MOVQ %rax, kkk; | |||
| #endif | |||
| SARQ $2, k; | |||
| JLE .L331_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L331_bodyB: | |||
| #### Unroll 1 #### | |||
| LD_SX 0*SIZE(ptrba), xvec0; | |||
| @@ -4202,7 +4203,7 @@ ADDQ $16*SIZE, ptrba; | |||
| ADDQ $8*SIZE, ptrbb; | |||
| DECQ k; | |||
| JG .L331_bodyB; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L331_loopE: | |||
| #ifndef TRMMKERNEL | |||
| TEST $2, bk; | |||
| @@ -4210,7 +4211,7 @@ TEST $2, bk; | |||
| TEST $2, kkk; | |||
| #endif | |||
| JLE .L332_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L332_bodyB: | |||
| #### Unroll 1 #### | |||
| LD_SX 0*SIZE(ptrba), xvec0; | |||
| @@ -4241,7 +4242,7 @@ TEST $1, bk; | |||
| TEST $1, kkk; | |||
| #endif | |||
| JLE .L333_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L333_bodyB: | |||
| #### Unroll 1 #### | |||
| LD_SX 0*SIZE(ptrba), xvec0; | |||
| @@ -4300,7 +4301,7 @@ ADDQ $4*SIZE, C0; | |||
| .L33_loopE: | |||
| TEST $1, bm; | |||
| JLE .L34_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L34_bodyB: | |||
| #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | |||
| MOVQ bb,ptrbb; | |||
| @@ -4329,7 +4330,7 @@ MOVQ %rax, kkk; | |||
| #endif | |||
| SARQ $2, k; | |||
| JLE .L341_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L341_bodyB: | |||
| LD_SX 0*SIZE(ptrba), xvec0; | |||
| LD_SX 0*SIZE(ptrbb), xvec2; | |||
| @@ -4354,7 +4355,7 @@ ADDQ $8*SIZE, ptrba; | |||
| ADDQ $8*SIZE, ptrbb; | |||
| DECQ k; | |||
| JG .L341_bodyB; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L341_loopE: | |||
| #ifndef TRMMKERNEL | |||
| TEST $2, bk; | |||
| @@ -4362,7 +4363,7 @@ TEST $2, bk; | |||
| TEST $2, kkk; | |||
| #endif | |||
| JLE .L342_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L342_bodyB: | |||
| LD_SX 0*SIZE(ptrba), xvec0; | |||
| LD_SX 0*SIZE(ptrbb), xvec2; | |||
| @@ -4383,7 +4384,7 @@ TEST $1, bk; | |||
| TEST $1, kkk; | |||
| #endif | |||
| JLE .L343_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L343_bodyB: | |||
| XOR_SY yvec0, yvec0, yvec0; | |||
| XOR_SY yvec2, yvec2, yvec2; | |||
| @@ -140,6 +140,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define JNE jne | |||
| #define NOP | |||
| #define XOR xorpd | |||
| #undef MOVQ | |||
| #define MOVQ movq | |||
| #define XOR_SY vxorps | |||
| @@ -265,7 +266,7 @@ movq %r11, kk | |||
| MOVQ bn,j; | |||
| SARQ $2,j; # Rn = 4 | |||
| JLE .L0_loopE; | |||
| .align 32; | |||
| ALIGN_5; | |||
| .L0_bodyB:; | |||
| #if defined(TRMMKERNEL) && defined(LEFT) | |||
| MOVQ OFFSET, %rax; | |||
| @@ -281,7 +282,7 @@ MOVQ ba,ptrba; | |||
| MOVQ bm,i; | |||
| SARQ $3,i; # Rm = 8 | |||
| JLE .L1_loopE; | |||
| .align 32; | |||
| ALIGN_5; | |||
| .L1_bodyB:; | |||
| #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | |||
| MOVQ bb, ptrbb; | |||
| @@ -328,7 +329,7 @@ MOVQ %rax, kkk; | |||
| #endif | |||
| SARQ $2,k; | |||
| JLE .L2_loopE; | |||
| .align 32; | |||
| ALIGN_5; | |||
| .L2_bodyB:; | |||
| # Computing kernel | |||
| @@ -448,7 +449,7 @@ ADD_DY yvec8, yvec7, yvec8; | |||
| .L2_bodyE:; | |||
| DECQ k; | |||
| JG .L2_bodyB; | |||
| .align 64; | |||
| ALIGN_5 | |||
| .L2_loopE:; | |||
| PREFETCH2 0*SIZE(prebb); | |||
| ADDQ $8*SIZE, prebb; | |||
| @@ -459,7 +460,7 @@ MOVQ kkk, %rax; | |||
| TEST $2, %rax; | |||
| #endif | |||
| JLE .L3_loopE; | |||
| .align 64 | |||
| ALIGN_5 | |||
| .L3_bodyB: | |||
| #### Unroll times 1 #### | |||
| PREFETCH0 64*SIZE(ptrba) | |||
| @@ -529,7 +530,7 @@ MOVQ kkk, %rax; | |||
| TEST $1, %rax; | |||
| #endif | |||
| JLE .L4_loopE; | |||
| .align 64 | |||
| ALIGN_5 | |||
| .L4_bodyB:; | |||
| #### Unroll times 1 #### | |||
| PREFETCH0 64*SIZE(ptrba) | |||
| @@ -588,7 +589,7 @@ MOVQ C0, %rax; | |||
| OR ldc, %rax; | |||
| TEST $15, %rax; | |||
| JNE .L4_loopEx; # Unalign part write back | |||
| .align 32 | |||
| ALIGN_5 | |||
| #### Writing Back #### | |||
| EXTRA_DY $1,yvec15,xvec7; | |||
| EXTRA_DY $1,yvec14,xvec6; | |||
| @@ -648,7 +649,7 @@ ADDQ $8*SIZE,C1; | |||
| DECQ i; | |||
| JG .L1_bodyB; | |||
| JMP .L1_loopE; | |||
| .align 32; | |||
| ALIGN_5; | |||
| .L4_loopEx:; | |||
| EXTRA_DY $1, yvec15, xvec7; | |||
| #ifndef TRMMKERNEL | |||
| @@ -776,11 +777,11 @@ ADDQ $8*SIZE, C0; | |||
| ADDQ $8*SIZE, C1; | |||
| DECQ i; | |||
| JG .L1_bodyB; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L1_loopE:; | |||
| TEST $4, bm; # Rm = 4 | |||
| JLE .L5_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L5_bodyB:; | |||
| #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | |||
| MOVQ bb, ptrbb; | |||
| @@ -816,7 +817,7 @@ MOVQ %rax, kkk; | |||
| #endif | |||
| SARQ $2, k; | |||
| JLE .L6_loopE; | |||
| .align 32; | |||
| ALIGN_5; | |||
| .L6_bodyB:; | |||
| # Computing kernel | |||
| @@ -887,7 +888,7 @@ MUL_DY yvec1, yvec5, yvec7; | |||
| ADD_DY yvec9, yvec7, yvec9; | |||
| DECQ k; | |||
| JG .L6_bodyB; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L6_loopE:; | |||
| #ifndef TRMMKERNEL | |||
| TEST $2, bk; | |||
| @@ -896,7 +897,7 @@ MOVQ kkk, %rax; | |||
| TEST $2, %rax; | |||
| #endif | |||
| JLE .L7_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L7_bodyB:; | |||
| #### Untoll time 1 #### | |||
| LD_DY 4*SIZE(ptrba), yvec1; | |||
| @@ -940,7 +941,7 @@ MOVQ kkk, %rax; | |||
| TEST $1, %rax; | |||
| #endif | |||
| JLE .L8_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L8_bodyB:; | |||
| #### Untoll time 1 #### | |||
| MUL_DY yvec0, yvec2, yvec6; | |||
| @@ -977,7 +978,7 @@ MOVQ C0, %rax; | |||
| OR ldc, %rax; | |||
| TEST $15, %rax; | |||
| JNE .L8_loopEx; # Unalign part write back | |||
| .align 32 | |||
| ALIGN_5 | |||
| #### Writing Back #### | |||
| EXTRA_DY $1,yvec15,xvec7; | |||
| EXTRA_DY $1,yvec13,xvec5; | |||
| @@ -1014,7 +1015,7 @@ ADDQ $4, kk | |||
| ADDQ $4*SIZE, C0; | |||
| ADDQ $4*SIZE, C1; | |||
| JMP .L5_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L8_loopEx:; | |||
| EXTRA_DY $1,yvec15,xvec7; | |||
| EXTRA_DY $1,yvec13,xvec5; | |||
| @@ -1080,7 +1081,7 @@ ADDQ $4*SIZE, C1; | |||
| .L5_loopE:; | |||
| TEST $2, bm; | |||
| JLE .L9_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L9_bodyB:; | |||
| #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | |||
| MOVQ bb, ptrbb; | |||
| @@ -1117,7 +1118,7 @@ MOVQ %rax, kkk; | |||
| #endif | |||
| SARQ $2, k; | |||
| JLE .L10_loopE; | |||
| .align 32; | |||
| ALIGN_5; | |||
| .L10_bodyB:; | |||
| # Computing kernel | |||
| @@ -1192,7 +1193,7 @@ MUL_DX xvec1, xvec5; | |||
| ADD_DX xvec5, xvec9; | |||
| DECQ k; | |||
| JG .L10_bodyB; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L10_loopE:; | |||
| #ifndef TRMMKERNEL | |||
| TEST $2, bk | |||
| @@ -1201,7 +1202,7 @@ MOVQ kkk, %rax; | |||
| TEST $2, %rax; | |||
| #endif | |||
| JLE .L11_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L11_bodyB:; | |||
| ##### Unroll time 1 #### | |||
| LD_DX 4*SIZE(ptrbb), xvec6; | |||
| @@ -1248,7 +1249,7 @@ MOVQ kkk, %rax; | |||
| TEST $1, %rax; | |||
| #endif | |||
| JLE .L12_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L12_bodyB:; | |||
| SHUF_DX $0x4e, xvec3, xvec5; | |||
| MUL_DX xvec0, xvec2; | |||
| @@ -1285,7 +1286,7 @@ MOVQ C0, %rax; | |||
| OR ldc, %rax; | |||
| TEST $15, %rax; | |||
| JNE .L12_loopEx; | |||
| .align 32 | |||
| ALIGN_5 | |||
| #### Writing Back #### | |||
| #ifndef TRMMKERNEL | |||
| ADD_DX 0*SIZE(C0), xvec13; | |||
| @@ -1310,7 +1311,7 @@ ADDQ $2, kk | |||
| ADDQ $2*SIZE, C0 | |||
| ADDQ $2*SIZE, C1 | |||
| JMP .L9_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L12_loopEx: | |||
| #ifndef TRMMKERNEL | |||
| LDL_DX 0*SIZE(C0), xvec14; | |||
| @@ -1349,7 +1350,7 @@ ADDQ $2*SIZE, C1; | |||
| .L9_loopE:; | |||
| TEST $1, bm | |||
| JLE .L13_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L13_bodyB:; | |||
| #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | |||
| MOVQ bb, ptrbb; | |||
| @@ -1379,7 +1380,7 @@ MOVQ %rax, kkk; | |||
| #endif | |||
| SARQ $2, k; | |||
| JLE .L14_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L14_bodyB:; | |||
| BROAD_DY 0*SIZE(ptrba), yvec0; | |||
| LD_DY 0*SIZE(ptrbb), yvec2; | |||
| @@ -1404,7 +1405,7 @@ ADDQ $4*SIZE, ptrba; | |||
| ADDQ $16*SIZE, ptrbb; | |||
| DECQ k; | |||
| JG .L14_bodyB; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L14_loopE: | |||
| #ifndef TRMMKERNEL | |||
| TEST $2, bk; | |||
| @@ -1413,7 +1414,7 @@ MOVQ kkk, %rax; | |||
| TEST $2, %rax; | |||
| #endif | |||
| JLE .L15_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L15_bodyB: | |||
| BROAD_DY 0*SIZE(ptrba), yvec0; | |||
| LD_DY 0*SIZE(ptrbb), yvec2; | |||
| @@ -1434,7 +1435,7 @@ MOVQ kkk, %rax; | |||
| TEST $1, %rax; | |||
| #endif | |||
| JLE .L16_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L16_bodyB:; | |||
| BROAD_DY 0*SIZE(ptrba), yvec0; | |||
| LD_DY 0*SIZE(ptrbb), yvec2; | |||
| @@ -1485,11 +1486,11 @@ LEAQ (C,ldc,4),C; | |||
| .L0_bodyE:; | |||
| DECQ j; | |||
| JG .L0_bodyB; | |||
| .align 32; | |||
| ALIGN_5; | |||
| .L0_loopE:; | |||
| TEST $2, bn; | |||
| JLE .L20_loopE; | |||
| .align 32; | |||
| ALIGN_5; | |||
| .L20_loopB:; | |||
| #if defined(TRMMKERNEL) && defined(LEFT) | |||
| MOVQ OFFSET, %rax; | |||
| @@ -1501,7 +1502,7 @@ MOVQ ba, ptrba; | |||
| MOVQ bm, i; | |||
| SARQ $3, i; # Rm = 8 | |||
| JLE .L21_loopE; | |||
| .align 32; | |||
| ALIGN_5; | |||
| .L21_bodyB:; | |||
| #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | |||
| MOVQ bb, ptrbb; | |||
| @@ -1538,7 +1539,7 @@ MOVQ %rax, kkk; | |||
| #endif | |||
| SARQ $2, k; | |||
| JLE .L211_loopE; | |||
| .align 32; | |||
| ALIGN_5; | |||
| .L211_bodyB: | |||
| # Computing kernel | |||
| #### Unroll time 1 #### | |||
| @@ -1692,7 +1693,7 @@ MUL_DX xvec3, xvec7; | |||
| ADD_DX xvec7, xvec8; | |||
| DECQ k; | |||
| JG .L211_bodyB; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L211_loopE: | |||
| #ifndef TRMMKERNEL | |||
| TEST $2, bk; | |||
| @@ -1701,7 +1702,7 @@ MOVQ kkk, %rax; | |||
| TEST $2, %rax; | |||
| #endif | |||
| JLE .L212_loopE; | |||
| .align 32; | |||
| ALIGN_5; | |||
| .L212_bodyB: | |||
| # Computing kernel | |||
| #### Unroll time 1 #### | |||
| @@ -1788,7 +1789,7 @@ MOVQ kkk, %rax; | |||
| TEST $1, %rax; | |||
| #endif | |||
| JLE .L213_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L213_bodyB: | |||
| #### Unroll time 1 #### | |||
| LD_DX 0*SIZE(ptrba), xvec0; | |||
| @@ -1858,7 +1859,7 @@ MOVQ C0, %rax; | |||
| OR ldc, %rax; | |||
| TEST $15, %rax; | |||
| JNE .L213_loopEx; | |||
| .align 32 | |||
| ALIGN_5 | |||
| #### Writing Back #### | |||
| #ifndef TRMMKERNEL | |||
| ADD_DX 0*SIZE(C0), xvec11; | |||
| @@ -1893,7 +1894,7 @@ ADDQ $8*SIZE, C1; | |||
| DECQ i; | |||
| JG .L21_bodyB; | |||
| JMP .L21_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L213_loopEx:; | |||
| #ifndef TRMMKERNEL | |||
| LDL_DX 0*SIZE(C0), xvec0; | |||
| @@ -1956,7 +1957,7 @@ JG .L21_bodyB; | |||
| .L21_loopE:; | |||
| TEST $4, bm; # Rm = 4 | |||
| JLE .L22_loopE; | |||
| .align 32; | |||
| ALIGN_5; | |||
| .L22_bodyB:; | |||
| #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | |||
| MOVQ bb, ptrbb; | |||
| @@ -1989,7 +1990,7 @@ MOVQ %rax, kkk; | |||
| #endif | |||
| SARQ $2, k; | |||
| JLE .L221_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L221_bodyB:; | |||
| # Computing kernel | |||
| #### Unroll time 1 #### | |||
| @@ -2071,7 +2072,7 @@ MUL_DX xvec1, xvec5; | |||
| ADD_DX xvec5, xvec10; | |||
| DECQ k; | |||
| JG .L221_bodyB; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L221_loopE:; | |||
| #ifndef TRMMKERNEL | |||
| TEST $2, bk; | |||
| @@ -2080,7 +2081,7 @@ MOVQ kkk, %rax; | |||
| TEST $2, %rax; | |||
| #endif | |||
| JLE .L222_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L222_bodyB: | |||
| #### Unroll time 1 #### | |||
| LD_DX 0*SIZE(ptrba), xvec0; | |||
| @@ -2129,7 +2130,7 @@ MOVQ kkk, %rax; | |||
| TEST $1, %rax; | |||
| #endif | |||
| JLE .L223_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L223_bodyB: | |||
| #### Unroll time 1 #### | |||
| LD_DX 0*SIZE(ptrba), xvec0; | |||
| @@ -2171,7 +2172,7 @@ MOVQ C0, %rax; | |||
| OR ldc, %rax; | |||
| TEST $15, %rax; | |||
| JNE .L223_loopEx; | |||
| .align 32 | |||
| ALIGN_5 | |||
| #### Writing Back #### | |||
| #ifndef TRMMKERNEL | |||
| ADD_DX 0*SIZE(C0), xvec11; | |||
| @@ -2196,7 +2197,7 @@ ADDQ $4, kk | |||
| ADDQ $4*SIZE, C0; | |||
| ADDQ $4*SIZE, C1; | |||
| JMP .L22_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L223_loopEx:; | |||
| #ifndef TRMMKERNEL | |||
| LDL_DX 0*SIZE(C0), xvec0; | |||
| @@ -2237,7 +2238,7 @@ ADDQ $4*SIZE, C1; | |||
| .L22_loopE:; | |||
| TEST $2, bm; # Rm = 2 | |||
| JLE .L23_loopE; | |||
| .align 32; | |||
| ALIGN_5; | |||
| .L23_bodyB: | |||
| #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | |||
| MOVQ bb, ptrbb; | |||
| @@ -2267,7 +2268,7 @@ MOVQ %rax, kkk; | |||
| #endif | |||
| SARQ $2, k; | |||
| JLE .L231_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L231_bodyB: | |||
| # Computing kernel | |||
| #### Unroll time 1 #### | |||
| @@ -2309,7 +2310,7 @@ ADD_DX xvec5, xvec11; | |||
| ADDQ $8*SIZE, ptrbb; | |||
| DECQ k; | |||
| JG .L231_bodyB; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L231_loopE: | |||
| #ifndef TRMMKERNEL | |||
| TEST $2, bk; | |||
| @@ -2318,7 +2319,7 @@ MOVQ kkk, %rax; | |||
| TEST $2, %rax; | |||
| #endif | |||
| JLE .L232_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L232_bodyB: | |||
| #### Unroll time 1 #### | |||
| LD_DX 0*SIZE(ptrba), xvec0; | |||
| @@ -2347,7 +2348,7 @@ MOVQ kkk, %rax; | |||
| TEST $1, %rax; | |||
| #endif | |||
| JLE .L233_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L233_bodyB: | |||
| #### Unroll time 1 #### | |||
| LD_DX 0*SIZE(ptrba), xvec0; | |||
| @@ -2373,7 +2374,7 @@ MOVQ C0, %rax; | |||
| OR ldc, %rax; | |||
| TEST $15, %rax; | |||
| JNE .L233_loopEx; | |||
| .align 32 | |||
| ALIGN_5 | |||
| #### Writing Back #### | |||
| #ifndef TRMMKERNEL | |||
| ADD_DX 0*SIZE(C0), xvec11; | |||
| @@ -2394,7 +2395,7 @@ ADDQ $2, kk; | |||
| ADDQ $2*SIZE, C0; | |||
| ADDQ $2*SIZE, C1; | |||
| JMP .L23_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L233_loopEx:; | |||
| #ifndef TRMMKERNEL | |||
| LDL_DX 0*SIZE(C0), xvec0; | |||
| @@ -2425,7 +2426,7 @@ ADDQ $2*SIZE, C1; | |||
| .L23_loopE: | |||
| TEST $1, bm; # Rm = 1 | |||
| JLE .L24_loopE; | |||
| .align 32; | |||
| ALIGN_5; | |||
| .L24_bodyB: | |||
| #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | |||
| MOVQ bb, ptrbb; | |||
| @@ -2454,7 +2455,7 @@ MOVQ %rax, kkk; | |||
| #endif | |||
| SARQ $2, k; | |||
| JLE .L241_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L241_bodyB: | |||
| BROAD_DX 0*SIZE(ptrba), xvec0; | |||
| LD_DX 0*SIZE(ptrbb), xvec2; | |||
| @@ -2479,7 +2480,7 @@ ADDQ $4*SIZE, ptrba; | |||
| ADDQ $8*SIZE, ptrbb; | |||
| DECQ k; | |||
| JG .L241_bodyB; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L241_loopE: | |||
| #ifndef TRMMKERNEL | |||
| TEST $2, bk; | |||
| @@ -2488,7 +2489,7 @@ MOVQ kkk, %rax; | |||
| TEST $2, %rax; | |||
| #endif | |||
| JLE .L242_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L242_bodyB: | |||
| BROAD_DX 0*SIZE(ptrba), xvec0; | |||
| LD_DX 0*SIZE(ptrbb), xvec2; | |||
| @@ -2509,7 +2510,7 @@ MOVQ kkk, %rax; | |||
| TEST $1, %rax; | |||
| #endif | |||
| JLE .L243_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L243_bodyB: | |||
| BROAD_DX 0*SIZE(ptrba), xvec0; | |||
| LD_DX 0*SIZE(ptrbb), xvec2; | |||
| @@ -2551,7 +2552,7 @@ LEAQ (C, ldc, 2), C; | |||
| .L20_loopE:; | |||
| TEST $1, bn; # Rn = 1 | |||
| JLE .L30_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L30_bodyB: | |||
| #if defined(TRMMKERNEL)&&defined(LEFT) | |||
| MOVQ OFFSET, %rax; | |||
| @@ -2562,7 +2563,7 @@ MOVQ ba, ptrba; | |||
| MOVQ bm, i; | |||
| SARQ $3, i; | |||
| JLE .L31_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L31_bodyB: | |||
| #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | |||
| MOVQ bb, ptrbb; | |||
| @@ -2593,7 +2594,7 @@ MOVQ %rax, kkk; | |||
| #endif | |||
| SARQ $2, k; | |||
| JLE .L311_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L311_bodyB: | |||
| #### Unroll time 1 #### | |||
| LD_DY 0*SIZE(ptrba), yvec0; | |||
| @@ -2634,7 +2635,7 @@ ADD_DY yvec4, yvec14, yvec14; | |||
| ADDQ $4*SIZE, ptrbb; | |||
| DECQ k; | |||
| JG .L311_bodyB; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L311_loopE: | |||
| #ifndef TRMMKERNEL | |||
| TEST $2, bk; | |||
| @@ -2643,7 +2644,7 @@ MOVQ kkk, %rax; | |||
| TEST $2, %rax; | |||
| #endif | |||
| JLE .L312_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L312_bodyB: | |||
| #### Unroll time 1 #### | |||
| LD_DY 0*SIZE(ptrba), yvec0; | |||
| @@ -2673,7 +2674,7 @@ MOVQ kkk, %rax; | |||
| TEST $1, %rax; | |||
| #endif | |||
| JLE .L313_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L313_bodyB: | |||
| #### Unroll time 1 #### | |||
| LD_DY 0*SIZE(ptrba), yvec0; | |||
| @@ -2696,7 +2697,7 @@ MOVQ C0, %rax; | |||
| OR ldc, %rax; | |||
| TEST $15, %rax; | |||
| JNE .L313_loopEx; | |||
| .align 32 | |||
| ALIGN_5 | |||
| #### Writing Back #### | |||
| EXTRA_DY $1, yvec15, xvec13; | |||
| EXTRA_DY $1, yvec14, xvec12; | |||
| @@ -2724,7 +2725,7 @@ ADDQ $8*SIZE, C0; | |||
| DECQ i; | |||
| JG .L31_bodyB; | |||
| JMP .L31_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L313_loopEx: | |||
| EXTRA_DY $1, yvec15, xvec13; | |||
| EXTRA_DY $1, yvec14, xvec12; | |||
| @@ -2766,7 +2767,7 @@ JG .L31_bodyB; | |||
| .L31_loopE: | |||
| TEST $4, bm | |||
| JLE .L32_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L32_bodyB: | |||
| #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | |||
| MOVQ bb, ptrbb; | |||
| @@ -2796,7 +2797,7 @@ MOVQ %rax, kkk | |||
| #endif | |||
| SARQ $2, k; | |||
| JLE .L321_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L321_bodyB: | |||
| LD_DY 0*SIZE(ptrba), yvec0; | |||
| BROAD_DY 0*SIZE(ptrbb), yvec1; | |||
| @@ -2821,7 +2822,7 @@ ADDQ $16*SIZE, ptrba; | |||
| ADDQ $4*SIZE, ptrbb; | |||
| DECQ k; | |||
| JG .L321_bodyB; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L321_loopE: | |||
| #ifndef TRMMKERNEL | |||
| TEST $2, bk; | |||
| @@ -2830,7 +2831,7 @@ MOVQ kkk, %rax; | |||
| TEST $2, %rax; | |||
| #endif | |||
| JLE .L322_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L322_bodyB: | |||
| LD_DY 0*SIZE(ptrba), yvec0; | |||
| BROAD_DY 0*SIZE(ptrbb), yvec1; | |||
| @@ -2852,7 +2853,7 @@ MOVQ kkk, %rax; | |||
| TEST $1, %rax; | |||
| #endif | |||
| JLE .L323_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L323_bodyB: | |||
| LD_DY 0*SIZE(ptrba), yvec0; | |||
| BROAD_DY 0*SIZE(ptrbb), yvec1; | |||
| @@ -2870,7 +2871,7 @@ MOVQ C0, %rax; | |||
| OR ldc, %rax; | |||
| TEST $15, %rax; | |||
| JNE .L323_loopEx; | |||
| .align 32 | |||
| ALIGN_5 | |||
| #### Writing Back #### | |||
| EXTRA_DY $1, yvec15, xvec14; | |||
| #ifndef TRMMKERNEL | |||
| @@ -2891,7 +2892,7 @@ ADDQ $4, kk | |||
| #endif | |||
| ADDQ $4*SIZE, C0; | |||
| JMP .L32_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L323_loopEx: | |||
| #### Writing Back #### | |||
| EXTRA_DY $1, yvec15, xvec14; | |||
| @@ -2921,7 +2922,7 @@ ADDQ $4*SIZE, C0; | |||
| .L32_loopE: | |||
| TEST $2, bm | |||
| JLE .L33_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L33_bodyB: | |||
| #if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) | |||
| MOVQ bb, ptrbb; | |||
| @@ -2951,7 +2952,7 @@ MOVQ %rax, kkk; | |||
| #endif | |||
| SARQ $2, k; | |||
| JLE .L331_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L331_bodyB: | |||
| LD_DX 0*SIZE(ptrba), xvec0; | |||
| BROAD_DX 0*SIZE(ptrbb), xvec2; | |||
| @@ -2976,7 +2977,7 @@ ADDQ $8*SIZE, ptrba; | |||
| ADDQ $4*SIZE, ptrbb; | |||
| DECQ k; | |||
| JG .L331_bodyB; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L331_loopE: | |||
| #ifndef TRMMKERNEL | |||
| TEST $2,bk; | |||
| @@ -2985,7 +2986,7 @@ MOVQ kkk, %rax; | |||
| TEST $2, %rax | |||
| #endif | |||
| JLE .L332_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L332_bodyB: | |||
| LD_DX 0*SIZE(ptrba), xvec0; | |||
| BROAD_DX 0*SIZE(ptrbb), xvec2; | |||
| @@ -3006,7 +3007,7 @@ MOVQ kkk, %rax; | |||
| TEST $1, %rax; | |||
| #endif | |||
| JLE .L333_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L333_bodyB: | |||
| LD_DX 0*SIZE(ptrba), xvec0; | |||
| BROAD_DX 0*SIZE(ptrbb), xvec2; | |||
| @@ -3039,7 +3040,7 @@ ADDQ $2*SIZE, C0; | |||
| .L33_loopE: | |||
| TEST $1, bm | |||
| JLE .L34_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L34_bodyB: | |||
| #if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) | |||
| MOVQ bb, ptrbb; | |||
| @@ -3068,7 +3069,7 @@ MOVQ %rax, kkk; | |||
| #endif | |||
| SARQ $2, k; | |||
| JLE .L341_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L341_bodyB: | |||
| movsd 0*SIZE(ptrba), xvec0; | |||
| movsd 0*SIZE(ptrbb), xvec1; | |||
| @@ -3093,7 +3094,7 @@ addq $4*SIZE, ptrba; | |||
| addq $4*SIZE, ptrbb; | |||
| decq k; | |||
| JG .L341_bodyB; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L341_loopE: | |||
| #ifndef TRMMKERNEL | |||
| TEST $2, bk; | |||
| @@ -3102,7 +3103,7 @@ MOVQ kkk, %rax; | |||
| TEST $2, %rax; | |||
| #endif | |||
| JLE .L342_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L342_bodyB: | |||
| movsd 0*SIZE(ptrba), xvec0; | |||
| movsd 0*SIZE(ptrbb), xvec1; | |||
| @@ -3124,7 +3125,7 @@ MOVQ kkk, %rax; | |||
| TEST $1, %rax; | |||
| #endif | |||
| JLE .L343_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L343_bodyB: | |||
| movsd 0*SIZE(ptrba), xvec0; | |||
| movsd 0*SIZE(ptrbb), xvec1; | |||
| @@ -142,6 +142,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define JMP jmp | |||
| #define NOP | |||
| #define XOR xorpd | |||
| #undef MOVQ | |||
| #define MOVQ movq | |||
| #define XOR_SY vxorps | |||
| @@ -273,7 +274,7 @@ movq %r11, kk | |||
| MOVQ bn,j; | |||
| SARQ $3,j; | |||
| JLE .L0_loopE; | |||
| .align 16; | |||
| ALIGN_4; | |||
| .L0_bodyB:; | |||
| #if defined(TRMMKERNEL) && defined(LEFT) | |||
| MOVQ OFFSET, %rax; | |||
| @@ -289,7 +290,7 @@ MOVQ ba,ptrba; | |||
| MOVQ bm,i; | |||
| SARQ $3,i; | |||
| JLE .L1_loopE; | |||
| .align 16; | |||
| ALIGN_4; | |||
| .L1_bodyB:; | |||
| #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | |||
| MOVQ bb, ptrbb; | |||
| @@ -342,7 +343,7 @@ MOVQ %rax, kkk; | |||
| #endif | |||
| SARQ $2,k; | |||
| JLE .L2_loopE; | |||
| .align 16; | |||
| ALIGN_4; | |||
| .L2_bodyB:; | |||
| # Computing kernel | |||
| @@ -472,7 +473,7 @@ ADD_SY yvec8, yvec7, yvec8; | |||
| .L2_bodyE:; | |||
| DECQ k; | |||
| JG .L2_bodyB; | |||
| .align 64; | |||
| ALIGN_4 | |||
| .L2_loopE:; | |||
| #ifndef TRMMKERNEL | |||
| TEST $2, bk; | |||
| @@ -480,7 +481,7 @@ TEST $2, bk; | |||
| TEST $2, kkk; | |||
| #endif | |||
| JLE .L3_loopE; | |||
| .align 64 | |||
| ALIGN_4 | |||
| .L3_loobB: | |||
| #### Unroll times 1 #### | |||
| MUL_SY yvec0, yvec2, yvec6; | |||
| @@ -550,7 +551,7 @@ TEST $1, bk; | |||
| TEST $1, kkk; | |||
| #endif | |||
| JLE .L4_loopE; | |||
| .align 64 | |||
| ALIGN_4 | |||
| .L4_loopB:; | |||
| #### Unroll times 1 #### | |||
| MUL_SY yvec0, yvec2, yvec6; | |||
| @@ -609,7 +610,7 @@ MOVQ C0, %rax; | |||
| OR ldc, %rax; | |||
| TEST $15, %rax; | |||
| JNE .L4_loopEx; | |||
| .align 16 | |||
| ALIGN_4 | |||
| LEAQ (ldc,ldc,2),%rax; | |||
| EXTRA_SY $1,yvec15,xvec7; | |||
| EXTRA_SY $1,yvec14,xvec6; | |||
| @@ -669,7 +670,7 @@ ADDQ $8*SIZE,C1; | |||
| DECQ i; | |||
| JG .L1_bodyB; | |||
| JMP .L1_loopE; | |||
| .align 16; | |||
| ALIGN_4; | |||
| .L4_loopEx: | |||
| LEAQ (ldc,ldc,2),%rax; | |||
| EXTRA_SY $1, yvec15, xvec7; | |||
| @@ -813,11 +814,11 @@ ADDQ $8*SIZE, C0; | |||
| ADDQ $8*SIZE, C1; | |||
| DECQ i; | |||
| JG .L1_bodyB; | |||
| .align 16 | |||
| ALIGN_4 | |||
| .L1_loopE:; | |||
| TEST $4, bm; | |||
| JLE .L5_loopE; | |||
| .align 16 | |||
| ALIGN_4 | |||
| .L5_bodyB: | |||
| #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | |||
| MOVQ bb, ptrbb; | |||
| @@ -857,7 +858,7 @@ MOVQ %rax, kkk; | |||
| #endif | |||
| SARQ $2, k; | |||
| JLE .L8_loopE; | |||
| .align 16 | |||
| ALIGN_4 | |||
| .L8_bodyB: | |||
| #### Unroll time 1 #### | |||
| @@ -983,7 +984,7 @@ MUL_SX xvec1, xvec5; | |||
| ADD_SX xvec5, xvec8; | |||
| DECQ k; | |||
| JG .L8_bodyB; | |||
| .align 16 | |||
| ALIGN_4 | |||
| .L8_loopE: | |||
| #ifndef TRMMKERNEL | |||
| TEST $2, bk; | |||
| @@ -991,7 +992,7 @@ TEST $2, bk; | |||
| TEST $2, kkk; | |||
| #endif | |||
| JLE .L9_loopE; | |||
| .align 16 | |||
| ALIGN_4 | |||
| .L9_bodyB: | |||
| #### Unroll time 1 #### | |||
| SHUF_SX $0x4e, xvec2, xvec4; | |||
| @@ -1062,7 +1063,7 @@ TEST $1, bk; | |||
| TEST $1, kkk; | |||
| #endif | |||
| JLE .L10_loopE; | |||
| .align 16 | |||
| ALIGN_4 | |||
| .L10_bodyB: | |||
| #### Unroll time 1 #### | |||
| SHUF_SX $0x4e, xvec2, xvec4; | |||
| @@ -1122,7 +1123,7 @@ MOVQ C0, %rax; | |||
| OR ldc, %rax; | |||
| TEST $15, %rax; | |||
| JNE .L10_loopEx; | |||
| .align 16 | |||
| ALIGN_4 | |||
| LEAQ (ldc,ldc,2),%rax; | |||
| #ifndef TRMMKERNEL | |||
| ADD_SX 0*SIZE(C0), xvec15; | |||
| @@ -1155,7 +1156,7 @@ ADDQ $4, kk | |||
| ADDQ $4*SIZE, C0; | |||
| ADDQ $4*SIZE, C1; | |||
| JMP .L5_loopE; | |||
| .align 16 | |||
| ALIGN_4 | |||
| .L10_loopEx: | |||
| LEAQ (ldc,ldc,2),%rax; | |||
| #ifndef TRMMKERNEL | |||
| @@ -1215,7 +1216,7 @@ ADDQ $4*SIZE, C1; | |||
| .L5_loopE: | |||
| TEST $2, bm; | |||
| JLE .L6_loopE; | |||
| .align 16 | |||
| ALIGN_4 | |||
| .L6_bodyB: | |||
| #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | |||
| MOVQ bb, ptrbb; | |||
| @@ -1249,7 +1250,7 @@ MOVQ %rax, kkk; | |||
| #endif | |||
| SARQ $2, k; | |||
| JLE .L11_loopE; | |||
| .align 16 | |||
| ALIGN_4 | |||
| .L11_bodyB: | |||
| #### Computing kernel | |||
| LD_SX 0*SIZE(ptrba), xvec0; # a1, a2, a3, a4 | |||
| @@ -1318,7 +1319,7 @@ ADDQ $8*SIZE, ptrba; | |||
| ADDQ $32*SIZE, ptrbb; | |||
| DECQ k; | |||
| JG .L11_bodyB; | |||
| .align 16 | |||
| ALIGN_4 | |||
| .L11_loopE: | |||
| #ifndef TRMMKERNEL | |||
| TEST $2, bk; | |||
| @@ -1326,7 +1327,7 @@ TEST $2, bk; | |||
| TEST $2, kkk; | |||
| #endif | |||
| JLE .L12_loopE; | |||
| .align 16 | |||
| ALIGN_4 | |||
| .L12_bodyB: | |||
| LD_SX 0*SIZE(ptrba), xvec0; # a1, a2, a3, a4 | |||
| SHUF_SX $0x44, xvec0, xvec1; # a1, a2, a1, a2 | |||
| @@ -1368,7 +1369,7 @@ TEST $1, bk; | |||
| TEST $1, kkk; | |||
| #endif | |||
| JLE .L13_loopE; | |||
| .align 16 | |||
| ALIGN_4 | |||
| .L13_bodyB: | |||
| LD_SX 0*SIZE(ptrba), xvec0; # a1, a2, a3, a4 | |||
| SHUF_SX $0x44, xvec0, xvec1; # a1, a2, a1, a2 | |||
| @@ -1433,7 +1434,7 @@ ADDQ $2*SIZE, C1; | |||
| .L6_loopE: | |||
| TEST $1, bm; | |||
| JLE .L7_loopE; | |||
| .align 16 | |||
| ALIGN_4 | |||
| .L7_bodyB: | |||
| #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | |||
| MOVQ bb, ptrbb; | |||
| @@ -1465,7 +1466,7 @@ MOVQ %rax, kkk; | |||
| #endif | |||
| SARQ $2, k; | |||
| JLE .L14_loopE; | |||
| .align 16 | |||
| ALIGN_4 | |||
| .L14_bodyB: | |||
| BROAD_SX 0*SIZE(ptrba), xvec0; | |||
| LD_SX 0*SIZE(ptrbb), xvec2; | |||
| @@ -1503,7 +1504,7 @@ ADDQ $4*SIZE, ptrba; | |||
| ADDQ $32*SIZE, ptrbb; | |||
| DECQ k; | |||
| JG .L14_bodyB; | |||
| .align 16 | |||
| ALIGN_4 | |||
| .L14_loopE: | |||
| #ifndef TRMMKERNEL | |||
| TEST $2, bk; | |||
| @@ -1511,7 +1512,7 @@ TEST $2, bk; | |||
| TEST $2, kkk; | |||
| #endif | |||
| JLE .L15_loopE; | |||
| .align 16 | |||
| ALIGN_4 | |||
| .L15_bodyB: | |||
| BROAD_SX 0*SIZE(ptrba), xvec0; | |||
| LD_SX 0*SIZE(ptrbb), xvec2; | |||
| @@ -1538,7 +1539,7 @@ TEST $1, bk; | |||
| TEST $1, kkk; | |||
| #endif | |||
| JLE .L16_loopE; | |||
| .align 16 | |||
| ALIGN_4 | |||
| .L16_bodyB: | |||
| BROAD_SX 0*SIZE(ptrba), xvec0; | |||
| LD_SX 0*SIZE(ptrbb), xvec2; | |||
| @@ -1611,11 +1612,11 @@ LEAQ (C,ldc,8),C; | |||
| .L0_bodyE:; | |||
| DECQ j; | |||
| JG .L0_bodyB; | |||
| .align 16; | |||
| ALIGN_4; | |||
| .L0_loopE:; | |||
| TEST $4, bn; # Rn = 4 | |||
| JLE .L20_loopE; | |||
| .align 16; | |||
| ALIGN_4; | |||
| .L20_bodyB: | |||
| #if defined(TRMMKERNEL) && defined(LEFT) | |||
| MOVQ OFFSET, %rax; | |||
| @@ -1628,7 +1629,7 @@ MOVQ ba, ptrba; | |||
| MOVQ bm, i; | |||
| SARQ $3, i; | |||
| JLE .L21_loopE; | |||
| .align 16 | |||
| ALIGN_4 | |||
| .L21_bodyB: | |||
| #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | |||
| MOVQ bb, ptrbb; | |||
| @@ -1668,7 +1669,7 @@ MOVQ %rax, kkk; | |||
| #endif | |||
| SARQ $2,k; | |||
| JLE .L211_loopE; | |||
| .align 16 | |||
| ALIGN_4 | |||
| .L211_bodyB: | |||
| #### Unroll time 1 #### | |||
| ODUP_SX 0*SIZE(ptrbb), xvec3; | |||
| @@ -1800,7 +1801,7 @@ ADD_SX xvec7, xvec8; | |||
| LD_SX 4*SIZE(ptrba), xvec1; | |||
| DECQ k; | |||
| JG .L211_bodyB; | |||
| .align 16 | |||
| ALIGN_4 | |||
| .L211_loopE: | |||
| #ifndef TRMMKERNEL | |||
| TEST $2, bk | |||
| @@ -1808,7 +1809,7 @@ TEST $2, bk | |||
| TEST $2, kkk; | |||
| #endif | |||
| JLE .L212_loopE; | |||
| .align 16 | |||
| ALIGN_4 | |||
| .L212_bodyB: | |||
| #### Unroll time 1 #### | |||
| ODUP_SX 0*SIZE(ptrbb), xvec3; | |||
| @@ -1882,7 +1883,7 @@ TEST $1, bk; | |||
| TEST $1, kkk; | |||
| #endif | |||
| JLE .L213_loopE; | |||
| .align 16 | |||
| ALIGN_4 | |||
| .L213_bodyB: | |||
| ODUP_SX 0*SIZE(ptrbb), xvec3; | |||
| SHUF_SX $0x4e, xvec2, xvec4; | |||
| @@ -1982,11 +1983,11 @@ ADDQ $8*SIZE, C0; | |||
| ADDQ $8*SIZE, C1; | |||
| DECQ i; | |||
| JG .L21_bodyB; | |||
| .align 16 | |||
| ALIGN_4 | |||
| .L21_loopE: | |||
| TEST $4, bm; | |||
| JLE .L22_loopE; | |||
| .align 16 | |||
| ALIGN_4 | |||
| .L22_bodyB: | |||
| #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | |||
| MOVQ bb, ptrbb; | |||
| @@ -2019,7 +2020,7 @@ MOVQ %rax, kkk; | |||
| #endif | |||
| SARQ $2, k; | |||
| JLE .L221_loopE; | |||
| .align 16 | |||
| ALIGN_4 | |||
| .L221_bodyB: | |||
| LD_SX 0*SIZE(ptrba), xvec0; | |||
| EDUP_SX 0*SIZE(ptrbb), xvec2; | |||
| @@ -2089,7 +2090,7 @@ ADDQ $16*SIZE, ptrbb; | |||
| DECQ k; | |||
| JG .L221_bodyB; | |||
| .align 16 | |||
| ALIGN_4 | |||
| .L221_loopE: | |||
| #ifndef TRMMKERNEL | |||
| TEST $2, bk; | |||
| @@ -2097,7 +2098,7 @@ TEST $2, bk; | |||
| TEST $2, kkk; | |||
| #endif | |||
| JLE .L222_loopE; | |||
| .align 16 | |||
| ALIGN_4 | |||
| .L222_bodyB: | |||
| LD_SX 0*SIZE(ptrba), xvec0; | |||
| EDUP_SX 0*SIZE(ptrbb), xvec2; | |||
| @@ -2139,7 +2140,7 @@ TEST $1, bk; | |||
| TEST $1, kkk; | |||
| #endif | |||
| JLE .L223_loopE; | |||
| .align 16 | |||
| ALIGN_4 | |||
| .L223_bodyB: | |||
| LD_SX 0*SIZE(ptrba), xvec0; | |||
| EDUP_SX 0*SIZE(ptrbb), xvec2; | |||
| @@ -2203,7 +2204,7 @@ ADDQ $4*SIZE, C1; | |||
| .L22_loopE: | |||
| TEST $2, bm; | |||
| JLE .L23_loopE; | |||
| .align 16 | |||
| ALIGN_4 | |||
| .L23_bodyB: | |||
| #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | |||
| MOVQ bb, ptrbb; | |||
| @@ -2234,7 +2235,7 @@ MOVQ %rax, kkk; | |||
| #endif | |||
| SARQ $2, k; | |||
| JLE .L231_loopE; | |||
| .align 16 | |||
| ALIGN_4 | |||
| .L231_bodyB: | |||
| LD_SX 0*SIZE(ptrba), xvec0; | |||
| EDUP_SX 0*SIZE(ptrbb), xvec4; | |||
| @@ -2274,7 +2275,7 @@ ADDQ $8*SIZE, ptrba; | |||
| ADDQ $16*SIZE, ptrbb; | |||
| DECQ k; | |||
| JG .L231_bodyB; | |||
| .align 16 | |||
| ALIGN_4 | |||
| .L231_loopE: | |||
| #ifndef TRMMKERNEL | |||
| TEST $2, bk; | |||
| @@ -2282,7 +2283,7 @@ TEST $2, bk; | |||
| TEST $2, kkk; | |||
| #endif | |||
| JLE .L232_loopE; | |||
| .align 16 | |||
| ALIGN_4 | |||
| .L232_bodyB: | |||
| LD_SX 0*SIZE(ptrba), xvec0; | |||
| EDUP_SX 0*SIZE(ptrbb), xvec4; | |||
| @@ -2310,7 +2311,7 @@ TEST $1, bk; | |||
| TEST $1, kkk; | |||
| #endif | |||
| JLE .L233_loopE; | |||
| .align 16 | |||
| ALIGN_4 | |||
| .L233_bodyB: | |||
| LD_SX 0*SIZE(ptrba), xvec0; | |||
| EDUP_SX 0*SIZE(ptrbb), xvec4; | |||
| @@ -2356,7 +2357,7 @@ ADDQ $2*SIZE, C1; | |||
| .L23_loopE: | |||
| TEST $1, bm; | |||
| JLE .L24_loopE; | |||
| .align 16 | |||
| ALIGN_4 | |||
| .L24_bodyB: | |||
| #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | |||
| MOVQ bb, ptrbb; | |||
| @@ -2386,7 +2387,7 @@ MOVQ %rax, kkk; | |||
| #endif | |||
| SARQ $2, k; | |||
| JLE .L241_loopE; | |||
| .align 16 | |||
| ALIGN_4 | |||
| .L241_bodyB: | |||
| BROAD_SX 0*SIZE(ptrba), xvec0; | |||
| LD_SX 0*SIZE(ptrbb), xvec1; | |||
| @@ -2419,7 +2420,7 @@ TEST $2, bk; | |||
| TEST $2, kkk; | |||
| #endif | |||
| JLE .L242_loopE; | |||
| .align 16 | |||
| ALIGN_4 | |||
| .L242_bodyB: | |||
| BROAD_SX 0*SIZE(ptrba), xvec0; | |||
| LD_SX 0*SIZE(ptrbb), xvec1; | |||
| @@ -2440,7 +2441,7 @@ TEST $1, bk; | |||
| TEST $1, kkk; | |||
| #endif | |||
| JLE .L243_loopE; | |||
| .align 16; | |||
| ALIGN_4; | |||
| .L243_bodyB: | |||
| BROAD_SX 0*SIZE(ptrba), xvec0; | |||
| LD_SX 0*SIZE(ptrbb), xvec1; | |||
| @@ -2491,7 +2492,7 @@ LEAQ (C, ldc, 4), C; | |||
| .L20_loopE: | |||
| TEST $2, bn; | |||
| JLE .L30_loopE; | |||
| .align 16 | |||
| ALIGN_4 | |||
| .L30_bodyB: | |||
| #if defined(TRMMKERNEL) && defined(LEFT) | |||
| MOVQ OFFSET, %rax; | |||
| @@ -2503,7 +2504,7 @@ MOVQ ba, ptrba; | |||
| MOVQ bm, i; | |||
| SARQ $3, i; | |||
| JLE .L31_loopE; | |||
| .align 16 | |||
| ALIGN_4 | |||
| .L31_bodyB: | |||
| #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | |||
| MOVQ bb, ptrbb; | |||
| @@ -2536,7 +2537,7 @@ MOVQ %rax, kkk; | |||
| #endif | |||
| SARQ $2, k; | |||
| JLE .L311_loopE; | |||
| .align 16 | |||
| ALIGN_4 | |||
| .L311_bodyB: | |||
| LD_SX 0*SIZE(ptrbb), xvec2; | |||
| SHUF_SX $0x50, xvec2, xvec3; | |||
| @@ -2612,7 +2613,7 @@ ADDQ $32*SIZE, ptrba; | |||
| ADDQ $8*SIZE, ptrbb; | |||
| DECQ k; | |||
| JG .L311_bodyB; | |||
| .align 16 | |||
| ALIGN_4 | |||
| .L311_loopE: | |||
| #ifndef TRMMKERNEL | |||
| TEST $2, bk; | |||
| @@ -2620,7 +2621,7 @@ TEST $2, bk; | |||
| TEST $2, kkk; | |||
| #endif | |||
| JLE .L312_loopE; | |||
| .align 16 | |||
| ALIGN_4 | |||
| .L312_bodyB: | |||
| LD_SX 0*SIZE(ptrbb), xvec2; | |||
| SHUF_SX $0x50, xvec2, xvec3; | |||
| @@ -2666,7 +2667,7 @@ TEST $1, bk; | |||
| TEST $1, kkk; | |||
| #endif | |||
| JLE .L313_loopE; | |||
| .align 16 | |||
| ALIGN_4 | |||
| .L313_bodyB: | |||
| LD_SX 0*SIZE(ptrbb), xvec2; | |||
| SHUF_SX $0x50, xvec2, xvec3; | |||
| @@ -2731,11 +2732,11 @@ ADDQ $8*SIZE, C0; | |||
| ADDQ $8*SIZE, C1; | |||
| DECQ i; | |||
| JG .L31_bodyB; | |||
| .align 16 | |||
| ALIGN_4 | |||
| .L31_loopE: | |||
| TEST $4, bm; | |||
| JLE .L32_loopE; | |||
| .align 16 | |||
| ALIGN_4 | |||
| .L32_bodyB: | |||
| #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | |||
| MOVQ bb, ptrbb; | |||
| @@ -2766,7 +2767,7 @@ MOVQ %rax, kkk; | |||
| #endif | |||
| SARQ $2, k; | |||
| JLE .L321_loopE; | |||
| .align 16 | |||
| ALIGN_4 | |||
| .L321_bodyB: | |||
| LD_SX 0*SIZE(ptrba), xvec0; | |||
| LD_SX 0*SIZE(ptrbb), xvec2; | |||
| @@ -2806,7 +2807,7 @@ ADDQ $16*SIZE, ptrba; | |||
| ADDQ $8*SIZE, ptrbb; | |||
| DECQ k; | |||
| JG .L321_bodyB; | |||
| .align 16 | |||
| ALIGN_4 | |||
| .L321_loopE: | |||
| #ifndef TRMMKERNEL | |||
| TEST $2, bk; | |||
| @@ -2814,7 +2815,7 @@ TEST $2, bk; | |||
| TEST $2, kkk; | |||
| #endif | |||
| JLE .L322_loopE; | |||
| .align 16 | |||
| ALIGN_4 | |||
| .L322_bodyB: | |||
| LD_SX 0*SIZE(ptrba), xvec0; | |||
| LD_SX 0*SIZE(ptrbb), xvec2; | |||
| @@ -2842,7 +2843,7 @@ TEST $1, bk; | |||
| TEST $1, kkk; | |||
| #endif | |||
| JLE .L323_loopE; | |||
| .align 16 | |||
| ALIGN_4 | |||
| .L323_bodyB: | |||
| LD_SX 0*SIZE(ptrba), xvec0; | |||
| LD_SX 0*SIZE(ptrbb), xvec2; | |||
| @@ -2887,7 +2888,7 @@ ADDQ $4*SIZE, C1; | |||
| .L32_loopE: | |||
| TEST $2, bm; | |||
| JLE .L33_loopE; | |||
| .align 16 | |||
| ALIGN_4 | |||
| .L33_bodyB: | |||
| #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | |||
| MOVQ bb, ptrbb; | |||
| @@ -2920,7 +2921,7 @@ MOVQ %rax, kkk; | |||
| #endif | |||
| SARQ $2, k; | |||
| JLE .L331_loopE; | |||
| .align 16 | |||
| ALIGN_4 | |||
| .L331_bodyB: | |||
| LD_SX 0*SIZE(ptrba), xvec0; # a0, a1, a2, a3 | |||
| EDUP_SX 0*SIZE(ptrbb), xvec2; # b0, b0, b2, b2 | |||
| @@ -2943,7 +2944,7 @@ ADDQ $8*SIZE, ptrba; | |||
| ADDQ $8*SIZE, ptrbb; | |||
| DECQ k; | |||
| JG .L331_bodyB; | |||
| .align 16 | |||
| ALIGN_4 | |||
| .L331_loopE: | |||
| #ifndef TRMMKERNEL | |||
| TEST $2, bk; | |||
| @@ -2951,7 +2952,7 @@ TEST $2, bk; | |||
| TEST $2, kkk; | |||
| #endif | |||
| JLE .L332_loopE; | |||
| .align 16 | |||
| ALIGN_4 | |||
| .L332_bodyB: | |||
| LD_SX 0*SIZE(ptrba), xvec0; # a0, a1, a2, a3 | |||
| EDUP_SX 0*SIZE(ptrbb), xvec2; # b0, b0, b2, b2 | |||
| @@ -2972,7 +2973,7 @@ TEST $1, bk; | |||
| TEST $1, kkk; | |||
| #endif | |||
| JLE .L333_loopE; | |||
| .align 16 | |||
| ALIGN_4 | |||
| .L333_bodyB: | |||
| movss 0*SIZE(ptrba), xvec0; | |||
| movss 1*SIZE(ptrba), xvec1; | |||
| @@ -3031,7 +3032,7 @@ ADDQ $2*SIZE, C1; | |||
| .L33_loopE: | |||
| TEST $1, bm; | |||
| JLE .L34_loopE; | |||
| .align 16 | |||
| ALIGN_4 | |||
| .L34_bodyB: | |||
| #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | |||
| MOVQ bb, ptrbb; | |||
| @@ -3062,7 +3063,7 @@ MOVQ %rax, kkk; | |||
| #endif | |||
| SARQ $2, k; | |||
| JLE .L341_loopE; | |||
| .align 16 | |||
| ALIGN_4 | |||
| .L341_bodyB: | |||
| movss 0*SIZE(ptrba), xvec0; | |||
| movss 0*SIZE(ptrbb), xvec1; | |||
| @@ -3104,7 +3105,7 @@ addq $4*SIZE, ptrba; | |||
| addq $8*SIZE, ptrbb; | |||
| decq k; | |||
| jg .L341_bodyB; | |||
| .align 16 | |||
| ALIGN_4 | |||
| .L341_loopE: | |||
| #ifndef TRMMKERNEL | |||
| TEST $2, bk; | |||
| @@ -3112,7 +3113,7 @@ TEST $2, bk; | |||
| TEST $2, kkk; | |||
| #endif | |||
| JLE .L342_loopE; | |||
| .align 16 | |||
| ALIGN_4 | |||
| .L342_bodyB: | |||
| movss 0*SIZE(ptrba), xvec0; | |||
| movss 0*SIZE(ptrbb), xvec1; | |||
| @@ -3140,7 +3141,7 @@ TEST $1, bk; | |||
| TEST $1, kkk; | |||
| #endif | |||
| JLE .L343_loopE; | |||
| .align 16 | |||
| ALIGN_4 | |||
| .L343_bodyB: | |||
| movss 0*SIZE(ptrba), xvec0; | |||
| movss 0*SIZE(ptrbb), xvec1; | |||
| @@ -3189,7 +3190,7 @@ LEAQ (C, ldc, 2), C; | |||
| .L30_loopE: | |||
| TEST $1, bn; | |||
| JLE .L40_loopE; | |||
| .align 16 | |||
| ALIGN_4 | |||
| .L40_bodyB: | |||
| #if defined(TRMMKERNEL)&&defined(LEFT) | |||
| MOVQ OFFSET, %rax; | |||
| @@ -3200,7 +3201,7 @@ MOVQ ba, ptrba; | |||
| MOVQ bm, i; | |||
| SARQ $3, i; | |||
| JLE .L41_loopE; | |||
| .align 16 | |||
| ALIGN_4 | |||
| .L41_bodyB: | |||
| #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | |||
| MOVQ bb, ptrbb; | |||
| @@ -3230,7 +3231,7 @@ MOVQ %rax, kkk; | |||
| #endif | |||
| SARQ $2, k; | |||
| JLE .L411_loopE; | |||
| .align 16 | |||
| ALIGN_4 | |||
| .L411_bodyB: | |||
| LD_SY 0*SIZE(ptrba), yvec0; | |||
| BROAD_SY 0*SIZE(ptrbb), yvec1; | |||
| @@ -3256,7 +3257,7 @@ ADDQ $32*SIZE, ptrba; | |||
| ADDQ $4*SIZE, ptrbb; | |||
| DECQ k; | |||
| JG .L411_bodyB; | |||
| .align 16 | |||
| ALIGN_4 | |||
| .L411_loopE: | |||
| #ifndef TRMMKERNEL | |||
| TEST $2, bk; | |||
| @@ -3264,7 +3265,7 @@ TEST $2, bk; | |||
| TEST $2, kkk; | |||
| #endif | |||
| JLE .L412_loopE; | |||
| .align 16 | |||
| ALIGN_4 | |||
| .L412_bodyB: | |||
| LD_SY 0*SIZE(ptrba), yvec0; | |||
| BROAD_SY 0*SIZE(ptrbb), yvec1; | |||
| @@ -3285,7 +3286,7 @@ TEST $1, bk; | |||
| TEST $1, kkk; | |||
| #endif | |||
| JLE .L413_loopE; | |||
| .align 16 | |||
| ALIGN_4 | |||
| .L413_bodyB: | |||
| LD_SY 0*SIZE(ptrba), yvec0; | |||
| BROAD_SY 0*SIZE(ptrbb), yvec1; | |||
| @@ -3329,11 +3330,11 @@ ADDQ $8, kk; | |||
| ADDQ $8*SIZE, C0; | |||
| DECQ i; | |||
| JG .L41_bodyB; | |||
| .align 16 | |||
| ALIGN_4 | |||
| .L41_loopE: | |||
| TEST $4, bm; | |||
| JLE .L42_loopE; | |||
| .align 16 | |||
| ALIGN_4 | |||
| .L42_bodyB: | |||
| #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | |||
| MOVQ bb, ptrbb; | |||
| @@ -3362,7 +3363,7 @@ MOVQ %rax, kkk | |||
| #endif | |||
| SARQ $2, k; | |||
| JLE .L421_loopE; | |||
| .align 16 | |||
| ALIGN_4 | |||
| .L421_bodyB: | |||
| LD_SX 0*SIZE(ptrba), xvec0; | |||
| BROAD_SX 0*SIZE(ptrbb), xvec1; | |||
| @@ -3387,7 +3388,7 @@ ADDQ $16*SIZE, ptrba; | |||
| ADDQ $4*SIZE, ptrbb; | |||
| DECQ k; | |||
| JG .L421_bodyB; | |||
| .align 16 | |||
| ALIGN_4 | |||
| .L421_loopE: | |||
| #ifndef TRMMKERNEL | |||
| TEST $2, bk; | |||
| @@ -3395,7 +3396,7 @@ TEST $2, bk; | |||
| TEST $2, kkk; | |||
| #endif | |||
| JLE .L422_loopE; | |||
| .align 16 | |||
| ALIGN_4 | |||
| .L422_bodyB: | |||
| LD_SX 0*SIZE(ptrba), xvec0; | |||
| BROAD_SX 0*SIZE(ptrbb), xvec1; | |||
| @@ -3416,7 +3417,7 @@ TEST $1, bk; | |||
| TEST $1, kkk; | |||
| #endif | |||
| JLE .L423_loopE; | |||
| .align 16 | |||
| ALIGN_4 | |||
| .L423_bodyB: | |||
| LD_SX 0*SIZE(ptrba), xvec0; | |||
| BROAD_SX 0*SIZE(ptrbb), xvec1; | |||
| @@ -3451,7 +3452,7 @@ ADDQ $4*SIZE, C0; | |||
| .L42_loopE: | |||
| TEST $2, bm; | |||
| JLE .L43_loopE; | |||
| .align 16 | |||
| ALIGN_4 | |||
| .L43_bodyB: | |||
| #if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) | |||
| MOVQ bb, ptrbb; | |||
| @@ -3481,7 +3482,7 @@ MOVQ %rax, kkk; | |||
| #endif | |||
| SARQ $2, k; | |||
| JLE .L431_loopE; | |||
| .align 16 | |||
| ALIGN_4 | |||
| .L431_bodyB: | |||
| movss 0*SIZE(ptrba), xvec0; | |||
| movss 1*SIZE(ptrba), xvec1; | |||
| @@ -3518,7 +3519,7 @@ addq $8*SIZE, ptrba; | |||
| addq $4*SIZE, ptrbb; | |||
| decq k; | |||
| JG .L431_bodyB; | |||
| .align 16 | |||
| ALIGN_4 | |||
| .L431_loopE: | |||
| #ifndef TRMMKERNEL | |||
| TEST $2, bk; | |||
| @@ -3526,7 +3527,7 @@ TEST $2, bk; | |||
| TEST $2, kkk; | |||
| #endif | |||
| JLE .L432_loopE; | |||
| .align 16 | |||
| ALIGN_4 | |||
| .L432_bodyB: | |||
| movss 0*SIZE(ptrba), xvec0; | |||
| movss 1*SIZE(ptrba), xvec1; | |||
| @@ -3553,7 +3554,7 @@ TEST $1, bk; | |||
| TEST $1, kkk; | |||
| #endif | |||
| JLE .L433_loopE; | |||
| .align 16 | |||
| ALIGN_4 | |||
| .L433_bodyB: | |||
| movss 0*SIZE(ptrba), xvec0; | |||
| movss 1*SIZE(ptrba), xvec1; | |||
| @@ -3592,7 +3593,7 @@ addq $2*SIZE, C0; | |||
| .L43_loopE: | |||
| TEST $1, bm; | |||
| JLE .L44_loopE; | |||
| .align 16 | |||
| ALIGN_4 | |||
| .L44_bodyB: | |||
| #if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) | |||
| MOVQ bb, ptrbb; | |||
| @@ -3621,7 +3622,7 @@ MOVQ %rax, kkk; | |||
| #endif | |||
| SARQ $2, k; | |||
| JLE .L441_loopE; | |||
| .align 16 | |||
| ALIGN_4 | |||
| .L441_bodyB: | |||
| movss 0*SIZE(ptrba), xvec0; | |||
| movss 0*SIZE(ptrbb), xvec1; | |||
| @@ -3646,7 +3647,7 @@ addq $4*SIZE, ptrba; | |||
| addq $4*SIZE, ptrbb; | |||
| decq k; | |||
| JG .L441_bodyB; | |||
| .align 16 | |||
| ALIGN_4 | |||
| .L441_loopE: | |||
| #ifndef TRMMKERNEL | |||
| TEST $2, bk; | |||
| @@ -3654,7 +3655,7 @@ TEST $2, bk; | |||
| TEST $2, kkk; | |||
| #endif | |||
| JLE .L442_loopE; | |||
| .align 16 | |||
| ALIGN_4 | |||
| .L442_bodyB: | |||
| movss 0*SIZE(ptrba), xvec0; | |||
| movss 0*SIZE(ptrbb), xvec1; | |||
| @@ -3675,7 +3676,7 @@ TEST $1, bk; | |||
| TEST $1, kkk; | |||
| #endif | |||
| JLE .L443_loopE; | |||
| .align 16 | |||
| ALIGN_4 | |||
| .L443_bodyB: | |||
| movss 0*SIZE(ptrba), xvec0; | |||
| movss 0*SIZE(ptrbb), xvec1; | |||
| @@ -145,6 +145,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define JMP jmp | |||
| #define NOP | |||
| #define XOR xorpd | |||
| #undef MOVQ | |||
| #define MOVQ movq | |||
| #define XOR_SY vxorps | |||
| #define XOR_DY vxorpd | |||
| @@ -297,7 +299,7 @@ movq %r11, kk; | |||
| MOVQ bn,j; | |||
| SARQ $2,j; # Rn = 4 | |||
| JLE .L0_loopE; | |||
| .align 32; | |||
| ALIGN_5; | |||
| .L0_bodyB:; | |||
| #if defined(TRMMKERNEL) && defined(LEFT) | |||
| MOVQ OFFSET, %rax; | |||
| @@ -312,7 +314,7 @@ MOVQ ba,ptrba; | |||
| MOVQ bm,i; | |||
| SARQ $2,i; # Rm = 4 | |||
| JLE .L1_loopE; | |||
| .align 32; | |||
| ALIGN_5; | |||
| .L1_bodyB:; | |||
| #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | |||
| MOVQ bb,ptrbb; | |||
| @@ -361,7 +363,7 @@ MOVQ %rax, kkk; | |||
| #endif | |||
| SARQ $2,k; # Unroll 4 times | |||
| JLE .L2_loopE; | |||
| .align 32; | |||
| ALIGN_5; | |||
| .L2_bodyB:; | |||
| #### Computing kernel #### | |||
| @@ -584,7 +586,7 @@ ADD2_DY yvec6, yvec12, yvec12; | |||
| ADD2_DY yvec7, yvec8, yvec8; | |||
| DECQ k; | |||
| JG .L2_bodyB; | |||
| .align 64; | |||
| ALIGN_5 | |||
| .L2_loopE:; | |||
| #ifndef TRMMKERNEL | |||
| TEST $2, bk; | |||
| @@ -592,7 +594,7 @@ TEST $2, bk; | |||
| TEST $2, kkk; | |||
| #endif | |||
| JLE .L3_loopE; | |||
| .align 64 | |||
| ALIGN_5 | |||
| .L3_bodyB: | |||
| #### Unroll time 1 #### | |||
| LD_DY 4*SIZE(ptrba), yvec1; | |||
| @@ -710,7 +712,7 @@ TEST $1, bk; | |||
| TEST $1, kkk; | |||
| #endif | |||
| JLE .L4_loopE; | |||
| .align 64 | |||
| ALIGN_5 | |||
| .L4_loopB:; | |||
| #### Unroll time 1 #### | |||
| PREFETCH0 PRESIZE*SIZE(ptrba); | |||
| @@ -852,7 +854,7 @@ MOVQ C0, %rax; | |||
| OR ldc, %rax; | |||
| TEST $15, %rax; | |||
| JNE .L4_loopEx; | |||
| .align 32 | |||
| ALIGN_5 | |||
| #### Store Back #### | |||
| EXTRA_DY $1,yvec15,xvec7; | |||
| EXTRA_DY $1,yvec14,xvec6; | |||
| @@ -912,7 +914,7 @@ ADDQ $8*SIZE,C1; | |||
| DECQ i; | |||
| JG .L1_bodyB; | |||
| JMP .L1_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L4_loopEx: | |||
| EXTRA_DY $1, yvec15, xvec7; | |||
| EXTRA_DY $1, yvec14, xvec6; | |||
| @@ -1024,11 +1026,11 @@ ADDQ $8*SIZE, C0; | |||
| ADDQ $8*SIZE, C1; | |||
| DECQ i; | |||
| JG .L1_bodyB; | |||
| .align 32; | |||
| ALIGN_5; | |||
| .L1_loopE:; | |||
| TEST $2, bm; | |||
| JLE .L5_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L5_bodyB: | |||
| #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | |||
| MOVQ bb,ptrbb; | |||
| @@ -1060,7 +1062,7 @@ MOVQ %rax, kkk; | |||
| #endif | |||
| SARQ $2, k; | |||
| JLE .L7_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L7_bodyB: | |||
| #### Compute kernel #### | |||
| #### Unroll times 1 #### | |||
| @@ -1194,7 +1196,7 @@ ADD2_DY yvec7, yvec12, yvec12; | |||
| ADDQ $32*SIZE, ptrbb; | |||
| DECQ k; | |||
| JG .L7_bodyB; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L7_loopE: | |||
| #ifndef TRMMKERNEL | |||
| TEST $2, bk; | |||
| @@ -1202,7 +1204,7 @@ TEST $2, bk; | |||
| TEST $2, kkk; | |||
| #endif | |||
| JLE .L8_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L8_bodyB: | |||
| #### Unroll times 1 #### | |||
| LD_DY 0*SIZE(ptrba), yvec0; | |||
| @@ -1276,7 +1278,7 @@ TEST $1, bk; | |||
| TEST $1, kkk; | |||
| #endif | |||
| JLE .L9_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L9_bodyB: | |||
| #### Unroll times 1 #### | |||
| LD_DY 0*SIZE(ptrba), yvec0; | |||
| @@ -1364,7 +1366,7 @@ MOVQ C0, %rax; | |||
| OR ldc, %rax; | |||
| TEST $15, %rax; | |||
| JNE .L9_loopEx; | |||
| .align 32 | |||
| ALIGN_5 | |||
| #### Writing back #### | |||
| EXTRA_DY $1, yvec15, xvec7; | |||
| EXTRA_DY $1, yvec14, xvec6; | |||
| @@ -1401,7 +1403,7 @@ ADDQ $2, kk; | |||
| ADDQ $4*SIZE, C0; | |||
| ADDQ $4*SIZE, C1; | |||
| JMP .L5_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L9_loopEx: | |||
| EXTRA_DY $1, yvec15, xvec7; | |||
| EXTRA_DY $1, yvec14, xvec6; | |||
| @@ -1466,7 +1468,7 @@ ADDQ $4*SIZE, C1; | |||
| .L5_loopE: | |||
| TEST $1, bm; | |||
| JLE .L6_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L6_bodyB: | |||
| #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | |||
| MOVQ bb,ptrbb; | |||
| @@ -1496,7 +1498,7 @@ MOVQ %rax, kkk; | |||
| #endif | |||
| SARQ $2, k; | |||
| JLE .L10_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L10_bodyB: | |||
| LD_DY 0*SIZE(ptrba), yvec0; #### A1r A1i A2r A2i | |||
| EDUP_DY 0*SIZE(ptrbb), yvec2; | |||
| @@ -1570,7 +1572,7 @@ ADDQ $8*SIZE, ptrba; | |||
| ADDQ $32*SIZE, ptrbb; | |||
| DECQ k; | |||
| JG .L10_bodyB; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L10_loopE: | |||
| #ifndef TRMMKERNEL | |||
| TEST $2, bk; | |||
| @@ -1578,7 +1580,7 @@ TEST $2, bk; | |||
| TEST $2, kkk; | |||
| #endif | |||
| JLE .L11_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L11_bodyB: | |||
| LD_DY 0*SIZE(ptrba), yvec0; #### A1r A1i A2r A2i | |||
| EDUP_DY 0*SIZE(ptrbb), yvec2; | |||
| @@ -1624,7 +1626,7 @@ TEST $1, bk; | |||
| TEST $1, kkk; | |||
| #endif | |||
| JLE .L12_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L12_bodyB: | |||
| LD_DY 0*SIZE(ptrba), yvec0; #### A1r A1i A2r A2i | |||
| EDUP_DY 0*SIZE(ptrbb), yvec2; | |||
| @@ -1722,11 +1724,11 @@ LEAQ (C,ldc,4),C; | |||
| .L0_bodyE:; | |||
| DECQ j; | |||
| JG .L0_bodyB; | |||
| .align 32; | |||
| ALIGN_5; | |||
| .L0_loopE:; | |||
| TEST $2, bn; | |||
| JLE .L20_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L20_bodyB: | |||
| #if defined(TRMMKERNEL) && defined(LEFT) | |||
| MOVQ OFFSET, %rax; | |||
| @@ -1738,7 +1740,7 @@ MOVQ ba, ptrba; | |||
| MOVQ bm, i; | |||
| SARQ $2, i; | |||
| JLE .L21_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L21_bodyB: | |||
| #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | |||
| MOVQ bb,ptrbb; | |||
| @@ -1770,7 +1772,7 @@ MOVQ %rax, kkk; | |||
| #endif | |||
| SARQ $2, k; | |||
| JLE .L211_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L211_bodyB: | |||
| #### Unroll time 1 #### | |||
| EDUP_DY 0*SIZE(ptrbb), yvec2; | |||
| @@ -1891,7 +1893,7 @@ ADD2_DY yvec7, yvec12, yvec12; | |||
| ADDQ $32*SIZE, ptrba; | |||
| DECQ k; | |||
| JG .L211_bodyB; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L211_loopE: | |||
| #ifndef TRMMKERNEL | |||
| TEST $2, bk; | |||
| @@ -1899,7 +1901,7 @@ TEST $2, bk; | |||
| TEST $2, kkk; | |||
| #endif | |||
| JLE .L212_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L212_bodyB: | |||
| #### Unroll time 1 #### | |||
| EDUP_DY 0*SIZE(ptrbb), yvec2; | |||
| @@ -1969,7 +1971,7 @@ TEST $1, bk; | |||
| TEST $1, kkk; | |||
| #endif | |||
| JLE .L213_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L213_bodyB: | |||
| #### Unroll time 1 #### | |||
| EDUP_DY 0*SIZE(ptrbb), yvec2; | |||
| @@ -2058,7 +2060,7 @@ MOVQ C0, %rax; | |||
| OR ldc, %rax; | |||
| TEST $15, %rax; | |||
| JNE .L213_loopEx; | |||
| .align 32 | |||
| ALIGN_5 | |||
| #### Writing back #### | |||
| #ifndef TRMMKERNEL | |||
| ADD_DX 0*SIZE(C0),xvec15; | |||
| @@ -2093,7 +2095,7 @@ ADDQ $8*SIZE, C1; | |||
| DECQ i; | |||
| JG .L21_bodyB; | |||
| JMP .L21_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L213_loopEx: | |||
| #ifndef TRMMKERNEL | |||
| LDL_DX 0*SIZE(C0), xvec0; | |||
| @@ -2153,11 +2155,11 @@ ADDQ $8*SIZE, C0; | |||
| ADDQ $8*SIZE, C1; | |||
| DECQ i; | |||
| JG .L21_bodyB; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L21_loopE: | |||
| TEST $2, bm; | |||
| JLE .L22_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L22_bodyB: | |||
| #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | |||
| MOVQ bb,ptrbb; | |||
| @@ -2187,7 +2189,7 @@ MOVQ %rax, kkk; | |||
| #endif | |||
| SARQ $2, k; | |||
| JLE .L221_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L221_bodyB: | |||
| #### Unroll time 1 #### | |||
| EDUP_DY 0*SIZE(ptrbb), yvec2; | |||
| @@ -2268,7 +2270,7 @@ ADD2_DY yvec6, yvec13, yvec13; | |||
| ADDQ $16*SIZE, ptrba; | |||
| DECQ k; | |||
| JG .L221_bodyB; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L221_loopE: | |||
| #ifndef TRMMKERNEL | |||
| TEST $2, bk; | |||
| @@ -2276,7 +2278,7 @@ TEST $2, bk; | |||
| TEST $2, kkk; | |||
| #endif | |||
| JLE .L222_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L222_bodyB: | |||
| #### Unroll time 1 #### | |||
| EDUP_DY 0*SIZE(ptrbb), yvec2; | |||
| @@ -2325,7 +2327,7 @@ TEST $1, bk; | |||
| TEST $1, kkk; | |||
| #endif | |||
| JLE .L223_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L223_bodyB: | |||
| #### Unroll time 1 #### | |||
| EDUP_DY 0*SIZE(ptrbb), yvec2; | |||
| @@ -2419,7 +2421,7 @@ ADDQ $4*SIZE, C1; | |||
| .L22_loopE: | |||
| TEST $1, bm; | |||
| JLE .L23_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L23_bodyB: | |||
| #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | |||
| MOVQ bb,ptrbb; | |||
| @@ -2448,7 +2450,7 @@ MOVQ %rax, kkk; | |||
| #endif | |||
| SARQ $2, k; | |||
| JLE .L231_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L231_bodyB: | |||
| LD_DY 0*SIZE(ptrba), yvec0; #### A1r A1i A2r A2i | |||
| EDUP_DY 0*SIZE(ptrbb), yvec2; | |||
| @@ -2498,7 +2500,7 @@ ADDQ $8*SIZE, ptrba; | |||
| ADDQ $16*SIZE, ptrbb; | |||
| DECQ k; | |||
| JG .L231_bodyB; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L231_loopE: | |||
| #ifndef TRMMKERNEL | |||
| TEST $2, bk; | |||
| @@ -2506,7 +2508,7 @@ TEST $2, bk; | |||
| TEST $2, kkk; | |||
| #endif | |||
| JLE .L232_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L232_bodyB: | |||
| LD_DY 0*SIZE(ptrba), yvec0; #### A1r A1i A2r A2i | |||
| EDUP_DY 0*SIZE(ptrbb), yvec2; | |||
| @@ -2540,7 +2542,7 @@ TEST $1, bk; | |||
| TEST $1, kkk; | |||
| #endif | |||
| JLE .L233_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L233_bodyB: | |||
| LD_DY 0*SIZE(ptrba), yvec0; #### A1r A1i A2r A2i | |||
| EDUP_DY 0*SIZE(ptrbb), yvec2; | |||
| @@ -2614,7 +2616,7 @@ LEAQ (C, ldc, 2), C; | |||
| .L20_loopE: | |||
| TEST $1, bn; | |||
| JLE .L30_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L30_bodyB: | |||
| #if defined(TRMMKERNEL) && defined(LEFT) | |||
| MOVQ OFFSET, %rax; | |||
| @@ -2625,7 +2627,7 @@ MOVQ C, C0; | |||
| MOVQ bm, i; | |||
| SARQ $2, i; | |||
| JLE .L31_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L31_bodyB: | |||
| #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | |||
| MOVQ bb,ptrbb; | |||
| @@ -2655,7 +2657,7 @@ MOVQ %rax, kkk; | |||
| #endif | |||
| SARQ $2, k; | |||
| JLE .L311_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L311_bodyB: | |||
| LD_DY 0*SIZE(ptrba), yvec0; | |||
| BROAD_DY 0*SIZE(ptrbb), yvec2; | |||
| @@ -2732,7 +2734,7 @@ ADDQ $32*SIZE, ptrba; | |||
| ADDQ $8*SIZE, ptrbb; | |||
| DECQ k; | |||
| JG .L311_bodyB; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L311_loopE: | |||
| #ifndef TRMMKERNEL | |||
| TEST $2, bk; | |||
| @@ -2740,7 +2742,7 @@ TEST $2, bk; | |||
| TEST $2, kkk; | |||
| #endif | |||
| JLE .L312_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L312_bodyB: | |||
| LD_DY 0*SIZE(ptrba), yvec0; | |||
| BROAD_DY 0*SIZE(ptrbb), yvec2; | |||
| @@ -2787,7 +2789,7 @@ TEST $1, bk; | |||
| TEST $1, kkk; | |||
| #endif | |||
| JLE .L313_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L313_bodyB: | |||
| LD_DY 0*SIZE(ptrba), yvec0; | |||
| BROAD_DY 0*SIZE(ptrbb), yvec2; | |||
| @@ -2877,11 +2879,11 @@ ADDQ $4, kk; | |||
| ADDQ $8*SIZE, C0; | |||
| DECQ i; | |||
| JG .L31_bodyB; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L31_loopE: | |||
| TEST $2, bm; | |||
| JLE .L32_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L32_bodyB: | |||
| #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | |||
| MOVQ bb,ptrbb; | |||
| @@ -2910,7 +2912,7 @@ MOVQ %rax, kkk; | |||
| #endif | |||
| SARQ $2, k; | |||
| JLE .L321_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L321_bodyB: | |||
| LD_DY 0*SIZE(ptrba), yvec0; | |||
| BROAD_DY 0*SIZE(ptrbb), yvec2; | |||
| @@ -2951,7 +2953,7 @@ ADDQ $16*SIZE, ptrba; | |||
| ADDQ $8*SIZE, ptrbb; | |||
| DECQ k; | |||
| JG .L321_bodyB; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L321_loopE: | |||
| #ifndef TRMMKERNEL | |||
| TEST $2, bk; | |||
| @@ -2959,7 +2961,7 @@ TEST $2, bk; | |||
| TEST $2, kkk; | |||
| #endif | |||
| JLE .L322_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L322_bodyB: | |||
| LD_DY 0*SIZE(ptrba), yvec0; | |||
| BROAD_DY 0*SIZE(ptrbb), yvec2; | |||
| @@ -2988,7 +2990,7 @@ TEST $1, bk; | |||
| TEST $1, kkk; | |||
| #endif | |||
| JLE .L323_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L323_bodyB: | |||
| LD_DY 0*SIZE(ptrba), yvec0; | |||
| BROAD_DY 0*SIZE(ptrbb), yvec2; | |||
| @@ -3049,7 +3051,7 @@ ADDQ $4*SIZE, C0; | |||
| .L32_loopE: | |||
| TEST $1, bm; | |||
| JLE .L33_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L33_bodyB: | |||
| #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | |||
| MOVQ bb,ptrbb; | |||
| @@ -3078,7 +3080,7 @@ MOVQ %rax, kkk; | |||
| #endif | |||
| SARQ $2, k; | |||
| JLE .L331_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L331_bodyB: | |||
| LD_DX 0*SIZE(ptrba), xvec0; | |||
| BROAD_DX 0*SIZE(ptrbb), xvec2; | |||
| @@ -3123,7 +3125,7 @@ ADDQ $8*SIZE, ptrba; | |||
| ADDQ $8*SIZE, ptrbb; | |||
| DECQ k; | |||
| JG .L331_bodyB; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L331_loopE: | |||
| #ifndef TRMMKERNEL | |||
| TEST $2, bk; | |||
| @@ -3131,7 +3133,7 @@ TEST $2, bk; | |||
| TEST $2, kkk; | |||
| #endif | |||
| JLE .L332_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L332_bodyB: | |||
| LD_DX 0*SIZE(ptrba), xvec0; | |||
| BROAD_DX 0*SIZE(ptrbb), xvec2; | |||
| @@ -3162,7 +3164,7 @@ TEST $1, bk; | |||
| TEST $1, kkk; | |||
| #endif | |||
| JLE .L333_loopE; | |||
| .align 32 | |||
| ALIGN_5 | |||
| .L333_bodyB: | |||
| LD_DX 0*SIZE(ptrba), xvec0; | |||
| BROAD_DX 0*SIZE(ptrbb), xvec2; | |||