| @@ -127,17 +127,18 @@ | |||||
| *******************************************************************************************/ | *******************************************************************************************/ | ||||
| #define KERNEL16x3_1(xx) \ | #define KERNEL16x3_1(xx) \ | ||||
| prefetcht0 A_PR1(AO,%rax,SIZE) ;\ | |||||
| vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ | vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ | ||||
| vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ | vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ | ||||
| vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ | vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ | ||||
| vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ | vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ | ||||
| vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ | vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ | ||||
| vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ | vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ | ||||
| nop ;\ | |||||
| vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ | vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ | ||||
| vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ | vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ | ||||
| vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ | vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ | ||||
| vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ | vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ | ||||
| prefetcht0 A_PR1(AO,%rax,SIZE) ;\ | |||||
| vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ | vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ | ||||
| vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ | vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ | ||||
| vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ | vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ | ||||
| @@ -146,20 +147,21 @@ | |||||
| vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ | vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ | ||||
| vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ | vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ | ||||
| vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ | vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ | ||||
| vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ | |||||
| vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ | |||||
| vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ | vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ | ||||
| #define KERNEL16x3_2(xx) \ | #define KERNEL16x3_2(xx) \ | ||||
| prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ | |||||
| vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ | |||||
| vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ | vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ | ||||
| vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ | vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ | ||||
| vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ | |||||
| vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ | vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ | ||||
| vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ | vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ | ||||
| nop ;\ | |||||
| vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ | vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ | ||||
| vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ | vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ | ||||
| vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ | vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ | ||||
| vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ | vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ | ||||
| prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ | |||||
| vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ | vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ | ||||
| vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ | vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ | ||||
| vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ | vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ | ||||
| @@ -168,20 +170,21 @@ | |||||
| vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ | vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ | ||||
| vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ | vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ | ||||
| vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ | vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ | ||||
| vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ | |||||
| vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ | |||||
| vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ | vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ | ||||
| #define KERNEL16x3_3(xx) \ | #define KERNEL16x3_3(xx) \ | ||||
| prefetcht0 A_PR1+128(AO,%rax,SIZE) ;\ | |||||
| vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ | |||||
| vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ | vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ | ||||
| vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ | vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ | ||||
| vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ | |||||
| vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ | vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ | ||||
| vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ | vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ | ||||
| nop ;\ | |||||
| vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ | vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ | ||||
| vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ | vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ | ||||
| vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ | vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ | ||||
| vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ | vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ | ||||
| prefetcht0 A_PR1+128(AO,%rax,SIZE) ;\ | |||||
| vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ | vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ | ||||
| vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ | vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ | ||||
| vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ | vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ | ||||
| @@ -190,31 +193,32 @@ | |||||
| vmovups 12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ | vmovups 12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ | ||||
| vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ | vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ | ||||
| vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ | vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ | ||||
| vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ | |||||
| vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ | |||||
| vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ | vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ | ||||
| #define KERNEL16x3_4(xx) \ | #define KERNEL16x3_4(xx) \ | ||||
| prefetcht0 A_PR1+192(AO,%rax,SIZE) ;\ | |||||
| vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ | |||||
| vmovups 16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ | vmovups 16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ | ||||
| vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ | vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ | ||||
| vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ | |||||
| vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ | vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ | ||||
| vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ | vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ | ||||
| nop ;\ | |||||
| vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ | vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ | ||||
| vmovups 20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ | vmovups 20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ | ||||
| vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ | vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ | ||||
| vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ | vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ | ||||
| prefetcht0 A_PR1+192(AO,%rax,SIZE) ;\ | |||||
| vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ | vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ | ||||
| vmovups 24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ | vmovups 24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ | ||||
| vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ | vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ | ||||
| vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ | vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ | ||||
| addq $12, BI ;\ | |||||
| vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ | vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ | ||||
| vmovups 28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ | vmovups 28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ | ||||
| vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ | vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ | ||||
| vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ | vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ | ||||
| vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ | |||||
| addq $12, BI ;\ | |||||
| addq $64, %rax ;\ | addq $64, %rax ;\ | ||||
| vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ | |||||
| #define KERNEL16x3_SUB(xx) \ | #define KERNEL16x3_SUB(xx) \ | ||||
| vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ | vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ | ||||
| @@ -223,6 +227,7 @@ | |||||
| vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ | vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ | ||||
| vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ | vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ | ||||
| vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ | vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ | ||||
| nop ;\ | |||||
| vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ | vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ | ||||
| vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ | vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ | ||||
| vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ | vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ | ||||
| @@ -248,6 +253,7 @@ | |||||
| vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ | vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ | ||||
| vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ | vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ | ||||
| vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ | vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ | ||||
| nop ;\ | |||||
| vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ | vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ | ||||
| vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ | vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ | ||||
| vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ | vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ | ||||
| @@ -261,6 +267,7 @@ | |||||
| vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ | vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ | ||||
| vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ | vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ | ||||
| vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ | vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ | ||||
| nop ;\ | |||||
| vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ | vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ | ||||
| vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ | vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ | ||||
| vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ | vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ | ||||
| @@ -275,6 +282,7 @@ | |||||
| vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ | vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ | ||||
| vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ | vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ | ||||
| vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ | vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ | ||||
| nop ;\ | |||||
| vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ | vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ | ||||
| vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ | vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ | ||||
| vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ | vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ | ||||
| @@ -288,6 +296,7 @@ | |||||
| vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ | vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ | ||||
| vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ | vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ | ||||
| vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ | vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ | ||||
| nop ;\ | |||||
| vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ | vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ | ||||
| vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ | vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ | ||||
| vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ | vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ | ||||
| @@ -303,6 +312,7 @@ | |||||
| vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ | vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ | ||||
| vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ | vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ | ||||
| vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ | vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ | ||||
| nop ;\ | |||||
| vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ | vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ | ||||
| vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ | vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ | ||||
| vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ | vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ | ||||
| @@ -1072,15 +1082,74 @@ | |||||
| leaq (B,%rax, SIZE), BO2 // next offset to BO2 | leaq (B,%rax, SIZE), BO2 // next offset to BO2 | ||||
| leaq BUFFER1, BO // first buffer to BO | leaq BUFFER1, BO // first buffer to BO | ||||
| movq K, %rax | movq K, %rax | ||||
| sarq $3 , %rax // K / 8 | |||||
| jz .L6_01a_2 | |||||
| ALIGN_4 | |||||
| .L6_01a_1: | |||||
| prefetcht0 512(BO1) | |||||
| prefetcht0 512(BO2) | |||||
| prefetchw 512(BO) | |||||
| vmovsd 0 * SIZE(BO1), %xmm0 | |||||
| vmovsd 2 * SIZE(BO1), %xmm2 | |||||
| vmovsd 4 * SIZE(BO1), %xmm4 | |||||
| vmovsd 6 * SIZE(BO1), %xmm6 | |||||
| vmovss 0 * SIZE(BO2), %xmm1 | |||||
| vmovss 2 * SIZE(BO2), %xmm3 | |||||
| vmovss 4 * SIZE(BO2), %xmm5 | |||||
| vmovss 6 * SIZE(BO2), %xmm7 | |||||
| vmovsd %xmm0, 0*SIZE(BO) | |||||
| vmovss %xmm1, 2*SIZE(BO) | |||||
| vmovsd %xmm2, 3*SIZE(BO) | |||||
| vmovss %xmm3, 5*SIZE(BO) | |||||
| vmovsd %xmm4, 6*SIZE(BO) | |||||
| vmovss %xmm5, 8*SIZE(BO) | |||||
| vmovsd %xmm6, 9*SIZE(BO) | |||||
| vmovss %xmm7,11*SIZE(BO) | |||||
| addq $8*SIZE,BO1 | |||||
| addq $8*SIZE,BO2 | |||||
| addq $12*SIZE,BO | |||||
| vmovsd 0 * SIZE(BO1), %xmm0 | |||||
| vmovsd 2 * SIZE(BO1), %xmm2 | |||||
| vmovsd 4 * SIZE(BO1), %xmm4 | |||||
| vmovsd 6 * SIZE(BO1), %xmm6 | |||||
| vmovss 0 * SIZE(BO2), %xmm1 | |||||
| vmovss 2 * SIZE(BO2), %xmm3 | |||||
| vmovss 4 * SIZE(BO2), %xmm5 | |||||
| vmovss 6 * SIZE(BO2), %xmm7 | |||||
| vmovsd %xmm0, 0*SIZE(BO) | |||||
| vmovss %xmm1, 2*SIZE(BO) | |||||
| vmovsd %xmm2, 3*SIZE(BO) | |||||
| vmovss %xmm3, 5*SIZE(BO) | |||||
| vmovsd %xmm4, 6*SIZE(BO) | |||||
| vmovss %xmm5, 8*SIZE(BO) | |||||
| vmovsd %xmm6, 9*SIZE(BO) | |||||
| vmovss %xmm7,11*SIZE(BO) | |||||
| addq $8*SIZE,BO1 | |||||
| addq $8*SIZE,BO2 | |||||
| addq $12*SIZE,BO | |||||
| decq %rax | |||||
| jnz .L6_01a_1 | |||||
| .L6_01a_2: | |||||
| movq K, %rax | |||||
| andq $7, %rax // K % 8 | |||||
| jz .L6_02c | |||||
| ALIGN_4 | ALIGN_4 | ||||
| .L6_02b: | .L6_02b: | ||||
| vmovss 0 * SIZE(BO1), %xmm0 | |||||
| vmovss 1 * SIZE(BO1), %xmm1 | |||||
| vmovsd 0 * SIZE(BO1), %xmm0 | |||||
| vmovss 0 * SIZE(BO2), %xmm2 | vmovss 0 * SIZE(BO2), %xmm2 | ||||
| vmovss %xmm0, 0*SIZE(BO) | |||||
| vmovss %xmm1, 1*SIZE(BO) | |||||
| vmovsd %xmm0, 0*SIZE(BO) | |||||
| vmovss %xmm2, 2*SIZE(BO) | vmovss %xmm2, 2*SIZE(BO) | ||||
| addq $2*SIZE,BO1 | addq $2*SIZE,BO1 | ||||
| addq $2*SIZE,BO2 | addq $2*SIZE,BO2 | ||||
| @@ -1096,17 +1165,73 @@ | |||||
| leaq (BO1,%rax, SIZE), BO2 // next offset to BO2 | leaq (BO1,%rax, SIZE), BO2 // next offset to BO2 | ||||
| leaq BUFFER2, BO // second buffer to BO | leaq BUFFER2, BO // second buffer to BO | ||||
| movq K, %rax | movq K, %rax | ||||
| sarq $3 , %rax // K / 8 | |||||
| jz .L6_02c_2 | |||||
| ALIGN_4 | ALIGN_4 | ||||
| .L6_02c_1: | |||||
| prefetcht0 512(BO2) | |||||
| prefetchw 512(BO) | |||||
| vmovsd 0 * SIZE(BO2), %xmm0 | |||||
| vmovsd 2 * SIZE(BO2), %xmm2 | |||||
| vmovsd 4 * SIZE(BO2), %xmm4 | |||||
| vmovsd 6 * SIZE(BO2), %xmm6 | |||||
| vmovss 1 * SIZE(BO1), %xmm1 | |||||
| vmovss 3 * SIZE(BO1), %xmm3 | |||||
| vmovss 5 * SIZE(BO1), %xmm5 | |||||
| vmovss 7 * SIZE(BO1), %xmm7 | |||||
| vmovss %xmm1, 0*SIZE(BO) | |||||
| vmovsd %xmm0, 1*SIZE(BO) | |||||
| vmovss %xmm3, 3*SIZE(BO) | |||||
| vmovsd %xmm2, 4*SIZE(BO) | |||||
| vmovss %xmm5, 6*SIZE(BO) | |||||
| vmovsd %xmm4, 7*SIZE(BO) | |||||
| vmovss %xmm7, 9*SIZE(BO) | |||||
| vmovsd %xmm6,10*SIZE(BO) | |||||
| addq $8*SIZE,BO1 | |||||
| addq $8*SIZE,BO2 | |||||
| addq $12*SIZE,BO | |||||
| vmovsd 0 * SIZE(BO2), %xmm0 | |||||
| vmovsd 2 * SIZE(BO2), %xmm2 | |||||
| vmovsd 4 * SIZE(BO2), %xmm4 | |||||
| vmovsd 6 * SIZE(BO2), %xmm6 | |||||
| vmovss 1 * SIZE(BO1), %xmm1 | |||||
| vmovss 3 * SIZE(BO1), %xmm3 | |||||
| vmovss 5 * SIZE(BO1), %xmm5 | |||||
| vmovss 7 * SIZE(BO1), %xmm7 | |||||
| vmovss %xmm1, 0*SIZE(BO) | |||||
| vmovsd %xmm0, 1*SIZE(BO) | |||||
| vmovss %xmm3, 3*SIZE(BO) | |||||
| vmovsd %xmm2, 4*SIZE(BO) | |||||
| vmovss %xmm5, 6*SIZE(BO) | |||||
| vmovsd %xmm4, 7*SIZE(BO) | |||||
| vmovss %xmm7, 9*SIZE(BO) | |||||
| vmovsd %xmm6,10*SIZE(BO) | |||||
| addq $8*SIZE,BO1 | |||||
| addq $8*SIZE,BO2 | |||||
| addq $12*SIZE,BO | |||||
| decq %rax | |||||
| jnz .L6_02c_1 | |||||
| .L6_02c_2: | |||||
| movq K, %rax | |||||
| andq $7, %rax // K % 8 | |||||
| jz .L6_03c | |||||
| ALIGN_4 | |||||
| .L6_03b: | .L6_03b: | ||||
| vmovss 1*SIZE(BO1), %xmm0 | vmovss 1*SIZE(BO1), %xmm0 | ||||
| vmovss 0*SIZE(BO2), %xmm1 | |||||
| vmovss 1*SIZE(BO2), %xmm2 | |||||
| vmovsd 0*SIZE(BO2), %xmm1 | |||||
| vmovss %xmm0, 0*SIZE(BO) | vmovss %xmm0, 0*SIZE(BO) | ||||
| vmovss %xmm1, 1*SIZE(BO) | |||||
| vmovss %xmm2, 2*SIZE(BO) | |||||
| vmovsd %xmm1, 1*SIZE(BO) | |||||
| addq $2*SIZE,BO1 | addq $2*SIZE,BO1 | ||||
| addq $2*SIZE,BO2 | addq $2*SIZE,BO2 | ||||
| addq $3*SIZE,BO | addq $3*SIZE,BO | ||||