| @@ -192,8 +192,8 @@ | |||||
| VFMADD231PD_ %ymm13,%ymm1,%ymm0 | VFMADD231PD_ %ymm13,%ymm1,%ymm0 | ||||
| VFMADD231PD_ %ymm14,%ymm2,%ymm0 | VFMADD231PD_ %ymm14,%ymm2,%ymm0 | ||||
| VFMADD231PD_ %ymm15,%ymm3,%ymm0 | VFMADD231PD_ %ymm15,%ymm3,%ymm0 | ||||
| addq $3*SIZE , BO | |||||
| addq $16*SIZE, AO | |||||
| addq $ 3*SIZE , BO | |||||
| addq $ 16*SIZE, AO | |||||
| .endm | .endm | ||||
| @@ -212,8 +212,8 @@ | |||||
| VFMADD231PD_ %ymm8,%ymm2,%ymm0 | VFMADD231PD_ %ymm8,%ymm2,%ymm0 | ||||
| VFMADD231PD_ %ymm9,%ymm3,%ymm0 | VFMADD231PD_ %ymm9,%ymm3,%ymm0 | ||||
| prefetcht0 B_PR1(BO) | prefetcht0 B_PR1(BO) | ||||
| addq $3*SIZE , BO | |||||
| addq $8*SIZE, AO | |||||
| addq $ 3*SIZE , BO | |||||
| addq $ 8*SIZE, AO | |||||
| .endm | .endm | ||||
| .macro KERNEL4x3_SUBN | .macro KERNEL4x3_SUBN | ||||
| @@ -224,8 +224,8 @@ | |||||
| VFMADD231PD_ %ymm5,%ymm2,%ymm0 | VFMADD231PD_ %ymm5,%ymm2,%ymm0 | ||||
| vbroadcastsd -10 * SIZE(BO), %ymm3 | vbroadcastsd -10 * SIZE(BO), %ymm3 | ||||
| VFMADD231PD_ %ymm6,%ymm3,%ymm0 | VFMADD231PD_ %ymm6,%ymm3,%ymm0 | ||||
| addq $3*SIZE , BO | |||||
| addq $4*SIZE, AO | |||||
| addq $ 3*SIZE , BO | |||||
| addq $ 4*SIZE, AO | |||||
| .endm | .endm | ||||
| .macro KERNEL2x3_SUBN | .macro KERNEL2x3_SUBN | ||||
| @@ -240,8 +240,8 @@ | |||||
| VFMADD231SD_ %xmm8,%xmm1,%xmm0 | VFMADD231SD_ %xmm8,%xmm1,%xmm0 | ||||
| VFMADD231SD_ %xmm10,%xmm2,%xmm0 | VFMADD231SD_ %xmm10,%xmm2,%xmm0 | ||||
| VFMADD231SD_ %xmm12,%xmm3,%xmm0 | VFMADD231SD_ %xmm12,%xmm3,%xmm0 | ||||
| addq $3*SIZE , BO | |||||
| addq $2*SIZE, AO | |||||
| addq $ 3*SIZE , BO | |||||
| addq $ 2*SIZE, AO | |||||
| .endm | .endm | ||||
| .macro KERNEL1x3_SUBN | .macro KERNEL1x3_SUBN | ||||
| @@ -252,8 +252,8 @@ | |||||
| VFMADD231SD_ %xmm5,%xmm2,%xmm0 | VFMADD231SD_ %xmm5,%xmm2,%xmm0 | ||||
| vmovsd -10 * SIZE(BO), %xmm3 | vmovsd -10 * SIZE(BO), %xmm3 | ||||
| VFMADD231SD_ %xmm6,%xmm3,%xmm0 | VFMADD231SD_ %xmm6,%xmm3,%xmm0 | ||||
| addq $3*SIZE , BO | |||||
| addq $1*SIZE, AO | |||||
| addq $ 3*SIZE , BO | |||||
| addq $ 1*SIZE, AO | |||||
| .endm | .endm | ||||
| @@ -1602,16 +1602,16 @@ | |||||
| vmovsd 1 * SIZE(BO, BI, SIZE), %xmm1 | vmovsd 1 * SIZE(BO, BI, SIZE), %xmm1 | ||||
| vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 | vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 | ||||
| VFMADD231SD_ %xmm4,%xmm1,%xmm0 | VFMADD231SD_ %xmm4,%xmm1,%xmm0 | ||||
| addq $4, BI | |||||
| addq $4, %rax | |||||
| addq $ 4, BI | |||||
| addq $ 4, %rax | |||||
| .endm | .endm | ||||
| .macro KERNEL1x1_SUB | .macro KERNEL1x1_SUB | ||||
| vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 | vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 | ||||
| vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 | vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 | ||||
| VFMADD231SD_ %xmm4,%xmm1,%xmm0 | VFMADD231SD_ %xmm4,%xmm1,%xmm0 | ||||
| addq $1, BI | |||||
| addq $1 , %rax | |||||
| addq $ 1, BI | |||||
| addq $ 1 , %rax | |||||
| .endm | .endm | ||||
| .macro SAVE1x1 | .macro SAVE1x1 | ||||
| @@ -1749,9 +1749,9 @@ | |||||
| vmovsd %xmm5, 8*SIZE(BO) | vmovsd %xmm5, 8*SIZE(BO) | ||||
| vmovups %xmm6, 9*SIZE(BO) | vmovups %xmm6, 9*SIZE(BO) | ||||
| vmovsd %xmm7,11*SIZE(BO) | vmovsd %xmm7,11*SIZE(BO) | ||||
| addq $8*SIZE,BO1 | |||||
| addq $8*SIZE,BO2 | |||||
| addq $12*SIZE,BO | |||||
| addq $ 8*SIZE,BO1 | |||||
| addq $ 8*SIZE,BO2 | |||||
| addq $ 12*SIZE,BO | |||||
| vmovups 0 * SIZE(BO1), %xmm0 | vmovups 0 * SIZE(BO1), %xmm0 | ||||
| vmovups 2 * SIZE(BO1), %xmm2 | vmovups 2 * SIZE(BO1), %xmm2 | ||||
| @@ -1769,9 +1769,9 @@ | |||||
| vmovsd %xmm5, 8*SIZE(BO) | vmovsd %xmm5, 8*SIZE(BO) | ||||
| vmovups %xmm6, 9*SIZE(BO) | vmovups %xmm6, 9*SIZE(BO) | ||||
| vmovsd %xmm7,11*SIZE(BO) | vmovsd %xmm7,11*SIZE(BO) | ||||
| addq $8*SIZE,BO1 | |||||
| addq $8*SIZE,BO2 | |||||
| addq $12*SIZE,BO | |||||
| addq $ 8*SIZE,BO1 | |||||
| addq $ 8*SIZE,BO2 | |||||
| addq $ 12*SIZE,BO | |||||
| decq %rax | decq %rax | ||||
| jnz .L6_01a_1 | jnz .L6_01a_1 | ||||
| @@ -1792,9 +1792,9 @@ | |||||
| vmovsd 0 * SIZE(BO2), %xmm2 | vmovsd 0 * SIZE(BO2), %xmm2 | ||||
| vmovups %xmm0, 0*SIZE(BO) | vmovups %xmm0, 0*SIZE(BO) | ||||
| vmovsd %xmm2, 2*SIZE(BO) | vmovsd %xmm2, 2*SIZE(BO) | ||||
| addq $2*SIZE,BO1 | |||||
| addq $2*SIZE,BO2 | |||||
| addq $3*SIZE,BO | |||||
| addq $ 2*SIZE,BO1 | |||||
| addq $ 2*SIZE,BO2 | |||||
| addq $ 3*SIZE,BO | |||||
| decq %rax | decq %rax | ||||
| jnz .L6_02b | jnz .L6_02b | ||||
| @@ -107,22 +107,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #if defined(OS_WINDOWS) | #if defined(OS_WINDOWS) | ||||
| #if L_BUFFER_SIZE > 16384 | #if L_BUFFER_SIZE > 16384 | ||||
| #define STACK_TOUCH \ | #define STACK_TOUCH \ | ||||
| movl $0, 4096 * 4(%rsp);\ | |||||
| movl $0, 4096 * 3(%rsp);\ | |||||
| movl $0, 4096 * 2(%rsp);\ | |||||
| movl $0, 4096 * 1(%rsp); | |||||
| movl $ 0, 4096 * 4(%rsp);\ | |||||
| movl $ 0, 4096 * 3(%rsp);\ | |||||
| movl $ 0, 4096 * 2(%rsp);\ | |||||
| movl $ 0, 4096 * 1(%rsp); | |||||
| #elif L_BUFFER_SIZE > 12288 | #elif L_BUFFER_SIZE > 12288 | ||||
| #define STACK_TOUCH \ | #define STACK_TOUCH \ | ||||
| movl $0, 4096 * 3(%rsp);\ | |||||
| movl $0, 4096 * 2(%rsp);\ | |||||
| movl $0, 4096 * 1(%rsp); | |||||
| movl $ 0, 4096 * 3(%rsp);\ | |||||
| movl $ 0, 4096 * 2(%rsp);\ | |||||
| movl $ 0, 4096 * 1(%rsp); | |||||
| #elif L_BUFFER_SIZE > 8192 | #elif L_BUFFER_SIZE > 8192 | ||||
| #define STACK_TOUCH \ | #define STACK_TOUCH \ | ||||
| movl $0, 4096 * 2(%rsp);\ | |||||
| movl $0, 4096 * 1(%rsp); | |||||
| movl $ 0, 4096 * 2(%rsp);\ | |||||
| movl $ 0, 4096 * 1(%rsp); | |||||
| #elif L_BUFFER_SIZE > 4096 | #elif L_BUFFER_SIZE > 4096 | ||||
| #define STACK_TOUCH \ | #define STACK_TOUCH \ | ||||
| movl $0, 4096 * 1(%rsp); | |||||
| movl $ 0, 4096 * 1(%rsp); | |||||
| #else | #else | ||||
| #define STACK_TOUCH | #define STACK_TOUCH | ||||
| #endif | #endif | ||||
| @@ -168,17 +168,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| vmulpd %ymm0 ,%ymm2 , %ymm8 | vmulpd %ymm0 ,%ymm2 , %ymm8 | ||||
| vmulpd %ymm0 ,%ymm3 , %ymm12 | vmulpd %ymm0 ,%ymm3 , %ymm12 | ||||
| prefetcht0 B_PR1+256(BO) | prefetcht0 B_PR1+256(BO) | ||||
| vpermpd $0xb1, %ymm0 , %ymm0 | |||||
| vpermpd $ 0xb1, %ymm0 , %ymm0 | |||||
| vmulpd %ymm0 ,%ymm1 , %ymm5 | vmulpd %ymm0 ,%ymm1 , %ymm5 | ||||
| vmulpd %ymm0 ,%ymm2 , %ymm9 | vmulpd %ymm0 ,%ymm2 , %ymm9 | ||||
| vmulpd %ymm0 ,%ymm3 , %ymm13 | vmulpd %ymm0 ,%ymm3 , %ymm13 | ||||
| vpermpd $0x1b, %ymm0 , %ymm0 | |||||
| vpermpd $ 0x1b, %ymm0 , %ymm0 | |||||
| vmulpd %ymm0 ,%ymm1 , %ymm6 | vmulpd %ymm0 ,%ymm1 , %ymm6 | ||||
| vmulpd %ymm0 ,%ymm2 , %ymm10 | vmulpd %ymm0 ,%ymm2 , %ymm10 | ||||
| addq $12*SIZE, BO | |||||
| addq $ 12*SIZE, BO | |||||
| vmulpd %ymm0 ,%ymm3 , %ymm14 | vmulpd %ymm0 ,%ymm3 , %ymm14 | ||||
| vpermpd $0xb1, %ymm0 , %ymm0 | |||||
| vpermpd $ 0xb1, %ymm0 , %ymm0 | |||||
| vmulpd %ymm0 ,%ymm1 , %ymm7 | vmulpd %ymm0 ,%ymm1 , %ymm7 | ||||
| vmovups -12 * SIZE(BO), %ymm1 | vmovups -12 * SIZE(BO), %ymm1 | ||||
| vmulpd %ymm0 ,%ymm2 , %ymm11 | vmulpd %ymm0 ,%ymm2 , %ymm11 | ||||
| @@ -197,16 +197,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| vfmadd231pd %ymm0 ,%ymm2 , %ymm8 | vfmadd231pd %ymm0 ,%ymm2 , %ymm8 | ||||
| prefetcht0 B_PR1+128(BO) | prefetcht0 B_PR1+128(BO) | ||||
| vfmadd231pd %ymm0 ,%ymm3 , %ymm12 | vfmadd231pd %ymm0 ,%ymm3 , %ymm12 | ||||
| vpermpd $0xb1, %ymm0 , %ymm0 | |||||
| vpermpd $ 0xb1, %ymm0 , %ymm0 | |||||
| vfmadd231pd %ymm0 ,%ymm1 , %ymm5 | vfmadd231pd %ymm0 ,%ymm1 , %ymm5 | ||||
| vfmadd231pd %ymm0 ,%ymm2 , %ymm9 | vfmadd231pd %ymm0 ,%ymm2 , %ymm9 | ||||
| vfmadd231pd %ymm0 ,%ymm3 , %ymm13 | vfmadd231pd %ymm0 ,%ymm3 , %ymm13 | ||||
| vpermpd $0x1b, %ymm0 , %ymm0 | |||||
| vpermpd $ 0x1b, %ymm0 , %ymm0 | |||||
| vfmadd231pd %ymm0 ,%ymm1 , %ymm6 | vfmadd231pd %ymm0 ,%ymm1 , %ymm6 | ||||
| vfmadd231pd %ymm0 ,%ymm2 , %ymm10 | vfmadd231pd %ymm0 ,%ymm2 , %ymm10 | ||||
| vfmadd231pd %ymm0 ,%ymm3 , %ymm14 | vfmadd231pd %ymm0 ,%ymm3 , %ymm14 | ||||
| vpermpd $0xb1, %ymm0 , %ymm0 | |||||
| vpermpd $ 0xb1, %ymm0 , %ymm0 | |||||
| vfmadd231pd %ymm0 ,%ymm1 , %ymm7 | vfmadd231pd %ymm0 ,%ymm1 , %ymm7 | ||||
| vmovups -12 * SIZE(BO), %ymm1 | vmovups -12 * SIZE(BO), %ymm1 | ||||
| vfmadd231pd %ymm0 ,%ymm2 , %ymm11 | vfmadd231pd %ymm0 ,%ymm2 , %ymm11 | ||||
| @@ -221,24 +221,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| vfmadd231pd %ymm0 ,%ymm1 , %ymm4 | vfmadd231pd %ymm0 ,%ymm1 , %ymm4 | ||||
| vfmadd231pd %ymm0 ,%ymm2 , %ymm8 | vfmadd231pd %ymm0 ,%ymm2 , %ymm8 | ||||
| vfmadd231pd %ymm0 ,%ymm3 , %ymm12 | vfmadd231pd %ymm0 ,%ymm3 , %ymm12 | ||||
| vpermpd $0xb1, %ymm0 , %ymm0 | |||||
| vpermpd $ 0xb1, %ymm0 , %ymm0 | |||||
| vfmadd231pd %ymm0 ,%ymm1 , %ymm5 | vfmadd231pd %ymm0 ,%ymm1 , %ymm5 | ||||
| vfmadd231pd %ymm0 ,%ymm2 , %ymm9 | vfmadd231pd %ymm0 ,%ymm2 , %ymm9 | ||||
| vfmadd231pd %ymm0 ,%ymm3 , %ymm13 | vfmadd231pd %ymm0 ,%ymm3 , %ymm13 | ||||
| vpermpd $0x1b, %ymm0 , %ymm0 | |||||
| vpermpd $ 0x1b, %ymm0 , %ymm0 | |||||
| vfmadd231pd %ymm0 ,%ymm1 , %ymm6 | vfmadd231pd %ymm0 ,%ymm1 , %ymm6 | ||||
| vfmadd231pd %ymm0 ,%ymm2 , %ymm10 | vfmadd231pd %ymm0 ,%ymm2 , %ymm10 | ||||
| addq $8*SIZE, AO | |||||
| addq $ 8*SIZE, AO | |||||
| vfmadd231pd %ymm0 ,%ymm3 , %ymm14 | vfmadd231pd %ymm0 ,%ymm3 , %ymm14 | ||||
| vpermpd $0xb1, %ymm0 , %ymm0 | |||||
| vpermpd $ 0xb1, %ymm0 , %ymm0 | |||||
| vfmadd231pd %ymm0 ,%ymm1 , %ymm7 | vfmadd231pd %ymm0 ,%ymm1 , %ymm7 | ||||
| vmovups 0 * SIZE(BO), %ymm1 | vmovups 0 * SIZE(BO), %ymm1 | ||||
| vfmadd231pd %ymm0 ,%ymm2 , %ymm11 | vfmadd231pd %ymm0 ,%ymm2 , %ymm11 | ||||
| vmovups 4 * SIZE(BO), %ymm2 | vmovups 4 * SIZE(BO), %ymm2 | ||||
| vfmadd231pd %ymm0 ,%ymm3 , %ymm15 | vfmadd231pd %ymm0 ,%ymm3 , %ymm15 | ||||
| vmovups 8 * SIZE(BO), %ymm3 | vmovups 8 * SIZE(BO), %ymm3 | ||||
| addq $24*SIZE, BO | |||||
| addq $ 24*SIZE, BO | |||||
| .endm | .endm | ||||
| @@ -247,21 +247,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| vfmadd231pd %ymm0 ,%ymm1 , %ymm4 | vfmadd231pd %ymm0 ,%ymm1 , %ymm4 | ||||
| vfmadd231pd %ymm0 ,%ymm2 , %ymm8 | vfmadd231pd %ymm0 ,%ymm2 , %ymm8 | ||||
| vfmadd231pd %ymm0 ,%ymm3 , %ymm12 | vfmadd231pd %ymm0 ,%ymm3 , %ymm12 | ||||
| vpermpd $0xb1, %ymm0 , %ymm0 | |||||
| vpermpd $ 0xb1, %ymm0 , %ymm0 | |||||
| vfmadd231pd %ymm0 ,%ymm1 , %ymm5 | vfmadd231pd %ymm0 ,%ymm1 , %ymm5 | ||||
| vfmadd231pd %ymm0 ,%ymm2 , %ymm9 | vfmadd231pd %ymm0 ,%ymm2 , %ymm9 | ||||
| vfmadd231pd %ymm0 ,%ymm3 , %ymm13 | vfmadd231pd %ymm0 ,%ymm3 , %ymm13 | ||||
| vpermpd $0x1b, %ymm0 , %ymm0 | |||||
| vpermpd $ 0x1b, %ymm0 , %ymm0 | |||||
| vfmadd231pd %ymm0 ,%ymm1 , %ymm6 | vfmadd231pd %ymm0 ,%ymm1 , %ymm6 | ||||
| vfmadd231pd %ymm0 ,%ymm2 , %ymm10 | vfmadd231pd %ymm0 ,%ymm2 , %ymm10 | ||||
| addq $8*SIZE, AO | |||||
| addq $ 8*SIZE, AO | |||||
| vfmadd231pd %ymm0 ,%ymm3 , %ymm14 | vfmadd231pd %ymm0 ,%ymm3 , %ymm14 | ||||
| vpermpd $0xb1, %ymm0 , %ymm0 | |||||
| vpermpd $ 0xb1, %ymm0 , %ymm0 | |||||
| vfmadd231pd %ymm0 ,%ymm1 , %ymm7 | vfmadd231pd %ymm0 ,%ymm1 , %ymm7 | ||||
| vfmadd231pd %ymm0 ,%ymm2 , %ymm11 | vfmadd231pd %ymm0 ,%ymm2 , %ymm11 | ||||
| vfmadd231pd %ymm0 ,%ymm3 , %ymm15 | vfmadd231pd %ymm0 ,%ymm3 , %ymm15 | ||||
| addq $12*SIZE, BO | |||||
| addq $ 12*SIZE, BO | |||||
| .endm | .endm | ||||
| .macro KERNEL4x12_SUB | .macro KERNEL4x12_SUB | ||||
| @@ -272,17 +272,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| vfmadd231pd %ymm0 ,%ymm2 , %ymm8 | vfmadd231pd %ymm0 ,%ymm2 , %ymm8 | ||||
| vmovups -4 * SIZE(BO), %ymm3 | vmovups -4 * SIZE(BO), %ymm3 | ||||
| vfmadd231pd %ymm0 ,%ymm3 , %ymm12 | vfmadd231pd %ymm0 ,%ymm3 , %ymm12 | ||||
| vpermpd $0xb1, %ymm0 , %ymm0 | |||||
| vpermpd $ 0xb1, %ymm0 , %ymm0 | |||||
| vfmadd231pd %ymm0 ,%ymm1 , %ymm5 | vfmadd231pd %ymm0 ,%ymm1 , %ymm5 | ||||
| vfmadd231pd %ymm0 ,%ymm2 , %ymm9 | vfmadd231pd %ymm0 ,%ymm2 , %ymm9 | ||||
| addq $12*SIZE, BO | |||||
| addq $ 12*SIZE, BO | |||||
| vfmadd231pd %ymm0 ,%ymm3 , %ymm13 | vfmadd231pd %ymm0 ,%ymm3 , %ymm13 | ||||
| vpermpd $0x1b, %ymm0 , %ymm0 | |||||
| vpermpd $ 0x1b, %ymm0 , %ymm0 | |||||
| vfmadd231pd %ymm0 ,%ymm1 , %ymm6 | vfmadd231pd %ymm0 ,%ymm1 , %ymm6 | ||||
| vfmadd231pd %ymm0 ,%ymm2 , %ymm10 | vfmadd231pd %ymm0 ,%ymm2 , %ymm10 | ||||
| addq $4*SIZE, AO | |||||
| addq $ 4*SIZE, AO | |||||
| vfmadd231pd %ymm0 ,%ymm3 , %ymm14 | vfmadd231pd %ymm0 ,%ymm3 , %ymm14 | ||||
| vpermpd $0xb1, %ymm0 , %ymm0 | |||||
| vpermpd $ 0xb1, %ymm0 , %ymm0 | |||||
| vfmadd231pd %ymm0 ,%ymm1 , %ymm7 | vfmadd231pd %ymm0 ,%ymm1 , %ymm7 | ||||
| vfmadd231pd %ymm0 ,%ymm2 , %ymm11 | vfmadd231pd %ymm0 ,%ymm2 , %ymm11 | ||||
| vfmadd231pd %ymm0 ,%ymm3 , %ymm15 | vfmadd231pd %ymm0 ,%ymm3 , %ymm15 | ||||
| @@ -309,23 +309,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| vmulpd %ymm0 , %ymm14, %ymm14 | vmulpd %ymm0 , %ymm14, %ymm14 | ||||
| vmulpd %ymm0 , %ymm15, %ymm15 | vmulpd %ymm0 , %ymm15, %ymm15 | ||||
| vpermpd $0xb1 , %ymm5, %ymm5 | |||||
| vpermpd $0xb1 , %ymm7, %ymm7 | |||||
| vpermpd $ 0xb1 , %ymm5, %ymm5 | |||||
| vpermpd $ 0xb1 , %ymm7, %ymm7 | |||||
| vblendpd $0x0a, %ymm5, %ymm4, %ymm0 | |||||
| vblendpd $0x05, %ymm5, %ymm4, %ymm1 | |||||
| vblendpd $0x0a, %ymm7, %ymm6, %ymm2 | |||||
| vblendpd $0x05, %ymm7, %ymm6, %ymm3 | |||||
| vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0 | |||||
| vblendpd $ 0x05, %ymm5, %ymm4, %ymm1 | |||||
| vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 | |||||
| vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 | |||||
| vpermpd $0x1b , %ymm2, %ymm2 | |||||
| vpermpd $0x1b , %ymm3, %ymm3 | |||||
| vpermpd $0xb1 , %ymm2, %ymm2 | |||||
| vpermpd $0xb1 , %ymm3, %ymm3 | |||||
| vpermpd $ 0x1b , %ymm2, %ymm2 | |||||
| vpermpd $ 0x1b , %ymm3, %ymm3 | |||||
| vpermpd $ 0xb1 , %ymm2, %ymm2 | |||||
| vpermpd $ 0xb1 , %ymm3, %ymm3 | |||||
| vblendpd $0x03, %ymm0, %ymm2 , %ymm4 | |||||
| vblendpd $0x03, %ymm1, %ymm3 , %ymm5 | |||||
| vblendpd $0x03, %ymm2, %ymm0 , %ymm6 | |||||
| vblendpd $0x03, %ymm3, %ymm1 , %ymm7 | |||||
| vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 | |||||
| vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 | |||||
| vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 | |||||
| vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 | |||||
| leaq (CO1, LDC, 2), %rax | leaq (CO1, LDC, 2), %rax | ||||
| @@ -349,23 +349,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| prefetcht0 32(%rax) | prefetcht0 32(%rax) | ||||
| prefetcht0 32(%rax,LDC) | prefetcht0 32(%rax,LDC) | ||||
| vpermpd $0xb1 , %ymm9 , %ymm9 | |||||
| vpermpd $0xb1 , %ymm11, %ymm11 | |||||
| vpermpd $ 0xb1 , %ymm9 , %ymm9 | |||||
| vpermpd $ 0xb1 , %ymm11, %ymm11 | |||||
| vblendpd $0x0a, %ymm9 , %ymm8 , %ymm0 | |||||
| vblendpd $0x05, %ymm9 , %ymm8 , %ymm1 | |||||
| vblendpd $0x0a, %ymm11, %ymm10, %ymm2 | |||||
| vblendpd $0x05, %ymm11, %ymm10, %ymm3 | |||||
| vblendpd $ 0x0a, %ymm9 , %ymm8 , %ymm0 | |||||
| vblendpd $ 0x05, %ymm9 , %ymm8 , %ymm1 | |||||
| vblendpd $ 0x0a, %ymm11, %ymm10, %ymm2 | |||||
| vblendpd $ 0x05, %ymm11, %ymm10, %ymm3 | |||||
| vpermpd $0x1b , %ymm2, %ymm2 | |||||
| vpermpd $0x1b , %ymm3, %ymm3 | |||||
| vpermpd $0xb1 , %ymm2, %ymm2 | |||||
| vpermpd $0xb1 , %ymm3, %ymm3 | |||||
| vpermpd $ 0x1b , %ymm2, %ymm2 | |||||
| vpermpd $ 0x1b , %ymm3, %ymm3 | |||||
| vpermpd $ 0xb1 , %ymm2, %ymm2 | |||||
| vpermpd $ 0xb1 , %ymm3, %ymm3 | |||||
| vblendpd $0x03, %ymm0, %ymm2 , %ymm4 | |||||
| vblendpd $0x03, %ymm1, %ymm3 , %ymm5 | |||||
| vblendpd $0x03, %ymm2, %ymm0 , %ymm6 | |||||
| vblendpd $0x03, %ymm3, %ymm1 , %ymm7 | |||||
| vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 | |||||
| vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 | |||||
| vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 | |||||
| vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 | |||||
| leaq (%rax, LDC, 2), %rax | leaq (%rax, LDC, 2), %rax | ||||
| @@ -390,23 +390,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| prefetcht0 32(%rbp) | prefetcht0 32(%rbp) | ||||
| prefetcht0 32(%rbp,LDC) | prefetcht0 32(%rbp,LDC) | ||||
| vpermpd $0xb1 , %ymm13, %ymm13 | |||||
| vpermpd $0xb1 , %ymm15, %ymm15 | |||||
| vpermpd $ 0xb1 , %ymm13, %ymm13 | |||||
| vpermpd $ 0xb1 , %ymm15, %ymm15 | |||||
| vblendpd $0x0a, %ymm13, %ymm12, %ymm0 | |||||
| vblendpd $0x05, %ymm13, %ymm12, %ymm1 | |||||
| vblendpd $0x0a, %ymm15, %ymm14, %ymm2 | |||||
| vblendpd $0x05, %ymm15, %ymm14, %ymm3 | |||||
| vblendpd $ 0x0a, %ymm13, %ymm12, %ymm0 | |||||
| vblendpd $ 0x05, %ymm13, %ymm12, %ymm1 | |||||
| vblendpd $ 0x0a, %ymm15, %ymm14, %ymm2 | |||||
| vblendpd $ 0x05, %ymm15, %ymm14, %ymm3 | |||||
| vpermpd $0x1b , %ymm2, %ymm2 | |||||
| vpermpd $0x1b , %ymm3, %ymm3 | |||||
| vpermpd $0xb1 , %ymm2, %ymm2 | |||||
| vpermpd $0xb1 , %ymm3, %ymm3 | |||||
| vpermpd $ 0x1b , %ymm2, %ymm2 | |||||
| vpermpd $ 0x1b , %ymm3, %ymm3 | |||||
| vpermpd $ 0xb1 , %ymm2, %ymm2 | |||||
| vpermpd $ 0xb1 , %ymm3, %ymm3 | |||||
| vblendpd $0x03, %ymm0, %ymm2 , %ymm4 | |||||
| vblendpd $0x03, %ymm1, %ymm3 , %ymm5 | |||||
| vblendpd $0x03, %ymm2, %ymm0 , %ymm6 | |||||
| vblendpd $0x03, %ymm3, %ymm1 , %ymm7 | |||||
| vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 | |||||
| vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 | |||||
| vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 | |||||
| vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 | |||||
| leaq (%rax, LDC, 4), %rax | leaq (%rax, LDC, 4), %rax | ||||
| @@ -431,7 +431,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| prefetcht0 32(%rbp) | prefetcht0 32(%rbp) | ||||
| prefetcht0 32(%rbp,LDC) | prefetcht0 32(%rbp,LDC) | ||||
| addq $4*SIZE, CO1 | |||||
| addq $ 4*SIZE, CO1 | |||||
| .endm | .endm | ||||
| /******************************************************************************************/ | /******************************************************************************************/ | ||||
| @@ -477,9 +477,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| vfmadd231pd %xmm0 ,%xmm3 , %xmm12 | vfmadd231pd %xmm0 ,%xmm3 , %xmm12 | ||||
| vmovddup -1 * SIZE(BO), %xmm3 | vmovddup -1 * SIZE(BO), %xmm3 | ||||
| vfmadd231pd %xmm0 ,%xmm1 , %xmm13 | vfmadd231pd %xmm0 ,%xmm1 , %xmm13 | ||||
| addq $12*SIZE, BO | |||||
| addq $ 12*SIZE, BO | |||||
| vfmadd231pd %xmm0 ,%xmm2 , %xmm14 | vfmadd231pd %xmm0 ,%xmm2 , %xmm14 | ||||
| addq $2*SIZE, AO | |||||
| addq $ 2*SIZE, AO | |||||
| vfmadd231pd %xmm0 ,%xmm3 , %xmm15 | vfmadd231pd %xmm0 ,%xmm3 , %xmm15 | ||||
| .endm | .endm | ||||
| @@ -557,7 +557,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| vmovups %xmm6 , (%rbp) | vmovups %xmm6 , (%rbp) | ||||
| vmovups %xmm7 , (%rbp, LDC) | vmovups %xmm7 , (%rbp, LDC) | ||||
| addq $2*SIZE, CO1 | |||||
| addq $ 2*SIZE, CO1 | |||||
| .endm | .endm | ||||
| @@ -604,9 +604,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| vfmadd231sd %xmm0 ,%xmm3 , %xmm12 | vfmadd231sd %xmm0 ,%xmm3 , %xmm12 | ||||
| vmovsd -1 * SIZE(BO), %xmm3 | vmovsd -1 * SIZE(BO), %xmm3 | ||||
| vfmadd231sd %xmm0 ,%xmm1 , %xmm13 | vfmadd231sd %xmm0 ,%xmm1 , %xmm13 | ||||
| addq $12*SIZE, BO | |||||
| addq $ 12*SIZE, BO | |||||
| vfmadd231sd %xmm0 ,%xmm2 , %xmm14 | vfmadd231sd %xmm0 ,%xmm2 , %xmm14 | ||||
| addq $1*SIZE, AO | |||||
| addq $ 1*SIZE, AO | |||||
| vfmadd231sd %xmm0 ,%xmm3 , %xmm15 | vfmadd231sd %xmm0 ,%xmm3 , %xmm15 | ||||
| .endm | .endm | ||||
| @@ -684,7 +684,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| vmovsd %xmm6 , (%rbp) | vmovsd %xmm6 , (%rbp) | ||||
| vmovsd %xmm7 , (%rbp, LDC) | vmovsd %xmm7 , (%rbp, LDC) | ||||
| addq $1*SIZE, CO1 | |||||
| addq $ 1*SIZE, CO1 | |||||
| .endm | .endm | ||||
| @@ -707,13 +707,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| vmovups -12 * SIZE(BO), %ymm1 | vmovups -12 * SIZE(BO), %ymm1 | ||||
| vmovups -16 * SIZE(AO), %ymm0 | vmovups -16 * SIZE(AO), %ymm0 | ||||
| vmulpd %ymm0 ,%ymm1 , %ymm4 | vmulpd %ymm0 ,%ymm1 , %ymm4 | ||||
| vpermpd $0xb1, %ymm0 , %ymm0 | |||||
| vpermpd $ 0xb1, %ymm0 , %ymm0 | |||||
| vmulpd %ymm0 ,%ymm1 , %ymm5 | vmulpd %ymm0 ,%ymm1 , %ymm5 | ||||
| vpermpd $0x1b, %ymm0 , %ymm0 | |||||
| vpermpd $ 0x1b, %ymm0 , %ymm0 | |||||
| vmulpd %ymm0 ,%ymm1 , %ymm6 | vmulpd %ymm0 ,%ymm1 , %ymm6 | ||||
| addq $4*SIZE, BO | |||||
| vpermpd $0xb1, %ymm0 , %ymm0 | |||||
| addq $ 4*SIZE, BO | |||||
| vpermpd $ 0xb1, %ymm0 , %ymm0 | |||||
| vmulpd %ymm0 ,%ymm1 , %ymm7 | vmulpd %ymm0 ,%ymm1 , %ymm7 | ||||
| vmovups -12 * SIZE(BO), %ymm1 | vmovups -12 * SIZE(BO), %ymm1 | ||||
| @@ -723,12 +723,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| prefetcht0 A_PR1(AO) | prefetcht0 A_PR1(AO) | ||||
| vmovups -16 * SIZE(AO), %ymm0 | vmovups -16 * SIZE(AO), %ymm0 | ||||
| vfmadd231pd %ymm0 ,%ymm1 , %ymm4 | vfmadd231pd %ymm0 ,%ymm1 , %ymm4 | ||||
| vpermpd $0xb1, %ymm0 , %ymm0 | |||||
| vpermpd $ 0xb1, %ymm0 , %ymm0 | |||||
| vfmadd231pd %ymm0 ,%ymm1 , %ymm5 | vfmadd231pd %ymm0 ,%ymm1 , %ymm5 | ||||
| vpermpd $0x1b, %ymm0 , %ymm0 | |||||
| vpermpd $ 0x1b, %ymm0 , %ymm0 | |||||
| vfmadd231pd %ymm0 ,%ymm1 , %ymm6 | vfmadd231pd %ymm0 ,%ymm1 , %ymm6 | ||||
| vpermpd $0xb1, %ymm0 , %ymm0 | |||||
| vpermpd $ 0xb1, %ymm0 , %ymm0 | |||||
| vfmadd231pd %ymm0 ,%ymm1 , %ymm7 | vfmadd231pd %ymm0 ,%ymm1 , %ymm7 | ||||
| vmovups -12 * SIZE(BO), %ymm1 | vmovups -12 * SIZE(BO), %ymm1 | ||||
| @@ -737,44 +737,44 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .macro KERNEL4x4_M2 | .macro KERNEL4x4_M2 | ||||
| vmovups -12 * SIZE(AO), %ymm0 | vmovups -12 * SIZE(AO), %ymm0 | ||||
| vfmadd231pd %ymm0 ,%ymm1 , %ymm4 | vfmadd231pd %ymm0 ,%ymm1 , %ymm4 | ||||
| vpermpd $0xb1, %ymm0 , %ymm0 | |||||
| vpermpd $ 0xb1, %ymm0 , %ymm0 | |||||
| vfmadd231pd %ymm0 ,%ymm1 , %ymm5 | vfmadd231pd %ymm0 ,%ymm1 , %ymm5 | ||||
| vpermpd $0x1b, %ymm0 , %ymm0 | |||||
| vpermpd $ 0x1b, %ymm0 , %ymm0 | |||||
| vfmadd231pd %ymm0 ,%ymm1 , %ymm6 | vfmadd231pd %ymm0 ,%ymm1 , %ymm6 | ||||
| addq $8*SIZE, AO | |||||
| vpermpd $0xb1, %ymm0 , %ymm0 | |||||
| addq $ 8*SIZE, AO | |||||
| vpermpd $ 0xb1, %ymm0 , %ymm0 | |||||
| vfmadd231pd %ymm0 ,%ymm1 , %ymm7 | vfmadd231pd %ymm0 ,%ymm1 , %ymm7 | ||||
| vmovups -8 * SIZE(BO), %ymm1 | vmovups -8 * SIZE(BO), %ymm1 | ||||
| addq $8*SIZE, BO | |||||
| addq $ 8*SIZE, BO | |||||
| .endm | .endm | ||||
| .macro KERNEL4x4_E | .macro KERNEL4x4_E | ||||
| vmovups -12 * SIZE(AO), %ymm0 | vmovups -12 * SIZE(AO), %ymm0 | ||||
| vfmadd231pd %ymm0 ,%ymm1 , %ymm4 | vfmadd231pd %ymm0 ,%ymm1 , %ymm4 | ||||
| vpermpd $0xb1, %ymm0 , %ymm0 | |||||
| vpermpd $ 0xb1, %ymm0 , %ymm0 | |||||
| vfmadd231pd %ymm0 ,%ymm1 , %ymm5 | vfmadd231pd %ymm0 ,%ymm1 , %ymm5 | ||||
| vpermpd $0x1b, %ymm0 , %ymm0 | |||||
| vpermpd $ 0x1b, %ymm0 , %ymm0 | |||||
| vfmadd231pd %ymm0 ,%ymm1 , %ymm6 | vfmadd231pd %ymm0 ,%ymm1 , %ymm6 | ||||
| addq $8*SIZE, AO | |||||
| vpermpd $0xb1, %ymm0 , %ymm0 | |||||
| addq $ 8*SIZE, AO | |||||
| vpermpd $ 0xb1, %ymm0 , %ymm0 | |||||
| vfmadd231pd %ymm0 ,%ymm1 , %ymm7 | vfmadd231pd %ymm0 ,%ymm1 , %ymm7 | ||||
| addq $4*SIZE, BO | |||||
| addq $ 4*SIZE, BO | |||||
| .endm | .endm | ||||
| .macro KERNEL4x4_SUB | .macro KERNEL4x4_SUB | ||||
| vmovups -12 * SIZE(BO), %ymm1 | vmovups -12 * SIZE(BO), %ymm1 | ||||
| vmovups -16 * SIZE(AO), %ymm0 | vmovups -16 * SIZE(AO), %ymm0 | ||||
| vfmadd231pd %ymm0 ,%ymm1 , %ymm4 | vfmadd231pd %ymm0 ,%ymm1 , %ymm4 | ||||
| vpermpd $0xb1, %ymm0 , %ymm0 | |||||
| vpermpd $ 0xb1, %ymm0 , %ymm0 | |||||
| vfmadd231pd %ymm0 ,%ymm1 , %ymm5 | vfmadd231pd %ymm0 ,%ymm1 , %ymm5 | ||||
| addq $4*SIZE, BO | |||||
| vpermpd $0x1b, %ymm0 , %ymm0 | |||||
| addq $ 4*SIZE, BO | |||||
| vpermpd $ 0x1b, %ymm0 , %ymm0 | |||||
| vfmadd231pd %ymm0 ,%ymm1 , %ymm6 | vfmadd231pd %ymm0 ,%ymm1 , %ymm6 | ||||
| addq $4*SIZE, AO | |||||
| vpermpd $0xb1, %ymm0 , %ymm0 | |||||
| addq $ 4*SIZE, AO | |||||
| vpermpd $ 0xb1, %ymm0 , %ymm0 | |||||
| vfmadd231pd %ymm0 ,%ymm1 , %ymm7 | vfmadd231pd %ymm0 ,%ymm1 , %ymm7 | ||||
| .endm | .endm | ||||
| @@ -788,23 +788,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| vmulpd %ymm0 , %ymm5 , %ymm5 | vmulpd %ymm0 , %ymm5 , %ymm5 | ||||
| vmulpd %ymm0 , %ymm6 , %ymm6 | vmulpd %ymm0 , %ymm6 , %ymm6 | ||||
| vpermpd $0xb1 , %ymm5, %ymm5 | |||||
| vpermpd $0xb1 , %ymm7, %ymm7 | |||||
| vpermpd $ 0xb1 , %ymm5, %ymm5 | |||||
| vpermpd $ 0xb1 , %ymm7, %ymm7 | |||||
| vblendpd $0x0a, %ymm5, %ymm4, %ymm0 | |||||
| vblendpd $0x05, %ymm5, %ymm4, %ymm1 | |||||
| vblendpd $0x0a, %ymm7, %ymm6, %ymm2 | |||||
| vblendpd $0x05, %ymm7, %ymm6, %ymm3 | |||||
| vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0 | |||||
| vblendpd $ 0x05, %ymm5, %ymm4, %ymm1 | |||||
| vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 | |||||
| vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 | |||||
| vpermpd $0x1b , %ymm2, %ymm2 | |||||
| vpermpd $0x1b , %ymm3, %ymm3 | |||||
| vpermpd $0xb1 , %ymm2, %ymm2 | |||||
| vpermpd $0xb1 , %ymm3, %ymm3 | |||||
| vpermpd $ 0x1b , %ymm2, %ymm2 | |||||
| vpermpd $ 0x1b , %ymm3, %ymm3 | |||||
| vpermpd $ 0xb1 , %ymm2, %ymm2 | |||||
| vpermpd $ 0xb1 , %ymm3, %ymm3 | |||||
| vblendpd $0x03, %ymm0, %ymm2 , %ymm4 | |||||
| vblendpd $0x03, %ymm1, %ymm3 , %ymm5 | |||||
| vblendpd $0x03, %ymm2, %ymm0 , %ymm6 | |||||
| vblendpd $0x03, %ymm3, %ymm1 , %ymm7 | |||||
| vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 | |||||
| vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 | |||||
| vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 | |||||
| vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 | |||||
| leaq (CO1, LDC, 2), %rax | leaq (CO1, LDC, 2), %rax | ||||
| @@ -823,7 +823,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| vmovups %ymm6 , (%rax) | vmovups %ymm6 , (%rax) | ||||
| vmovups %ymm7 , (%rax, LDC) | vmovups %ymm7 , (%rax, LDC) | ||||
| addq $4*SIZE, CO1 | |||||
| addq $ 4*SIZE, CO1 | |||||
| .endm | .endm | ||||
| /******************************************************************************************/ | /******************************************************************************************/ | ||||
| @@ -848,9 +848,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| vfmadd231pd %xmm0 ,%xmm2 , %xmm5 | vfmadd231pd %xmm0 ,%xmm2 , %xmm5 | ||||
| vmovddup -9 * SIZE(BO), %xmm8 | vmovddup -9 * SIZE(BO), %xmm8 | ||||
| vfmadd231pd %xmm0 ,%xmm3 , %xmm6 | vfmadd231pd %xmm0 ,%xmm3 , %xmm6 | ||||
| addq $4*SIZE, BO | |||||
| addq $ 4*SIZE, BO | |||||
| vfmadd231pd %xmm0 ,%xmm8 , %xmm7 | vfmadd231pd %xmm0 ,%xmm8 , %xmm7 | ||||
| addq $2*SIZE, AO | |||||
| addq $ 2*SIZE, AO | |||||
| .endm | .endm | ||||
| @@ -880,7 +880,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| vmovups %xmm6 , (%rax) | vmovups %xmm6 , (%rax) | ||||
| vmovups %xmm7 , (%rax, LDC) | vmovups %xmm7 , (%rax, LDC) | ||||
| addq $2*SIZE, CO1 | |||||
| addq $ 2*SIZE, CO1 | |||||
| .endm | .endm | ||||
| /******************************************************************************************/ | /******************************************************************************************/ | ||||
| @@ -905,9 +905,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| vfmadd231sd %xmm0 ,%xmm2 , %xmm5 | vfmadd231sd %xmm0 ,%xmm2 , %xmm5 | ||||
| vmovsd -9 * SIZE(BO), %xmm8 | vmovsd -9 * SIZE(BO), %xmm8 | ||||
| vfmadd231sd %xmm0 ,%xmm3 , %xmm6 | vfmadd231sd %xmm0 ,%xmm3 , %xmm6 | ||||
| addq $4*SIZE, BO | |||||
| addq $ 4*SIZE, BO | |||||
| vfmadd231sd %xmm0 ,%xmm8 , %xmm7 | vfmadd231sd %xmm0 ,%xmm8 , %xmm7 | ||||
| addq $1*SIZE, AO | |||||
| addq $ 1*SIZE, AO | |||||
| .endm | .endm | ||||
| @@ -937,7 +937,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| vmovsd %xmm6 , (%rax) | vmovsd %xmm6 , (%rax) | ||||
| vmovsd %xmm7 , (%rax, LDC) | vmovsd %xmm7 , (%rax, LDC) | ||||
| addq $1*SIZE, CO1 | |||||
| addq $ 1*SIZE, CO1 | |||||
| .endm | .endm | ||||
| @@ -963,8 +963,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| vfmadd231pd %xmm1 ,%xmm2 , %xmm5 | vfmadd231pd %xmm1 ,%xmm2 , %xmm5 | ||||
| vfmadd231pd %xmm0 ,%xmm3 , %xmm6 | vfmadd231pd %xmm0 ,%xmm3 , %xmm6 | ||||
| vfmadd231pd %xmm1 ,%xmm3 , %xmm7 | vfmadd231pd %xmm1 ,%xmm3 , %xmm7 | ||||
| addq $2*SIZE, BO | |||||
| addq $4*SIZE, AO | |||||
| addq $ 2*SIZE, BO | |||||
| addq $ 4*SIZE, AO | |||||
| .endm | .endm | ||||
| @@ -993,7 +993,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| vmovups %xmm6 , (CO1, LDC) | vmovups %xmm6 , (CO1, LDC) | ||||
| vmovups %xmm7 , 2 * SIZE(CO1, LDC) | vmovups %xmm7 , 2 * SIZE(CO1, LDC) | ||||
| addq $4*SIZE, CO1 | |||||
| addq $ 4*SIZE, CO1 | |||||
| .endm | .endm | ||||
| @@ -1014,8 +1014,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| vmovddup -11 * SIZE(BO), %xmm3 | vmovddup -11 * SIZE(BO), %xmm3 | ||||
| vfmadd231pd %xmm0 ,%xmm2 , %xmm4 | vfmadd231pd %xmm0 ,%xmm2 , %xmm4 | ||||
| vfmadd231pd %xmm0 ,%xmm3 , %xmm6 | vfmadd231pd %xmm0 ,%xmm3 , %xmm6 | ||||
| addq $2*SIZE, BO | |||||
| addq $2*SIZE, AO | |||||
| addq $ 2*SIZE, BO | |||||
| addq $ 2*SIZE, AO | |||||
| .endm | .endm | ||||
| @@ -1038,7 +1038,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| vmovups %xmm4 , (CO1) | vmovups %xmm4 , (CO1) | ||||
| vmovups %xmm6 , (CO1, LDC) | vmovups %xmm6 , (CO1, LDC) | ||||
| addq $2*SIZE, CO1 | |||||
| addq $ 2*SIZE, CO1 | |||||
| .endm | .endm | ||||
| /******************************************************************************************/ | /******************************************************************************************/ | ||||
| @@ -1058,8 +1058,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| vmovsd -11 * SIZE(BO), %xmm2 | vmovsd -11 * SIZE(BO), %xmm2 | ||||
| vfmadd231sd %xmm0 ,%xmm1 , %xmm4 | vfmadd231sd %xmm0 ,%xmm1 , %xmm4 | ||||
| vfmadd231sd %xmm0 ,%xmm2 , %xmm5 | vfmadd231sd %xmm0 ,%xmm2 , %xmm5 | ||||
| addq $2*SIZE, BO | |||||
| addq $1*SIZE, AO | |||||
| addq $ 2*SIZE, BO | |||||
| addq $ 1*SIZE, AO | |||||
| .endm | .endm | ||||
| @@ -1082,7 +1082,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| vmovsd %xmm4 , (CO1) | vmovsd %xmm4 , (CO1) | ||||
| vmovsd %xmm5 , (CO1, LDC) | vmovsd %xmm5 , (CO1, LDC) | ||||
| addq $1*SIZE, CO1 | |||||
| addq $ 1*SIZE, CO1 | |||||
| .endm | .endm | ||||
| @@ -1103,8 +1103,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| vmovups -14 * SIZE(AO), %xmm1 | vmovups -14 * SIZE(AO), %xmm1 | ||||
| vfmadd231pd %xmm0 ,%xmm2 , %xmm4 | vfmadd231pd %xmm0 ,%xmm2 , %xmm4 | ||||
| vfmadd231pd %xmm1 ,%xmm2 , %xmm5 | vfmadd231pd %xmm1 ,%xmm2 , %xmm5 | ||||
| addq $1*SIZE, BO | |||||
| addq $4*SIZE, AO | |||||
| addq $ 1*SIZE, BO | |||||
| addq $ 4*SIZE, AO | |||||
| .endm | .endm | ||||
| @@ -1127,7 +1127,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| vmovups %xmm4 , (CO1) | vmovups %xmm4 , (CO1) | ||||
| vmovups %xmm5 , 2 * SIZE(CO1) | vmovups %xmm5 , 2 * SIZE(CO1) | ||||
| addq $4*SIZE, CO1 | |||||
| addq $ 4*SIZE, CO1 | |||||
| .endm | .endm | ||||
| @@ -1145,8 +1145,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| vmovddup -12 * SIZE(BO), %xmm2 | vmovddup -12 * SIZE(BO), %xmm2 | ||||
| vmovups -16 * SIZE(AO), %xmm0 | vmovups -16 * SIZE(AO), %xmm0 | ||||
| vfmadd231pd %xmm0 ,%xmm2 , %xmm4 | vfmadd231pd %xmm0 ,%xmm2 , %xmm4 | ||||
| addq $1*SIZE, BO | |||||
| addq $2*SIZE, AO | |||||
| addq $ 1*SIZE, BO | |||||
| addq $ 2*SIZE, AO | |||||
| .endm | .endm | ||||
| @@ -1166,7 +1166,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| vmovups %xmm4 , (CO1) | vmovups %xmm4 , (CO1) | ||||
| addq $2*SIZE, CO1 | |||||
| addq $ 2*SIZE, CO1 | |||||
| .endm | .endm | ||||
| @@ -1184,8 +1184,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| vmovsd -12 * SIZE(BO), %xmm1 | vmovsd -12 * SIZE(BO), %xmm1 | ||||
| vmovsd -16 * SIZE(AO), %xmm0 | vmovsd -16 * SIZE(AO), %xmm0 | ||||
| vfmadd231sd %xmm0 ,%xmm1 , %xmm4 | vfmadd231sd %xmm0 ,%xmm1 , %xmm4 | ||||
| addq $1*SIZE, BO | |||||
| addq $1*SIZE, AO | |||||
| addq $ 1*SIZE, BO | |||||
| addq $ 1*SIZE, AO | |||||
| .endm | .endm | ||||
| @@ -1205,7 +1205,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| vmovsd %xmm4 , (CO1) | vmovsd %xmm4 , (CO1) | ||||
| addq $1*SIZE, CO1 | |||||
| addq $ 1*SIZE, CO1 | |||||
| .endm | .endm | ||||
| @@ -1262,13 +1262,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| STACK_TOUCH | STACK_TOUCH | ||||
| cmpq $0, OLD_M | |||||
| cmpq $ 0, OLD_M | |||||
| je .L999 | je .L999 | ||||
| cmpq $0, OLD_N | |||||
| cmpq $ 0, OLD_N | |||||
| je .L999 | je .L999 | ||||
| cmpq $0, OLD_K | |||||
| cmpq $ 0, OLD_K | |||||
| je .L999 | je .L999 | ||||
| movq OLD_M, M | movq OLD_M, M | ||||
| @@ -1288,7 +1288,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| movq Ndiv12, J | movq Ndiv12, J | ||||
| cmpq $0, J | |||||
| cmpq $ 0, J | |||||
| je .L4_0 | je .L4_0 | ||||
| ALIGN_4 | ALIGN_4 | ||||
| @@ -1330,10 +1330,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| vmovups %ymm6, 16 * SIZE(BO) | vmovups %ymm6, 16 * SIZE(BO) | ||||
| vmovups %ymm7, 20 * SIZE(BO) | vmovups %ymm7, 20 * SIZE(BO) | ||||
| addq $8 * SIZE ,BO1 | |||||
| addq $8 * SIZE ,BO2 | |||||
| addq $8 * SIZE ,BO3 | |||||
| addq $24 *SIZE ,BO | |||||
| addq $ 8 * SIZE ,BO1 | |||||
| addq $ 8 * SIZE ,BO2 | |||||
| addq $ 8 * SIZE ,BO3 | |||||
| addq $ 24 *SIZE ,BO | |||||
| decq %rax | decq %rax | ||||
| jnz .L12_01a_1 | jnz .L12_01a_1 | ||||
| @@ -1356,10 +1356,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| vmovups %ymm1, 0 * SIZE(BO) | vmovups %ymm1, 0 * SIZE(BO) | ||||
| vmovups %ymm2, 4 * SIZE(BO) | vmovups %ymm2, 4 * SIZE(BO) | ||||
| vmovups %ymm3, 8 * SIZE(BO) | vmovups %ymm3, 8 * SIZE(BO) | ||||
| addq $4*SIZE,BO1 | |||||
| addq $4*SIZE,BO2 | |||||
| addq $4*SIZE,BO3 | |||||
| addq $12*SIZE,BO | |||||
| addq $ 4*SIZE,BO1 | |||||
| addq $ 4*SIZE,BO2 | |||||
| addq $ 4*SIZE,BO3 | |||||
| addq $ 12*SIZE,BO | |||||
| decq %rax | decq %rax | ||||
| jnz .L12_02b | jnz .L12_02b | ||||
| @@ -1407,8 +1407,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| subq $2, %rax | subq $2, %rax | ||||
| je .L12_12a | je .L12_12a | ||||
| .align 32 | |||||
| ALIGN_5 | |||||
| .L12_12: | .L12_12: | ||||
| KERNEL4x12_M1 | KERNEL4x12_M1 | ||||
| @@ -1621,7 +1620,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .L4_0: | .L4_0: | ||||
| cmpq $0, Nmod12 // N % 12 == 0 | |||||
| cmpq $ 0, Nmod12 // N % 12 == 0 | |||||
| je .L999 | je .L999 | ||||
| movq Nmod12, J | movq Nmod12, J | ||||
| @@ -1666,7 +1665,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| subq $2, %rax | subq $2, %rax | ||||
| je .L4_12a | je .L4_12a | ||||
| .align 32 | |||||
| ALIGN_5 | |||||
| .L4_12: | .L4_12: | ||||
| @@ -1912,7 +1911,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| je .L2_16 | je .L2_16 | ||||
| .align 32 | |||||
| ALIGN_5 | |||||
| .L2_12: | .L2_12: | ||||
| @@ -2108,7 +2107,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| sarq $3, %rax // K / 8 | sarq $3, %rax // K / 8 | ||||
| je .L1_16 | je .L1_16 | ||||
| .align 32 | |||||
| ALIGN_5 | |||||
| .L1_12: | .L1_12: | ||||
| @@ -2362,13 +2361,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| STACK_TOUCH | STACK_TOUCH | ||||
| cmpq $0, OLD_M | |||||
| cmpq $ 0, OLD_M | |||||
| je .L999 | je .L999 | ||||
| cmpq $0, OLD_N | |||||
| cmpq $ 0, OLD_N | |||||
| je .L999 | je .L999 | ||||
| cmpq $0, OLD_K | |||||
| cmpq $ 0, OLD_K | |||||
| je .L999 | je .L999 | ||||
| movq OLD_M, M | movq OLD_M, M | ||||
| @@ -2397,7 +2396,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| movq Ndiv12, J | movq Ndiv12, J | ||||
| cmpq $0, J | |||||
| cmpq $ 0, J | |||||
| je .L2_0 | je .L2_0 | ||||
| ALIGN_4 | ALIGN_4 | ||||
| @@ -2471,7 +2470,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| subq $2, %rax | subq $2, %rax | ||||
| je .L4_12a | je .L4_12a | ||||
| .align 32 | |||||
| ALIGN_5 | |||||
| .L4_12: | .L4_12: | ||||
| @@ -2848,7 +2847,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| je .L2_16 | je .L2_16 | ||||
| .align 32 | |||||
| ALIGN_5 | |||||
| .L2_12: | .L2_12: | ||||
| @@ -3176,7 +3175,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| sarq $3, %rax // K / 8 | sarq $3, %rax // K / 8 | ||||
| je .L1_16 | je .L1_16 | ||||
| .align 32 | |||||
| ALIGN_5 | |||||
| .L1_12: | .L1_12: | ||||
| @@ -196,7 +196,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| vmovups -12 * SIZE(AO), %xmm0 | vmovups -12 * SIZE(AO), %xmm0 | ||||
| vmulpd %xmm1,%xmm0,%xmm10 | vmulpd %xmm1,%xmm0,%xmm10 | ||||
| vmulpd %xmm2,%xmm0,%xmm11 | vmulpd %xmm2,%xmm0,%xmm11 | ||||
| addq $3*SIZE, BO | |||||
| addq $ 3 * SIZE, BO | |||||
| vmulpd %xmm3,%xmm0,%xmm12 | vmulpd %xmm3,%xmm0,%xmm12 | ||||
| vmovups -10 * SIZE(AO), %xmm0 | vmovups -10 * SIZE(AO), %xmm0 | ||||
| vmulpd %xmm1,%xmm0,%xmm13 | vmulpd %xmm1,%xmm0,%xmm13 | ||||
| @@ -294,7 +294,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| vmovups 14 * SIZE(AO), %xmm0 | vmovups 14 * SIZE(AO), %xmm0 | ||||
| VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) | VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) | ||||
| vmovddup -3 * SIZE(BO), %xmm1 | vmovddup -3 * SIZE(BO), %xmm1 | ||||
| addq $32 * SIZE, AO | |||||
| addq $ 32 * SIZE, AO | |||||
| VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) | VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) | ||||
| vmovddup -2 * SIZE(BO), %xmm2 | vmovddup -2 * SIZE(BO), %xmm2 | ||||
| VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) | VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) | ||||
| @@ -392,8 +392,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| vmovddup 10 * SIZE(BO), %xmm2 | vmovddup 10 * SIZE(BO), %xmm2 | ||||
| VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) | VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) | ||||
| vmovddup 11 * SIZE(BO), %xmm3 | vmovddup 11 * SIZE(BO), %xmm3 | ||||
| addq $32 * SIZE, AO | |||||
| addq $24 * SIZE, BO | |||||
| addq $ 32 * SIZE, AO | |||||
| addq $ 24 * SIZE, BO | |||||
| .endm | .endm | ||||
| @@ -414,9 +414,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) | VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) | ||||
| vmovups 14 * SIZE(AO), %xmm0 | vmovups 14 * SIZE(AO), %xmm0 | ||||
| VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) | VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) | ||||
| addq $32*SIZE, AO | |||||
| addq $ 32 * SIZE, AO | |||||
| VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) | VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) | ||||
| addq $21*SIZE, BO | |||||
| addq $ 21 * SIZE, BO | |||||
| VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) | VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) | ||||
| .endm | .endm | ||||
| @@ -438,9 +438,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) | VFMADD231PD_( %xmm3,%xmm0,%xmm12 ) | ||||
| vmovups -10 * SIZE(AO), %xmm0 | vmovups -10 * SIZE(AO), %xmm0 | ||||
| VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) | VFMADD231PD_( %xmm1,%xmm0,%xmm13 ) | ||||
| addq $3*SIZE, BO | |||||
| addq $ 3 * SIZE, BO | |||||
| VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) | VFMADD231PD_( %xmm2,%xmm0,%xmm14 ) | ||||
| addq $8*SIZE, AO | |||||
| addq $ 8 * SIZE, AO | |||||
| VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) | VFMADD231PD_( %xmm3,%xmm0,%xmm15 ) | ||||
| .endm | .endm | ||||
| @@ -483,7 +483,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| prefetcht0 C_PR1(CO1,LDC) | prefetcht0 C_PR1(CO1,LDC) | ||||
| prefetcht0 C_PR1(CO1,LDC,2) | prefetcht0 C_PR1(CO1,LDC,2) | ||||
| addq $8 * SIZE, CO1 # coffset += 8 | |||||
| addq $ 8 * SIZE, CO1 # coffset += 8 | |||||
| .endm | .endm | ||||
| @@ -1165,9 +1165,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| vmovsd %xmm5, 8*SIZE(BO) | vmovsd %xmm5, 8*SIZE(BO) | ||||
| vmovups %xmm6, 9*SIZE(BO) | vmovups %xmm6, 9*SIZE(BO) | ||||
| vmovsd %xmm7,11*SIZE(BO) | vmovsd %xmm7,11*SIZE(BO) | ||||
| addq $8*SIZE,BO1 | |||||
| addq $8*SIZE,BO2 | |||||
| addq $12*SIZE,BO | |||||
| addq $ 8*SIZE,BO1 | |||||
| addq $ 8*SIZE,BO2 | |||||
| addq $ 12*SIZE,BO | |||||
| decq %rax | decq %rax | ||||
| jnz .L6_02 | jnz .L6_02 | ||||
| @@ -1184,9 +1184,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| vmovsd (BO2), %xmm1 | vmovsd (BO2), %xmm1 | ||||
| vmovups %xmm0, (BO) | vmovups %xmm0, (BO) | ||||
| vmovsd %xmm1, 2*SIZE(BO) | vmovsd %xmm1, 2*SIZE(BO) | ||||
| addq $2*SIZE,BO1 | |||||
| addq $2*SIZE,BO2 | |||||
| addq $3*SIZE,BO | |||||
| addq $ 2*SIZE,BO1 | |||||
| addq $ 2*SIZE,BO2 | |||||
| addq $ 3*SIZE,BO | |||||
| decq %rax | decq %rax | ||||
| jnz .L6_02b | jnz .L6_02b | ||||
| @@ -1223,9 +1223,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| vmovups %xmm4, 7*SIZE(BO) | vmovups %xmm4, 7*SIZE(BO) | ||||
| vmovsd %xmm7, 9*SIZE(BO) | vmovsd %xmm7, 9*SIZE(BO) | ||||
| vmovups %xmm6,10*SIZE(BO) | vmovups %xmm6,10*SIZE(BO) | ||||
| addq $8*SIZE,BO1 | |||||
| addq $8*SIZE,BO2 | |||||
| addq $12*SIZE,BO | |||||
| addq $ 8*SIZE,BO1 | |||||
| addq $ 8*SIZE,BO2 | |||||
| addq $ 12*SIZE,BO | |||||
| decq %rax | decq %rax | ||||
| jnz .L6_03 | jnz .L6_03 | ||||
| @@ -1243,9 +1243,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| vmovups (BO2), %xmm1 | vmovups (BO2), %xmm1 | ||||
| vmovsd %xmm0, (BO) | vmovsd %xmm0, (BO) | ||||
| vmovups %xmm1, 1*SIZE(BO) | vmovups %xmm1, 1*SIZE(BO) | ||||
| addq $2*SIZE,BO1 | |||||
| addq $2*SIZE,BO2 | |||||
| addq $3*SIZE,BO | |||||
| addq $ 2*SIZE,BO1 | |||||
| addq $ 2*SIZE,BO2 | |||||
| addq $ 3*SIZE,BO | |||||
| decq %rax | decq %rax | ||||
| jnz .L6_03b | jnz .L6_03b | ||||
| @@ -166,8 +166,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| VFMADD231PS_( %ymm9,%ymm2,%ymm1 ) | VFMADD231PS_( %ymm9,%ymm2,%ymm1 ) | ||||
| VFMADD231PS_( %ymm10,%ymm3,%ymm0 ) | VFMADD231PS_( %ymm10,%ymm3,%ymm0 ) | ||||
| VFMADD231PS_( %ymm11,%ymm3,%ymm1 ) | VFMADD231PS_( %ymm11,%ymm3,%ymm1 ) | ||||
| addq $4 , BI | |||||
| addq $16, %rax | |||||
| addq $ 4 , BI | |||||
| addq $ 16, %rax | |||||
| .endm | .endm | ||||
| .macro SAVE16x4 | .macro SAVE16x4 | ||||
| @@ -233,8 +233,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| vbroadcastss -1 * SIZE(BO, BI, SIZE), %ymm3 | vbroadcastss -1 * SIZE(BO, BI, SIZE), %ymm3 | ||||
| VFMADD231PS_( %ymm8,%ymm2,%ymm0 ) | VFMADD231PS_( %ymm8,%ymm2,%ymm0 ) | ||||
| VFMADD231PS_( %ymm10,%ymm3,%ymm0 ) | VFMADD231PS_( %ymm10,%ymm3,%ymm0 ) | ||||
| addq $4 , BI | |||||
| addq $8 , %rax | |||||
| addq $ 4 , BI | |||||
| addq $ 8 , %rax | |||||
| .endm | .endm | ||||
| .macro SAVE8x4 | .macro SAVE8x4 | ||||
| @@ -277,8 +277,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 | vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 | ||||
| VFMADD231PS_( %xmm8,%xmm2,%xmm0 ) | VFMADD231PS_( %xmm8,%xmm2,%xmm0 ) | ||||
| VFMADD231PS_( %xmm10,%xmm3,%xmm0 ) | VFMADD231PS_( %xmm10,%xmm3,%xmm0 ) | ||||
| addq $4 , BI | |||||
| addq $4 , %rax | |||||
| addq $ 4 , BI | |||||
| addq $ 4 , %rax | |||||
| .endm | .endm | ||||
| .macro SAVE4x4 | .macro SAVE4x4 | ||||
| @@ -325,8 +325,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| VFMADD231SS_( %xmm9,%xmm2,%xmm1 ) | VFMADD231SS_( %xmm9,%xmm2,%xmm1 ) | ||||
| VFMADD231SS_( %xmm10,%xmm3,%xmm0 ) | VFMADD231SS_( %xmm10,%xmm3,%xmm0 ) | ||||
| VFMADD231SS_( %xmm11,%xmm3,%xmm1 ) | VFMADD231SS_( %xmm11,%xmm3,%xmm1 ) | ||||
| addq $4 , BI | |||||
| addq $2, %rax | |||||
| addq $ 4 , BI | |||||
| addq $ 2, %rax | |||||
| .endm | .endm | ||||
| .macro SAVE2x4 | .macro SAVE2x4 | ||||
| @@ -386,8 +386,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 | vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 | ||||
| VFMADD231SS_( %xmm8,%xmm2,%xmm0 ) | VFMADD231SS_( %xmm8,%xmm2,%xmm0 ) | ||||
| VFMADD231SS_( %xmm10,%xmm3,%xmm0 ) | VFMADD231SS_( %xmm10,%xmm3,%xmm0 ) | ||||
| addq $4 , BI | |||||
| addq $1, %rax | |||||
| addq $ 4 , BI | |||||
| addq $ 1, %rax | |||||
| .endm | .endm | ||||
| .macro SAVE1x4 | .macro SAVE1x4 | ||||
| @@ -432,8 +432,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| VFMADD231PS_( %ymm5,%ymm2,%ymm1 ) | VFMADD231PS_( %ymm5,%ymm2,%ymm1 ) | ||||
| VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) | VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) | ||||
| VFMADD231PS_( %ymm7,%ymm3,%ymm1 ) | VFMADD231PS_( %ymm7,%ymm3,%ymm1 ) | ||||
| addq $2 , BI | |||||
| addq $16, %rax | |||||
| addq $ 2 , BI | |||||
| addq $ 16, %rax | |||||
| .endm | .endm | ||||
| .macro SAVE16x2 | .macro SAVE16x2 | ||||
| @@ -474,8 +474,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 | vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3 | ||||
| VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) | VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) | ||||
| VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) | VFMADD231PS_( %ymm6,%ymm3,%ymm0 ) | ||||
| addq $2 , BI | |||||
| addq $8 , %rax | |||||
| addq $ 2 , BI | |||||
| addq $ 8 , %rax | |||||
| .endm | .endm | ||||
| .macro SAVE8x2 | .macro SAVE8x2 | ||||
| @@ -507,8 +507,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm3 | vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm3 | ||||
| VFMADD231PS_( %xmm4,%xmm2,%xmm0 ) | VFMADD231PS_( %xmm4,%xmm2,%xmm0 ) | ||||
| VFMADD231PS_( %xmm6,%xmm3,%xmm0 ) | VFMADD231PS_( %xmm6,%xmm3,%xmm0 ) | ||||
| addq $2 , BI | |||||
| addq $4 , %rax | |||||
| addq $ 2 , BI | |||||
| addq $ 4 , %rax | |||||
| .endm | .endm | ||||
| .macro SAVE4x2 | .macro SAVE4x2 | ||||
| @@ -542,8 +542,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| VFMADD231SS_( %xmm5,%xmm2,%xmm1 ) | VFMADD231SS_( %xmm5,%xmm2,%xmm1 ) | ||||
| VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) | VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) | ||||
| VFMADD231SS_( %xmm7,%xmm3,%xmm1 ) | VFMADD231SS_( %xmm7,%xmm3,%xmm1 ) | ||||
| addq $2 , BI | |||||
| addq $2, %rax | |||||
| addq $ 2 , BI | |||||
| addq $ 2, %rax | |||||
| .endm | .endm | ||||
| .macro SAVE2x2 | .macro SAVE2x2 | ||||
| @@ -583,8 +583,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 | vmovss -3 * SIZE(BO, BI, SIZE), %xmm3 | ||||
| VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) | VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) | ||||
| VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) | VFMADD231SS_( %xmm6,%xmm3,%xmm0 ) | ||||
| addq $2 , BI | |||||
| addq $1, %rax | |||||
| addq $ 2 , BI | |||||
| addq $ 1, %rax | |||||
| .endm | .endm | ||||
| .macro SAVE1x2 | .macro SAVE1x2 | ||||
| @@ -619,8 +619,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 | vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 | ||||
| VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) | VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) | ||||
| VFMADD231PS_( %ymm5,%ymm2,%ymm1 ) | VFMADD231PS_( %ymm5,%ymm2,%ymm1 ) | ||||
| addq $1 , BI | |||||
| addq $16, %rax | |||||
| addq $ 1 , BI | |||||
| addq $ 16, %rax | |||||
| .endm | .endm | ||||
| .macro SAVE16x1 | .macro SAVE16x1 | ||||
| @@ -649,8 +649,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 | vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 | ||||
| vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 | vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2 | ||||
| VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) | VFMADD231PS_( %ymm4,%ymm2,%ymm0 ) | ||||
| addq $1 , BI | |||||
| addq $8 , %rax | |||||
| addq $ 1 , BI | |||||
| addq $ 8 , %rax | |||||
| .endm | .endm | ||||
| .macro SAVE8x1 | .macro SAVE8x1 | ||||
| @@ -677,8 +677,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 | vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 | ||||
| vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 | vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2 | ||||
| VFMADD231PS_( %xmm4,%xmm2,%xmm0 ) | VFMADD231PS_( %xmm4,%xmm2,%xmm0 ) | ||||
| addq $1 , BI | |||||
| addq $4 , %rax | |||||
| addq $ 1 , BI | |||||
| addq $ 4 , %rax | |||||
| .endm | .endm | ||||
| .macro SAVE4x1 | .macro SAVE4x1 | ||||
| @@ -706,8 +706,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 | vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 | ||||
| VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) | VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) | ||||
| VFMADD231SS_( %xmm5,%xmm2,%xmm1 ) | VFMADD231SS_( %xmm5,%xmm2,%xmm1 ) | ||||
| addq $1 , BI | |||||
| addq $2, %rax | |||||
| addq $ 1 , BI | |||||
| addq $ 2 , %rax | |||||
| .endm | .endm | ||||
| .macro SAVE2x1 | .macro SAVE2x1 | ||||
| @@ -736,8 +736,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 | vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0 | ||||
| vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 | vmovss -4 * SIZE(BO, BI, SIZE), %xmm2 | ||||
| VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) | VFMADD231SS_( %xmm4,%xmm2,%xmm0 ) | ||||
| addq $1 , BI | |||||
| addq $1, %rax | |||||
| addq $ 1 , BI | |||||
| addq $ 1 , %rax | |||||
| .endm | .endm | ||||
| .macro SAVE1x1 | .macro SAVE1x1 | ||||
| @@ -882,8 +882,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| vmovups %xmm2, 8*SIZE(BO) | vmovups %xmm2, 8*SIZE(BO) | ||||
| vmovups %xmm3,12*SIZE(BO) | vmovups %xmm3,12*SIZE(BO) | ||||
| addq $16*SIZE,BO1 | |||||
| addq $16*SIZE,BO | |||||
| addq $ 16*SIZE,BO1 | |||||
| addq $ 16*SIZE,BO | |||||
| decq %rax | decq %rax | ||||
| jnz .L4_01a | jnz .L4_01a | ||||
| @@ -899,8 +899,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| vmovups (BO1), %xmm0 | vmovups (BO1), %xmm0 | ||||
| vmovups %xmm0, (BO) | vmovups %xmm0, (BO) | ||||
| addq $4*SIZE,BO1 | |||||
| addq $4*SIZE,BO | |||||
| addq $ 4*SIZE,BO1 | |||||
| addq $ 4*SIZE,BO | |||||
| decq %rax | decq %rax | ||||
| jnz .L4_02c | jnz .L4_02c | ||||
| @@ -919,7 +919,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #endif | #endif | ||||
| movq A, AO // aoffset = a | movq A, AO // aoffset = a | ||||
| addq $16 * SIZE, AO | |||||
| addq $ 16 * SIZE, AO | |||||
| movq M, I | movq M, I | ||||
| sarq $4, I // i = (m >> 4) | sarq $4, I // i = (m >> 4) | ||||
| @@ -109,22 +109,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #if defined(OS_WINDOWS) | #if defined(OS_WINDOWS) | ||||
| #if L_BUFFER_SIZE > 16384 | #if L_BUFFER_SIZE > 16384 | ||||
| #define STACK_TOUCH \ | #define STACK_TOUCH \ | ||||
| movl $0, 4096 * 4(%rsp);\ | |||||
| movl $0, 4096 * 3(%rsp);\ | |||||
| movl $0, 4096 * 2(%rsp);\ | |||||
| movl $0, 4096 * 1(%rsp); | |||||
| movl $ 0, 4096 * 4(%rsp);\ | |||||
| movl $ 0, 4096 * 3(%rsp);\ | |||||
| movl $ 0, 4096 * 2(%rsp);\ | |||||
| movl $ 0, 4096 * 1(%rsp); | |||||
| #elif L_BUFFER_SIZE > 12288 | #elif L_BUFFER_SIZE > 12288 | ||||
| #define STACK_TOUCH \ | #define STACK_TOUCH \ | ||||
| movl $0, 4096 * 3(%rsp);\ | |||||
| movl $0, 4096 * 2(%rsp);\ | |||||
| movl $0, 4096 * 1(%rsp); | |||||
| movl $ 0, 4096 * 3(%rsp);\ | |||||
| movl $ 0, 4096 * 2(%rsp);\ | |||||
| movl $ 0, 4096 * 1(%rsp); | |||||
| #elif L_BUFFER_SIZE > 8192 | #elif L_BUFFER_SIZE > 8192 | ||||
| #define STACK_TOUCH \ | #define STACK_TOUCH \ | ||||
| movl $0, 4096 * 2(%rsp);\ | |||||
| movl $0, 4096 * 1(%rsp); | |||||
| movl $ 0, 4096 * 2(%rsp);\ | |||||
| movl $ 0, 4096 * 1(%rsp); | |||||
| #elif L_BUFFER_SIZE > 4096 | #elif L_BUFFER_SIZE > 4096 | ||||
| #define STACK_TOUCH \ | #define STACK_TOUCH \ | ||||
| movl $0, 4096 * 1(%rsp); | |||||
| movl $ 0, 4096 * 1(%rsp); | |||||
| #else | #else | ||||
| #define STACK_TOUCH | #define STACK_TOUCH | ||||
| #endif | #endif | ||||
| @@ -212,8 +212,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| VFMADDPD_I( %ymm11,%ymm7,%ymm0 ) | VFMADDPD_I( %ymm11,%ymm7,%ymm0 ) | ||||
| VFMADDPD_I( %ymm15,%ymm7,%ymm1 ) | VFMADDPD_I( %ymm15,%ymm7,%ymm1 ) | ||||
| addq $4, BI | |||||
| addq $8, %rax | |||||
| addq $ 4, BI | |||||
| addq $ 8, %rax | |||||
| .endm | .endm | ||||
| .macro SAVE4x2 | .macro SAVE4x2 | ||||
| @@ -222,10 +222,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| vbroadcastsd ALPHA_I, %ymm1 | vbroadcastsd ALPHA_I, %ymm1 | ||||
| // swap high and low 8 bytes | // swap high and low 8 bytes | ||||
| vshufpd $0x05, %ymm9 , %ymm9, %ymm9 | |||||
| vshufpd $0x05, %ymm11, %ymm11, %ymm11 | |||||
| vshufpd $0x05, %ymm13, %ymm13, %ymm13 | |||||
| vshufpd $0x05, %ymm15, %ymm15, %ymm15 | |||||
| vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9 | |||||
| vshufpd $ 0x05, %ymm11, %ymm11, %ymm11 | |||||
| vshufpd $ 0x05, %ymm13, %ymm13, %ymm13 | |||||
| vshufpd $ 0x05, %ymm15, %ymm15, %ymm15 | |||||
| #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ | #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ | ||||
| defined(NR) || defined(NC) || defined(TR) || defined(TC) | defined(NR) || defined(NC) || defined(TR) || defined(TC) | ||||
| @@ -235,10 +235,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| vaddsubpd %ymm13,%ymm12, %ymm12 | vaddsubpd %ymm13,%ymm12, %ymm12 | ||||
| vaddsubpd %ymm15,%ymm14, %ymm14 | vaddsubpd %ymm15,%ymm14, %ymm14 | ||||
| vshufpd $0x05, %ymm8 , %ymm8, %ymm9 | |||||
| vshufpd $0x05, %ymm10, %ymm10, %ymm11 | |||||
| vshufpd $0x05, %ymm12, %ymm12, %ymm13 | |||||
| vshufpd $0x05, %ymm14, %ymm14, %ymm15 | |||||
| vshufpd $ 0x05, %ymm8 , %ymm8, %ymm9 | |||||
| vshufpd $ 0x05, %ymm10, %ymm10, %ymm11 | |||||
| vshufpd $ 0x05, %ymm12, %ymm12, %ymm13 | |||||
| vshufpd $ 0x05, %ymm14, %ymm14, %ymm15 | |||||
| #else | #else | ||||
| vaddsubpd %ymm8, %ymm9 ,%ymm9 | vaddsubpd %ymm8, %ymm9 ,%ymm9 | ||||
| @@ -252,10 +252,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| vmovapd %ymm15, %ymm14 | vmovapd %ymm15, %ymm14 | ||||
| // swap high and low 8 bytes | // swap high and low 8 bytes | ||||
| vshufpd $0x05, %ymm9 , %ymm9, %ymm9 | |||||
| vshufpd $0x05, %ymm11, %ymm11, %ymm11 | |||||
| vshufpd $0x05, %ymm13, %ymm13, %ymm13 | |||||
| vshufpd $0x05, %ymm15, %ymm15, %ymm15 | |||||
| vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9 | |||||
| vshufpd $ 0x05, %ymm11, %ymm11, %ymm11 | |||||
| vshufpd $ 0x05, %ymm13, %ymm13, %ymm13 | |||||
| vshufpd $ 0x05, %ymm15, %ymm15, %ymm15 | |||||
| #endif | #endif | ||||
| @@ -316,8 +316,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 | vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 | ||||
| VFMADDPD_I( %xmm11,%xmm7,%xmm0 ) | VFMADDPD_I( %xmm11,%xmm7,%xmm0 ) | ||||
| VFMADDPD_I( %xmm15,%xmm7,%xmm1 ) | VFMADDPD_I( %xmm15,%xmm7,%xmm1 ) | ||||
| addq $4, BI | |||||
| addq $4, %rax | |||||
| addq $ 4, BI | |||||
| addq $ 4, %rax | |||||
| .endm | .endm | ||||
| .macro SAVE2x2 | .macro SAVE2x2 | ||||
| @@ -326,10 +326,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| vmovddup ALPHA_I, %xmm1 | vmovddup ALPHA_I, %xmm1 | ||||
| // swap high and low 64 bytes | // swap high and low 64 bytes | ||||
| vshufpd $0x01, %xmm9 , %xmm9, %xmm9 | |||||
| vshufpd $0x01, %xmm11, %xmm11, %xmm11 | |||||
| vshufpd $0x01, %xmm13, %xmm13, %xmm13 | |||||
| vshufpd $0x01, %xmm15, %xmm15, %xmm15 | |||||
| vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 | |||||
| vshufpd $ 0x01, %xmm11, %xmm11, %xmm11 | |||||
| vshufpd $ 0x01, %xmm13, %xmm13, %xmm13 | |||||
| vshufpd $ 0x01, %xmm15, %xmm15, %xmm15 | |||||
| #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ | #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ | ||||
| defined(NR) || defined(NC) || defined(TR) || defined(TC) | defined(NR) || defined(NC) || defined(TR) || defined(TC) | ||||
| @@ -339,10 +339,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| vaddsubpd %xmm13,%xmm12, %xmm12 | vaddsubpd %xmm13,%xmm12, %xmm12 | ||||
| vaddsubpd %xmm15,%xmm14, %xmm14 | vaddsubpd %xmm15,%xmm14, %xmm14 | ||||
| vshufpd $0x01, %xmm8 , %xmm8, %xmm9 | |||||
| vshufpd $0x01, %xmm10, %xmm10, %xmm11 | |||||
| vshufpd $0x01, %xmm12, %xmm12, %xmm13 | |||||
| vshufpd $0x01, %xmm14, %xmm14, %xmm15 | |||||
| vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9 | |||||
| vshufpd $ 0x01, %xmm10, %xmm10, %xmm11 | |||||
| vshufpd $ 0x01, %xmm12, %xmm12, %xmm13 | |||||
| vshufpd $ 0x01, %xmm14, %xmm14, %xmm15 | |||||
| #else | #else | ||||
| vaddsubpd %xmm8, %xmm9 ,%xmm9 | vaddsubpd %xmm8, %xmm9 ,%xmm9 | ||||
| @@ -356,10 +356,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| vmovapd %xmm15, %xmm14 | vmovapd %xmm15, %xmm14 | ||||
| // swap high and low 64 bytes | // swap high and low 64 bytes | ||||
| vshufpd $0x01, %xmm9 , %xmm9, %xmm9 | |||||
| vshufpd $0x01, %xmm11, %xmm11, %xmm11 | |||||
| vshufpd $0x01, %xmm13, %xmm13, %xmm13 | |||||
| vshufpd $0x01, %xmm15, %xmm15, %xmm15 | |||||
| vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 | |||||
| vshufpd $ 0x01, %xmm11, %xmm11, %xmm11 | |||||
| vshufpd $ 0x01, %xmm13, %xmm13, %xmm13 | |||||
| vshufpd $ 0x01, %xmm15, %xmm15, %xmm15 | |||||
| #endif | #endif | ||||
| @@ -415,8 +415,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 | vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 | ||||
| VFMADDPD_R( %xmm10,%xmm6,%xmm0 ) | VFMADDPD_R( %xmm10,%xmm6,%xmm0 ) | ||||
| VFMADDPD_I( %xmm11,%xmm7,%xmm0 ) | VFMADDPD_I( %xmm11,%xmm7,%xmm0 ) | ||||
| addq $4, BI | |||||
| addq $2, %rax | |||||
| addq $ 4, BI | |||||
| addq $ 2, %rax | |||||
| .endm | .endm | ||||
| .macro SAVE1x2 | .macro SAVE1x2 | ||||
| @@ -425,8 +425,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| vmovddup ALPHA_I, %xmm1 | vmovddup ALPHA_I, %xmm1 | ||||
| // swap high and low 64 bytes | // swap high and low 64 bytes | ||||
| vshufpd $0x01, %xmm9 , %xmm9, %xmm9 | |||||
| vshufpd $0x01, %xmm11, %xmm11, %xmm11 | |||||
| vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 | |||||
| vshufpd $ 0x01, %xmm11, %xmm11, %xmm11 | |||||
| #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ | #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ | ||||
| defined(NR) || defined(NC) || defined(TR) || defined(TC) | defined(NR) || defined(NC) || defined(TR) || defined(TC) | ||||
| @@ -434,8 +434,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| vaddsubpd %xmm9, %xmm8 , %xmm8 | vaddsubpd %xmm9, %xmm8 , %xmm8 | ||||
| vaddsubpd %xmm11,%xmm10, %xmm10 | vaddsubpd %xmm11,%xmm10, %xmm10 | ||||
| vshufpd $0x01, %xmm8 , %xmm8, %xmm9 | |||||
| vshufpd $0x01, %xmm10, %xmm10, %xmm11 | |||||
| vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9 | |||||
| vshufpd $ 0x01, %xmm10, %xmm10, %xmm11 | |||||
| #else | #else | ||||
| vaddsubpd %xmm8, %xmm9, %xmm9 | vaddsubpd %xmm8, %xmm9, %xmm9 | ||||
| @@ -445,8 +445,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| vmovapd %xmm11, %xmm10 | vmovapd %xmm11, %xmm10 | ||||
| // swap high and low 64 bytes | // swap high and low 64 bytes | ||||
| vshufpd $0x01, %xmm9 , %xmm9, %xmm9 | |||||
| vshufpd $0x01, %xmm11, %xmm11, %xmm11 | |||||
| vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 | |||||
| vshufpd $ 0x01, %xmm11, %xmm11, %xmm11 | |||||
| #endif | #endif | ||||
| @@ -486,8 +486,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| VFMADDPD_I( %ymm9 ,%ymm5,%ymm0 ) | VFMADDPD_I( %ymm9 ,%ymm5,%ymm0 ) | ||||
| VFMADDPD_I( %ymm13,%ymm5,%ymm1 ) | VFMADDPD_I( %ymm13,%ymm5,%ymm1 ) | ||||
| addq $2, BI | |||||
| addq $8, %rax | |||||
| addq $ 2, BI | |||||
| addq $ 8, %rax | |||||
| .endm | .endm | ||||
| .macro SAVE4x1 | .macro SAVE4x1 | ||||
| @@ -496,8 +496,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| vbroadcastsd ALPHA_I, %ymm1 | vbroadcastsd ALPHA_I, %ymm1 | ||||
| // swap high and low 8 bytes | // swap high and low 8 bytes | ||||
| vshufpd $0x05, %ymm9 , %ymm9, %ymm9 | |||||
| vshufpd $0x05, %ymm13, %ymm13, %ymm13 | |||||
| vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9 | |||||
| vshufpd $ 0x05, %ymm13, %ymm13, %ymm13 | |||||
| #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ | #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ | ||||
| defined(NR) || defined(NC) || defined(TR) || defined(TC) | defined(NR) || defined(NC) || defined(TR) || defined(TC) | ||||
| @@ -505,8 +505,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| vaddsubpd %ymm9, %ymm8 , %ymm8 | vaddsubpd %ymm9, %ymm8 , %ymm8 | ||||
| vaddsubpd %ymm13,%ymm12 , %ymm12 | vaddsubpd %ymm13,%ymm12 , %ymm12 | ||||
| vshufpd $0x05, %ymm8 , %ymm8, %ymm9 | |||||
| vshufpd $0x05, %ymm12, %ymm12, %ymm13 | |||||
| vshufpd $ 0x05, %ymm8 , %ymm8, %ymm9 | |||||
| vshufpd $ 0x05, %ymm12, %ymm12, %ymm13 | |||||
| #else | #else | ||||
| vaddsubpd %ymm8, %ymm9 , %ymm9 | vaddsubpd %ymm8, %ymm9 , %ymm9 | ||||
| @@ -516,8 +516,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| vmovapd %ymm13, %ymm12 | vmovapd %ymm13, %ymm12 | ||||
| // swap high and low 8 bytes | // swap high and low 8 bytes | ||||
| vshufpd $0x05, %ymm9 , %ymm9, %ymm9 | |||||
| vshufpd $0x05, %ymm13, %ymm13, %ymm13 | |||||
| vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9 | |||||
| vshufpd $ 0x05, %ymm13, %ymm13, %ymm13 | |||||
| #endif | #endif | ||||
| @@ -559,8 +559,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 | vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 | ||||
| VFMADDPD_I( %xmm9,%xmm5,%xmm0 ) | VFMADDPD_I( %xmm9,%xmm5,%xmm0 ) | ||||
| VFMADDPD_I( %xmm13,%xmm5,%xmm1 ) | VFMADDPD_I( %xmm13,%xmm5,%xmm1 ) | ||||
| addq $2, BI | |||||
| addq $4, %rax | |||||
| addq $ 2, BI | |||||
| addq $ 4, %rax | |||||
| .endm | .endm | ||||
| .macro SAVE2x1 | .macro SAVE2x1 | ||||
| @@ -569,8 +569,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| vmovddup ALPHA_I, %xmm1 | vmovddup ALPHA_I, %xmm1 | ||||
| // swap high and low 64 bytes | // swap high and low 64 bytes | ||||
| vshufpd $0x01, %xmm9 , %xmm9, %xmm9 | |||||
| vshufpd $0x01, %xmm13, %xmm13, %xmm13 | |||||
| vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 | |||||
| vshufpd $ 0x01, %xmm13, %xmm13, %xmm13 | |||||
| #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ | #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ | ||||
| defined(NR) || defined(NC) || defined(TR) || defined(TC) | defined(NR) || defined(NC) || defined(TR) || defined(TC) | ||||
| @@ -578,8 +578,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| vaddsubpd %xmm9, %xmm8 , %xmm8 | vaddsubpd %xmm9, %xmm8 , %xmm8 | ||||
| vaddsubpd %xmm13,%xmm12 , %xmm12 | vaddsubpd %xmm13,%xmm12 , %xmm12 | ||||
| vshufpd $0x01, %xmm8 , %xmm8, %xmm9 | |||||
| vshufpd $0x01, %xmm12, %xmm12, %xmm13 | |||||
| vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9 | |||||
| vshufpd $ 0x01, %xmm12, %xmm12, %xmm13 | |||||
| #else | #else | ||||
| vaddsubpd %xmm8, %xmm9 , %xmm9 | vaddsubpd %xmm8, %xmm9 , %xmm9 | ||||
| @@ -589,8 +589,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| vmovapd %xmm13, %xmm12 | vmovapd %xmm13, %xmm12 | ||||
| // swap high and low 64 bytes | // swap high and low 64 bytes | ||||
| vshufpd $0x01, %xmm9 , %xmm9, %xmm9 | |||||
| vshufpd $0x01, %xmm13, %xmm13, %xmm13 | |||||
| vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 | |||||
| vshufpd $ 0x01, %xmm13, %xmm13, %xmm13 | |||||
| #endif | #endif | ||||
| @@ -626,8 +626,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| VFMADDPD_R( %xmm8,%xmm4,%xmm0 ) | VFMADDPD_R( %xmm8,%xmm4,%xmm0 ) | ||||
| vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 | vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 | ||||
| VFMADDPD_I( %xmm9,%xmm5,%xmm0 ) | VFMADDPD_I( %xmm9,%xmm5,%xmm0 ) | ||||
| addq $2, BI | |||||
| addq $2, %rax | |||||
| addq $ 2, BI | |||||
| addq $ 2, %rax | |||||
| .endm | .endm | ||||
| .macro SAVE1x1 | .macro SAVE1x1 | ||||
| @@ -636,14 +636,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| vmovddup ALPHA_I, %xmm1 | vmovddup ALPHA_I, %xmm1 | ||||
| // swap high and low 64 bytes | // swap high and low 64 bytes | ||||
| vshufpd $0x01, %xmm9 , %xmm9, %xmm9 | |||||
| vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 | |||||
| #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ | #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ | ||||
| defined(NR) || defined(NC) || defined(TR) || defined(TC) | defined(NR) || defined(NC) || defined(TR) || defined(TC) | ||||
| vaddsubpd %xmm9, %xmm8, %xmm8 | vaddsubpd %xmm9, %xmm8, %xmm8 | ||||
| vshufpd $0x01, %xmm8 , %xmm8, %xmm9 | |||||
| vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9 | |||||
| #else | #else | ||||
| vaddsubpd %xmm8, %xmm9, %xmm9 | vaddsubpd %xmm8, %xmm9, %xmm9 | ||||
| @@ -651,7 +651,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| vmovapd %xmm9, %xmm8 | vmovapd %xmm9, %xmm8 | ||||
| // swap high and low 64 bytes | // swap high and low 64 bytes | ||||
| vshufpd $0x01, %xmm9 , %xmm9, %xmm9 | |||||
| vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9 | |||||
| #endif | #endif | ||||
| @@ -682,7 +682,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| PROLOGUE | PROLOGUE | ||||
| PROFCODE | PROFCODE | ||||
| subq $STACKSIZE, %rsp | |||||
| subq $ STACKSIZE, %rsp | |||||
| movq %rbx, (%rsp) | movq %rbx, (%rsp) | ||||
| movq %rbp, 8(%rsp) | movq %rbp, 8(%rsp) | ||||
| movq %r12, 16(%rsp) | movq %r12, 16(%rsp) | ||||
| @@ -727,18 +727,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #endif | #endif | ||||
| movq %rsp, SP # save old stack | movq %rsp, SP # save old stack | ||||
| subq $128 + L_BUFFER_SIZE, %rsp | |||||
| andq $-4096, %rsp # align stack | |||||
| subq $ 128 + L_BUFFER_SIZE, %rsp | |||||
| andq $ -4096, %rsp # align stack | |||||
| STACK_TOUCH | STACK_TOUCH | ||||
| cmpq $0, OLD_M | |||||
| cmpq $ 0, OLD_M | |||||
| je .L999 | je .L999 | ||||
| cmpq $0, OLD_N | |||||
| cmpq $ 0, OLD_N | |||||
| je .L999 | je .L999 | ||||
| cmpq $0, OLD_K | |||||
| cmpq $ 0, OLD_K | |||||
| je .L999 | je .L999 | ||||
| movq OLD_M, M | movq OLD_M, M | ||||
| @@ -748,11 +748,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| vmovsd %xmm0, ALPHA_R | vmovsd %xmm0, ALPHA_R | ||||
| vmovsd %xmm1, ALPHA_I | vmovsd %xmm1, ALPHA_I | ||||
| salq $ZBASE_SHIFT, LDC | |||||
| salq $ ZBASE_SHIFT, LDC | |||||
| movq N, %rax | movq N, %rax | ||||
| xorq %rdx, %rdx | xorq %rdx, %rdx | ||||
| movq $2, %rdi | |||||
| movq $ 2, %rdi | |||||
| divq %rdi // N / 2 | divq %rdi // N / 2 | ||||
| movq %rax, Ndiv6 // N / 2 | movq %rax, Ndiv6 // N / 2 | ||||
| movq %rdx, Nmod6 // N % 2 | movq %rdx, Nmod6 // N % 2 | ||||
| @@ -770,7 +770,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .L2_00_0: | .L2_00_0: | ||||
| movq Ndiv6, J | movq Ndiv6, J | ||||
| cmpq $0, J | |||||
| cmpq $ 0, J | |||||
| je .L1_2_0 | je .L1_2_0 | ||||
| ALIGN_4 | ALIGN_4 | ||||
| @@ -789,8 +789,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| vmovups 2 * SIZE(BO1), %xmm1 | vmovups 2 * SIZE(BO1), %xmm1 | ||||
| vmovups %xmm0, (BO) | vmovups %xmm0, (BO) | ||||
| vmovups %xmm1, 2 * SIZE(BO) | vmovups %xmm1, 2 * SIZE(BO) | ||||
| addq $4*SIZE,BO1 | |||||
| addq $4*SIZE,BO | |||||
| addq $ 4*SIZE,BO1 | |||||
| addq $ 4*SIZE,BO | |||||
| decq %rax | decq %rax | ||||
| jnz .L2_00_02b | jnz .L2_00_02b | ||||
| @@ -809,10 +809,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #endif | #endif | ||||
| movq A, AO // aoffset = a | movq A, AO // aoffset = a | ||||
| addq $8 * SIZE, AO | |||||
| addq $ 8 * SIZE, AO | |||||
| movq M, I | movq M, I | ||||
| sarq $2, I // i = (m >> 2) | |||||
| sarq $ 2, I // i = (m >> 2) | |||||
| je .L2_2_10 | je .L2_2_10 | ||||
| ALIGN_4 | ALIGN_4 | ||||
| @@ -825,15 +825,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ | (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ | ||||
| (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) | (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) | ||||
| leaq BUFFER1, BO // first buffer to BO | leaq BUFFER1, BO // first buffer to BO | ||||
| addq $8 * SIZE, BO | |||||
| addq $ 8 * SIZE, BO | |||||
| #else | #else | ||||
| movq KK, %rax | movq KK, %rax | ||||
| leaq BUFFER1, BO // first buffer to BO | leaq BUFFER1, BO // first buffer to BO | ||||
| addq $8 * SIZE, BO | |||||
| addq $ 8 * SIZE, BO | |||||
| movq %rax, BI // Index for BO | movq %rax, BI // Index for BO | ||||
| leaq (,BI,4), BI // BI = BI * 4 ; number of values | leaq (,BI,4), BI // BI = BI * 4 ; number of values | ||||
| leaq (BO, BI, SIZE), BO | leaq (BO, BI, SIZE), BO | ||||
| salq $3, %rax // rax = rax * 8 ; number of values | |||||
| salq $ 3, %rax // rax = rax * 8 ; number of values | |||||
| leaq (AO, %rax, SIZE), AO | leaq (AO, %rax, SIZE), AO | ||||
| #endif | #endif | ||||
| @@ -848,20 +848,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #else | #else | ||||
| movq KK, %rax | movq KK, %rax | ||||
| #ifdef LEFT | #ifdef LEFT | ||||
| addq $4, %rax // number of values in AO | |||||
| addq $ 4, %rax // number of values in AO | |||||
| #else | #else | ||||
| addq $2, %rax // number of values in BO | |||||
| addq $ 2, %rax // number of values in BO | |||||
| #endif | #endif | ||||
| movq %rax, KKK | movq %rax, KKK | ||||
| #endif | #endif | ||||
| andq $-8, %rax // K = K - ( K % 8 ) | |||||
| andq $ -8, %rax // K = K - ( K % 8 ) | |||||
| je .L2_4_16 | je .L2_4_16 | ||||
| movq %rax, BI // Index for BO | movq %rax, BI // Index for BO | ||||
| leaq ( ,BI,4), BI // BI = BI * 4 ; number of values | leaq ( ,BI,4), BI // BI = BI * 4 ; number of values | ||||
| salq $3, %rax // rax = rax * 8 ; number of values | |||||
| salq $ 3, %rax // rax = rax * 8 ; number of values | |||||
| leaq (AO, %rax, SIZE), AO | leaq (AO, %rax, SIZE), AO | ||||
| leaq (BO, BI, SIZE), BO | leaq (BO, BI, SIZE), BO | ||||
| negq BI | negq BI | ||||
| @@ -928,13 +928,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| movq KKK, %rax | movq KKK, %rax | ||||
| #endif | #endif | ||||
| andq $7, %rax # if (k & 1) | |||||
| andq $ 7, %rax # if (k & 1) | |||||
| je .L2_4_19 | je .L2_4_19 | ||||
| movq %rax, BI // Index for BO | movq %rax, BI // Index for BO | ||||
| leaq ( ,BI,4), BI // BI = BI * 4 ; number of values | leaq ( ,BI,4), BI // BI = BI * 4 ; number of values | ||||
| salq $3, %rax // rax = rax * 8 ; number of values | |||||
| salq $ 3, %rax // rax = rax * 8 ; number of values | |||||
| leaq (AO, %rax, SIZE), AO | leaq (AO, %rax, SIZE), AO | ||||
| leaq (BO, BI, SIZE), BO | leaq (BO, BI, SIZE), BO | ||||
| negq BI | negq BI | ||||
| @@ -960,16 +960,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| movq %rax, BI // Index for BO | movq %rax, BI // Index for BO | ||||
| leaq ( ,BI,4), BI // BI = BI * 4 ; number of values | leaq ( ,BI,4), BI // BI = BI * 4 ; number of values | ||||
| leaq (BO, BI, SIZE), BO | leaq (BO, BI, SIZE), BO | ||||
| salq $3, %rax // rax = rax * 8 ; number of values | |||||
| salq $ 3, %rax // rax = rax * 8 ; number of values | |||||
| leaq (AO, %rax, SIZE), AO | leaq (AO, %rax, SIZE), AO | ||||
| #endif | #endif | ||||
| #if defined(TRMMKERNEL) && defined(LEFT) | #if defined(TRMMKERNEL) && defined(LEFT) | ||||
| addq $4, KK | |||||
| addq $ 4, KK | |||||
| #endif | #endif | ||||
| addq $8 * SIZE, CO1 # coffset += 8 | |||||
| addq $ 8 * SIZE, CO1 # coffset += 8 | |||||
| decq I # i -- | decq I # i -- | ||||
| jg .L2_4_11 | jg .L2_4_11 | ||||
| ALIGN_4 | ALIGN_4 | ||||
| @@ -982,7 +982,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| /******************************************************************************************************************/ | /******************************************************************************************************************/ | ||||
| .L2_2_10: | .L2_2_10: | ||||
| testq $2, M | |||||
| testq $ 2, M | |||||
| jz .L2_2_40 // to next 2 lines of N | jz .L2_2_40 // to next 2 lines of N | ||||
| .L2_2_11: | .L2_2_11: | ||||
| @@ -991,15 +991,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ | (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ | ||||
| (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) | (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) | ||||
| leaq BUFFER1, BO // first buffer to BO | leaq BUFFER1, BO // first buffer to BO | ||||
| addq $8 * SIZE, BO | |||||
| addq $ 8 * SIZE, BO | |||||
| #else | #else | ||||
| movq KK, %rax | movq KK, %rax | ||||
| leaq BUFFER1, BO // first buffer to BO | leaq BUFFER1, BO // first buffer to BO | ||||
| addq $8 * SIZE, BO | |||||
| addq $ 8 * SIZE, BO | |||||
| movq %rax, BI // Index for BO | movq %rax, BI // Index for BO | ||||
| leaq (,BI,4), BI // BI = BI * 4 ; number of values | leaq (,BI,4), BI // BI = BI * 4 ; number of values | ||||
| leaq (BO, BI, SIZE), BO | leaq (BO, BI, SIZE), BO | ||||
| salq $2, %rax // rax = rax * 4 ; number of values | |||||
| salq $ 2, %rax // rax = rax * 4 ; number of values | |||||
| leaq (AO, %rax, SIZE), AO | leaq (AO, %rax, SIZE), AO | ||||
| #endif | #endif | ||||
| @@ -1014,20 +1014,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #else | #else | ||||
| movq KK, %rax | movq KK, %rax | ||||
| #ifdef LEFT | #ifdef LEFT | ||||
| addq $2, %rax // number of values in AO | |||||
| addq $ 2, %rax // number of values in AO | |||||
| #else | #else | ||||
| addq $2, %rax // number of values in BO | |||||
| addq $ 2, %rax // number of values in BO | |||||
| #endif | #endif | ||||
| movq %rax, KKK | movq %rax, KKK | ||||
| #endif | #endif | ||||
| andq $-8, %rax // K = K - ( K % 8 ) | |||||
| andq $ -8, %rax // K = K - ( K % 8 ) | |||||
| je .L2_2_16 | je .L2_2_16 | ||||
| movq %rax, BI // Index for BO | movq %rax, BI // Index for BO | ||||
| leaq ( ,BI,4), BI // BI = BI * 4 ; number of values | leaq ( ,BI,4), BI // BI = BI * 4 ; number of values | ||||
| salq $2, %rax // rax = rax * 4 ; number of values | |||||
| salq $ 2, %rax // rax = rax * 4 ; number of values | |||||
| leaq (AO, %rax, SIZE), AO | leaq (AO, %rax, SIZE), AO | ||||
| leaq (BO, BI, SIZE), BO | leaq (BO, BI, SIZE), BO | ||||
| negq BI | negq BI | ||||
| @@ -1086,13 +1086,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| movq KKK, %rax | movq KKK, %rax | ||||
| #endif | #endif | ||||
| andq $7, %rax # if (k & 1) | |||||
| andq $ 7, %rax # if (k & 1) | |||||
| je .L2_2_19 | je .L2_2_19 | ||||
| movq %rax, BI // Index for BO | movq %rax, BI // Index for BO | ||||
| leaq ( ,BI,4), BI // BI = BI * 4 ; number of values | leaq ( ,BI,4), BI // BI = BI * 4 ; number of values | ||||
| salq $2, %rax // rax = rax * 4 ; number of values | |||||
| salq $ 2, %rax // rax = rax * 4 ; number of values | |||||
| leaq (AO, %rax, SIZE), AO | leaq (AO, %rax, SIZE), AO | ||||
| leaq (BO, BI, SIZE), BO | leaq (BO, BI, SIZE), BO | ||||
| negq BI | negq BI | ||||
| @@ -1118,16 +1118,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| movq %rax, BI // Index for BO | movq %rax, BI // Index for BO | ||||
| leaq ( ,BI,4), BI // BI = BI * 4 ; number of values | leaq ( ,BI,4), BI // BI = BI * 4 ; number of values | ||||
| leaq (BO, BI, SIZE), BO | leaq (BO, BI, SIZE), BO | ||||
| salq $2, %rax // rax = rax * 4 ; number of values | |||||
| salq $ 2, %rax // rax = rax * 4 ; number of values | |||||
| leaq (AO, %rax, SIZE), AO | leaq (AO, %rax, SIZE), AO | ||||
| #endif | #endif | ||||
| #if defined(TRMMKERNEL) && defined(LEFT) | #if defined(TRMMKERNEL) && defined(LEFT) | ||||
| addq $2, KK | |||||
| addq $ 2, KK | |||||
| #endif | #endif | ||||
| addq $4 * SIZE, CO1 # coffset += 4 | |||||
| addq $ 4 * SIZE, CO1 # coffset += 4 | |||||
| ALIGN_4 | ALIGN_4 | ||||
| @@ -1135,7 +1135,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| * Rest of M | * Rest of M | ||||
| ***************************************************************************/ | ***************************************************************************/ | ||||
| .L2_2_40: | .L2_2_40: | ||||
| testq $1, M | |||||
| testq $ 1, M | |||||
| jz .L2_2_60 // to next 2 lines of N | jz .L2_2_60 // to next 2 lines of N | ||||
| ALIGN_4 | ALIGN_4 | ||||
| @@ -1146,15 +1146,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ | (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ | ||||
| (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) | (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) | ||||
| leaq BUFFER1, BO // first buffer to BO | leaq BUFFER1, BO // first buffer to BO | ||||
| addq $8 * SIZE, BO | |||||
| addq $ 8 * SIZE, BO | |||||
| #else | #else | ||||
| movq KK, %rax | movq KK, %rax | ||||
| leaq BUFFER1, BO // first buffer to BO | leaq BUFFER1, BO // first buffer to BO | ||||
| addq $8 * SIZE, BO | |||||
| addq $ 8 * SIZE, BO | |||||
| movq %rax, BI // Index for BO | movq %rax, BI // Index for BO | ||||
| leaq (,BI,4), BI // BI = BI * 4 ; number of values | leaq (,BI,4), BI // BI = BI * 4 ; number of values | ||||
| leaq (BO, BI, SIZE), BO | leaq (BO, BI, SIZE), BO | ||||
| salq $1, %rax // rax = rax * 2 ; number of values | |||||
| salq $ 1, %rax // rax = rax * 2 ; number of values | |||||
| leaq (AO, %rax, SIZE), AO | leaq (AO, %rax, SIZE), AO | ||||
| #endif | #endif | ||||
| @@ -1169,20 +1169,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #else | #else | ||||
| movq KK, %rax | movq KK, %rax | ||||
| #ifdef LEFT | #ifdef LEFT | ||||
| addq $1, %rax // number of values in AO | |||||
| addq $ 1, %rax // number of values in AO | |||||
| #else | #else | ||||
| addq $2, %rax // number of values in BO | |||||
| addq $ 2, %rax // number of values in BO | |||||
| #endif | #endif | ||||
| movq %rax, KKK | movq %rax, KKK | ||||
| #endif | #endif | ||||
| andq $-8, %rax // K = K - ( K % 8 ) | |||||
| andq $ -8, %rax // K = K - ( K % 8 ) | |||||
| je .L2_2_46 | je .L2_2_46 | ||||
| movq %rax, BI // Index for BO | movq %rax, BI // Index for BO | ||||
| leaq ( ,BI,4), BI // BI = BI * 4 ; number of values | leaq ( ,BI,4), BI // BI = BI * 4 ; number of values | ||||
| salq $1, %rax // rax = rax * 2 ; number of values | |||||
| salq $ 1, %rax // rax = rax * 2 ; number of values | |||||
| leaq (AO, %rax, SIZE), AO | leaq (AO, %rax, SIZE), AO | ||||
| leaq (BO, BI, SIZE), BO | leaq (BO, BI, SIZE), BO | ||||
| negq BI | negq BI | ||||
| @@ -1237,13 +1237,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| movq KKK, %rax | movq KKK, %rax | ||||
| #endif | #endif | ||||
| andq $7, %rax # if (k & 1) | |||||
| andq $ 7, %rax # if (k & 1) | |||||
| je .L2_2_49 | je .L2_2_49 | ||||
| movq %rax, BI // Index for BO | movq %rax, BI // Index for BO | ||||
| leaq ( ,BI,4), BI // BI = BI * 4 ; number of values | leaq ( ,BI,4), BI // BI = BI * 4 ; number of values | ||||
| salq $1, %rax // rax = rax * 2 ; number of values | |||||
| salq $ 1, %rax // rax = rax * 2 ; number of values | |||||
| leaq (AO, %rax, SIZE), AO | leaq (AO, %rax, SIZE), AO | ||||
| leaq (BO, BI, SIZE), BO | leaq (BO, BI, SIZE), BO | ||||
| negq BI | negq BI | ||||
| @@ -1269,16 +1269,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| movq %rax, BI // Index for BO | movq %rax, BI // Index for BO | ||||
| leaq ( ,BI,4), BI // BI = BI * 4 ; number of values | leaq ( ,BI,4), BI // BI = BI * 4 ; number of values | ||||
| leaq (BO, BI, SIZE), BO | leaq (BO, BI, SIZE), BO | ||||
| salq $1, %rax // rax = rax * 2 ; number of values | |||||
| salq $ 1, %rax // rax = rax * 2 ; number of values | |||||
| leaq (AO, %rax, SIZE), AO | leaq (AO, %rax, SIZE), AO | ||||
| #endif | #endif | ||||
| #if defined(TRMMKERNEL) && defined(LEFT) | #if defined(TRMMKERNEL) && defined(LEFT) | ||||
| addq $1, KK | |||||
| addq $ 1, KK | |||||
| #endif | #endif | ||||
| addq $2 * SIZE, CO1 # coffset += 2 | |||||
| addq $ 2 * SIZE, CO1 # coffset += 2 | |||||
| decq I # i -- | decq I # i -- | ||||
| jg .L2_2_41 | jg .L2_2_41 | ||||
| ALIGN_4 | ALIGN_4 | ||||
| @@ -1288,7 +1288,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .L2_2_60: | .L2_2_60: | ||||
| #if defined(TRMMKERNEL) && !defined(LEFT) | #if defined(TRMMKERNEL) && !defined(LEFT) | ||||
| addq $2, KK | |||||
| addq $ 2, KK | |||||
| #endif | #endif | ||||
| decq J // j -- | decq J // j -- | ||||
| @@ -1303,7 +1303,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *************************************************************************************************/ | *************************************************************************************************/ | ||||
| movq Nmod6, J | movq Nmod6, J | ||||
| andq $1, J // j % 2 | |||||
| andq $ 1, J // j % 2 | |||||
| je .L999 | je .L999 | ||||
| ALIGN_4 | ALIGN_4 | ||||
| @@ -1318,8 +1318,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| vmovups (BO1), %xmm0 | vmovups (BO1), %xmm0 | ||||
| vmovups %xmm0, (BO) | vmovups %xmm0, (BO) | ||||
| addq $2*SIZE,BO1 | |||||
| addq $2*SIZE,BO | |||||
| addq $ 2*SIZE,BO1 | |||||
| addq $ 2*SIZE,BO | |||||
| decq %rax | decq %rax | ||||
| jnz .L1_00_02b | jnz .L1_00_02b | ||||
| @@ -1337,10 +1337,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #endif | #endif | ||||
| movq A, AO // aoffset = a | movq A, AO // aoffset = a | ||||
| addq $8 * SIZE, AO | |||||
| addq $ 8 * SIZE, AO | |||||
| movq M, I | movq M, I | ||||
| sarq $2, I // i = (m >> 2) | |||||
| sarq $ 2, I // i = (m >> 2) | |||||
| je .L1_2_10 | je .L1_2_10 | ||||
| ALIGN_4 | ALIGN_4 | ||||
| @@ -1354,15 +1354,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ | (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ | ||||
| (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) | (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) | ||||
| leaq BUFFER1, BO // first buffer to BO | leaq BUFFER1, BO // first buffer to BO | ||||
| addq $4 * SIZE, BO | |||||
| addq $ 4 * SIZE, BO | |||||
| #else | #else | ||||
| movq KK, %rax | movq KK, %rax | ||||
| leaq BUFFER1, BO // first buffer to BO | leaq BUFFER1, BO // first buffer to BO | ||||
| addq $4 * SIZE, BO | |||||
| addq $ 4 * SIZE, BO | |||||
| movq %rax, BI // Index for BO | movq %rax, BI // Index for BO | ||||
| leaq (,BI,2), BI // BI = BI * 2 ; number of values | leaq (,BI,2), BI // BI = BI * 2 ; number of values | ||||
| leaq (BO, BI, SIZE), BO | leaq (BO, BI, SIZE), BO | ||||
| salq $3, %rax // rax = rax * 8 ; number of values | |||||
| salq $ 3, %rax // rax = rax * 8 ; number of values | |||||
| leaq (AO, %rax, SIZE), AO | leaq (AO, %rax, SIZE), AO | ||||
| #endif | #endif | ||||
| @@ -1377,20 +1377,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #else | #else | ||||
| movq KK, %rax | movq KK, %rax | ||||
| #ifdef LEFT | #ifdef LEFT | ||||
| addq $4, %rax // number of values in AO | |||||
| addq $ 4, %rax // number of values in AO | |||||
| #else | #else | ||||
| addq $1, %rax // number of values in BO | |||||
| addq $ 1, %rax // number of values in BO | |||||
| #endif | #endif | ||||
| movq %rax, KKK | movq %rax, KKK | ||||
| #endif | #endif | ||||
| andq $-8, %rax // K = K - ( K % 8 ) | |||||
| andq $ -8, %rax // K = K - ( K % 8 ) | |||||
| je .L1_4_16 | je .L1_4_16 | ||||
| movq %rax, BI // Index for BO | movq %rax, BI // Index for BO | ||||
| leaq ( ,BI,2), BI // BI = BI * 2 ; number of values | leaq ( ,BI,2), BI // BI = BI * 2 ; number of values | ||||
| salq $3, %rax // rax = rax * 8 ; number of values | |||||
| salq $ 3, %rax // rax = rax * 8 ; number of values | |||||
| leaq (AO, %rax, SIZE), AO | leaq (AO, %rax, SIZE), AO | ||||
| leaq (BO, BI, SIZE), BO | leaq (BO, BI, SIZE), BO | ||||
| negq BI | negq BI | ||||
| @@ -1433,13 +1433,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| movq KKK, %rax | movq KKK, %rax | ||||
| #endif | #endif | ||||
| andq $7, %rax # if (k & 1) | |||||
| andq $ 7, %rax # if (k & 1) | |||||
| je .L1_4_19 | je .L1_4_19 | ||||
| movq %rax, BI // Index for BO | movq %rax, BI // Index for BO | ||||
| leaq ( ,BI,2), BI // BI = BI * 2 ; number of values | leaq ( ,BI,2), BI // BI = BI * 2 ; number of values | ||||
| salq $3, %rax // rax = rax * 8 ; number of values | |||||
| salq $ 3, %rax // rax = rax * 8 ; number of values | |||||
| leaq (AO, %rax, SIZE), AO | leaq (AO, %rax, SIZE), AO | ||||
| leaq (BO, BI, SIZE), BO | leaq (BO, BI, SIZE), BO | ||||
| negq BI | negq BI | ||||
| @@ -1466,16 +1466,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| movq %rax, BI // Index for BO | movq %rax, BI // Index for BO | ||||
| leaq ( ,BI,2), BI // BI = BI * 2 ; number of values | leaq ( ,BI,2), BI // BI = BI * 2 ; number of values | ||||
| leaq (BO, BI, SIZE), BO | leaq (BO, BI, SIZE), BO | ||||
| salq $3, %rax // rax = rax * 8 ; number of values | |||||
| salq $ 3, %rax // rax = rax * 8 ; number of values | |||||
| leaq (AO, %rax, SIZE), AO | leaq (AO, %rax, SIZE), AO | ||||
| #endif | #endif | ||||
| #if defined(TRMMKERNEL) && defined(LEFT) | #if defined(TRMMKERNEL) && defined(LEFT) | ||||
| addq $4, KK | |||||
| addq $ 4, KK | |||||
| #endif | #endif | ||||
| addq $8 * SIZE, CO1 # coffset += 8 | |||||
| addq $ 8 * SIZE, CO1 # coffset += 8 | |||||
| decq I # i -- | decq I # i -- | ||||
| jg .L1_4_11 | jg .L1_4_11 | ||||
| ALIGN_4 | ALIGN_4 | ||||
| @@ -1485,7 +1485,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| /*******************************************************************************************************/ | /*******************************************************************************************************/ | ||||
| .L1_2_10: | .L1_2_10: | ||||
| testq $2, M | |||||
| testq $ 2, M | |||||
| jz .L1_2_40 | jz .L1_2_40 | ||||
| @@ -1495,15 +1495,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ | (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ | ||||
| (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) | (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) | ||||
| leaq BUFFER1, BO // first buffer to BO | leaq BUFFER1, BO // first buffer to BO | ||||
| addq $4 * SIZE, BO | |||||
| addq $ 4 * SIZE, BO | |||||
| #else | #else | ||||
| movq KK, %rax | movq KK, %rax | ||||
| leaq BUFFER1, BO // first buffer to BO | leaq BUFFER1, BO // first buffer to BO | ||||
| addq $4 * SIZE, BO | |||||
| addq $ 4 * SIZE, BO | |||||
| movq %rax, BI // Index for BO | movq %rax, BI // Index for BO | ||||
| leaq (,BI,2), BI // BI = BI * 2 ; number of values | leaq (,BI,2), BI // BI = BI * 2 ; number of values | ||||
| leaq (BO, BI, SIZE), BO | leaq (BO, BI, SIZE), BO | ||||
| salq $2, %rax // rax = rax * 4 ; number of values | |||||
| salq $ 2, %rax // rax = rax * 4 ; number of values | |||||
| leaq (AO, %rax, SIZE), AO | leaq (AO, %rax, SIZE), AO | ||||
| #endif | #endif | ||||
| @@ -1518,20 +1518,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #else | #else | ||||
| movq KK, %rax | movq KK, %rax | ||||
| #ifdef LEFT | #ifdef LEFT | ||||
| addq $2, %rax // number of values in AO | |||||
| addq $ 2, %rax // number of values in AO | |||||
| #else | #else | ||||
| addq $1, %rax // number of values in BO | |||||
| addq $ 1, %rax // number of values in BO | |||||
| #endif | #endif | ||||
| movq %rax, KKK | movq %rax, KKK | ||||
| #endif | #endif | ||||
| andq $-8, %rax // K = K - ( K % 8 ) | |||||
| andq $ -8, %rax // K = K - ( K % 8 ) | |||||
| je .L1_2_16 | je .L1_2_16 | ||||
| movq %rax, BI // Index for BO | movq %rax, BI // Index for BO | ||||
| leaq ( ,BI,2), BI // BI = BI * 2 ; number of values | leaq ( ,BI,2), BI // BI = BI * 2 ; number of values | ||||
| salq $2, %rax // rax = rax * 4 ; number of values | |||||
| salq $ 2, %rax // rax = rax * 4 ; number of values | |||||
| leaq (AO, %rax, SIZE), AO | leaq (AO, %rax, SIZE), AO | ||||
| leaq (BO, BI, SIZE), BO | leaq (BO, BI, SIZE), BO | ||||
| negq BI | negq BI | ||||
| @@ -1583,13 +1583,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| movq KKK, %rax | movq KKK, %rax | ||||
| #endif | #endif | ||||
| andq $7, %rax # if (k & 1) | |||||
| andq $ 7, %rax # if (k & 1) | |||||
| je .L1_2_19 | je .L1_2_19 | ||||
| movq %rax, BI // Index for BO | movq %rax, BI // Index for BO | ||||
| leaq ( ,BI,2), BI // BI = BI * 2 ; number of values | leaq ( ,BI,2), BI // BI = BI * 2 ; number of values | ||||
| salq $2, %rax // rax = rax * 4 ; number of values | |||||
| salq $ 2, %rax // rax = rax * 4 ; number of values | |||||
| leaq (AO, %rax, SIZE), AO | leaq (AO, %rax, SIZE), AO | ||||
| leaq (BO, BI, SIZE), BO | leaq (BO, BI, SIZE), BO | ||||
| negq BI | negq BI | ||||
| @@ -1615,16 +1615,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| movq %rax, BI // Index for BO | movq %rax, BI // Index for BO | ||||
| leaq ( ,BI,2), BI // BI = BI * 2 ; number of values | leaq ( ,BI,2), BI // BI = BI * 2 ; number of values | ||||
| leaq (BO, BI, SIZE), BO | leaq (BO, BI, SIZE), BO | ||||
| salq $2, %rax // rax = rax * 4 ; number of values | |||||
| salq $ 2, %rax // rax = rax * 4 ; number of values | |||||
| leaq (AO, %rax, SIZE), AO | leaq (AO, %rax, SIZE), AO | ||||
| #endif | #endif | ||||
| #if defined(TRMMKERNEL) && defined(LEFT) | #if defined(TRMMKERNEL) && defined(LEFT) | ||||
| addq $2, KK | |||||
| addq $ 2, KK | |||||
| #endif | #endif | ||||
| addq $4 * SIZE, CO1 # coffset += 4 | |||||
| addq $ 4 * SIZE, CO1 # coffset += 4 | |||||
| ALIGN_4 | ALIGN_4 | ||||
| @@ -1633,7 +1633,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| * Rest of M | * Rest of M | ||||
| ***************************************************************************/ | ***************************************************************************/ | ||||
| .L1_2_40: | .L1_2_40: | ||||
| testq $1, M | |||||
| testq $ 1, M | |||||
| jz .L999 | jz .L999 | ||||
| ALIGN_4 | ALIGN_4 | ||||
| @@ -1644,15 +1644,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ | (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ | ||||
| (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) | (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) | ||||
| leaq BUFFER1, BO // first buffer to BO | leaq BUFFER1, BO // first buffer to BO | ||||
| addq $4 * SIZE, BO | |||||
| addq $ 4 * SIZE, BO | |||||
| #else | #else | ||||
| movq KK, %rax | movq KK, %rax | ||||
| leaq BUFFER1, BO // first buffer to BO | leaq BUFFER1, BO // first buffer to BO | ||||
| addq $4 * SIZE, BO | |||||
| addq $ 4 * SIZE, BO | |||||
| movq %rax, BI // Index for BO | movq %rax, BI // Index for BO | ||||
| leaq (,BI,2), BI // BI = BI * 2 ; number of values | leaq (,BI,2), BI // BI = BI * 2 ; number of values | ||||
| leaq (BO, BI, SIZE), BO | leaq (BO, BI, SIZE), BO | ||||
| salq $1, %rax // rax = rax * 2 ; number of values | |||||
| salq $ 1, %rax // rax = rax * 2 ; number of values | |||||
| leaq (AO, %rax, SIZE), AO | leaq (AO, %rax, SIZE), AO | ||||
| #endif | #endif | ||||
| @@ -1667,20 +1667,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #else | #else | ||||
| movq KK, %rax | movq KK, %rax | ||||
| #ifdef LEFT | #ifdef LEFT | ||||
| addq $1, %rax // number of values in AO | |||||
| addq $ 1, %rax // number of values in AO | |||||
| #else | #else | ||||
| addq $1, %rax // number of values in BO | |||||
| addq $ 1, %rax // number of values in BO | |||||
| #endif | #endif | ||||
| movq %rax, KKK | movq %rax, KKK | ||||
| #endif | #endif | ||||
| andq $-8, %rax // K = K - ( K % 8 ) | |||||
| andq $ -8, %rax // K = K - ( K % 8 ) | |||||
| je .L1_2_46 | je .L1_2_46 | ||||
| movq %rax, BI // Index for BO | movq %rax, BI // Index for BO | ||||
| leaq ( ,BI,2), BI // BI = BI * 2 ; number of values | leaq ( ,BI,2), BI // BI = BI * 2 ; number of values | ||||
| salq $1, %rax // rax = rax * 2 ; number of values | |||||
| salq $ 1, %rax // rax = rax * 2 ; number of values | |||||
| leaq (AO, %rax, SIZE), AO | leaq (AO, %rax, SIZE), AO | ||||
| leaq (BO, BI, SIZE), BO | leaq (BO, BI, SIZE), BO | ||||
| negq BI | negq BI | ||||
| @@ -1731,13 +1731,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| movq KKK, %rax | movq KKK, %rax | ||||
| #endif | #endif | ||||
| andq $7, %rax # if (k & 1) | |||||
| andq $ 7, %rax # if (k & 1) | |||||
| je .L1_2_49 | je .L1_2_49 | ||||
| movq %rax, BI // Index for BO | movq %rax, BI // Index for BO | ||||
| leaq ( ,BI,2), BI // BI = BI * 2 ; number of values | leaq ( ,BI,2), BI // BI = BI * 2 ; number of values | ||||
| salq $1, %rax // rax = rax * 2 ; number of values | |||||
| salq $ 1, %rax // rax = rax * 2 ; number of values | |||||
| leaq (AO, %rax, SIZE), AO | leaq (AO, %rax, SIZE), AO | ||||
| leaq (BO, BI, SIZE), BO | leaq (BO, BI, SIZE), BO | ||||
| negq BI | negq BI | ||||
| @@ -1763,16 +1763,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| movq %rax, BI // Index for BO | movq %rax, BI // Index for BO | ||||
| leaq ( ,BI,2), BI // BI = BI * 2 ; number of values | leaq ( ,BI,2), BI // BI = BI * 2 ; number of values | ||||
| leaq (BO, BI, SIZE), BO | leaq (BO, BI, SIZE), BO | ||||
| salq $1, %rax // rax = rax * 2 ; number of values | |||||
| salq $ 1, %rax // rax = rax * 2 ; number of values | |||||
| leaq (AO, %rax, SIZE), AO | leaq (AO, %rax, SIZE), AO | ||||
| #endif | #endif | ||||
| #if defined(TRMMKERNEL) && defined(LEFT) | #if defined(TRMMKERNEL) && defined(LEFT) | ||||
| addq $1, KK | |||||
| addq $ 1, KK | |||||
| #endif | #endif | ||||
| addq $2 * SIZE, CO1 # coffset += 2 | |||||
| addq $ 2 * SIZE, CO1 # coffset += 2 | |||||
| decq I # i -- | decq I # i -- | ||||
| jg .L1_2_41 | jg .L1_2_41 | ||||
| ALIGN_4 | ALIGN_4 | ||||
| @@ -1806,7 +1806,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| movups 208(%rsp), %xmm15 | movups 208(%rsp), %xmm15 | ||||
| #endif | #endif | ||||
| addq $STACKSIZE, %rsp | |||||
| addq $ STACKSIZE, %rsp | |||||
| ret | ret | ||||
| EPILOGUE | EPILOGUE | ||||