| @@ -80,6 +80,24 @@ static void sgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT | |||
| ( | |||
| "xorps %%xmm10 , %%xmm10 \n\t" | |||
| "xorps %%xmm11 , %%xmm11 \n\t" | |||
| "testq $4 , %1 \n\t" | |||
| "jz .L01LABEL%= \n\t" | |||
| "movups (%5,%0,4) , %%xmm14 \n\t" // x | |||
| "movups (%3,%0,4) , %%xmm12 \n\t" // ap0 | |||
| "movups (%4,%0,4) , %%xmm13 \n\t" // ap1 | |||
| "mulps %%xmm14 , %%xmm12 \n\t" | |||
| "mulps %%xmm14 , %%xmm13 \n\t" | |||
| "addq $4 , %0 \n\t" | |||
| "addps %%xmm12 , %%xmm10 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| "addps %%xmm13 , %%xmm11 \n\t" | |||
| ".L01LABEL%=: \n\t" | |||
| "cmpq $0, %1 \n\t" | |||
| "je .L01END%= \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| @@ -89,13 +107,23 @@ static void sgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT | |||
| "movups (%4,%0,4) , %%xmm13 \n\t" // ap1 | |||
| "mulps %%xmm14 , %%xmm12 \n\t" | |||
| "mulps %%xmm14 , %%xmm13 \n\t" | |||
| "addq $4 , %0 \n\t" | |||
| "addps %%xmm12 , %%xmm10 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| "addps %%xmm13 , %%xmm11 \n\t" | |||
| "movups 16(%5,%0,4) , %%xmm14 \n\t" // x | |||
| "movups 16(%3,%0,4) , %%xmm12 \n\t" // ap0 | |||
| "movups 16(%4,%0,4) , %%xmm13 \n\t" // ap1 | |||
| "mulps %%xmm14 , %%xmm12 \n\t" | |||
| "mulps %%xmm14 , %%xmm13 \n\t" | |||
| "addps %%xmm12 , %%xmm10 \n\t" | |||
| "addps %%xmm13 , %%xmm11 \n\t" | |||
| "addq $8 , %0 \n\t" | |||
| "subq $8 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| ".L01END%=: \n\t" | |||
| "haddps %%xmm10, %%xmm10 \n\t" | |||
| "haddps %%xmm11, %%xmm11 \n\t" | |||
| "haddps %%xmm10, %%xmm10 \n\t" | |||
| @@ -113,7 +141,8 @@ static void sgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT | |||
| "r" (ap1), // 4 | |||
| "r" (x) // 5 | |||
| : "cc", | |||
| "%xmm10", "%xmm11", "%xmm12", | |||
| "%xmm4", "%xmm5", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| @@ -130,10 +159,11 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) | |||
| __asm__ __volatile__ | |||
| ( | |||
| "xorps %%xmm9 , %%xmm9 \n\t" | |||
| "xorps %%xmm10 , %%xmm10 \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "testq $4 , %1 \n\t" | |||
| "jz .L01LABEL%= \n\t" | |||
| "movups (%3,%0,4) , %%xmm12 \n\t" | |||
| "movups (%4,%0,4) , %%xmm11 \n\t" | |||
| @@ -142,8 +172,30 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) | |||
| "addps %%xmm12 , %%xmm10 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| ".L01LABEL%=: \n\t" | |||
| "cmpq $0, %1 \n\t" | |||
| "je .L01END%= \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "movups (%3,%0,4) , %%xmm12 \n\t" | |||
| "movups 16(%3,%0,4) , %%xmm14 \n\t" | |||
| "movups (%4,%0,4) , %%xmm11 \n\t" | |||
| "movups 16(%4,%0,4) , %%xmm13 \n\t" | |||
| "mulps %%xmm11 , %%xmm12 \n\t" | |||
| "mulps %%xmm13 , %%xmm14 \n\t" | |||
| "addq $8 , %0 \n\t" | |||
| "addps %%xmm12 , %%xmm10 \n\t" | |||
| "subq $8 , %1 \n\t" | |||
| "addps %%xmm14 , %%xmm9 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| ".L01END%=: \n\t" | |||
| "addps %%xmm9 , %%xmm10 \n\t" | |||
| "haddps %%xmm10, %%xmm10 \n\t" | |||
| "haddps %%xmm10, %%xmm10 \n\t" | |||
| @@ -157,7 +209,8 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) | |||
| "r" (ap), // 3 | |||
| "r" (x) // 4 | |||
| : "cc", | |||
| "%xmm10", "%xmm11", "%xmm12", | |||
| "%xmm9", "%xmm10" , | |||
| "%xmm11", "%xmm12", "%xmm13", "%xmm14", | |||
| "memory" | |||
| ); | |||