| @@ -279,30 +279,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| vmulpd %ymm0 , %ymm9 , %ymm9 | |||
| vmulpd %ymm0 , %ymm10, %ymm10 | |||
| vmulpd %ymm0 , %ymm11, %ymm11 | |||
| #if B_PR1 >= 96 | |||
| prefetcht0 128 + BUFFER1 | |||
| #endif | |||
| vmulpd %ymm0 , %ymm12, %ymm12 | |||
| vmulpd %ymm0 , %ymm13, %ymm13 | |||
| vmulpd %ymm0 , %ymm14, %ymm14 | |||
| vmulpd %ymm0 , %ymm15, %ymm15 | |||
| #if B_PR1 >= 160 | |||
| prefetcht0 192 + BUFFER1 | |||
| #endif | |||
| vpermilpd $ 0x05 , %ymm5, %ymm5 | |||
| vpermilpd $ 0x05 , %ymm7, %ymm7 | |||
| #if B_PR1 >= 224 | |||
| prefetcht0 256 + BUFFER1 | |||
| #endif | |||
| vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0 | |||
| vblendpd $ 0x05, %ymm5, %ymm4, %ymm1 | |||
| vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2 | |||
| vblendpd $ 0x05, %ymm7, %ymm6, %ymm3 | |||
| #if B_PR1 >= 288 | |||
| prefetcht0 320 + BUFFER1 | |||
| #endif | |||
| vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2 | |||
| vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3 | |||
| #if B_PR1 >= 352 | |||
| prefetcht0 384 + BUFFER1 | |||
| #endif | |||
| vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4 | |||
| vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5 | |||
| vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6 | |||
| vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7 | |||
| #if B_PR1 >= 416 | |||
| prefetcht0 448 + BUFFER1 | |||
| #endif | |||
| leaq (CO1, LDC, 2), %rax | |||
| #if B_PR1 >= 480 | |||
| prefetcht0 512 + BUFFER1 | |||
| #endif | |||
| #if !defined(TRMMKERNEL) | |||
| @@ -1867,13 +1882,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| /* here for the prefetch of next b source block */ | |||
| /* the increment should be proportional to the ratio of GEMM_Q/GEMM_P */ | |||
| /* currently an increment of 128 byte is suitable */ | |||
| salq $3, K | |||
| #ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */ | |||
| prefetcht2 32(B) | |||
| prefetcht2 32(B, K, 8) | |||
| addq $64, B /* increment */ | |||
| #else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */ | |||
| prefetcht2 32(B) | |||
| prefetcht2 32(B, K, 8) | |||
| prefetcht2 96(B) | |||
| prefetcht2 96(B, K, 8) | |||
| addq $128, B /* increment */ | |||
| #endif | |||
| sarq $3, K | |||
| decq I # i -- | |||
| @@ -1883,10 +1904,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| /************************************************************************** | |||
| * Rest of M | |||
| ***************************************************************************/ | |||
| /* recover the original value of pointer B */ | |||
| /* recover the original value of pointer B after prefetch */ | |||
| movq M, I | |||
| sarq $2, I | |||
| #ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */ | |||
| salq $6, I | |||
| #else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */ | |||
| salq $7, I | |||
| #endif | |||
| subq I, B | |||
| .L12_20: | |||
| @@ -2166,13 +2192,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| /* here for the prefetch of next b source block */ | |||
| /* the increment should be proportional to the ratio of GEMM_Q/GEMM_P */ | |||
| /* currently an increment of 128 byte is suitable */ | |||
| salq $3, K | |||
| #ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */ | |||
| prefetcht2 (B) | |||
| prefetcht2 (B, K, 8) | |||
| addq $64, B | |||
| #else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */ | |||
| prefetcht2 (B) | |||
| prefetcht2 (B, K, 8) | |||
| prefetcht2 64(B) | |||
| prefetcht2 64(B, K, 8) | |||
| addq $128, B | |||
| #endif | |||
| sarq $3, K | |||
| decq I # i -- | |||
| @@ -2185,7 +2217,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| /* recover the original value of pointer B */ | |||
| movq M, I | |||
| sarq $2, I | |||
| #ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */ | |||
| salq $6, I | |||
| #else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */ | |||
| salq $7, I | |||
| #endif | |||
| subq I, B | |||
| .L13_20: | |||