It overflowed the internal buffer. Thus, we split vector x into blocks when m is very large. Thank @wangqian for this patch.tags/v0.2.5
| @@ -47,7 +47,7 @@ | |||
| #ifndef WINDOWS_ABI | |||
| #define STACKSIZE 64 | |||
| #define STACKSIZE 128 | |||
| #define OLD_M %rdi | |||
| #define OLD_N %rsi | |||
| @@ -57,7 +57,10 @@ | |||
| #define STACK_Y 16 + STACKSIZE(%rsp) | |||
| #define STACK_INCY 24 + STACKSIZE(%rsp) | |||
| #define STACK_BUFFER 32 + STACKSIZE(%rsp) | |||
| #define MMM 56(%rsp) | |||
| #define NN 64(%rsp) | |||
| #define AA 72(%rsp) | |||
| #define LDAX 80(%rsp) | |||
| #else | |||
| #define STACKSIZE 256 | |||
| @@ -132,12 +135,44 @@ | |||
| movq OLD_LDA, LDA | |||
| movq OLD_X, X | |||
| #else | |||
| movq OLD_M, M | |||
| movq OLD_N, N | |||
| movq OLD_A, A | |||
| movq OLD_LDA, LDA | |||
| movq OLD_M, MMM | |||
| movq OLD_N, NN | |||
| movq OLD_A, AA | |||
| movq OLD_LDA, LDAX | |||
| #endif | |||
| #ifdef HAVE_SSE3 | |||
| #ifndef WINDOWS_ABI | |||
| movddup %xmm0, ALPHA | |||
| #else | |||
| movddup %xmm3, ALPHA | |||
| #endif | |||
| #else | |||
| #ifndef WINDOWS_ABI | |||
| movapd %xmm0, ALPHA | |||
| #else | |||
| movapd %xmm3, ALPHA | |||
| #endif | |||
| unpcklpd ALPHA, ALPHA | |||
| #endif | |||
| .L0x: | |||
| xorq M,M | |||
| addq $1,M | |||
| salq $22,M | |||
| subq M,MMM | |||
| jge .L00 | |||
| movq MMM,%rax | |||
| addq M,%rax | |||
| jle .L999x | |||
| movq %rax,M | |||
| .L00: | |||
| movq LDAX,LDA | |||
| movq NN,N | |||
| movq AA,A | |||
| movq STACK_INCX, INCX | |||
| movq STACK_Y, Y | |||
| movq STACK_INCY, INCY | |||
| @@ -153,21 +188,6 @@ | |||
| subq $-16 * SIZE, A | |||
| #ifdef HAVE_SSE3 | |||
| #ifndef WINDOWS_ABI | |||
| movddup %xmm0, ALPHA | |||
| #else | |||
| movddup %xmm3, ALPHA | |||
| #endif | |||
| #else | |||
| #ifndef WINDOWS_ABI | |||
| movapd %xmm0, ALPHA | |||
| #else | |||
| movapd %xmm3, ALPHA | |||
| #endif | |||
| unpcklpd ALPHA, ALPHA | |||
| #endif | |||
| testq M, M | |||
| jle .L999 | |||
| testq N, N | |||
| @@ -854,7 +874,6 @@ | |||
| .L21: | |||
| #endif | |||
| subq $4, N | |||
| leaq 16 * SIZE(BUFFER), X1 | |||
| @@ -2461,6 +2480,12 @@ | |||
| ALIGN_4 | |||
| .L999: | |||
| leaq (, M, SIZE), %rax | |||
| addq %rax,AA | |||
| jmp .L0x; | |||
| ALIGN_4 | |||
| .L999x: | |||
| movq 0(%rsp), %rbx | |||
| movq 8(%rsp), %rbp | |||
| movq 16(%rsp), %r12 | |||