[WIP]Make NAN handling in the SCAL kernels depend on the dummy2 parametertags/v0.3.28^2
| @@ -43,9 +43,22 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||
| if ( (n <= 0) || (inc_x <= 0)) | |||
| return(0); | |||
| if (dummy2 == 0) { | |||
| while(j < n) | |||
| { | |||
| while(j < n) | |||
| { | |||
| if ( da == 0.0 ) | |||
| x[i]=0.0; | |||
| else | |||
| x[i] = da * x[i] ; | |||
| i += inc_x ; | |||
| j++; | |||
| } | |||
| } else { | |||
| while(j < n) | |||
| { | |||
| if ( da == 0.0 ) | |||
| if (!isnan(x[i]) && !isinf(x[i])) { | |||
| @@ -59,6 +72,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||
| i += inc_x ; | |||
| j++; | |||
| } | |||
| } | |||
| return 0; | |||
| @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define X_COPY x5 /* X vector address */ | |||
| #define INC_X x4 /* X stride */ | |||
| #define I x1 /* loop variable */ | |||
| #define FLAG x9 | |||
| /******************************************************************************* | |||
| * Macro definitions | |||
| *******************************************************************************/ | |||
| @@ -168,9 +168,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| cmp N, xzr | |||
| ble .Lscal_kernel_L999 | |||
| //fcmp DA, #0.0 | |||
| //beq .Lscal_kernel_zero | |||
| ldr FLAG, [sp] | |||
| cmp FLAG, #1 | |||
| beq .Lscal_kernel_nansafe | |||
| fcmp DA, #0.0 | |||
| beq .Lscal_kernel_zero | |||
| .Lscal_kernel_nansafe: | |||
| cmp INC_X, #1 | |||
| bne .Lscal_kernel_S_BEGIN | |||
| @@ -73,6 +73,15 @@ static void dscal_kernel_8_zero (BLASLONG n, FLOAT *x) | |||
| for( i=0; i<n; i+=8 ) | |||
| { | |||
| x[0] = alpha; | |||
| x[1] = alpha; | |||
| x[2] = alpha; | |||
| x[3] = alpha; | |||
| x[4] = alpha; | |||
| x[5] = alpha; | |||
| x[6] = alpha; | |||
| x[7] = alpha; | |||
| #if 0 | |||
| if(isfinite(x[0])) | |||
| x[0] = alpha; | |||
| else | |||
| @@ -106,7 +115,8 @@ static void dscal_kernel_8_zero (BLASLONG n, FLOAT *x) | |||
| else | |||
| x[7] = NAN; | |||
| x+=8; | |||
| } | |||
| #endif | |||
| } | |||
| } | |||
| @@ -130,6 +140,11 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||
| if ( n >= 16 ) | |||
| { | |||
| BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3; | |||
| if (dummy2 == 0) | |||
| for (j = 0; j < align; j++) { | |||
| x [j] = 0.0; | |||
| } | |||
| else | |||
| for (j = 0; j < align; j++) { | |||
| if (isfinite(x[j])) | |||
| x[j] = 0.0; | |||
| @@ -151,7 +166,13 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||
| j=n1; | |||
| } | |||
| #endif | |||
| if (dummy2 == 0) | |||
| while(j < n) | |||
| { | |||
| x[j]=0.0; | |||
| j++; | |||
| } | |||
| else | |||
| while(j < n) | |||
| { | |||
| if (!isfinite(x[j])) | |||
| @@ -202,7 +223,14 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||
| if ( da == 0.0 ) | |||
| { | |||
| if (dummy2 == 0) | |||
| while(j < n) | |||
| { | |||
| x[i]=0.0; | |||
| i += inc_x; | |||
| j++; | |||
| } | |||
| else | |||
| while(j < n) | |||
| { | |||
| if (!isfinite(x[i])) | |||
| @@ -47,9 +47,11 @@ | |||
| #ifndef __64BIT__ | |||
| #define X r6 | |||
| #define INCX r7 | |||
| #define FLAG r11 | |||
| #else | |||
| #define X r7 | |||
| #define INCX r8 | |||
| #define FLAG r12 | |||
| #endif | |||
| #endif | |||
| @@ -57,9 +59,11 @@ | |||
| #if !defined(__64BIT__) && defined(DOUBLE) | |||
| #define X r8 | |||
| #define INCX r9 | |||
| #define FLAG r13 | |||
| #else | |||
| #define X r7 | |||
| #define INCX r8 | |||
| #define FLAG r12 | |||
| #endif | |||
| #endif | |||
| @@ -84,9 +88,12 @@ | |||
| cmpwi cr0, N, 0 | |||
| blelr- cr0 | |||
| // fcmpu cr0, FZERO, ALPHA | |||
| // bne- cr0, LL(A1I1) | |||
| b LL(A1I1) | |||
| fcmpu cr0, FZERO, ALPHA | |||
| bne- cr0, LL(A1I1) | |||
| ld FLAG, 48+64+8(SP) | |||
| cmpwi cr0, FLAG, 1 | |||
| beq- cr0, LL(A1I1) | |||
| cmpwi cr0, INCX, SIZE | |||
| bne- cr0, LL(A0IN) | |||
| @@ -74,7 +74,24 @@ static void sscal_kernel_16_zero( BLASLONG n, FLOAT *x ) | |||
| for( i=0; i<n; i+=8 ) | |||
| { | |||
| if (isfinite(x[0])) | |||
| x[0] = alpha; | |||
| x[1] = alpha; | |||
| x[2] = alpha; | |||
| x[3] = alpha; | |||
| x[4] = alpha; | |||
| x[5] = alpha; | |||
| x[6] = alpha; | |||
| x[7] = alpha; | |||
| x[8] = alpha; | |||
| x[9] = alpha; | |||
| x[10] = alpha; | |||
| x[11] = alpha; | |||
| x[12] = alpha; | |||
| x[13] = alpha; | |||
| x[14] = alpha; | |||
| x[15] = alpha; | |||
| #if 0 | |||
| if (isfinite(x[0])) | |||
| x[0] = alpha; | |||
| else | |||
| x[0] = NAN; | |||
| @@ -107,7 +124,8 @@ static void sscal_kernel_16_zero( BLASLONG n, FLOAT *x ) | |||
| else | |||
| x[7] = NAN; | |||
| x+=8; | |||
| } | |||
| #endif | |||
| } | |||
| } | |||
| @@ -132,6 +150,11 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||
| if ( n >= 32 ) | |||
| { | |||
| BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7; | |||
| if (dummy2 == 0) | |||
| for (j = 0; j < align; j++){ | |||
| x[j] = 0.0; | |||
| } | |||
| else | |||
| for (j = 0; j < align; j++) { | |||
| if (isfinite(x[j])) | |||
| x[j] = 0.0; | |||
| @@ -153,9 +176,15 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||
| j=n1; | |||
| } | |||
| #endif | |||
| if (dummy2 == 0) | |||
| while(j < n) | |||
| { | |||
| x[j] = 0.0; | |||
| j++; | |||
| } | |||
| else | |||
| while(j < n) | |||
| { | |||
| if (isfinite(x[j])) | |||
| x[j]=0.0; | |||
| else | |||
| @@ -204,7 +233,14 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||
| if ( da == 0.0 ) | |||
| { | |||
| if (dummy2 == 0) | |||
| while(j < n) | |||
| { | |||
| x[i]=0.0; | |||
| i += inc_x; | |||
| j++; | |||
| } | |||
| else | |||
| while(j < n) | |||
| { | |||
| if (isfinite(x[i])) | |||
| @@ -43,9 +43,9 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||
| if ( (n <= 0) || (inc_x <= 0)) | |||
| return(0); | |||
| while(j < n) | |||
| { | |||
| if (dummy2 == 0) { | |||
| while(j < n) | |||
| { | |||
| if ( da == 0.0 ) | |||
| if (isfinite(x[i])) | |||
| @@ -57,7 +57,19 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||
| i += inc_x ; | |||
| j++; | |||
| } | |||
| } else { | |||
| while(j < n) | |||
| { | |||
| if ( da == 0.0 ) | |||
| x[i]=0.0; | |||
| else | |||
| x[i] = da * x[i] ; | |||
| i += inc_x ; | |||
| j++; | |||
| } | |||
| } | |||
| return 0; | |||
| @@ -56,7 +56,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||
| FLOAT_V_T v0; | |||
| if(inc_x == 1) { | |||
| if(da == 0.0) { | |||
| if(dummy2 == 0 && da == 0.0) { | |||
| int gvl = VSETVL_MAX; | |||
| v0 = VFMVVF_FLOAT(0.0, gvl); | |||
| for (size_t vl; n > 0; n -= vl, x += vl) { | |||
| @@ -75,7 +75,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||
| } else { | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| if(da == 0.0) { | |||
| if(dummy2 == 0 && da == 0.0) { | |||
| int gvl = VSETVL_MAX; | |||
| v0 = VFMVVF_FLOAT(0.0, gvl); | |||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { | |||
| @@ -71,7 +71,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||
| FLOAT_V_T v0, v1; | |||
| unsigned int gvl = 0; | |||
| if(inc_x == 1){ | |||
| if (0){ //if(da == 0.0){ | |||
| if(dummy2 == 0 && da == 0.0){ | |||
| memset(&x[0], 0, n * sizeof(FLOAT)); | |||
| }else{ | |||
| gvl = VSETVL(n); | |||
| @@ -96,7 +96,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||
| } | |||
| } | |||
| }else{ | |||
| if (0) { //if(da == 0.0){ | |||
| if(dummy2 == 0 && da == 0.0){ | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| BLASLONG ix = 0; | |||
| gvl = VSETVL(n); | |||
| @@ -57,19 +57,24 @@ | |||
| #ifdef XDOUBLE | |||
| movl 44(%esp),%edi | |||
| movl 48(%esp),%esi | |||
| movl 64(%esp),%ecx | |||
| #elif defined(DOUBLE) | |||
| movl 36(%esp),%edi | |||
| movl 40(%esp),%esi | |||
| movl 56(%esp),%ecx | |||
| #else | |||
| movl 32(%esp),%edi | |||
| movl 36(%esp),%esi | |||
| movl 54(%esp),%ecx | |||
| #endif | |||
| ftst | |||
| fnstsw %ax | |||
| andb $68, %ah | |||
| // je .L300 # Alpha != ZERO | |||
| jmp .L300 | |||
| je .L300 # Alpha != ZERO | |||
| cmpl $1,%ecx # dummy2 flag | |||
| je .L300 | |||
| /* Alpha == ZERO */ | |||
| cmpl $1,%esi | |||
| @@ -60,8 +60,10 @@ | |||
| #ifdef WINDOWS_ABI | |||
| movq 40(%rsp), X | |||
| movq 48(%rsp), INCX | |||
| movq 64(%rsp), %r9 | |||
| movaps %xmm3, %xmm0 | |||
| #else | |||
| movq 24(%rsp), %r9 | |||
| #endif | |||
| SAVEREGISTERS | |||
| @@ -73,6 +75,10 @@ | |||
| lea (, INCX, SIZE), INCX | |||
| comisd %xmm0, %xmm1 | |||
| jne .L100 | |||
| jp .L100 | |||
| cmpq $1, %r9 | |||
| je .L100 | |||
| /* Alpha == ZERO */ | |||
| cmpq $SIZE, INCX | |||
| @@ -60,8 +60,10 @@ | |||
| #ifdef WINDOWS_ABI | |||
| movq 40(%rsp), X | |||
| movq 48(%rsp), INCX | |||
| movq 64(%rsp), %r9 | |||
| movaps %xmm3, %xmm0 | |||
| #else | |||
| movq 24(%rsp), %r9 | |||
| #endif | |||
| SAVEREGISTERS | |||
| @@ -76,6 +78,8 @@ | |||
| shufps $0, %xmm0, %xmm0 | |||
| jne .L100 # Alpha != ZERO | |||
| cmpq $1, %r9 | |||
| je .L100 | |||
| /* Alpha == ZERO */ | |||
| cmpq $SIZE, INCX | |||
| @@ -48,6 +48,7 @@ | |||
| #define X ARG2 | |||
| #define INCX ARG3 | |||
| #endif | |||
| #define FLAG %r9 | |||
| #define XX %r10 | |||
| #define I %rax | |||
| @@ -60,8 +61,10 @@ | |||
| #ifdef WINDOWS_ABI | |||
| movq 40(%rsp), X | |||
| movq 48(%rsp), INCX | |||
| movq 64(%rsp), FLAG | |||
| movaps %xmm3, %xmm0 | |||
| #else | |||
| movq 24(%rsp), FLAG | |||
| #endif | |||
| SAVEREGISTERS | |||
| @@ -75,6 +78,8 @@ | |||
| comisd %xmm0, %xmm1 | |||
| jne .L100 # Alpha != ZERO | |||
| jp .L100 # For Alpha = NaN | |||
| cmpq $1, FLAG | |||
| je .L100 # disable the Alpha=zero path as it does not handle x=inf or nan | |||
| /* Alpha == ZERO */ | |||
| cmpq $SIZE, INCX | |||
| @@ -74,7 +74,7 @@ | |||
| pxor %xmm15, %xmm15 | |||
| comisd %xmm0, %xmm15 | |||
| jne .L30 # Alpha_r != ZERO | |||
| jp .L30 | |||
| comisd %xmm1, %xmm15 | |||
| jne .L30 # Alpha_i != ZERO | |||
| @@ -76,7 +76,7 @@ | |||
| pxor %xmm15, %xmm15 | |||
| comiss %xmm0, %xmm15 | |||
| jne .L100 # Alpha_r != ZERO | |||
| jp .L100 # Alpha_r == NAN | |||
| comiss %xmm1, %xmm15 | |||
| jne .L100 # Alpha_i != ZERO | |||