| @@ -520,6 +520,19 @@ LL(1000): | |||
| .align 4 | |||
| LL(1010): | |||
| addi RET, RET, 1 | |||
| fcmpu cr0, f24, f24 | |||
| bun cr0, LL(9999) | |||
| addi RET, RET, 1 | |||
| fcmpu cr0, f25, f25 | |||
| bun cr0, LL(9999) | |||
| addi RET, RET, 1 | |||
| fcmpu cr0, f26, f26 | |||
| bun cr0, LL(9999) | |||
| addi RET, RET, 1 | |||
| fcmpu cr0, f27, f27 | |||
| bun cr0, LL(9999) | |||
| fabs f8, f24 | |||
| fabs f9, f25 | |||
| fabs f10, f26 | |||
| @@ -529,6 +542,20 @@ LL(1010): | |||
| LFD f25, 9 * SIZE(XX) | |||
| LFD f26, 10 * SIZE(XX) | |||
| LFD f27, 11 * SIZE(XX) | |||
| addi RET, RET, 1 | |||
| fcmpu cr0, f24, f24 | |||
| bun cr0, LL(9999) | |||
| addi RET, RET, 1 | |||
| fcmpu cr0, f25, f25 | |||
| bun cr0, LL(9999) | |||
| addi RET, RET, 1 | |||
| fcmpu cr0, f26, f26 | |||
| bun cr0, LL(9999) | |||
| addi RET, RET, 1 | |||
| fcmpu cr0, f27, f27 | |||
| bun cr0, LL(9999) | |||
| subi RET, RET, 8 | |||
| fabs f12, f28 | |||
| fabs f13, f29 | |||
| @@ -577,6 +604,32 @@ LL(1010): | |||
| .align 4 | |||
| LL(1020): | |||
| addi RET, RET, 1 | |||
| fcmpu cr0, f24, f24 | |||
| bun cr0, LL(9999) | |||
| addi RET, RET, 1 | |||
| fcmpu cr0, f25, f25 | |||
| bun cr0, LL(9999) | |||
| addi RET, RET, 1 | |||
| fcmpu cr0, f26, f26 | |||
| bun cr0, LL(9999) | |||
| addi RET, RET, 1 | |||
| fcmpu cr0, f27, f27 | |||
| bun cr0, LL(9999) | |||
| addi RET, RET, 1 | |||
| fcmpu cr0, f28, f28 | |||
| bun cr0, LL(9999) | |||
| addi RET, RET, 1 | |||
| fcmpu cr0, f29, f29 | |||
| bun cr0, LL(9999) | |||
| addi RET, RET, 1 | |||
| fcmpu cr0, f30, f30 | |||
| bun cr0, LL(9999) | |||
| addi RET, RET, 1 | |||
| fcmpu cr0, f31, f31 | |||
| bun cr0, LL(9999) | |||
| subi RET, RET, 8 | |||
| fabs f8, f24 | |||
| fabs f9, f25 | |||
| fabs f10, f26 | |||
| @@ -631,8 +684,12 @@ LL(1050): | |||
| LL(1060): | |||
| LFD f8, 0 * SIZE(XX) | |||
| addi XX, XX, 1 * SIZE | |||
| addi RET, RET, 1 | |||
| fcmpu cr0, f8, f8 | |||
| bun cru, LL(9999) | |||
| fabs f8, f8 | |||
| addi RET, RET, 1 | |||
| //addi RET, RET, 1 | |||
| fcmpu cr0, f1, f8 | |||
| beq cr0, LL(9999) | |||
| bdnz LL(1060) | |||
| @@ -658,6 +715,18 @@ LL(1100): | |||
| .align 4 | |||
| LL(1110): | |||
| addi RET, RET, 1 | |||
| fcmpu cr0, f24, f24 | |||
| bun cr0, LL(9999) | |||
| addi RET, RET, 1 | |||
| fcmpu cr0, f25, f25 | |||
| bun cr0, LL(9999) | |||
| addi RET, RET, 1 | |||
| fcmpu cr0, f26, f26 | |||
| bun cr0, LL(9999) | |||
| addi RET, RET, 1 | |||
| fcmpu cr0, f27, f27 | |||
| bun cr0, LL(9999) | |||
| fabs f8, f24 | |||
| fabs f9, f25 | |||
| fabs f10, f26 | |||
| @@ -667,7 +736,19 @@ LL(1110): | |||
| LFDUX f25, XX, INCX | |||
| LFDUX f26, XX, INCX | |||
| LFDUX f27, XX, INCX | |||
| addi RET, RET, 1 | |||
| fcmpu cr0, f24, f24 | |||
| bun cr0, LL(9999) | |||
| addi RET, RET, 1 | |||
| fcmpu cr0, f25, f25 | |||
| bun cr0, LL(9999) | |||
| addi RET, RET, 1 | |||
| fcmpu cr0, f26, f26 | |||
| bun cr0, LL(9999) | |||
| addi RET, RET, 1 | |||
| fcmpu cr0, f27, f27 | |||
| bun cr0, LL(9999) | |||
| subi RET, RET, 8 | |||
| fabs f12, f28 | |||
| fabs f13, f29 | |||
| fabs f14, f30 | |||
| @@ -714,6 +795,30 @@ LL(1110): | |||
| .align 4 | |||
| LL(1120): | |||
| addi RET, RET, 1 | |||
| fcmpu cr0, f24, f24 | |||
| bun cr0, LL(9999) | |||
| addi RET, RET, 1 | |||
| fcmpu cr0, f25, f25 | |||
| bun cr0, LL(9999) | |||
| addi RET, RET, 1 | |||
| fcmpu cr0, f26, f26 | |||
| bun cr0, LL(9999) | |||
| addi RET, RET, 1 | |||
| fcmpu cr0, f27, f27 | |||
| bun cr0, LL(9999) | |||
| addi RET, RET, 1 | |||
| fcmpu cr0, f28, f28 | |||
| bun cr0, LL(9999) | |||
| addi RET, RET, 1 | |||
| fcmpu cr0, f29, f29 | |||
| bun cr0, LL(9999) | |||
| addi RET, RET, 1 | |||
| fcmpu cr0, f30, f30 | |||
| bun cr0, LL(9999) | |||
| addi RET, RET, 1 | |||
| fcmpu cr0, f31, f31 | |||
| subi RET, RET, 8 | |||
| fabs f8, f24 | |||
| fabs f9, f25 | |||
| fabs f10, f26 | |||
| @@ -765,8 +870,11 @@ LL(1150): | |||
| LL(1160): | |||
| LFDUX f8, XX, INCX | |||
| fabs f8, f8 | |||
| addi RET, RET, 1 | |||
| fcmpu cr0, f8, f8 | |||
| bun LL(9999) | |||
| fabs f8, f8 | |||
| // addi RET, RET, 1 | |||
| fcmpu cr0, f1, f8 | |||
| beq cr0, LL(9999) | |||
| bdnz LL(1160) | |||
| @@ -327,6 +327,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||
| BLASLONG max = 0; | |||
| if (n <= 0 || inc_x <= 0) return (max); | |||
| if (n == 1) return(1); | |||
| if (inc_x == 1) { | |||
| @@ -335,7 +336,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||
| BLASLONG n1 = n & -32; | |||
| if (n1 > 0) { | |||
| for (int ii=i;ii<i+32;ii++) if (x[ii]!=x[ii]) return(ii+1); | |||
| max = diamax_kernel_32(n1, x, &maxf); | |||
| i = n1; | |||
| @@ -343,6 +344,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||
| #endif | |||
| #endif | |||
| while (i < n) { | |||
| if (x[i] != x[i]) return(i+1); | |||
| if (ABS(x[i]) > maxf) { | |||
| max = i; | |||
| maxf = ABS(x[i]); | |||
| @@ -356,6 +358,10 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||
| BLASLONG n1 = n & -4; | |||
| while (j < n1) { | |||
| if (x[i] != x[i]) return(i+1); | |||
| if (x[i+inc_x] != x[i+inc_x]) return(j+1); | |||
| if (x[i+2*inc_x] != x[i+2*inc_x]) return(j+2); | |||
| if (x[i+3*inc_x] != x[i+3*inc_x]) return(j+3); | |||
| if (ABS(x[i]) > maxf) { | |||
| max = j; | |||
| maxf = ABS(x[i]); | |||
| @@ -381,6 +387,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||
| while (j < n) { | |||
| if (x[i] != x[i]) return(j+1); | |||
| if (ABS(x[i]) > maxf) { | |||
| max = j; | |||
| maxf = ABS(x[i]); | |||
| @@ -58,6 +58,78 @@ static BLASLONG siamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *maxf) { | |||
| register __vector float quadruple_values={0,0,0,0}; | |||
| register __vector float * v_ptrx=(__vector float *)x; | |||
| for(; i<n; i+=64){ | |||
| if (vec_any_nan(v_ptrx[0])) { | |||
| float d=vec_extract(v_ptrx[0],0); | |||
| if (d!=d) return(i+0); | |||
| d=vec_extract(v_ptrx[0],1); | |||
| if (d!=d) return(i+1); | |||
| d=vec_extract(v_ptrx[0],2); | |||
| if (d!=d) return(i+2); | |||
| return(i+3); | |||
| } | |||
| if (vec_any_nan(v_ptrx[1])) { | |||
| float d=vec_extract(v_ptrx[1],0); | |||
| if (d!=d) return(i+4+0); | |||
| d=vec_extract(v_ptrx[1],1); | |||
| if (d!=d) return(i+4+1); | |||
| d=vec_extract(v_ptrx[1],2); | |||
| if (d!=d) return(i+4+2); | |||
| return(i+4+3); | |||
| } | |||
| if (vec_any_nan(v_ptrx[2])) { | |||
| float d=vec_extract(v_ptrx[2],0); | |||
| if (d!=d) return(i+8+0); | |||
| d=vec_extract(v_ptrx[2],1); | |||
| if (d!=d) return(i+8+1); | |||
| d=vec_extract(v_ptrx[2],2); | |||
| if (d!=d) return(i+8+2); | |||
| return(i+8+3); | |||
| } | |||
| if (vec_any_nan(v_ptrx[3])) { | |||
| float d=vec_extract(v_ptrx[3],0); | |||
| if (d!=d) return(i+12+0); | |||
| d=vec_extract(v_ptrx[3],1); | |||
| if (d!=d) return(i+12+1); | |||
| d=vec_extract(v_ptrx[3],2); | |||
| if (d!=d) return(i+12+2); | |||
| return(i+12+3); | |||
| } | |||
| if (vec_any_nan(v_ptrx[4])) { | |||
| float d=vec_extract(v_ptrx[4],0); | |||
| if (d!=d) return(i+16+0); | |||
| d=vec_extract(v_ptrx[4],1); | |||
| if (d!=d) return(i+16+1); | |||
| d=vec_extract(v_ptrx[4],2); | |||
| if (d!=d) return(i+16+2); | |||
| return(i+16+3); | |||
| } | |||
| if (vec_any_nan(v_ptrx[5])) { | |||
| float d=vec_extract(v_ptrx[5],0); | |||
| if (d!=d) return(i+20+0); | |||
| d=vec_extract(v_ptrx[5],1); | |||
| if (d!=d) return(i+20+1); | |||
| d=vec_extract(v_ptrx[5],2); | |||
| if (d!=d) return(i+20+2); | |||
| return(i+20+3); | |||
| } | |||
| if (vec_any_nan(v_ptrx[6])) { | |||
| float d=vec_extract(v_ptrx[6],0); | |||
| if (d!=d) return(i+24+0); | |||
| d=vec_extract(v_ptrx[6],1); | |||
| if (d!=d) return(i+24+1); | |||
| d=vec_extract(v_ptrx[6],2); | |||
| if (d!=d) return(i+24+2); | |||
| return(i+24+3); | |||
| } | |||
| if (vec_any_nan(v_ptrx[7])) { | |||
| float d=vec_extract(v_ptrx[7],0); | |||
| if (d!=d) return(i+28+0); | |||
| d=vec_extract(v_ptrx[7],1); | |||
| if (d!=d) return(i+28+1); | |||
| d=vec_extract(v_ptrx[7],2); | |||
| if (d!=d) return(i+28+2); | |||
| return(i+28+3); | |||
| } | |||
| //absolute temporary vectors | |||
| register __vector float v0=vec_abs(v_ptrx[0]); | |||
| register __vector float v1=vec_abs(v_ptrx[1]); | |||
| @@ -226,7 +298,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||
| BLASLONG max = 0; | |||
| if (n <= 0 || inc_x <= 0) return (max); | |||
| if (x[0] != x[0]) return(1); | |||
| if (inc_x == 1) { | |||
| BLASLONG n1 = n & -64; | |||
| @@ -238,6 +310,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||
| } | |||
| while (i < n) { | |||
| if (x[i] != x[i]) return(i+1); | |||
| if (ABS(x[i]) > maxf) { | |||
| max = i; | |||
| maxf = ABS(x[i]); | |||
| @@ -251,18 +324,22 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||
| BLASLONG n1 = n & -4; | |||
| while (j < n1) { | |||
| if (x[i] != x[i]) return(j+1); | |||
| if (ABS(x[i]) > maxf) { | |||
| max = j; | |||
| maxf = ABS(x[i]); | |||
| } | |||
| if (x[i+inc_x] != x[i+inc_x]) return(j+1); | |||
| if (ABS(x[i + inc_x]) > maxf) { | |||
| max = j + 1; | |||
| maxf = ABS(x[i + inc_x]); | |||
| } | |||
| if (x[i+2*inc_x] != x[i+2*inc_x]) return(j+2); | |||
| if (ABS(x[i + 2 * inc_x]) > maxf) { | |||
| max = j + 2; | |||
| maxf = ABS(x[i + 2 * inc_x]); | |||
| } | |||
| if (x[i+3*inc_x] != x[i+3*inc_x]) return(j+3); | |||
| if (ABS(x[i + 3 * inc_x]) > maxf) { | |||
| max = j + 3; | |||
| maxf = ABS(x[i + 3 * inc_x]); | |||
| @@ -276,6 +353,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||
| while (j < n) { | |||
| if (x[i] != x[i]) return(j+1); | |||
| if (ABS(x[i]) > maxf) { | |||
| max = j; | |||
| maxf = ABS(x[i]); | |||