Add data prefetch in DOT and ASUM functionstags/v0.2.20^2
| @@ -36,40 +36,51 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| BLASLONG i, inc_x2; | |||
| FLOAT sumf = 0.0; | |||
| v4f32 src0, src1, src2, src3, src4, src5, src6, src7; | |||
| v4f32 sum_abs0, sum_abs1, sum_abs2, sum_abs3; | |||
| v4f32 zero_v = {0}; | |||
| v4f32 src8, src9, src10, src11, src12, src13, src14, src15; | |||
| v4f32 sum_abs0 = {0, 0, 0, 0}; | |||
| v4f32 sum_abs1 = {0, 0, 0, 0}; | |||
| v4f32 sum_abs2 = {0, 0, 0, 0}; | |||
| v4f32 sum_abs3 = {0, 0, 0, 0}; | |||
| v4i32 and_vec = {0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF}; | |||
| if (n <= 0 || inc_x <= 0) return (sumf); | |||
| if (1 == inc_x) | |||
| { | |||
| if (n > 15) | |||
| { | |||
| n -= 16; | |||
| LD_SP8_INC(x, 4, src0, src1, src2, src3, src4, src5, src6, src7); | |||
| #ifdef ENABLE_PREFETCH | |||
| FLOAT *x_pref; | |||
| BLASLONG pref_offset; | |||
| sum_abs0 = AND_VEC_W(src0); | |||
| sum_abs1 = AND_VEC_W(src1); | |||
| sum_abs2 = AND_VEC_W(src2); | |||
| sum_abs3 = AND_VEC_W(src3); | |||
| sum_abs0 += AND_VEC_W(src4); | |||
| sum_abs1 += AND_VEC_W(src5); | |||
| sum_abs2 += AND_VEC_W(src6); | |||
| sum_abs3 += AND_VEC_W(src7); | |||
| } | |||
| else | |||
| pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); | |||
| if (pref_offset > 0) | |||
| { | |||
| sum_abs0 = zero_v; | |||
| sum_abs1 = zero_v; | |||
| sum_abs2 = zero_v; | |||
| sum_abs3 = zero_v; | |||
| pref_offset = L1_DATA_LINESIZE - pref_offset; | |||
| } | |||
| pref_offset = pref_offset / sizeof(FLOAT); | |||
| x_pref = x + pref_offset + 128; | |||
| #endif | |||
| for (i = (n >> 4); i--;) | |||
| for (i = (n >> 5); i--;) | |||
| { | |||
| #ifdef ENABLE_PREFETCH | |||
| __asm__ __volatile__( | |||
| "pref 0, 0(%[x_pref])\n\t" | |||
| "pref 0, 32(%[x_pref])\n\t" | |||
| "pref 0, 64(%[x_pref])\n\t" | |||
| "pref 0, 96(%[x_pref])\n\t" | |||
| "pref 0, 128(%[x_pref])\n\t" | |||
| "pref 0, 160(%[x_pref])\n\t" | |||
| "pref 0, 192(%[x_pref])\n\t" | |||
| "pref 0, 224(%[x_pref])\n\t" | |||
| : : [x_pref] "r" (x_pref) | |||
| ); | |||
| x_pref += 64; | |||
| #endif | |||
| LD_SP8_INC(x, 4, src0, src1, src2, src3, src4, src5, src6, src7); | |||
| LD_SP8_INC(x, 4, src8, src9, src10, src11, src12, src13, src14, src15); | |||
| sum_abs0 += AND_VEC_W(src0); | |||
| sum_abs1 += AND_VEC_W(src1); | |||
| @@ -79,13 +90,21 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| sum_abs1 += AND_VEC_W(src5); | |||
| sum_abs2 += AND_VEC_W(src6); | |||
| sum_abs3 += AND_VEC_W(src7); | |||
| sum_abs0 += AND_VEC_W(src8); | |||
| sum_abs1 += AND_VEC_W(src9); | |||
| sum_abs2 += AND_VEC_W(src10); | |||
| sum_abs3 += AND_VEC_W(src11); | |||
| sum_abs0 += AND_VEC_W(src12); | |||
| sum_abs1 += AND_VEC_W(src13); | |||
| sum_abs2 += AND_VEC_W(src14); | |||
| sum_abs3 += AND_VEC_W(src15); | |||
| } | |||
| if (n & 15) | |||
| if (n & 31) | |||
| { | |||
| if ((n & 8) && (n & 4) && (n & 2)) | |||
| if (n & 16) | |||
| { | |||
| LD_SP7_INC(x, 4, src0, src1, src2, src3, src4, src5, src6); | |||
| LD_SP8_INC(x, 4, src0, src1, src2, src3, src4, src5, src6, src7); | |||
| sum_abs0 += AND_VEC_W(src0); | |||
| sum_abs1 += AND_VEC_W(src1); | |||
| @@ -94,65 +113,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| sum_abs0 += AND_VEC_W(src4); | |||
| sum_abs1 += AND_VEC_W(src5); | |||
| sum_abs2 += AND_VEC_W(src6); | |||
| sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; | |||
| sumf = sum_abs0[0]; | |||
| sumf += sum_abs0[1]; | |||
| sumf += sum_abs0[2]; | |||
| sumf += sum_abs0[3]; | |||
| sum_abs3 += AND_VEC_W(src7); | |||
| } | |||
| else if ((n & 8) && (n & 4)) | |||
| { | |||
| LD_SP6_INC(x, 4, src0, src1, src2, src3, src4, src5); | |||
| sum_abs0 += AND_VEC_W(src0); | |||
| sum_abs1 += AND_VEC_W(src1); | |||
| sum_abs2 += AND_VEC_W(src2); | |||
| sum_abs3 += AND_VEC_W(src3); | |||
| sum_abs0 += AND_VEC_W(src4); | |||
| sum_abs1 += AND_VEC_W(src5); | |||
| sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; | |||
| sumf = sum_abs0[0]; | |||
| sumf += sum_abs0[1]; | |||
| sumf += sum_abs0[2]; | |||
| sumf += sum_abs0[3]; | |||
| } | |||
| else if ((n & 8) && (n & 2)) | |||
| { | |||
| LD_SP5_INC(x, 4, src0, src1, src2, src3, src4); | |||
| sum_abs0 += AND_VEC_W(src0); | |||
| sum_abs1 += AND_VEC_W(src1); | |||
| sum_abs2 += AND_VEC_W(src2); | |||
| sum_abs3 += AND_VEC_W(src3); | |||
| sum_abs0 += AND_VEC_W(src4); | |||
| sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; | |||
| sumf = sum_abs0[0]; | |||
| sumf += sum_abs0[1]; | |||
| sumf += sum_abs0[2]; | |||
| sumf += sum_abs0[3]; | |||
| } | |||
| else if ((n & 4) && (n & 2)) | |||
| { | |||
| LD_SP3_INC(x, 4, src0, src1, src2); | |||
| sum_abs0 += AND_VEC_W(src0); | |||
| sum_abs1 += AND_VEC_W(src1); | |||
| sum_abs2 += AND_VEC_W(src2); | |||
| sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; | |||
| sumf = sum_abs0[0]; | |||
| sumf += sum_abs0[1]; | |||
| sumf += sum_abs0[2]; | |||
| sumf += sum_abs0[3]; | |||
| } | |||
| else if (n & 8) | |||
| if (n & 8) | |||
| { | |||
| LD_SP4_INC(x, 4, src0, src1, src2, src3); | |||
| @@ -160,97 +124,45 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| sum_abs1 += AND_VEC_W(src1); | |||
| sum_abs2 += AND_VEC_W(src2); | |||
| sum_abs3 += AND_VEC_W(src3); | |||
| sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; | |||
| sumf = sum_abs0[0]; | |||
| sumf += sum_abs0[1]; | |||
| sumf += sum_abs0[2]; | |||
| sumf += sum_abs0[3]; | |||
| } | |||
| else if (n & 4) | |||
| if (n & 4) | |||
| { | |||
| LD_SP2_INC(x, 4, src0, src1); | |||
| sum_abs0 += AND_VEC_W(src0); | |||
| sum_abs1 += AND_VEC_W(src1); | |||
| sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; | |||
| sumf = sum_abs0[0]; | |||
| sumf += sum_abs0[1]; | |||
| sumf += sum_abs0[2]; | |||
| sumf += sum_abs0[3]; | |||
| } | |||
| else if (n & 2) | |||
| if (n & 2) | |||
| { | |||
| src0 = LD_SP(x); x += 4; | |||
| sum_abs0 += AND_VEC_W(src0); | |||
| sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; | |||
| sumf = sum_abs0[0]; | |||
| sumf += sum_abs0[1]; | |||
| sumf += sum_abs0[2]; | |||
| sumf += sum_abs0[3]; | |||
| } | |||
| else | |||
| { | |||
| sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; | |||
| sumf = sum_abs0[0]; | |||
| sumf += sum_abs0[1]; | |||
| sumf += sum_abs0[2]; | |||
| sumf += sum_abs0[3]; | |||
| } | |||
| if (n & 1) | |||
| { | |||
| sumf += fabsf(*(x + 0)); | |||
| sumf += fabsf(*x); | |||
| sumf += fabsf(*(x + 1)); | |||
| } | |||
| } | |||
| else | |||
| { | |||
| sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; | |||
| sumf = sum_abs0[0]; | |||
| sumf += sum_abs0[1]; | |||
| sumf += sum_abs0[2]; | |||
| sumf += sum_abs0[3]; | |||
| } | |||
| sum_abs0 += sum_abs1 + sum_abs2 + sum_abs3; | |||
| sumf += sum_abs0[0]; | |||
| sumf += sum_abs0[1]; | |||
| sumf += sum_abs0[2]; | |||
| sumf += sum_abs0[3]; | |||
| } | |||
| else | |||
| { | |||
| inc_x2 = 2 * inc_x; | |||
| if (n > 8) | |||
| { | |||
| n -= 8; | |||
| LD_SP8_INC(x, inc_x2, src0, src1, src2, src3, src4, src5, src6, src7); | |||
| sum_abs0 = AND_VEC_W(src0); | |||
| sum_abs1 = AND_VEC_W(src1); | |||
| sum_abs2 = AND_VEC_W(src2); | |||
| sum_abs3 = AND_VEC_W(src3); | |||
| sum_abs0 += AND_VEC_W(src4); | |||
| sum_abs1 += AND_VEC_W(src5); | |||
| sum_abs2 += AND_VEC_W(src6); | |||
| sum_abs3 += AND_VEC_W(src7); | |||
| } | |||
| else | |||
| { | |||
| sum_abs0 = zero_v; | |||
| sum_abs1 = zero_v; | |||
| sum_abs2 = zero_v; | |||
| sum_abs3 = zero_v; | |||
| } | |||
| for (i = (n >> 3); i--;) | |||
| for (i = (n >> 4); i--;) | |||
| { | |||
| LD_SP8_INC(x, inc_x2, src0, src1, src2, src3, src4, src5, src6, src7); | |||
| LD_SP8_INC(x, inc_x2, src8, src9, src10, src11, src12, src13, src14, src15); | |||
| sum_abs0 += AND_VEC_W(src0); | |||
| sum_abs1 += AND_VEC_W(src1); | |||
| @@ -260,13 +172,21 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| sum_abs1 += AND_VEC_W(src5); | |||
| sum_abs2 += AND_VEC_W(src6); | |||
| sum_abs3 += AND_VEC_W(src7); | |||
| sum_abs0 += AND_VEC_W(src8); | |||
| sum_abs1 += AND_VEC_W(src9); | |||
| sum_abs2 += AND_VEC_W(src10); | |||
| sum_abs3 += AND_VEC_W(src11); | |||
| sum_abs0 += AND_VEC_W(src12); | |||
| sum_abs1 += AND_VEC_W(src13); | |||
| sum_abs2 += AND_VEC_W(src14); | |||
| sum_abs3 += AND_VEC_W(src15); | |||
| } | |||
| if (n & 7) | |||
| if (n & 15) | |||
| { | |||
| if ((n & 4) && (n & 2) && (n & 1)) | |||
| if (n & 8) | |||
| { | |||
| LD_SP7_INC(x, inc_x2, src0, src1, src2, src3, src4, src5, src6); | |||
| LD_SP8_INC(x, inc_x2, src0, src1, src2, src3, src4, src5, src6, src7); | |||
| sum_abs0 += AND_VEC_W(src0); | |||
| sum_abs1 += AND_VEC_W(src1); | |||
| @@ -275,37 +195,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| sum_abs0 += AND_VEC_W(src4); | |||
| sum_abs1 += AND_VEC_W(src5); | |||
| sum_abs2 += AND_VEC_W(src6); | |||
| sum_abs3 += AND_VEC_W(src7); | |||
| } | |||
| else if ((n & 4) && (n & 2)) | |||
| { | |||
| LD_SP6_INC(x, inc_x2, src0, src1, src2, src3, src4, src5); | |||
| sum_abs0 += AND_VEC_W(src0); | |||
| sum_abs1 += AND_VEC_W(src1); | |||
| sum_abs2 += AND_VEC_W(src2); | |||
| sum_abs3 += AND_VEC_W(src3); | |||
| sum_abs0 += AND_VEC_W(src4); | |||
| sum_abs1 += AND_VEC_W(src5); | |||
| } | |||
| else if ((n & 4) && (n & 1)) | |||
| { | |||
| LD_SP5_INC(x, inc_x2, src0, src1, src2, src3, src4); | |||
| sum_abs0 += AND_VEC_W(src0); | |||
| sum_abs1 += AND_VEC_W(src1); | |||
| sum_abs2 += AND_VEC_W(src2); | |||
| sum_abs3 += AND_VEC_W(src3); | |||
| sum_abs0 += AND_VEC_W(src4); | |||
| } | |||
| else if ((n & 2) && (n & 1)) | |||
| { | |||
| LD_SP3_INC(x, inc_x2, src0, src1, src2); | |||
| sum_abs0 += AND_VEC_W(src0); | |||
| sum_abs1 += AND_VEC_W(src1); | |||
| sum_abs2 += AND_VEC_W(src2); | |||
| } | |||
| else if (n & 4) | |||
| if (n & 4) | |||
| { | |||
| LD_SP4_INC(x, inc_x2, src0, src1, src2, src3); | |||
| @@ -314,22 +207,24 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| sum_abs2 += AND_VEC_W(src2); | |||
| sum_abs3 += AND_VEC_W(src3); | |||
| } | |||
| else if (n & 2) | |||
| if (n & 2) | |||
| { | |||
| LD_SP2_INC(x, inc_x2, src0, src1); | |||
| sum_abs0 += AND_VEC_W(src0); | |||
| sum_abs1 += AND_VEC_W(src1); | |||
| } | |||
| else if (n & 1) | |||
| if (n & 1) | |||
| { | |||
| src0 = LD_SP(x); x += inc_x2; | |||
| src0 = LD_SP(x); | |||
| sum_abs0 += AND_VEC_W(src0); | |||
| } | |||
| } | |||
| sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; | |||
| sum_abs0 += sum_abs1 + sum_abs2 + sum_abs3; | |||
| sumf = sum_abs0[0] + sum_abs0[1]; | |||
| } | |||
| @@ -29,333 +29,274 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "macros_msa.h" | |||
| #if !defined(CONJ) | |||
| #define OP2 += | |||
| #define OP3 - | |||
| #define OP4 + | |||
| #define OP1 -= | |||
| #define OP2 += | |||
| #define OP3 - | |||
| #define OP4 + | |||
| #else | |||
| #define OP2 -= | |||
| #define OP3 + | |||
| #define OP4 - | |||
| #define OP1 += | |||
| #define OP2 -= | |||
| #define OP3 + | |||
| #define OP4 - | |||
| #endif | |||
| #define DOT16_KERNEL(OPR0, OPR1) \ | |||
| dot0 += (vx0r * vy0r); \ | |||
| dot0 OPR0## = (vx0i * vy0i); \ | |||
| dot1 OPR1## = (vx0i * vy0r); \ | |||
| dot1 += (vx0r * vy0i); \ | |||
| \ | |||
| dot0 += (vx1r * vy1r); \ | |||
| dot0 OPR0## = (vx1i * vy1i); \ | |||
| dot1 OPR1## = (vx1i * vy1r); \ | |||
| dot1 += (vx1r * vy1i); \ | |||
| \ | |||
| dot0 += (vx2r * vy2r); \ | |||
| dot0 OPR0## = (vx2i * vy2i); \ | |||
| dot1 OPR1## = (vx2i * vy2r); \ | |||
| dot1 += (vx2r * vy2i); \ | |||
| \ | |||
| dot0 += (vx3r * vy3r); \ | |||
| dot0 OPR0## = (vx3i * vy3i); \ | |||
| dot1 OPR1## = (vx3i * vy3r); \ | |||
| dot1 += (vx3r * vy3i); | |||
| #define DOT12_KERNEL(OPR0, OPR1) \ | |||
| dot0 += (vx0r * vy0r); \ | |||
| dot0 OPR0## = (vx0i * vy0i); \ | |||
| dot1 OPR1## = (vx0i * vy0r); \ | |||
| dot1 += (vx0r * vy0i); \ | |||
| \ | |||
| dot0 += (vx1r * vy1r); \ | |||
| dot0 OPR0## = (vx1i * vy1i); \ | |||
| dot1 OPR1## = (vx1i * vy1r); \ | |||
| dot1 += (vx1r * vy1i); \ | |||
| \ | |||
| dot0 += (vx2r * vy2r); \ | |||
| dot0 OPR0## = (vx2i * vy2i); \ | |||
| dot1 OPR1## = (vx2i * vy2r); \ | |||
| dot1 += (vx2r * vy2i); | |||
| #define DOT8_KERNEL(OPR0, OPR1) \ | |||
| dot0 += (vx0r * vy0r); \ | |||
| dot0 OPR0## = (vx0i * vy0i); \ | |||
| dot1 OPR1## = (vx0i * vy0r); \ | |||
| dot1 += (vx0r * vy0i); \ | |||
| \ | |||
| dot0 += (vx1r * vy1r); \ | |||
| dot0 OPR0## = (vx1i * vy1i); \ | |||
| dot1 OPR1## = (vx1i * vy1r); \ | |||
| dot1 += (vx1r * vy1i); | |||
| #define DOT4_KERNEL(OPR0, OPR1) \ | |||
| dot0 += (vx0r * vy0r); \ | |||
| dot0 OPR0## = (vx0i * vy0i); \ | |||
| dot1 OPR1## = (vx0i * vy0r); \ | |||
| dot1 += (vx0r * vy0i); | |||
| /* return float, x,y float */ | |||
| /* cdotc - CONJ */ | |||
| /* cdotu - !CONJ */ | |||
| #ifndef _MSC_VER | |||
| #include <complex.h> | |||
| FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| #else | |||
| OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| #endif | |||
| { | |||
| BLASLONG i = 0; | |||
| FLOAT dot[2]; | |||
| BLASLONG inc_x2; | |||
| BLASLONG inc_y2; | |||
| BLASLONG inc_x2, inc_y2; | |||
| FLOAT x0, x1, x2, x3, x4, x5, x6, x7; | |||
| FLOAT y0, y1, y2, y3, y4, y5, y6, y7; | |||
| v4f32 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7; | |||
| v4f32 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7; | |||
| v4f32 vx0r, vx0i, vx1r, vx1i, vx2r, vx2i, vx3r, vx3i; | |||
| v4f32 vy0r, vy0i, vy1r, vy1i, vy2r, vy2i, vy3r, vy3i; | |||
| v4f32 vx0r, vx0i, vx1r, vx1i, vx2r, vx2i, vx3r, vx3i; | |||
| v4f32 vy0r, vy0i, vy1r, vy1i, vy2r, vy2i, vy3r, vy3i; | |||
| v4f32 dot0 = {0, 0, 0, 0}; | |||
| v4f32 dot1 = {0, 0, 0, 0}; | |||
| openblas_complex_float result; | |||
| v4f32 dot2 = {0, 0, 0, 0}; | |||
| v4f32 dot3 = {0, 0, 0, 0}; | |||
| v4f32 dot4 = {0, 0, 0, 0}; | |||
| v4f32 dot5 = {0, 0, 0, 0}; | |||
| v4f32 dot6 = {0, 0, 0, 0}; | |||
| v4f32 dot7 = {0, 0, 0, 0}; | |||
| OPENBLAS_COMPLEX_FLOAT result; | |||
| dot[0] = 0.0; | |||
| dot[1] = 0.0; | |||
| __real__(result) = 0.0; | |||
| __imag__(result) = 0.0; | |||
| CREAL(result) = 0.0; | |||
| CIMAG(result) = 0.0; | |||
| if ( n < 1 ) return(result); | |||
| if (n < 1) return (result); | |||
| if ((1 == inc_x) && (1 == inc_y)) | |||
| { | |||
| #ifdef ENABLE_PREFETCH | |||
| FLOAT *x_pref, *y_pref; | |||
| BLASLONG pref_offset; | |||
| pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); | |||
| if (pref_offset > 0) | |||
| { | |||
| pref_offset = L1_DATA_LINESIZE - pref_offset; | |||
| } | |||
| pref_offset = pref_offset / sizeof(FLOAT); | |||
| x_pref = x + pref_offset + 64; | |||
| pref_offset = (BLASLONG)y & (L1_DATA_LINESIZE - 1); | |||
| if (pref_offset > 0) | |||
| { | |||
| pref_offset = L1_DATA_LINESIZE - pref_offset; | |||
| } | |||
| pref_offset = pref_offset / sizeof(FLOAT); | |||
| y_pref = y + pref_offset + 64; | |||
| #endif | |||
| for (i = (n >> 4); i--;) | |||
| { | |||
| LD_SP8_INC(x, 4, vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7); | |||
| LD_SP8_INC(y, 4, vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7); | |||
| PCKEVOD_W2_SP(vx1, vx0, vx0r, vx0i); | |||
| PCKEVOD_W2_SP(vx3, vx2, vx1r, vx1i); | |||
| PCKEVOD_W2_SP(vx5, vx4, vx2r, vx2i); | |||
| PCKEVOD_W2_SP(vx7, vx6, vx3r, vx3i); | |||
| PCKEVOD_W2_SP(vy1, vy0, vy0r, vy0i); | |||
| PCKEVOD_W2_SP(vy3, vy2, vy1r, vy1i); | |||
| PCKEVOD_W2_SP(vy5, vy4, vy2r, vy2i); | |||
| PCKEVOD_W2_SP(vy7, vy6, vy3r, vy3i); | |||
| #if !defined(CONJ) | |||
| DOT16_KERNEL(-, +); | |||
| #else | |||
| DOT16_KERNEL(+, -); | |||
| #endif | |||
| #ifdef ENABLE_PREFETCH | |||
| __asm__ __volatile__( | |||
| "pref 0, 0(%[x_pref])\n\t" | |||
| "pref 0, 32(%[x_pref])\n\t" | |||
| "pref 0, 64(%[x_pref])\n\t" | |||
| "pref 0, 96(%[x_pref])\n\t" | |||
| "pref 0, 0(%[y_pref])\n\t" | |||
| "pref 0, 32(%[y_pref])\n\t" | |||
| "pref 0, 64(%[y_pref])\n\t" | |||
| "pref 0, 96(%[y_pref])\n\t" | |||
| : : [x_pref] "r" (x_pref), [y_pref] "r" (y_pref) | |||
| ); | |||
| x_pref += 32; | |||
| y_pref += 32; | |||
| #endif | |||
| LD_SP8_INC(x, 4, vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7); | |||
| LD_SP8_INC(y, 4, vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7); | |||
| PCKEVOD_W2_SP(vx1, vx0, vx0r, vx0i); | |||
| PCKEVOD_W2_SP(vx3, vx2, vx1r, vx1i); | |||
| PCKEVOD_W2_SP(vx5, vx4, vx2r, vx2i); | |||
| PCKEVOD_W2_SP(vx7, vx6, vx3r, vx3i); | |||
| PCKEVOD_W2_SP(vy1, vy0, vy0r, vy0i); | |||
| PCKEVOD_W2_SP(vy3, vy2, vy1r, vy1i); | |||
| PCKEVOD_W2_SP(vy5, vy4, vy2r, vy2i); | |||
| PCKEVOD_W2_SP(vy7, vy6, vy3r, vy3i); | |||
| dot0 += (vx0r * vy0r); | |||
| dot0 OP1 (vx0i * vy0i); | |||
| dot1 OP2 (vx0i * vy0r); | |||
| dot1 += (vx0r * vy0i); | |||
| dot2 += (vx1r * vy1r); | |||
| dot2 OP1 (vx1i * vy1i); | |||
| dot3 OP2 (vx1i * vy1r); | |||
| dot3 += (vx1r * vy1i); | |||
| dot4 += (vx2r * vy2r); | |||
| dot4 OP1 (vx2i * vy2i); | |||
| dot5 OP2 (vx2i * vy2r); | |||
| dot5 += (vx2r * vy2i); | |||
| dot6 += (vx3r * vy3r); | |||
| dot6 OP1 (vx3i * vy3i); | |||
| dot7 OP2 (vx3i * vy3r); | |||
| dot7 += (vx3r * vy3i); | |||
| } | |||
| if (n & 15) | |||
| { | |||
| if ((n & 8) && (n & 4)) | |||
| if (n & 8) | |||
| { | |||
| LD_SP4_INC(x, 4, vx0, vx1, vx2, vx3); | |||
| LD_SP4_INC(y, 4, vy0, vy1, vy2, vy3); | |||
| LD_SP2_INC(x, 4, vx4, vx5); | |||
| LD_SP2_INC(y, 4, vy4, vy5); | |||
| PCKEVOD_W2_SP(vx1, vx0, vx0r, vx0i); | |||
| PCKEVOD_W2_SP(vx3, vx2, vx1r, vx1i); | |||
| PCKEVOD_W2_SP(vx5, vx4, vx2r, vx2i); | |||
| PCKEVOD_W2_SP(vy1, vy0, vy0r, vy0i); | |||
| PCKEVOD_W2_SP(vy3, vy2, vy1r, vy1i); | |||
| PCKEVOD_W2_SP(vy5, vy4, vy2r, vy2i); | |||
| #if !defined(CONJ) | |||
| DOT12_KERNEL(-, +); | |||
| #else | |||
| DOT12_KERNEL(+, -); | |||
| #endif | |||
| LD_SP4_INC(x, 4, vx0, vx1, vx2, vx3); | |||
| LD_SP4_INC(y, 4, vy0, vy1, vy2, vy3); | |||
| PCKEVOD_W2_SP(vx1, vx0, vx0r, vx0i); | |||
| PCKEVOD_W2_SP(vx3, vx2, vx1r, vx1i); | |||
| PCKEVOD_W2_SP(vy1, vy0, vy0r, vy0i); | |||
| PCKEVOD_W2_SP(vy3, vy2, vy1r, vy1i); | |||
| dot0 += (vx0r * vy0r); | |||
| dot0 OP1 (vx0i * vy0i); | |||
| dot1 OP2 (vx0i * vy0r); | |||
| dot1 += (vx0r * vy0i); | |||
| dot2 += (vx1r * vy1r); | |||
| dot2 OP1 (vx1i * vy1i); | |||
| dot3 OP2 (vx1i * vy1r); | |||
| dot3 += (vx1r * vy1i); | |||
| } | |||
| else if (n & 8) | |||
| if (n & 4) | |||
| { | |||
| LD_SP4_INC(x, 4, vx0, vx1, vx2, vx3); | |||
| LD_SP4_INC(y, 4, vy0, vy1, vy2, vy3); | |||
| LD_SP2_INC(x, 4, vx0, vx1); | |||
| LD_SP2_INC(y, 4, vy0, vy1); | |||
| PCKEVOD_W2_SP(vx1, vx0, vx0r, vx0i); | |||
| PCKEVOD_W2_SP(vy1, vy0, vy0r, vy0i); | |||
| dot0 += (vx0r * vy0r); | |||
| dot0 OP1 (vx0i * vy0i); | |||
| dot1 OP2 (vx0i * vy0r); | |||
| dot1 += (vx0r * vy0i); | |||
| } | |||
| PCKEVOD_W2_SP(vx1, vx0, vx0r, vx0i); | |||
| PCKEVOD_W2_SP(vx3, vx2, vx1r, vx1i); | |||
| if (n & 2) | |||
| { | |||
| LD_GP4_INC(x, 1, x0, x1, x2, x3); | |||
| LD_GP4_INC(y, 1, y0, y1, y2, y3); | |||
| PCKEVOD_W2_SP(vy1, vy0, vy0r, vy0i); | |||
| PCKEVOD_W2_SP(vy3, vy2, vy1r, vy1i); | |||
| dot[0] += (x0 * y0 OP3 x1 * y1); | |||
| dot[1] OP2 (x1 * y0 OP4 x0 * y1); | |||
| #if !defined(CONJ) | |||
| DOT8_KERNEL(-, +); | |||
| #else | |||
| DOT8_KERNEL(+, -); | |||
| #endif | |||
| dot[0] += (x2 * y2 OP3 x3 * y3); | |||
| dot[1] OP2 (x3 * y2 OP4 x2 * y3); | |||
| } | |||
| else if (n & 4) | |||
| if (n & 1) | |||
| { | |||
| LD_SP2_INC(x, 4, vx0, vx1); | |||
| LD_SP2_INC(y, 4, vy0, vy1); | |||
| PCKEVOD_W2_SP(vx1, vx0, vx0r, vx0i); | |||
| PCKEVOD_W2_SP(vy1, vy0, vy0r, vy0i); | |||
| #if !defined(CONJ) | |||
| DOT4_KERNEL(-, +); | |||
| #else | |||
| DOT4_KERNEL(+, -); | |||
| #endif | |||
| LD_GP2_INC(x, 1, x0, x1); | |||
| LD_GP2_INC(y, 1, y0, y1); | |||
| dot[0] += (x0 * y0 OP3 x1 * y1); | |||
| dot[1] OP2 (x1 * y0 OP4 x0 * y1); | |||
| } | |||
| } | |||
| if ((n & 2) && (n & 1)) | |||
| { | |||
| LD_GP6_INC(x, 1, x0, x1, x2, x3, x4, x5); | |||
| LD_GP6_INC(y, 1, y0, y1, y2, y3, y4, y5); | |||
| dot0 += dot2 + dot4 + dot6; | |||
| dot1 += dot3 + dot5 + dot7; | |||
| dot[0] += ( x0 * y0 OP3 x1 * y1 ); | |||
| dot[1] OP2 ( x1 * y0 OP4 x0 * y1 ); | |||
| dot[0] += (dot0[0] + dot0[1] + dot0[2] + dot0[3]); | |||
| dot[1] += (dot1[0] + dot1[1] + dot1[2] + dot1[3]); | |||
| } | |||
| else | |||
| { | |||
| inc_x2 = 2 * inc_x; | |||
| inc_y2 = 2 * inc_y; | |||
| dot[0] += ( x2 * y2 OP3 x3 * y3 ); | |||
| dot[1] OP2 ( x3 * y2 OP4 x2 * y3 ); | |||
| for (i = (n >> 2); i--;) | |||
| { | |||
| x0 = *x; | |||
| x1 = *(x + 1); | |||
| x += inc_x2; | |||
| x2 = *x; | |||
| x3 = *(x + 1); | |||
| x += inc_x2; | |||
| x4 = *x; | |||
| x5 = *(x + 1); | |||
| x += inc_x2; | |||
| x6 = *x; | |||
| x7 = *(x + 1); | |||
| x += inc_x2; | |||
| y0 = *y; | |||
| y1 = *(y + 1); | |||
| y += inc_y2; | |||
| y2 = *y; | |||
| y3 = *(y + 1); | |||
| y += inc_y2; | |||
| y4 = *y; | |||
| y5 = *(y + 1); | |||
| y += inc_y2; | |||
| y6 = *y; | |||
| y7 = *(y + 1); | |||
| y += inc_y2; | |||
| dot[0] += (x0 * y0 OP3 x1 * y1); | |||
| dot[1] OP2 (x1 * y0 OP4 x0 * y1); | |||
| dot[0] += (x2 * y2 OP3 x3 * y3); | |||
| dot[1] OP2 (x3 * y2 OP4 x2 * y3); | |||
| dot[0] += (x4 * y4 OP3 x5 * y5); | |||
| dot[1] OP2 (x5 * y4 OP4 x4 * y5); | |||
| dot[0] += (x6 * y6 OP3 x7 * y7); | |||
| dot[1] OP2 (x7 * y6 OP4 x6 * y7); | |||
| } | |||
| dot[0] += ( x4 * y4 OP3 x5 * y5 ); | |||
| dot[1] OP2 ( x5 * y4 OP4 x4 * y5 ); | |||
| } | |||
| else if (n & 2) | |||
| { | |||
| LD_GP4_INC(x, 1, x0, x1, x2, x3); | |||
| LD_GP4_INC(y, 1, y0, y1, y2, y3); | |||
| if (n & 2) | |||
| { | |||
| x0 = *x; | |||
| x1 = *(x + 1); | |||
| x += inc_x2; | |||
| x2 = *x; | |||
| x3 = *(x + 1); | |||
| x += inc_x2; | |||
| y0 = *y; | |||
| y1 = *(y + 1); | |||
| y += inc_y2; | |||
| y2 = *y; | |||
| y3 = *(y + 1); | |||
| y += inc_y2; | |||
| dot[0] += (x0 * y0 OP3 x1 * y1); | |||
| dot[1] OP2 (x1 * y0 OP4 x0 * y1); | |||
| dot[0] += (x2 * y2 OP3 x3 * y3); | |||
| dot[1] OP2 (x3 * y2 OP4 x2 * y3); | |||
| } | |||
| dot[0] += ( x0 * y0 OP3 x1 * y1 ); | |||
| dot[1] OP2 ( x1 * y0 OP4 x0 * y1 ); | |||
| if (n & 1) | |||
| { | |||
| x0 = *x; | |||
| x1 = *(x + 1); | |||
| x += inc_x2; | |||
| dot[0] += ( x2 * y2 OP3 x3 * y3 ); | |||
| dot[1] OP2 ( x3 * y2 OP4 x2 * y3 ); | |||
| } | |||
| else if (n & 1) | |||
| { | |||
| LD_GP2_INC(x, 1, x0, x1); | |||
| LD_GP2_INC(y, 1, y0, y1); | |||
| y0 = *y; | |||
| y1 = *(y + 1); | |||
| y += inc_y2; | |||
| dot[0] += ( x0 * y0 OP3 x1 * y1 ); | |||
| dot[1] OP2 ( x1 * y0 OP4 x0 * y1 ); | |||
| } | |||
| dot[0] += (x0 * y0 OP3 x1 * y1); | |||
| dot[1] OP2 (x1 * y0 OP4 x0 * y1); | |||
| } | |||
| } | |||
| CREAL(result) = dot[0]; | |||
| CIMAG(result) = dot[1]; | |||
| dot[0] += (dot0[0] + dot0[1] + dot0[2] + dot0[3]); | |||
| dot[1] += (dot1[0] + dot1[1] + dot1[2] + dot1[3]); | |||
| } | |||
| else | |||
| { | |||
| inc_x2 = 2 * inc_x; | |||
| inc_y2 = 2 * inc_y; | |||
| for (i = (n >> 2); i--;) | |||
| { | |||
| x0 = *x; | |||
| x1 = *(x + 1); | |||
| x += inc_x2; | |||
| x2 = *x; | |||
| x3 = *(x + 1); | |||
| x += inc_x2; | |||
| x4 = *x; | |||
| x5 = *(x + 1); | |||
| x += inc_x2; | |||
| x6 = *x; | |||
| x7 = *(x + 1); | |||
| x += inc_x2; | |||
| y0 = *y; | |||
| y1 = *(y + 1); | |||
| y += inc_y2; | |||
| y2 = *y; | |||
| y3 = *(y + 1); | |||
| y += inc_y2; | |||
| y4 = *y; | |||
| y5 = *(y + 1); | |||
| y += inc_y2; | |||
| y6 = *y; | |||
| y7 = *(y + 1); | |||
| y += inc_y2; | |||
| dot[0] += ( x0 * y0 OP3 x1 * y1 ); | |||
| dot[1] OP2 ( x1 * y0 OP4 x0 * y1 ); | |||
| dot[0] += ( x2 * y2 OP3 x3 * y3 ); | |||
| dot[1] OP2 ( x3 * y2 OP4 x2 * y3 ); | |||
| dot[0] += ( x4 * y4 OP3 x5 * y5 ); | |||
| dot[1] OP2 ( x5 * y4 OP4 x4 * y5 ); | |||
| dot[0] += ( x6 * y6 OP3 x7 * y7 ); | |||
| dot[1] OP2 ( x7 * y6 OP4 x6 * y7 ); | |||
| } | |||
| if ((n & 2) && (n & 1)) | |||
| { | |||
| x0 = *x; | |||
| x1 = *(x + 1); | |||
| x += inc_x2; | |||
| x2 = *x; | |||
| x3 = *(x + 1); | |||
| x += inc_x2; | |||
| x4 = *x; | |||
| x5 = *(x + 1); | |||
| x += inc_x2; | |||
| y0 = *y; | |||
| y1 = *(y + 1); | |||
| y += inc_y2; | |||
| y2 = *y; | |||
| y3 = *(y + 1); | |||
| y += inc_y2; | |||
| y4 = *y; | |||
| y5 = *(y + 1); | |||
| y += inc_y2; | |||
| dot[0] += ( x0 * y0 OP3 x1 * y1 ); | |||
| dot[1] OP2 ( x1 * y0 OP4 x0 * y1 ); | |||
| dot[0] += ( x2 * y2 OP3 x3 * y3 ); | |||
| dot[1] OP2 ( x3 * y2 OP4 x2 * y3 ); | |||
| dot[0] += ( x4 * y4 OP3 x5 * y5 ); | |||
| dot[1] OP2 ( x5 * y4 OP4 x4 * y5 ); | |||
| } | |||
| else if (n & 2) | |||
| { | |||
| x0 = *x; | |||
| x1 = *(x + 1); | |||
| x += inc_x2; | |||
| x2 = *x; | |||
| x3 = *(x + 1); | |||
| x += inc_x2; | |||
| y0 = *y; | |||
| y1 = *(y + 1); | |||
| y += inc_y2; | |||
| y2 = *y; | |||
| y3 = *(y + 1); | |||
| y += inc_y2; | |||
| dot[0] += ( x0 * y0 OP3 x1 * y1 ); | |||
| dot[1] OP2 ( x1 * y0 OP4 x0 * y1 ); | |||
| dot[0] += ( x2 * y2 OP3 x3 * y3 ); | |||
| dot[1] OP2 ( x3 * y2 OP4 x2 * y3 ); | |||
| } | |||
| else if (n & 1) | |||
| { | |||
| x0 = *x; | |||
| x1 = *(x + 1); | |||
| x += inc_x2; | |||
| y0 = *y; | |||
| y1 = *(y + 1); | |||
| y += inc_y2; | |||
| dot[0] += ( x0 * y0 OP3 x1 * y1 ); | |||
| dot[1] OP2 ( x1 * y0 OP4 x0 * y1 ); | |||
| } | |||
| } | |||
| __real__(result) = dot[0]; | |||
| __imag__(result) = dot[1]; | |||
| return(result); | |||
| return (result); | |||
| } | |||
| @@ -36,40 +36,51 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| BLASLONG i; | |||
| FLOAT sumf = 0.0; | |||
| v2f64 src0, src1, src2, src3, src4, src5, src6, src7; | |||
| v2f64 sum_abs0, sum_abs1, sum_abs2, sum_abs3; | |||
| v2f64 zero_v = {0}; | |||
| v2f64 src8, src9, src10, src11, src12, src13, src14, src15; | |||
| v2f64 sum_abs0 = {0, 0}; | |||
| v2f64 sum_abs1 = {0, 0}; | |||
| v2f64 sum_abs2 = {0, 0}; | |||
| v2f64 sum_abs3 = {0, 0}; | |||
| v2i64 and_vec = {0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF}; | |||
| if (n <= 0 || inc_x <= 0) return (sumf); | |||
| if (1 == inc_x) | |||
| { | |||
| if (n > 15) | |||
| { | |||
| n -= 16; | |||
| LD_DP8_INC(x, 2, src0, src1, src2, src3, src4, src5, src6, src7); | |||
| #ifdef ENABLE_PREFETCH | |||
| FLOAT *x_pref; | |||
| BLASLONG pref_offset; | |||
| sum_abs0 = AND_VEC_D(src0); | |||
| sum_abs1 = AND_VEC_D(src1); | |||
| sum_abs2 = AND_VEC_D(src2); | |||
| sum_abs3 = AND_VEC_D(src3); | |||
| sum_abs0 += AND_VEC_D(src4); | |||
| sum_abs1 += AND_VEC_D(src5); | |||
| sum_abs2 += AND_VEC_D(src6); | |||
| sum_abs3 += AND_VEC_D(src7); | |||
| } | |||
| else | |||
| pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); | |||
| if (pref_offset > 0) | |||
| { | |||
| sum_abs0 = zero_v; | |||
| sum_abs1 = zero_v; | |||
| sum_abs2 = zero_v; | |||
| sum_abs3 = zero_v; | |||
| pref_offset = L1_DATA_LINESIZE - pref_offset; | |||
| } | |||
| pref_offset = pref_offset / sizeof(FLOAT); | |||
| x_pref = x + pref_offset + 64; | |||
| #endif | |||
| for (i = (n >> 4); i--;) | |||
| for (i = (n >> 5); i--;) | |||
| { | |||
| #ifdef ENABLE_PREFETCH | |||
| __asm__ __volatile__( | |||
| "pref 0, 0(%[x_pref])\n\t" | |||
| "pref 0, 32(%[x_pref])\n\t" | |||
| "pref 0, 64(%[x_pref])\n\t" | |||
| "pref 0, 96(%[x_pref])\n\t" | |||
| "pref 0, 128(%[x_pref])\n\t" | |||
| "pref 0, 160(%[x_pref])\n\t" | |||
| "pref 0, 192(%[x_pref])\n\t" | |||
| "pref 0, 224(%[x_pref])\n\t" | |||
| : : [x_pref] "r" (x_pref) | |||
| ); | |||
| x_pref += 32; | |||
| #endif | |||
| LD_DP8_INC(x, 2, src0, src1, src2, src3, src4, src5, src6, src7); | |||
| LD_DP8_INC(x, 2, src8, src9, src10, src11, src12, src13, src14, src15); | |||
| sum_abs0 += AND_VEC_D(src0); | |||
| sum_abs1 += AND_VEC_D(src1); | |||
| @@ -79,13 +90,21 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| sum_abs1 += AND_VEC_D(src5); | |||
| sum_abs2 += AND_VEC_D(src6); | |||
| sum_abs3 += AND_VEC_D(src7); | |||
| sum_abs0 += AND_VEC_D(src8); | |||
| sum_abs1 += AND_VEC_D(src9); | |||
| sum_abs2 += AND_VEC_D(src10); | |||
| sum_abs3 += AND_VEC_D(src11); | |||
| sum_abs0 += AND_VEC_D(src12); | |||
| sum_abs1 += AND_VEC_D(src13); | |||
| sum_abs2 += AND_VEC_D(src14); | |||
| sum_abs3 += AND_VEC_D(src15); | |||
| } | |||
| if (n & 15) | |||
| if (n & 31) | |||
| { | |||
| if ((n & 8) && (n & 4) && (n & 2)) | |||
| if (n & 16) | |||
| { | |||
| LD_DP7_INC(x, 2, src0, src1, src2, src3, src4, src5, src6); | |||
| LD_DP8_INC(x, 2, src0, src1, src2, src3, src4, src5, src6, src7); | |||
| sum_abs0 += AND_VEC_D(src0); | |||
| sum_abs1 += AND_VEC_D(src1); | |||
| @@ -94,37 +113,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| sum_abs0 += AND_VEC_D(src4); | |||
| sum_abs1 += AND_VEC_D(src5); | |||
| sum_abs2 += AND_VEC_D(src6); | |||
| sum_abs3 += AND_VEC_D(src7); | |||
| } | |||
| else if ((n & 8) && (n & 4)) | |||
| { | |||
| LD_DP6_INC(x, 2, src0, src1, src2, src3, src4, src5); | |||
| sum_abs0 += AND_VEC_D(src0); | |||
| sum_abs1 += AND_VEC_D(src1); | |||
| sum_abs2 += AND_VEC_D(src2); | |||
| sum_abs3 += AND_VEC_D(src3); | |||
| sum_abs0 += AND_VEC_D(src4); | |||
| sum_abs1 += AND_VEC_D(src5); | |||
| } | |||
| else if ((n & 8) && (n & 2)) | |||
| { | |||
| LD_DP5_INC(x, 2, src0, src1, src2, src3, src4); | |||
| sum_abs0 += AND_VEC_D(src0); | |||
| sum_abs1 += AND_VEC_D(src1); | |||
| sum_abs2 += AND_VEC_D(src2); | |||
| sum_abs3 += AND_VEC_D(src3); | |||
| sum_abs0 += AND_VEC_D(src4); | |||
| } | |||
| else if ((n & 4) && (n & 2)) | |||
| { | |||
| LD_DP3_INC(x, 2, src0, src1, src2); | |||
| sum_abs0 += AND_VEC_D(src0); | |||
| sum_abs1 += AND_VEC_D(src1); | |||
| sum_abs2 += AND_VEC_D(src2); | |||
| } | |||
| else if (n & 8) | |||
| if (n & 8) | |||
| { | |||
| LD_DP4_INC(x, 2, src0, src1, src2, src3); | |||
| @@ -133,64 +125,38 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| sum_abs2 += AND_VEC_D(src2); | |||
| sum_abs3 += AND_VEC_D(src3); | |||
| } | |||
| else if (n & 4) | |||
| if (n & 4) | |||
| { | |||
| LD_DP2_INC(x, 2, src0, src1); | |||
| sum_abs0 += AND_VEC_D(src0); | |||
| sum_abs1 += AND_VEC_D(src1); | |||
| } | |||
| else if (n & 2) | |||
| if (n & 2) | |||
| { | |||
| src0 = LD_DP(x); x += 2; | |||
| sum_abs0 += AND_VEC_D(src0); | |||
| } | |||
| sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; | |||
| sumf = sum_abs0[0] + sum_abs0[1]; | |||
| if (n & 1) | |||
| { | |||
| sumf += fabs(*x); | |||
| } | |||
| } | |||
| else | |||
| { | |||
| sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; | |||
| sumf = sum_abs0[0] + sum_abs0[1]; | |||
| } | |||
| sum_abs0 += sum_abs1 + sum_abs2 + sum_abs3; | |||
| sumf += sum_abs0[0] + sum_abs0[1]; | |||
| } | |||
| else | |||
| { | |||
| if (n > 8) | |||
| { | |||
| n -= 8; | |||
| LD_DP8_INC(x, inc_x, src0, src1, src2, src3, src4, src5, src6, src7); | |||
| sum_abs0 = AND_VEC_D(src0); | |||
| sum_abs1 = AND_VEC_D(src1); | |||
| sum_abs2 = AND_VEC_D(src2); | |||
| sum_abs3 = AND_VEC_D(src3); | |||
| sum_abs0 += AND_VEC_D(src4); | |||
| sum_abs1 += AND_VEC_D(src5); | |||
| sum_abs2 += AND_VEC_D(src6); | |||
| sum_abs3 += AND_VEC_D(src7); | |||
| } | |||
| else | |||
| { | |||
| sum_abs0 = zero_v; | |||
| sum_abs1 = zero_v; | |||
| sum_abs2 = zero_v; | |||
| sum_abs3 = zero_v; | |||
| } | |||
| for (i = (n >> 3); i--;) | |||
| for (i = (n >> 4); i--;) | |||
| { | |||
| LD_DP8_INC(x, inc_x, src0, src1, src2, src3, src4, src5, src6, src7); | |||
| LD_DP8_INC(x, inc_x, src8, src9, src10, src11, src12, src13, src14, src15); | |||
| sum_abs0 += AND_VEC_D(src0); | |||
| sum_abs1 += AND_VEC_D(src1); | |||
| @@ -200,13 +166,21 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| sum_abs1 += AND_VEC_D(src5); | |||
| sum_abs2 += AND_VEC_D(src6); | |||
| sum_abs3 += AND_VEC_D(src7); | |||
| sum_abs0 += AND_VEC_D(src8); | |||
| sum_abs1 += AND_VEC_D(src9); | |||
| sum_abs2 += AND_VEC_D(src10); | |||
| sum_abs3 += AND_VEC_D(src11); | |||
| sum_abs0 += AND_VEC_D(src12); | |||
| sum_abs1 += AND_VEC_D(src13); | |||
| sum_abs2 += AND_VEC_D(src14); | |||
| sum_abs3 += AND_VEC_D(src15); | |||
| } | |||
| if (n & 7) | |||
| if (n & 15) | |||
| { | |||
| if ((n & 4) && (n & 2) && (n & 1)) | |||
| if (n & 8) | |||
| { | |||
| LD_DP7_INC(x, inc_x, src0, src1, src2, src3, src4, src5, src6); | |||
| LD_DP8_INC(x, inc_x, src0, src1, src2, src3, src4, src5, src6, src7); | |||
| sum_abs0 += AND_VEC_D(src0); | |||
| sum_abs1 += AND_VEC_D(src1); | |||
| @@ -215,37 +189,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| sum_abs0 += AND_VEC_D(src4); | |||
| sum_abs1 += AND_VEC_D(src5); | |||
| sum_abs2 += AND_VEC_D(src6); | |||
| sum_abs3 += AND_VEC_D(src7); | |||
| } | |||
| else if ((n & 4) && (n & 2)) | |||
| { | |||
| LD_DP6_INC(x, inc_x, src0, src1, src2, src3, src4, src5); | |||
| sum_abs0 += AND_VEC_D(src0); | |||
| sum_abs1 += AND_VEC_D(src1); | |||
| sum_abs2 += AND_VEC_D(src2); | |||
| sum_abs3 += AND_VEC_D(src3); | |||
| sum_abs0 += AND_VEC_D(src4); | |||
| sum_abs1 += AND_VEC_D(src5); | |||
| } | |||
| else if ((n & 4) && (n & 1)) | |||
| { | |||
| LD_DP5_INC(x, inc_x, src0, src1, src2, src3, src4); | |||
| sum_abs0 += AND_VEC_D(src0); | |||
| sum_abs1 += AND_VEC_D(src1); | |||
| sum_abs2 += AND_VEC_D(src2); | |||
| sum_abs3 += AND_VEC_D(src3); | |||
| sum_abs0 += AND_VEC_D(src4); | |||
| } | |||
| else if ((n & 2) && (n & 1)) | |||
| { | |||
| LD_DP3_INC(x, inc_x, src0, src1, src2); | |||
| sum_abs0 += AND_VEC_D(src0); | |||
| sum_abs1 += AND_VEC_D(src1); | |||
| sum_abs2 += AND_VEC_D(src2); | |||
| } | |||
| else if (n & 4) | |||
| if (n & 4) | |||
| { | |||
| LD_DP4_INC(x, inc_x, src0, src1, src2, src3); | |||
| @@ -254,14 +201,16 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| sum_abs2 += AND_VEC_D(src2); | |||
| sum_abs3 += AND_VEC_D(src3); | |||
| } | |||
| else if (n & 2) | |||
| if (n & 2) | |||
| { | |||
| LD_DP2_INC(x, inc_x, src0, src1); | |||
| sum_abs0 += AND_VEC_D(src0); | |||
| sum_abs1 += AND_VEC_D(src1); | |||
| } | |||
| else if (n & 1) | |||
| if (n & 1) | |||
| { | |||
| src0 = LD_DP(x); | |||
| @@ -269,7 +218,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| } | |||
| } | |||
| sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; | |||
| sum_abs0 += sum_abs1 + sum_abs2 + sum_abs3; | |||
| sumf = sum_abs0[0]; | |||
| } | |||
| @@ -28,105 +28,75 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #include "macros_msa.h" | |||
| /* return float, x,y float */ | |||
| #if defined(DSDOT) | |||
| double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| #else | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| #endif | |||
| { | |||
| BLASLONG i = 0; | |||
| double dot = 0.0; | |||
| FLOAT dot = 0.0; | |||
| FLOAT x0, x1, x2, x3, y0, y1, y2, y3; | |||
| v2f64 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7; | |||
| v2f64 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7; | |||
| v2f64 dot0 = {0, 0}; | |||
| v2f64 dot1 = {0, 0}; | |||
| v2f64 dot2 = {0, 0}; | |||
| v2f64 dot3 = {0, 0}; | |||
| if (n < 0) return (dot); | |||
| if (n < 1) return (dot); | |||
| if ((1 == inc_x) && (1 == inc_y)) | |||
| { | |||
| for (i = (n >> 4); i--;) | |||
| { | |||
| LD_DP8_INC(x, 2, vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7); | |||
| LD_DP8_INC(y, 2, vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7); | |||
| LD_DP8_INC(x, 2, vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7); | |||
| LD_DP8_INC(y, 2, vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7); | |||
| #ifdef ENABLE_PREFETCH | |||
| __asm__ __volatile__( | |||
| "pref 0, 256(%[x])\n\t" | |||
| "pref 0, 288(%[x])\n\t" | |||
| "pref 0, 320(%[x])\n\t" | |||
| "pref 0, 352(%[x])\n\t" | |||
| "pref 0, 256(%[y])\n\t" | |||
| "pref 0, 288(%[y])\n\t" | |||
| "pref 0, 320(%[y])\n\t" | |||
| "pref 0, 352(%[y])\n\t" | |||
| : : [x] "r" (x), [y] "r" (y) | |||
| ); | |||
| #endif | |||
| dot0 += (vy0 * vx0); | |||
| dot0 += (vy1 * vx1); | |||
| dot0 += (vy2 * vx2); | |||
| dot0 += (vy3 * vx3); | |||
| dot1 += (vy1 * vx1); | |||
| dot2 += (vy2 * vx2); | |||
| dot3 += (vy3 * vx3); | |||
| dot0 += (vy4 * vx4); | |||
| dot0 += (vy5 * vx5); | |||
| dot0 += (vy6 * vx6); | |||
| dot0 += (vy7 * vx7); | |||
| dot1 += (vy5 * vx5); | |||
| dot2 += (vy6 * vx6); | |||
| dot3 += (vy7 * vx7); | |||
| } | |||
| if (n & 15) | |||
| { | |||
| if ((n & 8) && (n & 4) && (n & 2)) | |||
| { | |||
| LD_DP7_INC(x, 2, vx0, vx1, vx2, vx3, vx4, vx5, vx6); | |||
| LD_DP7_INC(y, 2, vy0, vy1, vy2, vy3, vy4, vy5, vy6); | |||
| dot0 += (vy0 * vx0); | |||
| dot0 += (vy1 * vx1); | |||
| dot0 += (vy2 * vx2); | |||
| dot0 += (vy3 * vx3); | |||
| dot0 += (vy4 * vx4); | |||
| dot0 += (vy5 * vx5); | |||
| dot0 += (vy6 * vx6); | |||
| } | |||
| else if ((n & 8) && (n & 4)) | |||
| { | |||
| LD_DP6_INC(x, 2, vx0, vx1, vx2, vx3, vx4, vx5); | |||
| LD_DP6_INC(y, 2, vy0, vy1, vy2, vy3, vy4, vy5); | |||
| dot0 += (vy0 * vx0); | |||
| dot0 += (vy1 * vx1); | |||
| dot0 += (vy2 * vx2); | |||
| dot0 += (vy3 * vx3); | |||
| dot0 += (vy4 * vx4); | |||
| dot0 += (vy5 * vx5); | |||
| } | |||
| else if ((n & 8) && (n & 2)) | |||
| { | |||
| LD_DP5_INC(x, 2, vx0, vx1, vx2, vx3, vx4); | |||
| LD_DP5_INC(y, 2, vy0, vy1, vy2, vy3, vy4); | |||
| dot0 += (vy0 * vx0); | |||
| dot0 += (vy1 * vx1); | |||
| dot0 += (vy2 * vx2); | |||
| dot0 += (vy3 * vx3); | |||
| dot0 += (vy4 * vx4); | |||
| } | |||
| else if ((n & 4) && (n & 2)) | |||
| { | |||
| LD_DP3_INC(x, 2, vx0, vx1, vx2); | |||
| LD_DP3_INC(y, 2, vy0, vy1, vy2); | |||
| dot0 += (vy0 * vx0); | |||
| dot0 += (vy1 * vx1); | |||
| dot0 += (vy2 * vx2); | |||
| } | |||
| else if (n & 8) | |||
| if (n & 8) | |||
| { | |||
| LD_DP4_INC(x, 2, vx0, vx1, vx2, vx3); | |||
| LD_DP4_INC(y, 2, vy0, vy1, vy2, vy3); | |||
| LD_DP4_INC(x, 2, vx0, vx1, vx2, vx3); | |||
| LD_DP4_INC(y, 2, vy0, vy1, vy2, vy3); | |||
| dot0 += (vy0 * vx0); | |||
| dot0 += (vy1 * vx1); | |||
| dot0 += (vy2 * vx2); | |||
| dot0 += (vy3 * vx3); | |||
| dot1 += (vy1 * vx1); | |||
| dot2 += (vy2 * vx2); | |||
| dot3 += (vy3 * vx3); | |||
| } | |||
| else if (n & 4) | |||
| if (n & 4) | |||
| { | |||
| LD_DP2_INC(x, 2, vx0, vx1); | |||
| LD_DP2_INC(y, 2, vy0, vy1); | |||
| LD_DP2_INC(x, 2, vx0, vx1); | |||
| LD_DP2_INC(y, 2, vy0, vy1); | |||
| dot0 += (vy0 * vx0); | |||
| dot0 += (vy1 * vx1); | |||
| dot1 += (vy1 * vx1); | |||
| } | |||
| else if (n & 2) | |||
| if (n & 2) | |||
| { | |||
| vx0 = LD_DP(x); x += 2; | |||
| vy0 = LD_DP(y); y += 2; | |||
| @@ -143,6 +113,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| } | |||
| } | |||
| dot0 += dot1 + dot2 + dot3; | |||
| dot += dot0[0]; | |||
| dot += dot0[1]; | |||
| } | |||
| @@ -159,16 +131,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| dot += (y3 * x3); | |||
| } | |||
| if ((n & 2) && (n & 1)) | |||
| { | |||
| LD_GP3_INC(x, inc_x, x0, x1, x2); | |||
| LD_GP3_INC(y, inc_y, y0, y1, y2); | |||
| dot += (y0 * x0); | |||
| dot += (y1 * x1); | |||
| dot += (y2 * x2); | |||
| } | |||
| else if (n & 2) | |||
| if (n & 2) | |||
| { | |||
| LD_GP2_INC(x, inc_x, x0, x1); | |||
| LD_GP2_INC(y, inc_y, y0, y1); | |||
| @@ -176,7 +139,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| dot += (y0 * x0); | |||
| dot += (y1 * x1); | |||
| } | |||
| else if (n & 1) | |||
| if (n & 1) | |||
| { | |||
| x0 = *x; | |||
| y0 = *y; | |||
| @@ -34,42 +34,54 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| BLASLONG i = 0; | |||
| FLOAT data0, data1, data2, sumf = 0.0; | |||
| FLOAT data0, data1, sumf = 0.0; | |||
| v4f32 src0, src1, src2, src3, src4, src5, src6, src7; | |||
| v4f32 sum_abs0, sum_abs1, sum_abs2, sum_abs3; | |||
| v4f32 zero_v = {0}; | |||
| v4f32 src8, src9, src10, src11, src12, src13, src14, src15; | |||
| v4f32 sum_abs0 = {0, 0, 0, 0}; | |||
| v4f32 sum_abs1 = {0, 0, 0, 0}; | |||
| v4f32 sum_abs2 = {0, 0, 0, 0}; | |||
| v4f32 sum_abs3 = {0, 0, 0, 0}; | |||
| v4f32 zero_v = {0, 0, 0, 0}; | |||
| v4i32 and_vec = {0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF}; | |||
| if (n <= 0 || inc_x <= 0) return (sumf); | |||
| if (1 == inc_x) | |||
| { | |||
| if (n > 31) | |||
| { | |||
| n -= 32; | |||
| LD_SP8_INC(x, 4, src0, src1, src2, src3, src4, src5, src6, src7); | |||
| #ifdef ENABLE_PREFETCH | |||
| FLOAT *x_pref; | |||
| BLASLONG pref_offset; | |||
| sum_abs0 = AND_VEC_W(src0); | |||
| sum_abs1 = AND_VEC_W(src1); | |||
| sum_abs2 = AND_VEC_W(src2); | |||
| sum_abs3 = AND_VEC_W(src3); | |||
| sum_abs0 += AND_VEC_W(src4); | |||
| sum_abs1 += AND_VEC_W(src5); | |||
| sum_abs2 += AND_VEC_W(src6); | |||
| sum_abs3 += AND_VEC_W(src7); | |||
| } | |||
| else | |||
| pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); | |||
| if (pref_offset > 0) | |||
| { | |||
| sum_abs0 = zero_v; | |||
| sum_abs1 = zero_v; | |||
| sum_abs2 = zero_v; | |||
| sum_abs3 = zero_v; | |||
| pref_offset = L1_DATA_LINESIZE - pref_offset; | |||
| } | |||
| pref_offset = pref_offset / sizeof(FLOAT); | |||
| x_pref = x + pref_offset + 128; | |||
| #endif | |||
| for (i = 0; i < (n >> 5); i++) | |||
| for (i = 0; i < (n >> 6); i++) | |||
| { | |||
| #ifdef ENABLE_PREFETCH | |||
| __asm__ __volatile__( | |||
| "pref 0, 0(%[x_pref])\n\t" | |||
| "pref 0, 32(%[x_pref])\n\t" | |||
| "pref 0, 64(%[x_pref])\n\t" | |||
| "pref 0, 96(%[x_pref])\n\t" | |||
| "pref 0, 128(%[x_pref])\n\t" | |||
| "pref 0, 160(%[x_pref])\n\t" | |||
| "pref 0, 192(%[x_pref])\n\t" | |||
| "pref 0, 224(%[x_pref])\n\t" | |||
| : : [x_pref] "r" (x_pref) | |||
| ); | |||
| x_pref += 64; | |||
| #endif | |||
| LD_SP8_INC(x, 4, src0, src1, src2, src3, src4, src5, src6, src7); | |||
| LD_SP8_INC(x, 4, src8, src9, src10, src11, src12, src13, src14, src15); | |||
| sum_abs0 += AND_VEC_W(src0); | |||
| sum_abs1 += AND_VEC_W(src1); | |||
| @@ -79,13 +91,21 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| sum_abs1 += AND_VEC_W(src5); | |||
| sum_abs2 += AND_VEC_W(src6); | |||
| sum_abs3 += AND_VEC_W(src7); | |||
| sum_abs0 += AND_VEC_W(src8); | |||
| sum_abs1 += AND_VEC_W(src9); | |||
| sum_abs2 += AND_VEC_W(src10); | |||
| sum_abs3 += AND_VEC_W(src11); | |||
| sum_abs0 += AND_VEC_W(src12); | |||
| sum_abs1 += AND_VEC_W(src13); | |||
| sum_abs2 += AND_VEC_W(src14); | |||
| sum_abs3 += AND_VEC_W(src15); | |||
| } | |||
| if (n & 31) | |||
| if (n & 63) | |||
| { | |||
| if ((n & 16) && (n & 8) && (n & 4)) | |||
| if (n & 32) | |||
| { | |||
| LD_SP7_INC(x, 4, src0, src1, src2, src3, src4, src5, src6); | |||
| LD_SP8_INC(x, 4, src0, src1, src2, src3, src4, src5, src6, src7); | |||
| sum_abs0 += AND_VEC_W(src0); | |||
| sum_abs1 += AND_VEC_W(src1); | |||
| @@ -94,65 +114,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| sum_abs0 += AND_VEC_W(src4); | |||
| sum_abs1 += AND_VEC_W(src5); | |||
| sum_abs2 += AND_VEC_W(src6); | |||
| sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; | |||
| sumf += sum_abs0[0]; | |||
| sumf += sum_abs0[1]; | |||
| sumf += sum_abs0[2]; | |||
| sumf += sum_abs0[3]; | |||
| } | |||
| else if ((n & 16) && (n & 8)) | |||
| { | |||
| LD_SP6_INC(x, 4, src0, src1, src2, src3, src4, src5); | |||
| sum_abs0 += AND_VEC_W(src0); | |||
| sum_abs1 += AND_VEC_W(src1); | |||
| sum_abs2 += AND_VEC_W(src2); | |||
| sum_abs3 += AND_VEC_W(src3); | |||
| sum_abs0 += AND_VEC_W(src4); | |||
| sum_abs1 += AND_VEC_W(src5); | |||
| sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; | |||
| sumf += sum_abs0[0]; | |||
| sumf += sum_abs0[1]; | |||
| sumf += sum_abs0[2]; | |||
| sumf += sum_abs0[3]; | |||
| } | |||
| else if ((n & 16) && (n & 4)) | |||
| { | |||
| LD_SP5_INC(x, 4, src0, src1, src2, src3, src4); | |||
| sum_abs0 += AND_VEC_W(src0); | |||
| sum_abs1 += AND_VEC_W(src1); | |||
| sum_abs2 += AND_VEC_W(src2); | |||
| sum_abs3 += AND_VEC_W(src3); | |||
| sum_abs0 += AND_VEC_W(src4); | |||
| sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; | |||
| sumf += sum_abs0[0]; | |||
| sumf += sum_abs0[1]; | |||
| sumf += sum_abs0[2]; | |||
| sumf += sum_abs0[3]; | |||
| sum_abs3 += AND_VEC_W(src7); | |||
| } | |||
| else if ((n & 8) && (n & 4)) | |||
| { | |||
| LD_SP3_INC(x, 4, src0, src1, src2); | |||
| sum_abs0 += AND_VEC_W(src0); | |||
| sum_abs1 += AND_VEC_W(src1); | |||
| sum_abs2 += AND_VEC_W(src2); | |||
| sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; | |||
| sumf += sum_abs0[0]; | |||
| sumf += sum_abs0[1]; | |||
| sumf += sum_abs0[2]; | |||
| sumf += sum_abs0[3]; | |||
| } | |||
| else if (n & 16) | |||
| if (n & 16) | |||
| { | |||
| LD_SP4_INC(x, 4, src0, src1, src2, src3); | |||
| @@ -160,79 +125,47 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| sum_abs1 += AND_VEC_W(src1); | |||
| sum_abs2 += AND_VEC_W(src2); | |||
| sum_abs3 += AND_VEC_W(src3); | |||
| sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; | |||
| sumf += sum_abs0[0]; | |||
| sumf += sum_abs0[1]; | |||
| sumf += sum_abs0[2]; | |||
| sumf += sum_abs0[3]; | |||
| } | |||
| else if (n & 8) | |||
| if (n & 8) | |||
| { | |||
| LD_SP2_INC(x, 4, src0, src1); | |||
| sum_abs0 += AND_VEC_W(src0); | |||
| sum_abs1 += AND_VEC_W(src1); | |||
| sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; | |||
| sumf += sum_abs0[0]; | |||
| sumf += sum_abs0[1]; | |||
| sumf += sum_abs0[2]; | |||
| sumf += sum_abs0[3]; | |||
| } | |||
| else if (n & 4) | |||
| if (n & 4) | |||
| { | |||
| src0 = LD_SP(x); x += 4; | |||
| sum_abs0 += AND_VEC_W(src0); | |||
| sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; | |||
| sumf += sum_abs0[0]; | |||
| sumf += sum_abs0[1]; | |||
| sumf += sum_abs0[2]; | |||
| sumf += sum_abs0[3]; | |||
| } | |||
| else | |||
| { | |||
| sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; | |||
| sumf += sum_abs0[0]; | |||
| sumf += sum_abs0[1]; | |||
| sumf += sum_abs0[2]; | |||
| sumf += sum_abs0[3]; | |||
| } | |||
| if (n & 2) | |||
| { | |||
| sumf += fabsf(*(x + 0)); | |||
| sumf += fabsf(*x); | |||
| sumf += fabsf(*(x + 1)); | |||
| x += 2; | |||
| } | |||
| if (n & 1) | |||
| { | |||
| sumf += fabsf(*(x + 0)); | |||
| sumf += fabsf(*x); | |||
| } | |||
| } | |||
| else | |||
| { | |||
| sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3; | |||
| sumf += sum_abs0[0]; | |||
| sumf += sum_abs0[1]; | |||
| sumf += sum_abs0[2]; | |||
| sumf += sum_abs0[3]; | |||
| } | |||
| sum_abs0 += sum_abs1 + sum_abs2 + sum_abs3; | |||
| sumf += sum_abs0[0]; | |||
| sumf += sum_abs0[1]; | |||
| sumf += sum_abs0[2]; | |||
| sumf += sum_abs0[3]; | |||
| } | |||
| else | |||
| { | |||
| if (n > 8) | |||
| for (i = (n >> 4); i--;) | |||
| { | |||
| n -= 8; | |||
| src0 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x)); | |||
| x += inc_x; | |||
| src0 = (v4f32) __msa_insert_w((v4i32) src0, 1, *((int *) x)); | |||
| @@ -241,92 +174,97 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| x += inc_x; | |||
| src0 = (v4f32) __msa_insert_w((v4i32) src0, 3, *((int *) x)); | |||
| x += inc_x; | |||
| src4 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x)); | |||
| src1 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x)); | |||
| x += inc_x; | |||
| src4 = (v4f32) __msa_insert_w((v4i32) src4, 1, *((int *) x)); | |||
| src1 = (v4f32) __msa_insert_w((v4i32) src1, 1, *((int *) x)); | |||
| x += inc_x; | |||
| src4 = (v4f32) __msa_insert_w((v4i32) src4, 2, *((int *) x)); | |||
| src1 = (v4f32) __msa_insert_w((v4i32) src1, 2, *((int *) x)); | |||
| x += inc_x; | |||
| src4 = (v4f32) __msa_insert_w((v4i32) src4, 3, *((int *) x)); | |||
| src1 = (v4f32) __msa_insert_w((v4i32) src1, 3, *((int *) x)); | |||
| x += inc_x; | |||
| sum_abs0 = AND_VEC_W(src0); | |||
| sum_abs1 = AND_VEC_W(src4); | |||
| } | |||
| else | |||
| { | |||
| sum_abs0 = zero_v; | |||
| sum_abs1 = zero_v; | |||
| } | |||
| for (i = (n >> 3); i--;) | |||
| { | |||
| src0 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x)); | |||
| src2 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x)); | |||
| x += inc_x; | |||
| src0 = (v4f32) __msa_insert_w((v4i32) src0, 1, *((int *) x)); | |||
| src2 = (v4f32) __msa_insert_w((v4i32) src2, 1, *((int *) x)); | |||
| x += inc_x; | |||
| src0 = (v4f32) __msa_insert_w((v4i32) src0, 2, *((int *) x)); | |||
| src2 = (v4f32) __msa_insert_w((v4i32) src2, 2, *((int *) x)); | |||
| x += inc_x; | |||
| src0 = (v4f32) __msa_insert_w((v4i32) src0, 3, *((int *) x)); | |||
| src2 = (v4f32) __msa_insert_w((v4i32) src2, 3, *((int *) x)); | |||
| x += inc_x; | |||
| src4 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x)); | |||
| src3 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x)); | |||
| x += inc_x; | |||
| src4 = (v4f32) __msa_insert_w((v4i32) src4, 1, *((int *) x)); | |||
| src3 = (v4f32) __msa_insert_w((v4i32) src3, 1, *((int *) x)); | |||
| x += inc_x; | |||
| src4 = (v4f32) __msa_insert_w((v4i32) src4, 2, *((int *) x)); | |||
| src3 = (v4f32) __msa_insert_w((v4i32) src3, 2, *((int *) x)); | |||
| x += inc_x; | |||
| src4 = (v4f32) __msa_insert_w((v4i32) src4, 3, *((int *) x)); | |||
| src3 = (v4f32) __msa_insert_w((v4i32) src3, 3, *((int *) x)); | |||
| x += inc_x; | |||
| sum_abs0 += AND_VEC_W(src0); | |||
| sum_abs1 += AND_VEC_W(src4); | |||
| sum_abs1 += AND_VEC_W(src1); | |||
| sum_abs2 += AND_VEC_W(src2); | |||
| sum_abs3 += AND_VEC_W(src3); | |||
| } | |||
| if (n & 4) | |||
| if (n & 15) | |||
| { | |||
| src0 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x)); | |||
| x += inc_x; | |||
| src0 = (v4f32) __msa_insert_w((v4i32) src0, 1, *((int *) x)); | |||
| x += inc_x; | |||
| src0 = (v4f32) __msa_insert_w((v4i32) src0, 2, *((int *) x)); | |||
| x += inc_x; | |||
| src0 = (v4f32) __msa_insert_w((v4i32) src0, 3, *((int *) x)); | |||
| x += inc_x; | |||
| if (n & 8) | |||
| { | |||
| src0 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x)); | |||
| x += inc_x; | |||
| src0 = (v4f32) __msa_insert_w((v4i32) src0, 1, *((int *) x)); | |||
| x += inc_x; | |||
| src0 = (v4f32) __msa_insert_w((v4i32) src0, 2, *((int *) x)); | |||
| x += inc_x; | |||
| src0 = (v4f32) __msa_insert_w((v4i32) src0, 3, *((int *) x)); | |||
| x += inc_x; | |||
| src1 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x)); | |||
| x += inc_x; | |||
| src1 = (v4f32) __msa_insert_w((v4i32) src1, 1, *((int *) x)); | |||
| x += inc_x; | |||
| src1 = (v4f32) __msa_insert_w((v4i32) src1, 2, *((int *) x)); | |||
| x += inc_x; | |||
| src1 = (v4f32) __msa_insert_w((v4i32) src1, 3, *((int *) x)); | |||
| x += inc_x; | |||
| sum_abs0 += AND_VEC_W(src0); | |||
| } | |||
| sum_abs0 += AND_VEC_W(src0); | |||
| sum_abs1 += AND_VEC_W(src1); | |||
| } | |||
| sum_abs0 += sum_abs1; | |||
| if (n & 4) | |||
| { | |||
| src0 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x)); | |||
| x += inc_x; | |||
| src0 = (v4f32) __msa_insert_w((v4i32) src0, 1, *((int *) x)); | |||
| x += inc_x; | |||
| src0 = (v4f32) __msa_insert_w((v4i32) src0, 2, *((int *) x)); | |||
| x += inc_x; | |||
| src0 = (v4f32) __msa_insert_w((v4i32) src0, 3, *((int *) x)); | |||
| x += inc_x; | |||
| sumf += sum_abs0[0]; | |||
| sumf += sum_abs0[1]; | |||
| sumf += sum_abs0[2]; | |||
| sumf += sum_abs0[3]; | |||
| sum_abs0 += AND_VEC_W(src0); | |||
| } | |||
| if ((n & 2) && (n & 1)) | |||
| { | |||
| data0 = fabsf(*x); x += inc_x; | |||
| data1 = fabsf(*x); x += inc_x; | |||
| data2 = fabsf(*x); | |||
| if (n & 2) | |||
| { | |||
| data0 = fabsf(*x); x += inc_x; | |||
| data1 = fabsf(*x); x += inc_x; | |||
| sumf += data0; | |||
| sumf += data1; | |||
| sumf += data2; | |||
| } | |||
| else if (n & 2) | |||
| { | |||
| data0 = fabsf(*x); x += inc_x; | |||
| data1 = fabsf(*x); | |||
| sumf += data0; | |||
| sumf += data1; | |||
| } | |||
| sumf += data0; | |||
| sumf += data1; | |||
| if (n & 1) | |||
| { | |||
| sumf += fabsf(*x); | |||
| } | |||
| } | |||
| else if (n & 1) | |||
| { | |||
| data0 = fabsf(*x); | |||
| sumf += data0; | |||
| } | |||
| sum_abs0 += sum_abs1 + sum_abs2 + sum_abs3; | |||
| sumf += sum_abs0[0]; | |||
| sumf += sum_abs0[1]; | |||
| sumf += sum_abs0[2]; | |||
| sumf += sum_abs0[3]; | |||
| } | |||
| return (sumf); | |||
| @@ -28,7 +28,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #include "macros_msa.h" | |||
| /* return float, x,y float */ | |||
| #if defined(DSDOT) | |||
| double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| #else | |||
| @@ -37,96 +36,71 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| { | |||
| BLASLONG i = 0; | |||
| double dot = 0.0; | |||
| float x0, x1, x2, x3, y0, y1, y2, y3; | |||
| FLOAT x0, x1, x2, x3, y0, y1, y2, y3; | |||
| v4f32 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7; | |||
| v4f32 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7; | |||
| v4f32 dot0 = {0, 0, 0, 0}; | |||
| v4f32 dot1 = {0, 0, 0, 0}; | |||
| v4f32 dot2 = {0, 0, 0, 0}; | |||
| v4f32 dot3 = {0, 0, 0, 0}; | |||
| if (n < 0) return (dot); | |||
| if (n < 1) return (dot); | |||
| if ((1 == inc_x) && (1 == inc_y)) | |||
| { | |||
| for (i = (n >> 5); i--;) | |||
| { | |||
| LD_SP8_INC(x, 4, vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7); | |||
| LD_SP8_INC(y, 4, vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7); | |||
| LD_SP8_INC(x, 4, vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7); | |||
| LD_SP8_INC(y, 4, vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7); | |||
| #ifdef ENABLE_PREFETCH | |||
| __asm__ __volatile__( | |||
| "pref 0, 256(%[x])\n\t" | |||
| "pref 0, 288(%[x])\n\t" | |||
| "pref 0, 320(%[x])\n\t" | |||
| "pref 0, 352(%[x])\n\t" | |||
| "pref 0, 256(%[y])\n\t" | |||
| "pref 0, 288(%[y])\n\t" | |||
| "pref 0, 320(%[y])\n\t" | |||
| "pref 0, 352(%[y])\n\t" | |||
| : : [x] "r" (x), [y] "r" (y) | |||
| ); | |||
| #endif | |||
| dot0 += (vy0 * vx0); | |||
| dot0 += (vy1 * vx1); | |||
| dot0 += (vy2 * vx2); | |||
| dot0 += (vy3 * vx3); | |||
| dot1 += (vy1 * vx1); | |||
| dot2 += (vy2 * vx2); | |||
| dot3 += (vy3 * vx3); | |||
| dot0 += (vy4 * vx4); | |||
| dot0 += (vy5 * vx5); | |||
| dot0 += (vy6 * vx6); | |||
| dot0 += (vy7 * vx7); | |||
| dot1 += (vy5 * vx5); | |||
| dot2 += (vy6 * vx6); | |||
| dot3 += (vy7 * vx7); | |||
| } | |||
| if (n & 31) | |||
| { | |||
| if ((n & 16) && (n & 8) && (n & 4)) | |||
| if (n & 16) | |||
| { | |||
| LD_SP7_INC(x, 4, vx0, vx1, vx2, vx3, vx4, vx5, vx6); | |||
| LD_SP7_INC(y, 4, vy0, vy1, vy2, vy3, vy4, vy5, vy6); | |||
| LD_SP4_INC(x, 4, vx0, vx1, vx2, vx3); | |||
| LD_SP4_INC(y, 4, vy0, vy1, vy2, vy3); | |||
| dot0 += (vy0 * vx0); | |||
| dot0 += (vy1 * vx1); | |||
| dot0 += (vy2 * vx2); | |||
| dot0 += (vy3 * vx3); | |||
| dot0 += (vy4 * vx4); | |||
| dot0 += (vy5 * vx5); | |||
| dot0 += (vy6 * vx6); | |||
| dot1 += (vy1 * vx1); | |||
| dot2 += (vy2 * vx2); | |||
| dot3 += (vy3 * vx3); | |||
| } | |||
| else if ((n & 16) && (n & 8)) | |||
| { | |||
| LD_SP6_INC(x, 4, vx0, vx1, vx2, vx3, vx4, vx5); | |||
| LD_SP6_INC(y, 4, vy0, vy1, vy2, vy3, vy4, vy5); | |||
| dot0 += (vy0 * vx0); | |||
| dot0 += (vy1 * vx1); | |||
| dot0 += (vy2 * vx2); | |||
| dot0 += (vy3 * vx3); | |||
| dot0 += (vy4 * vx4); | |||
| dot0 += (vy5 * vx5); | |||
| } | |||
| else if ((n & 16) && (n & 4)) | |||
| if (n & 8) | |||
| { | |||
| LD_SP5_INC(x, 4, vx0, vx1, vx2, vx3, vx4); | |||
| LD_SP5_INC(y, 4, vy0, vy1, vy2, vy3, vy4); | |||
| LD_SP2_INC(x, 4, vx0, vx1); | |||
| LD_SP2_INC(y, 4, vy0, vy1); | |||
| dot0 += (vy0 * vx0); | |||
| dot0 += (vy1 * vx1); | |||
| dot0 += (vy2 * vx2); | |||
| dot0 += (vy3 * vx3); | |||
| dot0 += (vy4 * vx4); | |||
| dot1 += (vy1 * vx1); | |||
| } | |||
| else if ((n & 8) && (n & 4)) | |||
| { | |||
| LD_SP3_INC(x, 4, vx0, vx1, vx2); | |||
| LD_SP3_INC(y, 4, vy0, vy1, vy2); | |||
| dot0 += (vy0 * vx0); | |||
| dot0 += (vy1 * vx1); | |||
| dot0 += (vy2 * vx2); | |||
| } | |||
| else if (n & 16) | |||
| { | |||
| LD_SP4_INC(x, 4, vx0, vx1, vx2, vx3); | |||
| LD_SP4_INC(y, 4, vy0, vy1, vy2, vy3); | |||
| dot0 += (vy0 * vx0); | |||
| dot0 += (vy1 * vx1); | |||
| dot0 += (vy2 * vx2); | |||
| dot0 += (vy3 * vx3); | |||
| } | |||
| else if (n & 8) | |||
| { | |||
| LD_SP2_INC(x, 4, vx0, vx1); | |||
| LD_SP2_INC(y, 4, vy0, vy1); | |||
| dot0 += (vy0 * vx0); | |||
| dot0 += (vy1 * vx1); | |||
| } | |||
| else if (n & 4) | |||
| if (n & 4) | |||
| { | |||
| vx0 = LD_SP(x); x += 4; | |||
| vy0 = LD_SP(y); y += 4; | |||
| @@ -134,16 +108,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| dot0 += (vy0 * vx0); | |||
| } | |||
| if ((n & 2) && (n & 1)) | |||
| { | |||
| LD_GP3_INC(x, 1, x0, x1, x2); | |||
| LD_GP3_INC(y, 1, y0, y1, y2); | |||
| dot += (y0 * x0); | |||
| dot += (y1 * x1); | |||
| dot += (y2 * x2); | |||
| } | |||
| else if (n & 2) | |||
| if (n & 2) | |||
| { | |||
| LD_GP2_INC(x, 1, x0, x1); | |||
| LD_GP2_INC(y, 1, y0, y1); | |||
| @@ -151,7 +116,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| dot += (y0 * x0); | |||
| dot += (y1 * x1); | |||
| } | |||
| else if (n & 1) | |||
| if (n & 1) | |||
| { | |||
| x0 = *x; | |||
| y0 = *y; | |||
| @@ -160,6 +126,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| } | |||
| } | |||
| dot0 += dot1 + dot2 + dot3; | |||
| dot += dot0[0]; | |||
| dot += dot0[1]; | |||
| dot += dot0[2]; | |||
| @@ -178,16 +146,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| dot += (y3 * x3); | |||
| } | |||
| if ((n & 2) && (n & 1)) | |||
| { | |||
| LD_GP3_INC(x, inc_x, x0, x1, x2); | |||
| LD_GP3_INC(y, inc_y, y0, y1, y2); | |||
| dot += (y0 * x0); | |||
| dot += (y1 * x1); | |||
| dot += (y2 * x2); | |||
| } | |||
| else if (n & 2) | |||
| if (n & 2) | |||
| { | |||
| LD_GP2_INC(x, inc_x, x0, x1); | |||
| LD_GP2_INC(y, inc_y, y0, y1); | |||
| @@ -195,7 +154,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| dot += (y0 * x0); | |||
| dot += (y1 * x1); | |||
| } | |||
| else if (n & 1) | |||
| if (n & 1) | |||
| { | |||
| x0 = *x; | |||
| y0 = *y; | |||
| @@ -31,139 +31,191 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define AND_VEC_D(in) ((v2f64) ((v2i64) in & and_vec)) | |||
| #define PROCESS_ZD(inc_val) \ | |||
| if (n > 8) \ | |||
| { \ | |||
| n -= 8; \ | |||
| \ | |||
| LD_DP8_INC(x, inc_val, src0, src1, src2, \ | |||
| src3, src4, src5, src6, src7); \ | |||
| \ | |||
| sum_abs0 = AND_VEC_D(src0); \ | |||
| sum_abs1 = AND_VEC_D(src1); \ | |||
| sum_abs2 = AND_VEC_D(src2); \ | |||
| sum_abs3 = AND_VEC_D(src3); \ | |||
| sum_abs0 += AND_VEC_D(src4); \ | |||
| sum_abs1 += AND_VEC_D(src5); \ | |||
| sum_abs2 += AND_VEC_D(src6); \ | |||
| sum_abs3 += AND_VEC_D(src7); \ | |||
| } \ | |||
| else \ | |||
| { \ | |||
| sum_abs0 = zero_v; \ | |||
| sum_abs1 = zero_v; \ | |||
| sum_abs2 = zero_v; \ | |||
| sum_abs3 = zero_v; \ | |||
| } \ | |||
| \ | |||
| for (i = (n >> 3); i--;) \ | |||
| { \ | |||
| LD_DP8_INC(x, inc_val, src0, src1, src2, \ | |||
| src3, src4, src5, src6, src7); \ | |||
| \ | |||
| sum_abs0 += AND_VEC_D(src0); \ | |||
| sum_abs1 += AND_VEC_D(src1); \ | |||
| sum_abs2 += AND_VEC_D(src2); \ | |||
| sum_abs3 += AND_VEC_D(src3); \ | |||
| sum_abs0 += AND_VEC_D(src4); \ | |||
| sum_abs1 += AND_VEC_D(src5); \ | |||
| sum_abs2 += AND_VEC_D(src6); \ | |||
| sum_abs3 += AND_VEC_D(src7); \ | |||
| } \ | |||
| \ | |||
| if (n & 7) \ | |||
| { \ | |||
| if ((n & 4) && (n & 2) && (n & 1)) \ | |||
| { \ | |||
| LD_DP7_INC(x, inc_val, src0, src1, src2, \ | |||
| src3, src4, src5, src6); \ | |||
| \ | |||
| sum_abs0 += AND_VEC_D(src0); \ | |||
| sum_abs1 += AND_VEC_D(src1); \ | |||
| sum_abs2 += AND_VEC_D(src2); \ | |||
| sum_abs3 += AND_VEC_D(src3); \ | |||
| sum_abs0 += AND_VEC_D(src4); \ | |||
| sum_abs1 += AND_VEC_D(src5); \ | |||
| sum_abs2 += AND_VEC_D(src6); \ | |||
| } \ | |||
| else if ((n & 4) && (n & 2)) \ | |||
| { \ | |||
| LD_DP6_INC(x, inc_val, src0, src1, src2, \ | |||
| src3, src4, src5); \ | |||
| \ | |||
| sum_abs0 += AND_VEC_D(src0); \ | |||
| sum_abs1 += AND_VEC_D(src1); \ | |||
| sum_abs2 += AND_VEC_D(src2); \ | |||
| sum_abs3 += AND_VEC_D(src3); \ | |||
| sum_abs0 += AND_VEC_D(src4); \ | |||
| sum_abs1 += AND_VEC_D(src5); \ | |||
| } \ | |||
| else if ((n & 4) && (n & 1)) \ | |||
| { \ | |||
| LD_DP5_INC(x, inc_val, src0, src1, src2, \ | |||
| src3, src4); \ | |||
| \ | |||
| sum_abs0 += AND_VEC_D(src0); \ | |||
| sum_abs1 += AND_VEC_D(src1); \ | |||
| sum_abs2 += AND_VEC_D(src2); \ | |||
| sum_abs3 += AND_VEC_D(src3); \ | |||
| sum_abs0 += AND_VEC_D(src4); \ | |||
| } \ | |||
| else if ((n & 2) && (n & 1)) \ | |||
| { \ | |||
| LD_DP3_INC(x, inc_val, src0, src1, src2); \ | |||
| \ | |||
| sum_abs0 += AND_VEC_D(src0); \ | |||
| sum_abs1 += AND_VEC_D(src1); \ | |||
| sum_abs2 += AND_VEC_D(src2); \ | |||
| } \ | |||
| else if (n & 4) \ | |||
| { \ | |||
| LD_DP4_INC(x, inc_val, src0, src1, src2, \ | |||
| src3); \ | |||
| \ | |||
| sum_abs0 += AND_VEC_D(src0); \ | |||
| sum_abs1 += AND_VEC_D(src1); \ | |||
| sum_abs2 += AND_VEC_D(src2); \ | |||
| sum_abs3 += AND_VEC_D(src3); \ | |||
| } \ | |||
| else if (n & 2) \ | |||
| { \ | |||
| LD_DP2_INC(x, inc_val, src0, src1); \ | |||
| \ | |||
| sum_abs0 += AND_VEC_D(src0); \ | |||
| sum_abs1 += AND_VEC_D(src1); \ | |||
| } \ | |||
| else if (n & 1) \ | |||
| { \ | |||
| src0 = LD_DP(x); \ | |||
| \ | |||
| sum_abs0 += AND_VEC_D(src0); \ | |||
| } \ | |||
| } \ | |||
| \ | |||
| sum_abs0 += sum_abs1 + sum_abs2 + sum_abs3; \ | |||
| sumf = sum_abs0[0] + sum_abs0[1]; | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| BLASLONG i; | |||
| FLOAT sumf = 0.0; | |||
| v2f64 src0, src1, src2, src3, src4, src5, src6, src7; | |||
| v2f64 sum_abs0, sum_abs1, sum_abs2, sum_abs3; | |||
| v2f64 zero_v = {0}; | |||
| v2f64 src8, src9, src10, src11, src12, src13, src14, src15; | |||
| v2f64 sum_abs0 = {0, 0}; | |||
| v2f64 sum_abs1 = {0, 0}; | |||
| v2f64 sum_abs2 = {0, 0}; | |||
| v2f64 sum_abs3 = {0, 0}; | |||
| v2i64 and_vec = {0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF}; | |||
| if (n <= 0 || inc_x <= 0) return (sumf); | |||
| if (1 == inc_x) | |||
| { | |||
| PROCESS_ZD(2); | |||
| #ifdef ENABLE_PREFETCH | |||
| FLOAT *x_pref; | |||
| BLASLONG pref_offset; | |||
| pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); | |||
| if (pref_offset > 0) | |||
| { | |||
| pref_offset = L1_DATA_LINESIZE - pref_offset; | |||
| } | |||
| pref_offset = pref_offset / sizeof(FLOAT); | |||
| x_pref = x + pref_offset + 64; | |||
| #endif | |||
| for (i = (n >> 4); i--;) | |||
| { | |||
| #ifdef ENABLE_PREFETCH | |||
| __asm__ __volatile__( | |||
| "pref 0, 0(%[x_pref])\n\t" | |||
| "pref 0, 32(%[x_pref])\n\t" | |||
| "pref 0, 64(%[x_pref])\n\t" | |||
| "pref 0, 96(%[x_pref])\n\t" | |||
| "pref 0, 128(%[x_pref])\n\t" | |||
| "pref 0, 160(%[x_pref])\n\t" | |||
| "pref 0, 192(%[x_pref])\n\t" | |||
| "pref 0, 224(%[x_pref])\n\t" | |||
| : : [x_pref] "r" (x_pref) | |||
| ); | |||
| x_pref += 32; | |||
| #endif | |||
| LD_DP8_INC(x, 2, src0, src1, src2, src3, src4, src5, src6, src7); | |||
| LD_DP8_INC(x, 2, src8, src9, src10, src11, src12, src13, src14, src15); | |||
| sum_abs0 += AND_VEC_D(src0); | |||
| sum_abs1 += AND_VEC_D(src1); | |||
| sum_abs2 += AND_VEC_D(src2); | |||
| sum_abs3 += AND_VEC_D(src3); | |||
| sum_abs0 += AND_VEC_D(src4); | |||
| sum_abs1 += AND_VEC_D(src5); | |||
| sum_abs2 += AND_VEC_D(src6); | |||
| sum_abs3 += AND_VEC_D(src7); | |||
| sum_abs0 += AND_VEC_D(src8); | |||
| sum_abs1 += AND_VEC_D(src9); | |||
| sum_abs2 += AND_VEC_D(src10); | |||
| sum_abs3 += AND_VEC_D(src11); | |||
| sum_abs0 += AND_VEC_D(src12); | |||
| sum_abs1 += AND_VEC_D(src13); | |||
| sum_abs2 += AND_VEC_D(src14); | |||
| sum_abs3 += AND_VEC_D(src15); | |||
| } | |||
| if (n & 15) | |||
| { | |||
| if (n & 8) | |||
| { | |||
| LD_DP8_INC(x, 2, src0, src1, src2, src3, src4, src5, src6, src7); | |||
| sum_abs0 += AND_VEC_D(src0); | |||
| sum_abs1 += AND_VEC_D(src1); | |||
| sum_abs2 += AND_VEC_D(src2); | |||
| sum_abs3 += AND_VEC_D(src3); | |||
| sum_abs0 += AND_VEC_D(src4); | |||
| sum_abs1 += AND_VEC_D(src5); | |||
| sum_abs2 += AND_VEC_D(src6); | |||
| sum_abs3 += AND_VEC_D(src7); | |||
| } | |||
| if (n & 4) | |||
| { | |||
| LD_DP4_INC(x, 2, src0, src1, src2, src3); | |||
| sum_abs0 += AND_VEC_D(src0); | |||
| sum_abs1 += AND_VEC_D(src1); | |||
| sum_abs2 += AND_VEC_D(src2); | |||
| sum_abs3 += AND_VEC_D(src3); | |||
| } | |||
| if (n & 2) | |||
| { | |||
| LD_DP2_INC(x, 2, src0, src1); | |||
| sum_abs0 += AND_VEC_D(src0); | |||
| sum_abs1 += AND_VEC_D(src1); | |||
| } | |||
| if (n & 1) | |||
| { | |||
| src0 = LD_DP(x); | |||
| sum_abs0 += AND_VEC_D(src0); | |||
| } | |||
| } | |||
| sum_abs0 += sum_abs1 + sum_abs2 + sum_abs3; | |||
| sumf = sum_abs0[0] + sum_abs0[1]; | |||
| } | |||
| else | |||
| { | |||
| inc_x *= 2; | |||
| PROCESS_ZD(inc_x); | |||
| for (i = (n >> 4); i--;) | |||
| { | |||
| LD_DP8_INC(x, inc_x, src0, src1, src2, src3, src4, src5, src6, src7); | |||
| LD_DP8_INC(x, inc_x, src8, src9, src10, src11, src12, src13, src14, src15); | |||
| sum_abs0 += AND_VEC_D(src0); | |||
| sum_abs1 += AND_VEC_D(src1); | |||
| sum_abs2 += AND_VEC_D(src2); | |||
| sum_abs3 += AND_VEC_D(src3); | |||
| sum_abs0 += AND_VEC_D(src4); | |||
| sum_abs1 += AND_VEC_D(src5); | |||
| sum_abs2 += AND_VEC_D(src6); | |||
| sum_abs3 += AND_VEC_D(src7); | |||
| sum_abs0 += AND_VEC_D(src8); | |||
| sum_abs1 += AND_VEC_D(src9); | |||
| sum_abs2 += AND_VEC_D(src10); | |||
| sum_abs3 += AND_VEC_D(src11); | |||
| sum_abs0 += AND_VEC_D(src12); | |||
| sum_abs1 += AND_VEC_D(src13); | |||
| sum_abs2 += AND_VEC_D(src14); | |||
| sum_abs3 += AND_VEC_D(src15); | |||
| } | |||
| if (n & 15) | |||
| { | |||
| if (n & 8) | |||
| { | |||
| LD_DP8_INC(x, inc_x, src0, src1, src2, src3, src4, src5, src6, src7); | |||
| sum_abs0 += AND_VEC_D(src0); | |||
| sum_abs1 += AND_VEC_D(src1); | |||
| sum_abs2 += AND_VEC_D(src2); | |||
| sum_abs3 += AND_VEC_D(src3); | |||
| sum_abs0 += AND_VEC_D(src4); | |||
| sum_abs1 += AND_VEC_D(src5); | |||
| sum_abs2 += AND_VEC_D(src6); | |||
| sum_abs3 += AND_VEC_D(src7); | |||
| } | |||
| if (n & 4) | |||
| { | |||
| LD_DP4_INC(x, inc_x, src0, src1, src2, src3); | |||
| sum_abs0 += AND_VEC_D(src0); | |||
| sum_abs1 += AND_VEC_D(src1); | |||
| sum_abs2 += AND_VEC_D(src2); | |||
| sum_abs3 += AND_VEC_D(src3); | |||
| } | |||
| if (n & 2) | |||
| { | |||
| LD_DP2_INC(x, inc_x, src0, src1); | |||
| sum_abs0 += AND_VEC_D(src0); | |||
| sum_abs1 += AND_VEC_D(src1); | |||
| } | |||
| if (n & 1) | |||
| { | |||
| src0 = LD_DP(x); | |||
| sum_abs0 += AND_VEC_D(src0); | |||
| } | |||
| } | |||
| sum_abs0 += sum_abs1 + sum_abs2 + sum_abs3; | |||
| sumf = sum_abs0[0] + sum_abs0[1]; | |||
| } | |||
| return (sumf); | |||
| @@ -29,195 +29,220 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "macros_msa.h" | |||
| #if !defined(CONJ) | |||
| #define OP2 += | |||
| #define OP3 - | |||
| #define OP4 + | |||
| #define OP1 -= | |||
| #define OP2 += | |||
| #define OP3 - | |||
| #define OP4 + | |||
| #else | |||
| #define OP2 -= | |||
| #define OP3 + | |||
| #define OP4 - | |||
| #define OP1 += | |||
| #define OP2 -= | |||
| #define OP3 + | |||
| #define OP4 - | |||
| #endif | |||
| #define DOT16_KERNEL(OPR0, OPR1) \ | |||
| dot0 += (vx0r * vy0r); \ | |||
| dot0 OPR0## = (vx0i * vy0i); \ | |||
| dot1 OPR1## = (vx0i * vy0r); \ | |||
| dot1 += (vx0r * vy0i); \ | |||
| \ | |||
| dot0 += (vx1r * vy1r); \ | |||
| dot0 OPR0## = (vx1i * vy1i); \ | |||
| dot1 OPR1## = (vx1i * vy1r); \ | |||
| dot1 += (vx1r * vy1i); \ | |||
| \ | |||
| dot0 += (vx2r * vy2r); \ | |||
| dot0 OPR0## = (vx2i * vy2i); \ | |||
| dot1 OPR1## = (vx2i * vy2r); \ | |||
| dot1 += (vx2r * vy2i); \ | |||
| \ | |||
| dot0 += (vx3r * vy3r); \ | |||
| dot0 OPR0## = (vx3i * vy3i); \ | |||
| dot1 OPR1## = (vx3i * vy3r); \ | |||
| dot1 += (vx3r * vy3i); | |||
| #define DOT12_KERNEL(OPR0, OPR1) \ | |||
| dot0 += (vx0r * vy0r); \ | |||
| dot0 OPR0## = (vx0i * vy0i); \ | |||
| dot1 OPR1## = (vx0i * vy0r); \ | |||
| dot1 += (vx0r * vy0i); \ | |||
| \ | |||
| dot0 += (vx1r * vy1r); \ | |||
| dot0 OPR0## = (vx1i * vy1i); \ | |||
| dot1 OPR1## = (vx1i * vy1r); \ | |||
| dot1 += (vx1r * vy1i); \ | |||
| \ | |||
| dot0 += (vx2r * vy2r); \ | |||
| dot0 OPR0## = (vx2i * vy2i); \ | |||
| dot1 OPR1## = (vx2i * vy2r); \ | |||
| dot1 += (vx2r * vy2i); | |||
| #define DOT8_KERNEL(OPR0, OPR1) \ | |||
| dot0 += (vx0r * vy0r); \ | |||
| dot0 OPR0## = (vx0i * vy0i); \ | |||
| dot1 OPR1## = (vx0i * vy0r); \ | |||
| dot1 += (vx0r * vy0i); \ | |||
| \ | |||
| dot0 += (vx1r * vy1r); \ | |||
| dot0 OPR0## = (vx1i * vy1i); \ | |||
| dot1 OPR1## = (vx1i * vy1r); \ | |||
| dot1 += (vx1r * vy1i); | |||
| #define DOT4_KERNEL(OPR0, OPR1) \ | |||
| dot0 += (vx0r * vy0r); \ | |||
| dot0 OPR0## = (vx0i * vy0i); \ | |||
| dot1 OPR1## = (vx0i * vy0r); \ | |||
| dot1 += (vx0r * vy0i); | |||
| /* return double, x,y double */ | |||
| /* zdotc - CONJ */ | |||
| /* zdotu - !CONJ */ | |||
| OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| { | |||
| BLASLONG i = 0; | |||
| FLOAT dot[2]; | |||
| BLASLONG inc_x2; | |||
| BLASLONG inc_y2; | |||
| BLASLONG inc_x2, inc_y2; | |||
| v2f64 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7; | |||
| v2f64 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7; | |||
| v2f64 vx0r, vx0i, vx1r, vx1i, vx2r, vx2i, vx3r, vx3i; | |||
| v2f64 vy0r, vy0i, vy1r, vy1i, vy2r, vy2i, vy3r, vy3i; | |||
| v2f64 vx0r, vx0i, vx1r, vx1i, vx2r, vx2i, vx3r, vx3i; | |||
| v2f64 vy0r, vy0i, vy1r, vy1i, vy2r, vy2i, vy3r, vy3i; | |||
| v2f64 dot0 = {0, 0}; | |||
| v2f64 dot1 = {0, 0}; | |||
| v2f64 dot2 = {0, 0}; | |||
| v2f64 dot3 = {0, 0}; | |||
| v2f64 dot4 = {0, 0}; | |||
| v2f64 dot5 = {0, 0}; | |||
| v2f64 dot6 = {0, 0}; | |||
| v2f64 dot7 = {0, 0}; | |||
| v2f64 zero = {0, 0}; | |||
| openblas_complex_double result; | |||
| OPENBLAS_COMPLEX_FLOAT result; | |||
| dot[0] = 0.0; | |||
| dot[1] = 0.0; | |||
| __real__(result) = 0.0; | |||
| __imag__(result) = 0.0; | |||
| CREAL(result) = 0.0; | |||
| CIMAG(result) = 0.0; | |||
| if ( n < 1 ) return(result); | |||
| if (n < 1) return (result); | |||
| inc_x2 = 2 * inc_x; | |||
| inc_y2 = 2 * inc_y; | |||
| for (i = (n >> 3); i--;) | |||
| { | |||
| LD_DP8_INC(x, inc_x2, vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7); | |||
| LD_DP8_INC(y, inc_y2, vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7); | |||
| PCKEVOD_D2_DP(vx1, vx0, vx0r, vx0i); | |||
| PCKEVOD_D2_DP(vx3, vx2, vx1r, vx1i); | |||
| PCKEVOD_D2_DP(vx5, vx4, vx2r, vx2i); | |||
| PCKEVOD_D2_DP(vx7, vx6, vx3r, vx3i); | |||
| PCKEVOD_D2_DP(vy1, vy0, vy0r, vy0i); | |||
| PCKEVOD_D2_DP(vy3, vy2, vy1r, vy1i); | |||
| PCKEVOD_D2_DP(vy5, vy4, vy2r, vy2i); | |||
| PCKEVOD_D2_DP(vy7, vy6, vy3r, vy3i); | |||
| #if !defined(CONJ) | |||
| DOT16_KERNEL(-, +); | |||
| #else | |||
| DOT16_KERNEL(+, -); | |||
| #endif | |||
| } | |||
| if (n & 7) | |||
| { | |||
| if ((n & 4) && (n & 2)) | |||
| { | |||
| LD_DP4_INC(x, inc_x2, vx0, vx1, vx2, vx3); | |||
| LD_DP4_INC(y, inc_y2, vy0, vy1, vy2, vy3); | |||
| LD_DP2_INC(x, inc_x2, vx4, vx5); | |||
| LD_DP2_INC(y, inc_y2, vy4, vy5); | |||
| PCKEVOD_D2_DP(vx1, vx0, vx0r, vx0i); | |||
| PCKEVOD_D2_DP(vx3, vx2, vx1r, vx1i); | |||
| PCKEVOD_D2_DP(vx5, vx4, vx2r, vx2i); | |||
| PCKEVOD_D2_DP(vy1, vy0, vy0r, vy0i); | |||
| PCKEVOD_D2_DP(vy3, vy2, vy1r, vy1i); | |||
| PCKEVOD_D2_DP(vy5, vy4, vy2r, vy2i); | |||
| #if !defined(CONJ) | |||
| DOT12_KERNEL(-, +); | |||
| #else | |||
| DOT12_KERNEL(+, -); | |||
| #endif | |||
| } | |||
| else if (n & 4) | |||
| { | |||
| LD_DP4_INC(x, inc_x2, vx0, vx1, vx2, vx3); | |||
| LD_DP4_INC(y, inc_y2, vy0, vy1, vy2, vy3); | |||
| PCKEVOD_D2_DP(vx1, vx0, vx0r, vx0i); | |||
| PCKEVOD_D2_DP(vx3, vx2, vx1r, vx1i); | |||
| PCKEVOD_D2_DP(vy1, vy0, vy0r, vy0i); | |||
| PCKEVOD_D2_DP(vy3, vy2, vy1r, vy1i); | |||
| #if !defined(CONJ) | |||
| DOT8_KERNEL(-, +); | |||
| #else | |||
| DOT8_KERNEL(+, -); | |||
| #endif | |||
| } | |||
| else if (n & 2) | |||
| { | |||
| LD_DP2_INC(x, inc_x2, vx0, vx1); | |||
| LD_DP2_INC(y, inc_y2, vy0, vy1); | |||
| PCKEVOD_D2_DP(vx1, vx0, vx0r, vx0i); | |||
| PCKEVOD_D2_DP(vy1, vy0, vy0r, vy0i); | |||
| #if !defined(CONJ) | |||
| DOT4_KERNEL(-, +); | |||
| #else | |||
| DOT4_KERNEL(+, -); | |||
| #endif | |||
| } | |||
| if (n & 1) | |||
| { | |||
| vx0 = LD_DP(x); | |||
| vy0 = LD_DP(y); | |||
| PCKEVOD_D2_DP(zero, vx0, vx0r, vx0i); | |||
| PCKEVOD_D2_DP(zero, vy0, vy0r, vy0i); | |||
| #if !defined(CONJ) | |||
| DOT4_KERNEL(-, +); | |||
| #else | |||
| DOT4_KERNEL(+, -); | |||
| #endif | |||
| } | |||
| } | |||
| dot[0] += (dot0[0] + dot0[1]); | |||
| dot[1] += (dot1[0] + dot1[1]); | |||
| __real__(result) = dot[0]; | |||
| __imag__(result) = dot[1]; | |||
| return(result); | |||
| #ifdef ENABLE_PREFETCH | |||
| if ((1 == inc_x) && (1 == inc_y)) | |||
| { | |||
| double *x_pref, *y_pref; | |||
| BLASLONG pref_offset; | |||
| pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); | |||
| if (pref_offset > 0) | |||
| { | |||
| pref_offset = L1_DATA_LINESIZE - pref_offset; | |||
| } | |||
| pref_offset = pref_offset / sizeof(double); | |||
| x_pref = x + pref_offset + 32; | |||
| pref_offset = (BLASLONG)y & (L1_DATA_LINESIZE - 1); | |||
| if (pref_offset > 0) | |||
| { | |||
| pref_offset = L1_DATA_LINESIZE - pref_offset; | |||
| } | |||
| pref_offset = pref_offset / sizeof(double); | |||
| y_pref = y + pref_offset + 32; | |||
| for (i = (n >> 3); i--;) | |||
| { | |||
| __asm__ __volatile__( | |||
| "pref 0, 0(%[x_pref])\n\t" | |||
| "pref 0, 32(%[x_pref])\n\t" | |||
| "pref 0, 64(%[x_pref])\n\t" | |||
| "pref 0, 96(%[x_pref])\n\t" | |||
| "pref 0, 0(%[y_pref])\n\t" | |||
| "pref 0, 32(%[y_pref])\n\t" | |||
| "pref 0, 64(%[y_pref])\n\t" | |||
| "pref 0, 96(%[y_pref])\n\t" | |||
| : : [x_pref] "r" (x_pref), [y_pref] "r" (y_pref) | |||
| ); | |||
| x_pref += 16; | |||
| y_pref += 16; | |||
| LD_DP8_INC(x, 2, vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7); | |||
| LD_DP8_INC(y, 2, vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7); | |||
| PCKEVOD_D2_DP(vx1, vx0, vx0r, vx0i); | |||
| PCKEVOD_D2_DP(vx3, vx2, vx1r, vx1i); | |||
| PCKEVOD_D2_DP(vx5, vx4, vx2r, vx2i); | |||
| PCKEVOD_D2_DP(vx7, vx6, vx3r, vx3i); | |||
| PCKEVOD_D2_DP(vy1, vy0, vy0r, vy0i); | |||
| PCKEVOD_D2_DP(vy3, vy2, vy1r, vy1i); | |||
| PCKEVOD_D2_DP(vy5, vy4, vy2r, vy2i); | |||
| PCKEVOD_D2_DP(vy7, vy6, vy3r, vy3i); | |||
| dot0 += (vx0r * vy0r); | |||
| dot0 OP1 (vx0i * vy0i); | |||
| dot1 OP2 (vx0i * vy0r); | |||
| dot1 += (vx0r * vy0i); | |||
| dot2 += (vx1r * vy1r); | |||
| dot2 OP1 (vx1i * vy1i); | |||
| dot3 OP2 (vx1i * vy1r); | |||
| dot3 += (vx1r * vy1i); | |||
| dot4 += (vx2r * vy2r); | |||
| dot4 OP1 (vx2i * vy2i); | |||
| dot5 OP2 (vx2i * vy2r); | |||
| dot5 += (vx2r * vy2i); | |||
| dot6 += (vx3r * vy3r); | |||
| dot6 OP1 (vx3i * vy3i); | |||
| dot7 OP2 (vx3i * vy3r); | |||
| dot7 += (vx3r * vy3i); | |||
| } | |||
| } | |||
| else | |||
| #endif | |||
| for (i = (n >> 3); i--;) | |||
| { | |||
| LD_DP8_INC(x, inc_x2, vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7); | |||
| LD_DP8_INC(y, inc_y2, vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7); | |||
| PCKEVOD_D2_DP(vx1, vx0, vx0r, vx0i); | |||
| PCKEVOD_D2_DP(vx3, vx2, vx1r, vx1i); | |||
| PCKEVOD_D2_DP(vx5, vx4, vx2r, vx2i); | |||
| PCKEVOD_D2_DP(vx7, vx6, vx3r, vx3i); | |||
| PCKEVOD_D2_DP(vy1, vy0, vy0r, vy0i); | |||
| PCKEVOD_D2_DP(vy3, vy2, vy1r, vy1i); | |||
| PCKEVOD_D2_DP(vy5, vy4, vy2r, vy2i); | |||
| PCKEVOD_D2_DP(vy7, vy6, vy3r, vy3i); | |||
| dot0 += (vx0r * vy0r); | |||
| dot0 OP1 (vx0i * vy0i); | |||
| dot1 OP2 (vx0i * vy0r); | |||
| dot1 += (vx0r * vy0i); | |||
| dot2 += (vx1r * vy1r); | |||
| dot2 OP1 (vx1i * vy1i); | |||
| dot3 OP2 (vx1i * vy1r); | |||
| dot3 += (vx1r * vy1i); | |||
| dot4 += (vx2r * vy2r); | |||
| dot4 OP1 (vx2i * vy2i); | |||
| dot5 OP2 (vx2i * vy2r); | |||
| dot5 += (vx2r * vy2i); | |||
| dot6 += (vx3r * vy3r); | |||
| dot6 OP1 (vx3i * vy3i); | |||
| dot7 OP2 (vx3i * vy3r); | |||
| dot7 += (vx3r * vy3i); | |||
| } | |||
| if (n & 7) | |||
| { | |||
| if (n & 4) | |||
| { | |||
| LD_DP4_INC(x, inc_x2, vx0, vx1, vx2, vx3); | |||
| LD_DP4_INC(y, inc_y2, vy0, vy1, vy2, vy3); | |||
| PCKEVOD_D2_DP(vx1, vx0, vx0r, vx0i); | |||
| PCKEVOD_D2_DP(vx3, vx2, vx1r, vx1i); | |||
| PCKEVOD_D2_DP(vy1, vy0, vy0r, vy0i); | |||
| PCKEVOD_D2_DP(vy3, vy2, vy1r, vy1i); | |||
| dot0 += (vx0r * vy0r); | |||
| dot0 OP1 (vx0i * vy0i); | |||
| dot1 OP2 (vx0i * vy0r); | |||
| dot1 += (vx0r * vy0i); | |||
| dot2 += (vx1r * vy1r); | |||
| dot2 OP1 (vx1i * vy1i); | |||
| dot3 OP2 (vx1i * vy1r); | |||
| dot3 += (vx1r * vy1i); | |||
| } | |||
| if (n & 2) | |||
| { | |||
| LD_DP2_INC(x, inc_x2, vx0, vx1); | |||
| LD_DP2_INC(y, inc_y2, vy0, vy1); | |||
| PCKEVOD_D2_DP(vx1, vx0, vx0r, vx0i); | |||
| PCKEVOD_D2_DP(vy1, vy0, vy0r, vy0i); | |||
| dot0 += (vx0r * vy0r); | |||
| dot0 OP1 (vx0i * vy0i); | |||
| dot1 OP2 (vx0i * vy0r); | |||
| dot1 += (vx0r * vy0i); | |||
| } | |||
| if (n & 1) | |||
| { | |||
| vx0 = LD_DP(x); | |||
| vy0 = LD_DP(y); | |||
| PCKEVOD_D2_DP(zero, vx0, vx0r, vx0i); | |||
| PCKEVOD_D2_DP(zero, vy0, vy0r, vy0i); | |||
| dot0 += (vx0r * vy0r); | |||
| dot0 OP1 (vx0i * vy0i); | |||
| dot1 OP2 (vx0i * vy0r); | |||
| dot1 += (vx0r * vy0i); | |||
| } | |||
| } | |||
| dot0 += dot2 + dot4 + dot6; | |||
| dot1 += dot3 + dot5 + dot7; | |||
| dot[0] += (dot0[0] + dot0[1]); | |||
| dot[1] += (dot1[0] + dot1[1]); | |||
| CREAL(result) = dot[0]; | |||
| CIMAG(result) = dot[1]; | |||
| return (result); | |||
| } | |||