Add msa optimization for AXPY, COPY, SCALE, SWAPtags/v0.2.20^2
| @@ -42,15 +42,29 @@ CASUMKERNEL = ../mips/asum.c | |||
| ZASUMKERNEL = ../mips/asum.c | |||
| endif | |||
| ifdef HAVE_MSA | |||
| SAXPYKERNEL = ../mips/saxpy_msa.c | |||
| DAXPYKERNEL = ../mips/daxpy_msa.c | |||
| CAXPYKERNEL = ../mips/caxpy_msa.c | |||
| ZAXPYKERNEL = ../mips/zaxpy_msa.c | |||
| else | |||
| SAXPYKERNEL = ../mips/axpy.c | |||
| DAXPYKERNEL = ../mips/axpy.c | |||
| CAXPYKERNEL = ../mips/zaxpy.c | |||
| ZAXPYKERNEL = ../mips/zaxpy.c | |||
| endif | |||
| ifdef HAVE_MSA | |||
| SCOPYKERNEL = ../mips/scopy_msa.c | |||
| DCOPYKERNEL = ../mips/dcopy_msa.c | |||
| CCOPYKERNEL = ../mips/ccopy_msa.c | |||
| ZCOPYKERNEL = ../mips/zcopy_msa.c | |||
| else | |||
| SCOPYKERNEL = ../mips/copy.c | |||
| DCOPYKERNEL = ../mips/copy.c | |||
| CCOPYKERNEL = ../mips/zcopy.c | |||
| ZCOPYKERNEL = ../mips/zcopy.c | |||
| endif | |||
| ifdef HAVE_MSA | |||
| SDOTKERNEL = ../mips/sdot_msa.c | |||
| @@ -74,15 +88,29 @@ DROTKERNEL = ../mips/rot.c | |||
| CROTKERNEL = ../mips/zrot.c | |||
| ZROTKERNEL = ../mips/zrot.c | |||
| ifdef HAVE_MSA | |||
| SSCALKERNEL = ../mips/sscal_msa.c | |||
| DSCALKERNEL = ../mips/dscal_msa.c | |||
| CSCALKERNEL = ../mips/cscal_msa.c | |||
| ZSCALKERNEL = ../mips/zscal_msa.c | |||
| else | |||
| SSCALKERNEL = ../mips/scal.c | |||
| DSCALKERNEL = ../mips/scal.c | |||
| CSCALKERNEL = ../mips/zscal.c | |||
| ZSCALKERNEL = ../mips/zscal.c | |||
| endif | |||
| ifdef HAVE_MSA | |||
| SSWAPKERNEL = ../mips/sswap_msa.c | |||
| DSWAPKERNEL = ../mips/dswap_msa.c | |||
| CSWAPKERNEL = ../mips/cswap_msa.c | |||
| ZSWAPKERNEL = ../mips/zswap_msa.c | |||
| else | |||
| SSWAPKERNEL = ../mips/swap.c | |||
| DSWAPKERNEL = ../mips/swap.c | |||
| CSWAPKERNEL = ../mips/zswap.c | |||
| ZSWAPKERNEL = ../mips/zswap.c | |||
| endif | |||
| ifdef HAVE_MSA | |||
| SGEMVNKERNEL = ../mips/sgemv_n_msa.c | |||
| @@ -0,0 +1,471 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #include "common.h" | |||
| #include "macros_msa.h" | |||
| #if !defined(CONJ) | |||
| #define OP0 += | |||
| #define OP1 -= | |||
| #define OP2 += | |||
| #else | |||
| #define OP0 -= | |||
| #define OP1 += | |||
| #define OP2 -= | |||
| #endif | |||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||
| FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, | |||
| BLASLONG dummy2) | |||
| { | |||
| BLASLONG i, inc_x2, inc_y2; | |||
| FLOAT *py; | |||
| v4f32 x0, x1, x2, x3, x4, x5, x6, x7; | |||
| v4f32 y0, y1, y2, y3, y4, y5, y6, y7, dar_vec, dai_vec; | |||
| v4f32 x0r, x1r, x2r, x3r, x0i, x1i, x2i, x3i; | |||
| v4f32 y0r, y1r, y2r, y3r, y0i, y1i, y2i, y3i; | |||
| FLOAT xd0, xd1, xd2, xd3, xd4, xd5, xd6, xd7; | |||
| FLOAT yd0, yd1, yd2, yd3, yd4, yd5, yd6, yd7; | |||
| if (n < 0) return(0); | |||
| if ((da_r == 0.0) && (da_i == 0.0)) return(0); | |||
| py = y; | |||
| if ((1 == inc_x) && (1 == inc_y)) | |||
| { | |||
| FLOAT *x_pref, *y_pref; | |||
| BLASLONG pref_offset; | |||
| pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); | |||
| if (pref_offset > 0) | |||
| { | |||
| pref_offset = L1_DATA_LINESIZE - pref_offset; | |||
| pref_offset = pref_offset / sizeof(FLOAT); | |||
| } | |||
| x_pref = x + pref_offset + 64; | |||
| pref_offset = (BLASLONG)y & (L1_DATA_LINESIZE - 1); | |||
| if (pref_offset > 0) | |||
| { | |||
| pref_offset = L1_DATA_LINESIZE - pref_offset; | |||
| pref_offset = pref_offset / sizeof(FLOAT); | |||
| } | |||
| y_pref = y + pref_offset + 64; | |||
| dar_vec = COPY_FLOAT_TO_VECTOR(da_r); | |||
| dai_vec = COPY_FLOAT_TO_VECTOR(da_i); | |||
| for (i = (n >> 4); i--;) | |||
| { | |||
| PREF_OFFSET(x_pref, 0); | |||
| PREF_OFFSET(x_pref, 32); | |||
| PREF_OFFSET(x_pref, 64); | |||
| PREF_OFFSET(x_pref, 96); | |||
| PREF_OFFSET(y_pref, 0); | |||
| PREF_OFFSET(y_pref, 32); | |||
| PREF_OFFSET(y_pref, 64); | |||
| PREF_OFFSET(y_pref, 96); | |||
| x_pref += 32; | |||
| y_pref += 32; | |||
| LD_SP8_INC(x, 4, x0, x1, x2, x3, x4, x5, x6, x7); | |||
| LD_SP8_INC(py, 4, y0, y1, y2, y3, y4, y5, y6, y7); | |||
| PCKEVOD_W2_SP(x1, x0, x0r, x0i); | |||
| PCKEVOD_W2_SP(y1, y0, y0r, y0i); | |||
| PCKEVOD_W2_SP(x3, x2, x1r, x1i); | |||
| PCKEVOD_W2_SP(y3, y2, y1r, y1i); | |||
| PCKEVOD_W2_SP(x5, x4, x2r, x2i); | |||
| PCKEVOD_W2_SP(y5, y4, y2r, y2i); | |||
| PCKEVOD_W2_SP(x7, x6, x3r, x3i); | |||
| PCKEVOD_W2_SP(y7, y6, y3r, y3i); | |||
| FMADD4(x0r, x1r, x2r, x3r, dar_vec, y0r, y1r, y2r, y3r); | |||
| y0i OP0 dar_vec * x0i; | |||
| y1i OP0 dar_vec * x1i; | |||
| y2i OP0 dar_vec * x2i; | |||
| y3i OP0 dar_vec * x3i; | |||
| y0r OP1 dai_vec * x0i; | |||
| y1r OP1 dai_vec * x1i; | |||
| y2r OP1 dai_vec * x2i; | |||
| y3r OP1 dai_vec * x3i; | |||
| y0i OP2 dai_vec * x0r; | |||
| y1i OP2 dai_vec * x1r; | |||
| y2i OP2 dai_vec * x2r; | |||
| y3i OP2 dai_vec * x3r; | |||
| ILVRL_W2_SP(y0i, y0r, y0, y1); | |||
| ILVRL_W2_SP(y1i, y1r, y2, y3); | |||
| ILVRL_W2_SP(y2i, y2r, y4, y5); | |||
| ILVRL_W2_SP(y3i, y3r, y6, y7); | |||
| ST_SP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, 4); | |||
| } | |||
| if (n & 15) | |||
| { | |||
| if (n & 8) | |||
| { | |||
| LD_SP4_INC(x, 4, x0, x1, x2, x3); | |||
| LD_SP4_INC(py, 4, y0, y1, y2, y3); | |||
| PCKEVOD_W2_SP(x1, x0, x0r, x0i); | |||
| PCKEVOD_W2_SP(y1, y0, y0r, y0i); | |||
| PCKEVOD_W2_SP(x3, x2, x1r, x1i); | |||
| PCKEVOD_W2_SP(y3, y2, y1r, y1i); | |||
| FMADD2(x0r, x1r, dar_vec, y0r, y1r); | |||
| y0i OP0 dar_vec * x0i; | |||
| y1i OP0 dar_vec * x1i; | |||
| y0r OP1 dai_vec * x0i; | |||
| y1r OP1 dai_vec * x1i; | |||
| y0i OP2 dai_vec * x0r; | |||
| y1i OP2 dai_vec * x1r; | |||
| ILVRL_W2_SP(y0i, y0r, y0, y1); | |||
| ILVRL_W2_SP(y1i, y1r, y2, y3); | |||
| ST_SP4_INC(y0, y1, y2, y3, y, 4); | |||
| } | |||
| if (n & 4) | |||
| { | |||
| LD_SP2_INC(x, 4, x0, x1); | |||
| LD_SP2_INC(py, 4, y0, y1); | |||
| PCKEVOD_W2_SP(x1, x0, x0r, x0i); | |||
| PCKEVOD_W2_SP(y1, y0, y0r, y0i); | |||
| y0r += dar_vec * x0r; | |||
| y0i OP0 dar_vec * x0i; | |||
| y0r OP1 dai_vec * x0i; | |||
| y0i OP2 dai_vec * x0r; | |||
| ILVRL_W2_SP(y0i, y0r, y0, y1); | |||
| ST_SP2_INC(y0, y1, y, 4); | |||
| } | |||
| if (n & 2) | |||
| { | |||
| LD_GP4_INC(x, 1, xd0, xd1, xd2, xd3); | |||
| LD_GP4_INC(py, 1, yd0, yd1, yd2, yd3); | |||
| FMADD2(xd0, xd2, da_r, yd0, yd2); | |||
| yd1 OP0 da_r * xd1; | |||
| yd3 OP0 da_r * xd3; | |||
| yd0 OP1 da_i * xd1; | |||
| yd2 OP1 da_i * xd3; | |||
| yd1 OP2 da_i * xd0; | |||
| yd3 OP2 da_i * xd2; | |||
| ST_GP4_INC(yd0, yd1, yd2, yd3, y, 1); | |||
| } | |||
| if (n & 1) | |||
| { | |||
| LD_GP2_INC(x, 1, xd0, xd1); | |||
| LD_GP2_INC(py, 1, yd0, yd1); | |||
| yd0 += da_r * xd0; | |||
| yd1 OP0 da_r * xd1; | |||
| yd0 OP1 da_i * xd1; | |||
| yd1 OP2 da_i * xd0; | |||
| ST_GP2_INC(yd0, yd1, y, 1); | |||
| } | |||
| } | |||
| } | |||
| else if (1 == inc_y) | |||
| { | |||
| FLOAT *y_pref; | |||
| BLASLONG pref_offset; | |||
| v4f32 x8, x9, x10, x11, x12, x13, x14; | |||
| pref_offset = (BLASLONG)y & (L1_DATA_LINESIZE - 1); | |||
| if (pref_offset > 0) | |||
| { | |||
| pref_offset = L1_DATA_LINESIZE - pref_offset; | |||
| pref_offset = pref_offset / sizeof(FLOAT); | |||
| } | |||
| y_pref = y + pref_offset + 64; | |||
| inc_x2 = 2 * inc_x; | |||
| dar_vec = COPY_FLOAT_TO_VECTOR(da_r); | |||
| dai_vec = COPY_FLOAT_TO_VECTOR(da_i); | |||
| for (i = (n >> 4); i--;) | |||
| { | |||
| PREF_OFFSET(y_pref, 0); | |||
| PREF_OFFSET(y_pref, 32); | |||
| PREF_OFFSET(y_pref, 64); | |||
| PREF_OFFSET(y_pref, 96); | |||
| y_pref += 32; | |||
| LD_SP8_INC(x, inc_x2, x0, x1, x2, x3, x4, x5, x6, x14); | |||
| LD_SP7_INC(x, inc_x2, x8, x9, x10, x11, x12, x13, x7); | |||
| PCKEV_D2_SP(x1, x0, x3, x2, x0, x1); | |||
| PCKEV_D2_SP(x5, x4, x14, x6, x2, x3); | |||
| PCKEV_D2_SP(x9, x8, x11, x10, x4, x5); | |||
| x6 = (v4f32) __msa_pckev_d((v2i64) x13, (v2i64) x12); | |||
| x7 = (v4f32) __msa_insert_w((v4i32) x7, 2, *((int *) x)); | |||
| x7 = (v4f32) __msa_insert_w((v4i32) x7, 3, *((int *) (x + 1))); | |||
| x += inc_x2; | |||
| LD_SP8_INC(py, 4, y0, y1, y2, y3, y4, y5, y6, y7); | |||
| PCKEVOD_W2_SP(x1, x0, x0r, x0i); | |||
| PCKEVOD_W2_SP(y1, y0, y0r, y0i); | |||
| PCKEVOD_W2_SP(x3, x2, x1r, x1i); | |||
| PCKEVOD_W2_SP(y3, y2, y1r, y1i); | |||
| PCKEVOD_W2_SP(x5, x4, x2r, x2i); | |||
| PCKEVOD_W2_SP(y5, y4, y2r, y2i); | |||
| PCKEVOD_W2_SP(x7, x6, x3r, x3i); | |||
| PCKEVOD_W2_SP(y7, y6, y3r, y3i); | |||
| FMADD4(x0r, x1r, x2r, x3r, dar_vec, y0r, y1r, y2r, y3r); | |||
| y0i OP0 dar_vec * x0i; | |||
| y1i OP0 dar_vec * x1i; | |||
| y2i OP0 dar_vec * x2i; | |||
| y3i OP0 dar_vec * x3i; | |||
| y0r OP1 dai_vec * x0i; | |||
| y1r OP1 dai_vec * x1i; | |||
| y2r OP1 dai_vec * x2i; | |||
| y3r OP1 dai_vec * x3i; | |||
| y0i OP2 dai_vec * x0r; | |||
| y1i OP2 dai_vec * x1r; | |||
| y2i OP2 dai_vec * x2r; | |||
| y3i OP2 dai_vec * x3r; | |||
| ILVRL_W2_SP(y0i, y0r, y0, y1); | |||
| ILVRL_W2_SP(y1i, y1r, y2, y3); | |||
| ILVRL_W2_SP(y2i, y2r, y4, y5); | |||
| ILVRL_W2_SP(y3i, y3r, y6, y7); | |||
| ST_SP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, 4); | |||
| } | |||
| if (n & 15) | |||
| { | |||
| if (n & 8) | |||
| { | |||
| LD_SP7_INC(x, inc_x2, x0, x1, x2, x6, x4, x5, x3); | |||
| PCKEV_D2_SP(x1, x0, x6, x2, x0, x1); | |||
| x2 = (v4f32) __msa_pckev_d((v2i64) x5, (v2i64) x4); | |||
| x3 = (v4f32) __msa_insert_w((v4i32) x3, 2, *((int *) x)); | |||
| x3 = (v4f32) __msa_insert_w((v4i32) x3, 3, *((int *) (x + 1))); | |||
| x += inc_x2; | |||
| LD_SP4_INC(py, 4, y0, y1, y2, y3); | |||
| PCKEVOD_W2_SP(x1, x0, x0r, x0i); | |||
| PCKEVOD_W2_SP(y1, y0, y0r, y0i); | |||
| PCKEVOD_W2_SP(x3, x2, x1r, x1i); | |||
| PCKEVOD_W2_SP(y3, y2, y1r, y1i); | |||
| FMADD2(x0r, x1r, dar_vec, y0r, y1r); | |||
| y0i OP0 dar_vec * x0i; | |||
| y1i OP0 dar_vec * x1i; | |||
| y0r OP1 dai_vec * x0i; | |||
| y1r OP1 dai_vec * x1i; | |||
| y0i OP2 dai_vec * x0r; | |||
| y1i OP2 dai_vec * x1r; | |||
| ILVRL_W2_SP(y0i, y0r, y0, y1); | |||
| ILVRL_W2_SP(y1i, y1r, y2, y3); | |||
| ST_SP4_INC(y0, y1, y2, y3, y, 4); | |||
| } | |||
| if (n & 4) | |||
| { | |||
| LD_SP3_INC(x, inc_x2, x0, x2, x1); | |||
| x0 = (v4f32) __msa_pckev_d((v2i64) x2, (v2i64) x0); | |||
| x1 = (v4f32) __msa_insert_w((v4i32) x1, 2, *((int *) x)); | |||
| x1 = (v4f32) __msa_insert_w((v4i32) x1, 3, *((int *) (x + 1))); | |||
| x += inc_x2; | |||
| LD_SP2_INC(py, 4, y0, y1); | |||
| PCKEVOD_W2_SP(x1, x0, x0r, x0i); | |||
| PCKEVOD_W2_SP(y1, y0, y0r, y0i); | |||
| y0r += dar_vec * x0r; | |||
| y0i OP0 dar_vec * x0i; | |||
| y0r OP1 dai_vec * x0i; | |||
| y0i OP2 dai_vec * x0r; | |||
| ILVRL_W2_SP(y0i, y0r, y0, y1); | |||
| ST_SP2_INC(y0, y1, y, 4); | |||
| } | |||
| if (n & 2) | |||
| { | |||
| xd0 = x[0]; | |||
| xd1 = x[1]; | |||
| x += inc_x2; | |||
| xd2 = x[0]; | |||
| xd3 = x[1]; | |||
| x += inc_x2; | |||
| LD_GP4_INC(py, 1, yd0, yd1, yd2, yd3); | |||
| FMADD2(xd0, xd2, da_r, yd0, yd2); | |||
| yd1 OP0 da_r * xd1; | |||
| yd3 OP0 da_r * xd3; | |||
| yd0 OP1 da_i * xd1; | |||
| yd2 OP1 da_i * xd3; | |||
| yd1 OP2 da_i * xd0; | |||
| yd3 OP2 da_i * xd2; | |||
| ST_GP4_INC(yd0, yd1, yd2, yd3, y, 1); | |||
| } | |||
| if (n & 1) | |||
| { | |||
| LD_GP2_INC(x, 1, xd0, xd1); | |||
| LD_GP2_INC(py, 1, yd0, yd1); | |||
| yd0 += da_r * xd0; | |||
| yd1 OP0 da_r * xd1; | |||
| yd0 OP1 da_i * xd1; | |||
| yd1 OP2 da_i * xd0; | |||
| ST_GP2_INC(yd0, yd1, y, 1); | |||
| } | |||
| } | |||
| } | |||
| else | |||
| { | |||
| inc_x2 = 2 * inc_x; | |||
| inc_y2 = 2 * inc_y; | |||
| for (i = (n >> 2); i--;) | |||
| { | |||
| xd0 = x[0]; | |||
| xd1 = x[1]; | |||
| x += inc_x2; | |||
| xd2 = x[0]; | |||
| xd3 = x[1]; | |||
| x += inc_x2; | |||
| xd4 = x[0]; | |||
| xd5 = x[1]; | |||
| x += inc_x2; | |||
| xd6 = x[0]; | |||
| xd7 = x[1]; | |||
| x += inc_x2; | |||
| yd0 = py[0]; | |||
| yd1 = py[1]; | |||
| py += inc_y2; | |||
| yd2 = py[0]; | |||
| yd3 = py[1]; | |||
| py += inc_y2; | |||
| yd4 = py[0]; | |||
| yd5 = py[1]; | |||
| py += inc_y2; | |||
| yd6 = py[0]; | |||
| yd7 = py[1]; | |||
| py += inc_y2; | |||
| FMADD4(xd0, xd2, xd4, xd6, da_r, yd0, yd2, yd4, yd6); | |||
| yd1 OP0 da_r * xd1; | |||
| yd3 OP0 da_r * xd3; | |||
| yd5 OP0 da_r * xd5; | |||
| yd7 OP0 da_r * xd7; | |||
| yd0 OP1 da_i * xd1; | |||
| yd2 OP1 da_i * xd3; | |||
| yd4 OP1 da_i * xd5; | |||
| yd6 OP1 da_i * xd7; | |||
| yd1 OP2 da_i * xd0; | |||
| yd3 OP2 da_i * xd2; | |||
| yd5 OP2 da_i * xd4; | |||
| yd7 OP2 da_i * xd6; | |||
| y[0] = yd0; | |||
| y[1] = yd1; | |||
| y += inc_y2; | |||
| y[0] = yd2; | |||
| y[1] = yd3; | |||
| y += inc_y2; | |||
| y[0] = yd4; | |||
| y[1] = yd5; | |||
| y += inc_y2; | |||
| y[0] = yd6; | |||
| y[1] = yd7; | |||
| y += inc_y2; | |||
| } | |||
| if (n & 3) | |||
| { | |||
| if (n & 2) | |||
| { | |||
| xd0 = x[0]; | |||
| xd1 = x[1]; | |||
| x += inc_x2; | |||
| xd2 = x[0]; | |||
| xd3 = x[1]; | |||
| x += inc_x2; | |||
| yd0 = py[0]; | |||
| yd1 = py[1]; | |||
| py += inc_y2; | |||
| yd2 = py[0]; | |||
| yd3 = py[1]; | |||
| py += inc_y2; | |||
| FMADD2(xd0, xd2, da_r, yd0, yd2); | |||
| yd1 OP0 da_r * xd1; | |||
| yd3 OP0 da_r * xd3; | |||
| yd0 OP1 da_i * xd1; | |||
| yd2 OP1 da_i * xd3; | |||
| yd1 OP2 da_i * xd0; | |||
| yd3 OP2 da_i * xd2; | |||
| y[0] = yd0; | |||
| y[1] = yd1; | |||
| y += inc_y2; | |||
| y[0] = yd2; | |||
| y[1] = yd3; | |||
| y += inc_y2; | |||
| } | |||
| if (n & 1) | |||
| { | |||
| xd0 = x[0]; | |||
| xd1 = x[1]; | |||
| yd0 = y[0]; | |||
| yd1 = y[1]; | |||
| yd0 += da_r * xd0; | |||
| yd1 OP0 da_r * xd1; | |||
| yd0 OP1 da_i * xd1; | |||
| yd1 OP2 da_i * xd0; | |||
| y[0] = yd0; | |||
| y[1] = yd1; | |||
| } | |||
| } | |||
| } | |||
| return (0); | |||
| } | |||
| @@ -0,0 +1,201 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #include "common.h" | |||
| #include "macros_msa.h" | |||
| int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| { | |||
| BLASLONG i, inc_x2, inc_y2; | |||
| v4f32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; | |||
| FLOAT f0, f1, f2, f3, f4, f5, f6, f7; | |||
| if (n < 0) return (0); | |||
| if ((1 == inc_x) && (1 == inc_y)) | |||
| { | |||
| if (n > 31) | |||
| { | |||
| FLOAT *x_pref; | |||
| BLASLONG pref_offset; | |||
| pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); | |||
| if (pref_offset > 0) | |||
| { | |||
| pref_offset = L1_DATA_LINESIZE - pref_offset; | |||
| pref_offset = pref_offset / sizeof(FLOAT); | |||
| } | |||
| x_pref = x + pref_offset + 128 + 32; | |||
| LD_SP8_INC(x, 4, x0, x1, x2, x3, x4, x5, x6, x7); | |||
| for (i = (n >> 5) - 1; i--;) | |||
| { | |||
| PREF_OFFSET(x_pref, 0); | |||
| PREF_OFFSET(x_pref, 32); | |||
| PREF_OFFSET(x_pref, 64); | |||
| PREF_OFFSET(x_pref, 96); | |||
| PREF_OFFSET(x_pref, 128); | |||
| PREF_OFFSET(x_pref, 160); | |||
| PREF_OFFSET(x_pref, 192); | |||
| PREF_OFFSET(x_pref, 224); | |||
| x_pref += 64; | |||
| x8 = LD_SP(x); x += 4; | |||
| ST_SP(x0, y); y += 4; | |||
| x9 = LD_SP(x); x += 4; | |||
| ST_SP(x1, y); y += 4; | |||
| x10 = LD_SP(x); x += 4; | |||
| ST_SP(x2, y); y += 4; | |||
| x11 = LD_SP(x); x += 4; | |||
| ST_SP(x3, y); y += 4; | |||
| x12 = LD_SP(x); x += 4; | |||
| ST_SP(x4, y); y += 4; | |||
| x13 = LD_SP(x); x += 4; | |||
| ST_SP(x5, y); y += 4; | |||
| x14 = LD_SP(x); x += 4; | |||
| ST_SP(x6, y); y += 4; | |||
| x15 = LD_SP(x); x += 4; | |||
| ST_SP(x7, y); y += 4; | |||
| x0 = LD_SP(x); x += 4; | |||
| ST_SP(x8, y); y += 4; | |||
| x1 = LD_SP(x); x += 4; | |||
| ST_SP(x9, y); y += 4; | |||
| x2 = LD_SP(x); x += 4; | |||
| ST_SP(x10, y); y += 4; | |||
| x3 = LD_SP(x); x += 4; | |||
| ST_SP(x11, y); y += 4; | |||
| x4 = LD_SP(x); x += 4; | |||
| ST_SP(x12, y); y += 4; | |||
| x5 = LD_SP(x); x += 4; | |||
| ST_SP(x13, y); y += 4; | |||
| x6 = LD_SP(x); x += 4; | |||
| ST_SP(x14, y); y += 4; | |||
| x7 = LD_SP(x); x += 4; | |||
| ST_SP(x15, y); y += 4; | |||
| } | |||
| x8 = LD_SP(x); x += 4; | |||
| x9 = LD_SP(x); x += 4; | |||
| ST_SP(x0, y); y += 4; | |||
| x10 = LD_SP(x); x += 4; | |||
| ST_SP(x1, y); y += 4; | |||
| x11 = LD_SP(x); x += 4; | |||
| ST_SP(x2, y); y += 4; | |||
| x12 = LD_SP(x); x += 4; | |||
| ST_SP(x3, y); y += 4; | |||
| x13 = LD_SP(x); x += 4; | |||
| ST_SP(x4, y); y += 4; | |||
| x14 = LD_SP(x); x += 4; | |||
| ST_SP(x5, y); y += 4; | |||
| x15 = LD_SP(x); x += 4; | |||
| ST_SP(x6, y); y += 4; | |||
| ST_SP(x7, y); y += 4; | |||
| ST_SP8_INC(x8, x9, x10, x11, x12, x13, x14, x15, y, 4); | |||
| } | |||
| if (n & 31) | |||
| { | |||
| if (n & 16) | |||
| { | |||
| LD_SP8_INC(x, 4, x0, x1, x2, x3, x4, x5, x6, x7); | |||
| ST_SP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, y, 4); | |||
| } | |||
| if (n & 8) | |||
| { | |||
| LD_SP4_INC(x, 4, x0, x1, x2, x3); | |||
| ST_SP4_INC(x0, x1, x2, x3, y, 4); | |||
| } | |||
| if (n & 4) | |||
| { | |||
| LD_SP2_INC(x, 4, x0, x1); | |||
| ST_SP2_INC(x0, x1, y, 4); | |||
| } | |||
| if (n & 2) | |||
| { | |||
| LD_GP4_INC(x, 1, f0, f1, f2, f3); | |||
| ST_GP4_INC(f0, f1, f2, f3, y, 1); | |||
| } | |||
| if (n & 1) | |||
| { | |||
| LD_GP2_INC(x, 1, f0, f1); | |||
| ST_GP2_INC(f0, f1, y, 1); | |||
| } | |||
| } | |||
| } | |||
| else | |||
| { | |||
| inc_x2 = 2 * inc_x; | |||
| inc_y2 = 2 * inc_y; | |||
| for (i = (n >> 2); i--;) | |||
| { | |||
| f0 = *x; | |||
| f1 = *(x+1); x += inc_x2; | |||
| f2 = *x; | |||
| f3 = *(x+1); x += inc_x2; | |||
| f4 = *x; | |||
| f5 = *(x+1); x += inc_x2; | |||
| f6 = *x; | |||
| f7 = *(x+1); x += inc_x2; | |||
| *y = f0; | |||
| *(y+1) = f1; y += inc_y2; | |||
| *y = f2; | |||
| *(y+1) = f3; y += inc_y2; | |||
| *y = f4; | |||
| *(y+1) = f5; y += inc_y2; | |||
| *y = f6; | |||
| *(y+1) = f7; y += inc_y2; | |||
| } | |||
| if (n & 2) | |||
| { | |||
| f0 = *x; | |||
| f1 = *(x+1); x += inc_x2; | |||
| f2 = *x; | |||
| f3 = *(x+1); x += inc_x2; | |||
| *y = f0; | |||
| *(y+1) = f1; y += inc_y2; | |||
| *y = f2; | |||
| *(y+1) = f3; y += inc_y2; | |||
| } | |||
| if (n & 1) | |||
| { | |||
| LD_GP2_INC(x, 1, f0, f1); | |||
| ST_GP2_INC(f0, f1, y, 1); | |||
| } | |||
| } | |||
| return (0); | |||
| } | |||
| @@ -0,0 +1,281 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #include "common.h" | |||
| #include "macros_msa.h" | |||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, | |||
| FLOAT dummy4, FLOAT *srcx, BLASLONG inc_x, FLOAT *srcy, | |||
| BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | |||
| { | |||
| BLASLONG i = 0, pref_offsetx, pref_offsety; | |||
| FLOAT *px, *py; | |||
| BLASLONG inc_x2, inc_y2; | |||
| FLOAT x0, x1, x2, x3, x4, x5, x6, x7; | |||
| FLOAT y0, y1, y2, y3, y4, y5, y6, y7; | |||
| v4f32 xv0, xv1, xv2, xv3, xv4, xv5, xv6, xv7; | |||
| v4f32 yv0, yv1, yv2, yv3, yv4, yv5, yv6, yv7; | |||
| if (n < 0) return (0); | |||
| pref_offsetx = (BLASLONG)srcx & (L1_DATA_LINESIZE - 1); | |||
| if (pref_offsetx > 0) | |||
| { | |||
| pref_offsetx = L1_DATA_LINESIZE - pref_offsetx; | |||
| pref_offsetx = pref_offsetx / sizeof(FLOAT); | |||
| } | |||
| pref_offsety = (BLASLONG)srcy & (L1_DATA_LINESIZE - 1); | |||
| if (pref_offsety > 0) | |||
| { | |||
| pref_offsety = L1_DATA_LINESIZE - pref_offsety; | |||
| pref_offsety = pref_offsety / sizeof(FLOAT); | |||
| } | |||
| px = srcx; | |||
| py = srcy; | |||
| inc_x2 = 2 * inc_x; | |||
| inc_y2 = 2 * inc_y; | |||
| if ((1 == inc_x) && (1 == inc_y)) | |||
| { | |||
| if (n >> 4) | |||
| { | |||
| LD_SP8_INC(px, 4, xv0, xv1, xv2, xv3, xv4, xv5, xv6, xv7); | |||
| for (i = (n >> 4) - 1; i--;) | |||
| { | |||
| PREFETCH(px + pref_offsetx + 32); | |||
| PREFETCH(px + pref_offsetx + 40); | |||
| PREFETCH(px + pref_offsetx + 48); | |||
| PREFETCH(px + pref_offsetx + 56); | |||
| PREFETCH(py + pref_offsety + 32); | |||
| PREFETCH(py + pref_offsety + 40); | |||
| PREFETCH(py + pref_offsety + 48); | |||
| PREFETCH(py + pref_offsety + 56); | |||
| yv0 = LD_SP(py); py += 4; | |||
| ST_SP(xv0, srcy); srcy += 4; | |||
| yv1 = LD_SP(py); py += 4; | |||
| ST_SP(xv1, srcy); srcy += 4; | |||
| yv2 = LD_SP(py); py += 4; | |||
| ST_SP(xv2, srcy); srcy += 4; | |||
| yv3 = LD_SP(py); py += 4; | |||
| ST_SP(xv3, srcy); srcy += 4; | |||
| yv4 = LD_SP(py); py += 4; | |||
| ST_SP(xv4, srcy); srcy += 4; | |||
| yv5 = LD_SP(py); py += 4; | |||
| ST_SP(xv5, srcy); srcy += 4; | |||
| yv6 = LD_SP(py); py += 4; | |||
| ST_SP(xv6, srcy); srcy += 4; | |||
| yv7 = LD_SP(py); py += 4; | |||
| ST_SP(xv7, srcy); srcy += 4; | |||
| xv0 = LD_SP(px); px += 4; | |||
| ST_SP(yv0, srcx); srcx += 4; | |||
| xv1 = LD_SP(px); px += 4; | |||
| ST_SP(yv1, srcx); srcx += 4; | |||
| xv2 = LD_SP(px); px += 4; | |||
| ST_SP(yv2, srcx); srcx += 4; | |||
| xv3 = LD_SP(px); px += 4; | |||
| ST_SP(yv3, srcx); srcx += 4; | |||
| xv4 = LD_SP(px); px += 4; | |||
| ST_SP(yv4, srcx); srcx += 4; | |||
| xv5 = LD_SP(px); px += 4; | |||
| ST_SP(yv5, srcx); srcx += 4; | |||
| xv6 = LD_SP(px); px += 4; | |||
| ST_SP(yv6, srcx); srcx += 4; | |||
| xv7 = LD_SP(px); px += 4; | |||
| ST_SP(yv7, srcx); srcx += 4; | |||
| } | |||
| LD_SP8_INC(py, 4, yv0, yv1, yv2, yv3, yv4, yv5, yv6, yv7); | |||
| ST_SP8_INC(xv0, xv1, xv2, xv3, xv4, xv5, xv6, xv7, srcy, 4); | |||
| ST_SP8_INC(yv0, yv1, yv2, yv3, yv4, yv5, yv6, yv7, srcx, 4); | |||
| } | |||
| if (n & 15) | |||
| { | |||
| if ((n & 8) && (n & 4) && (n & 2)) | |||
| { | |||
| LD_SP7_INC(px, 4, xv0, xv1, xv2, xv3, xv4, xv5, xv6); | |||
| LD_SP7_INC(py, 4, yv0, yv1, yv2, yv3, yv4, yv5, yv6); | |||
| ST_SP7_INC(xv0, xv1, xv2, xv3, xv4, xv5, xv6, srcy, 4); | |||
| ST_SP7_INC(yv0, yv1, yv2, yv3, yv4, yv5, yv6, srcx, 4); | |||
| } | |||
| else if ((n & 8) && (n & 4)) | |||
| { | |||
| LD_SP6_INC(px, 4, xv0, xv1, xv2, xv3, xv4, xv5); | |||
| LD_SP6_INC(py, 4, yv0, yv1, yv2, yv3, yv4, yv5); | |||
| ST_SP6_INC(xv0, xv1, xv2, xv3, xv4, xv5, srcy, 4); | |||
| ST_SP6_INC(yv0, yv1, yv2, yv3, yv4, yv5, srcx, 4); | |||
| } | |||
| else if ((n & 8) && (n & 2)) | |||
| { | |||
| LD_SP5_INC(px, 4, xv0, xv1, xv2, xv3, xv4); | |||
| LD_SP5_INC(py, 4, yv0, yv1, yv2, yv3, yv4); | |||
| ST_SP5_INC(xv0, xv1, xv2, xv3, xv4, srcy, 4); | |||
| ST_SP5_INC(yv0, yv1, yv2, yv3, yv4, srcx, 4); | |||
| } | |||
| else if ((n & 4) && (n & 2)) | |||
| { | |||
| LD_SP3_INC(px, 4, xv0, xv1, xv2); | |||
| LD_SP3_INC(py, 4, yv0, yv1, yv2); | |||
| ST_SP3_INC(xv0, xv1, xv2, srcy, 4); | |||
| ST_SP3_INC(yv0, yv1, yv2, srcx, 4); | |||
| } | |||
| else if (n & 8) | |||
| { | |||
| LD_SP4_INC(px, 4, xv0, xv1, xv2, xv3); | |||
| LD_SP4_INC(py, 4, yv0, yv1, yv2, yv3); | |||
| ST_SP4_INC(xv0, xv1, xv2, xv3, srcy, 4); | |||
| ST_SP4_INC(yv0, yv1, yv2, yv3, srcx, 4); | |||
| } | |||
| else if (n & 4) | |||
| { | |||
| LD_SP2_INC(px, 4, xv0, xv1); | |||
| LD_SP2_INC(py, 4, yv0, yv1); | |||
| ST_SP2_INC(xv0, xv1, srcy, 4); | |||
| ST_SP2_INC(yv0, yv1, srcx, 4); | |||
| } | |||
| else if (n & 2) | |||
| { | |||
| xv0 = LD_SP(px); | |||
| yv0 = LD_SP(py); | |||
| px += 4; | |||
| py += 4; | |||
| ST_SP(xv0, srcy); | |||
| ST_SP(yv0, srcx); | |||
| srcx += 4; | |||
| srcy += 4; | |||
| } | |||
| if (n & 1) | |||
| { | |||
| LD_GP2_INC(px, 1, x0, x1); | |||
| LD_GP2_INC(py, 1, y0, y1); | |||
| ST_GP2_INC(x0, x1, srcy, 1); | |||
| ST_GP2_INC(y0, y1, srcx, 1); | |||
| } | |||
| } | |||
| } | |||
| else | |||
| { | |||
| for (i = (n >> 2); i--;) | |||
| { | |||
| x0 = srcx[0 * inc_x2]; | |||
| x1 = srcx[0 * inc_x2 + 1]; | |||
| x2 = srcx[1 * inc_x2]; | |||
| x3 = srcx[1 * inc_x2 + 1]; | |||
| x4 = srcx[2 * inc_x2]; | |||
| x5 = srcx[2 * inc_x2 + 1]; | |||
| x6 = srcx[3 * inc_x2]; | |||
| x7 = srcx[3 * inc_x2 + 1]; | |||
| y0 = srcy[0 * inc_y2]; | |||
| y1 = srcy[0 * inc_y2 + 1]; | |||
| y2 = srcy[1 * inc_y2]; | |||
| y3 = srcy[1 * inc_y2 + 1]; | |||
| y4 = srcy[2 * inc_y2]; | |||
| y5 = srcy[2 * inc_y2 + 1]; | |||
| y6 = srcy[3 * inc_y2]; | |||
| y7 = srcy[3 * inc_y2 + 1]; | |||
| srcx[0 * inc_x2] = y0; | |||
| srcx[0 * inc_x2 + 1] = y1; | |||
| srcx[1 * inc_x2] = y2; | |||
| srcx[1 * inc_x2 + 1] = y3; | |||
| srcx[2 * inc_x2] = y4; | |||
| srcx[2 * inc_x2 + 1] = y5; | |||
| srcx[3 * inc_x2] = y6; | |||
| srcx[3 * inc_x2 + 1] = y7; | |||
| srcy[0 * inc_y2] = x0; | |||
| srcy[0 * inc_y2 + 1] = x1; | |||
| srcy[1 * inc_y2] = x2; | |||
| srcy[1 * inc_y2 + 1] = x3; | |||
| srcy[2 * inc_y2] = x4; | |||
| srcy[2 * inc_y2 + 1] = x5; | |||
| srcy[3 * inc_y2] = x6; | |||
| srcy[3 * inc_y2 + 1] = x7; | |||
| srcx += 4 * inc_x2; | |||
| srcy += 4 * inc_y2; | |||
| } | |||
| if (n & 2) | |||
| { | |||
| x0 = srcx[0 * inc_x2]; | |||
| x1 = srcx[0 * inc_x2 + 1]; | |||
| x2 = srcx[1 * inc_x2]; | |||
| x3 = srcx[1 * inc_x2 + 1]; | |||
| y0 = srcy[0 * inc_y2]; | |||
| y1 = srcy[0 * inc_y2 + 1]; | |||
| y2 = srcy[1 * inc_y2]; | |||
| y3 = srcy[1 * inc_y2 + 1]; | |||
| srcx[0 * inc_x2] = y0; | |||
| srcx[0 * inc_x2 + 1] = y1; | |||
| srcx[1 * inc_x2] = y2; | |||
| srcx[1 * inc_x2 + 1] = y3; | |||
| srcy[0 * inc_y2] = x0; | |||
| srcy[0 * inc_y2 + 1] = x1; | |||
| srcy[1 * inc_y2] = x2; | |||
| srcy[1 * inc_y2 + 1] = x3; | |||
| srcx += 2 * inc_x2; | |||
| srcy += 2 * inc_y2; | |||
| } | |||
| if (n & 1) | |||
| { | |||
| x0 = srcx[0 * inc_x2]; | |||
| x1 = srcx[0 * inc_x2 + 1]; | |||
| y0 = srcy[0 * inc_y2]; | |||
| y1 = srcy[0 * inc_y2 + 1]; | |||
| srcx[0 * inc_x2] = y0; | |||
| srcx[0 * inc_x2 + 1] = y1; | |||
| srcy[0 * inc_y2] = x0; | |||
| srcy[0 * inc_y2 + 1] = x1; | |||
| srcx += inc_x2; | |||
| srcy += inc_y2; | |||
| } | |||
| } | |||
| return (0); | |||
| } | |||
| @@ -0,0 +1,246 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #include "common.h" | |||
| #include "macros_msa.h" | |||
| #if !defined(CONJ) | |||
| #define OP0 += | |||
| #define OP1 -= | |||
| #define OP2 += | |||
| #else | |||
| #define OP0 -= | |||
| #define OP1 += | |||
| #define OP2 -= | |||
| #endif | |||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, | |||
| BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, | |||
| BLASLONG dummy2) | |||
| { | |||
| BLASLONG i; | |||
| FLOAT *py; | |||
| v2f64 x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, y6, y7; | |||
| v2f64 da_vec, zero_v = {0}; | |||
| if ((n < 0) || (da == 0.0)) return(0); | |||
| py = y; | |||
| if ((1 == inc_x) && (1 == inc_y)) | |||
| { | |||
| FLOAT *x_pref, *y_pref; | |||
| BLASLONG pref_offset; | |||
| pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); | |||
| if (pref_offset > 0) | |||
| { | |||
| pref_offset = L1_DATA_LINESIZE - pref_offset; | |||
| pref_offset = pref_offset / sizeof(FLOAT); | |||
| } | |||
| x_pref = x + pref_offset + 32; | |||
| pref_offset = (BLASLONG)y & (L1_DATA_LINESIZE - 1); | |||
| if (pref_offset > 0) | |||
| { | |||
| pref_offset = L1_DATA_LINESIZE - pref_offset; | |||
| pref_offset = pref_offset / sizeof(FLOAT); | |||
| } | |||
| y_pref = y + pref_offset + 32; | |||
| da_vec = COPY_DOUBLE_TO_VECTOR(da); | |||
| for (i = (n >> 4); i--;) | |||
| { | |||
| PREF_OFFSET(x_pref, 0); | |||
| PREF_OFFSET(x_pref, 32); | |||
| PREF_OFFSET(x_pref, 64); | |||
| PREF_OFFSET(x_pref, 96); | |||
| PREF_OFFSET(y_pref, 0); | |||
| PREF_OFFSET(y_pref, 32); | |||
| PREF_OFFSET(y_pref, 64); | |||
| PREF_OFFSET(y_pref, 96); | |||
| x_pref += 16; | |||
| y_pref += 16; | |||
| LD_DP8_INC(x, 2, x0, x1, x2, x3, x4, x5, x6, x7); | |||
| LD_DP8_INC(py, 2, y0, y1, y2, y3, y4, y5, y6, y7); | |||
| FMADD4(x0, x1, x2, x3, da_vec, y0, y1, y2, y3); | |||
| FMADD4(x4, x5, x6, x7, da_vec, y4, y5, y6, y7); | |||
| ST_DP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, 2); | |||
| } | |||
| if (n & 15) | |||
| { | |||
| if (n & 8) | |||
| { | |||
| LD_DP4_INC(x, 2, x0, x1, x2, x3); | |||
| LD_DP4_INC(py, 2, y0, y1, y2, y3); | |||
| FMADD4(x0, x1, x2, x3, da_vec, y0, y1, y2, y3); | |||
| ST_DP4_INC(y0, y1, y2, y3, y, 2); | |||
| } | |||
| if (n & 4) | |||
| { | |||
| LD_DP2_INC(x, 2, x0, x1); | |||
| LD_DP2_INC(py, 2, y0, y1); | |||
| FMADD2(x0, x1, da_vec, y0, y1); | |||
| ST_DP2_INC(y0, y1, y, 2); | |||
| } | |||
| if (n & 2) | |||
| { | |||
| x0 = LD_DP(x); x += 2; | |||
| y0 = LD_DP(py); py += 2; | |||
| y0 += da_vec * x0; | |||
| ST_DP(y0, y); y += 2; | |||
| } | |||
| if (n & 1) | |||
| { | |||
| y[0] += da * x[0]; | |||
| } | |||
| } | |||
| } | |||
| else if (1 == inc_y) | |||
| { | |||
| FLOAT *y_pref; | |||
| BLASLONG pref_offset; | |||
| v2f64 x8, x9, x10, x11, x12, x13, x14; | |||
| pref_offset = (BLASLONG)y & (L1_DATA_LINESIZE - 1); | |||
| if (pref_offset > 0) | |||
| { | |||
| pref_offset = L1_DATA_LINESIZE - pref_offset; | |||
| pref_offset = pref_offset / sizeof(FLOAT); | |||
| } | |||
| y_pref = y + pref_offset + 32; | |||
| da_vec = COPY_DOUBLE_TO_VECTOR(da); | |||
| for (i = (n >> 4); i--;) | |||
| { | |||
| PREF_OFFSET(y_pref, 0); | |||
| PREF_OFFSET(y_pref, 32); | |||
| PREF_OFFSET(y_pref, 64); | |||
| PREF_OFFSET(y_pref, 96); | |||
| y_pref += 16; | |||
| LD_DP8_INC(x, inc_x, x0, x1, x2, x3, x4, x5, x6, x14); | |||
| LD_DP7_INC(x, inc_x, x8, x9, x10, x11, x12, x13, x7); | |||
| PCKEV_D2_SD(x1, x0, x3, x2, x0, x1); | |||
| PCKEV_D2_SD(x5, x4, x14, x6, x2, x3); | |||
| PCKEV_D2_SD(x9, x8, x11, x10, x4, x5); | |||
| x6 = (v2f64) __msa_pckev_d((v2i64) x13, (v2i64) x12); | |||
| x7 = (v2f64) __msa_insert_d((v2i64) x7, 1, *((BLASLONG *) x)); | |||
| x += inc_x; | |||
| LD_DP8_INC(py, 2, y0, y1, y2, y3, y4, y5, y6, y7); | |||
| FMADD4(x0, x1, x2, x3, da_vec, y0, y1, y2, y3); | |||
| FMADD4(x4, x5, x6, x7, da_vec, y4, y5, y6, y7); | |||
| ST_DP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, 2); | |||
| } | |||
| if (n & 15) | |||
| { | |||
| if (n & 8) | |||
| { | |||
| LD_DP7_INC(x, inc_x, x0, x1, x2, x6, x4, x5, x3); | |||
| PCKEV_D2_SD(x1, x0, x6, x2, x0, x1); | |||
| x2 = (v2f64) __msa_pckev_d((v2i64) x5, (v2i64) x4); | |||
| x3 = (v2f64) __msa_insert_d((v2i64) x3, 1, *((BLASLONG *) x)); | |||
| x += inc_x; | |||
| LD_DP4_INC(py, 2, y0, y1, y2, y3); | |||
| FMADD4(x0, x1, x2, x3, da_vec, y0, y1, y2, y3); | |||
| ST_DP4_INC(y0, y1, y2, y3, y, 2); | |||
| } | |||
| if (n & 4) | |||
| { | |||
| LD_DP3_INC(x, inc_x, x0, x2, x1); | |||
| x0 = (v2f64) __msa_pckev_d((v2i64) x2, (v2i64) x0); | |||
| x1 = (v2f64) __msa_insert_d((v2i64) x1, 1, *((BLASLONG *) x)); | |||
| x += inc_x; | |||
| LD_DP2_INC(py, 2, y0, y1); | |||
| FMADD2(x0, x1, da_vec, y0, y1); | |||
| ST_DP2_INC(y0, y1, y, 2); | |||
| } | |||
| if (n & 2) | |||
| { | |||
| x0 = (v2f64) __msa_insert_d((v2i64) zero_v, 0, *((BLASLONG *) x)); | |||
| x += inc_x; | |||
| x0 = (v2f64) __msa_insert_d((v2i64) x0, 1, *((BLASLONG *) x)); | |||
| x += inc_x; | |||
| y0 = LD_DP(py); py += 2; | |||
| y0 += da_vec * x0; | |||
| ST_DP(y0, y); y += 2; | |||
| } | |||
| if (n & 1) | |||
| { | |||
| y[0] += da * x[0]; | |||
| } | |||
| } | |||
| } | |||
| else | |||
| { | |||
| FLOAT x0, x1, x2, x3, y0, y1, y2, y3; | |||
| for (i = (n >> 2); i--;) | |||
| { | |||
| LD_GP4_INC(x, inc_x, x0, x1, x2, x3); | |||
| LD_GP4_INC(py, inc_y, y0, y1, y2, y3); | |||
| FMADD4(x0, x1, x2, x3, da, y0, y1, y2, y3); | |||
| ST_GP4_INC(y0, y1, y2, y3, y, inc_y); | |||
| } | |||
| if (n & 3) | |||
| { | |||
| if (n & 2) | |||
| { | |||
| LD_GP2_INC(x, inc_x, x0, x1); | |||
| LD_GP2_INC(py, inc_y, y0, y1); | |||
| FMADD2(x0, x1, da, y0, y1); | |||
| ST_GP2_INC(y0, y1, y, inc_y); | |||
| } | |||
| if (n & 1) | |||
| { | |||
| *y += da * *x; | |||
| } | |||
| } | |||
| } | |||
| return (0); | |||
| } | |||
| @@ -0,0 +1,180 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #include "common.h" | |||
| #include "macros_msa.h" | |||
| int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| { | |||
| BLASLONG i; | |||
| v2f64 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; | |||
| FLOAT f0, f1, f2, f3, f4, f5, f6, f7; | |||
| if (n < 0) return (0); | |||
| if ((1 == inc_x) && (1 == inc_y)) | |||
| { | |||
| if (n > 31) | |||
| { | |||
| FLOAT *x_pref; | |||
| BLASLONG pref_offset; | |||
| pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); | |||
| if (pref_offset > 0) | |||
| { | |||
| pref_offset = L1_DATA_LINESIZE - pref_offset; | |||
| pref_offset = pref_offset / sizeof(FLOAT); | |||
| } | |||
| x_pref = x + pref_offset + 64 + 16; | |||
| LD_DP8_INC(x, 2, x0, x1, x2, x3, x4, x5, x6, x7); | |||
| for (i = (n >> 5) - 1; i--;) | |||
| { | |||
| PREF_OFFSET(x_pref, 0); | |||
| PREF_OFFSET(x_pref, 32); | |||
| PREF_OFFSET(x_pref, 64); | |||
| PREF_OFFSET(x_pref, 96); | |||
| PREF_OFFSET(x_pref, 128); | |||
| PREF_OFFSET(x_pref, 160); | |||
| PREF_OFFSET(x_pref, 192); | |||
| PREF_OFFSET(x_pref, 224); | |||
| x_pref += 32; | |||
| x8 = LD_DP(x); x += 2; | |||
| ST_DP(x0, y); y += 2; | |||
| x9 = LD_DP(x); x += 2; | |||
| ST_DP(x1, y); y += 2; | |||
| x10 = LD_DP(x); x += 2; | |||
| ST_DP(x2, y); y += 2; | |||
| x11 = LD_DP(x); x += 2; | |||
| ST_DP(x3, y); y += 2; | |||
| x12 = LD_DP(x); x += 2; | |||
| ST_DP(x4, y); y += 2; | |||
| x13 = LD_DP(x); x += 2; | |||
| ST_DP(x5, y); y += 2; | |||
| x14 = LD_DP(x); x += 2; | |||
| ST_DP(x6, y); y += 2; | |||
| x15 = LD_DP(x); x += 2; | |||
| ST_DP(x7, y); y += 2; | |||
| x0 = LD_DP(x); x += 2; | |||
| ST_DP(x8, y); y += 2; | |||
| x1 = LD_DP(x); x += 2; | |||
| ST_DP(x9, y); y += 2; | |||
| x2 = LD_DP(x); x += 2; | |||
| ST_DP(x10, y); y += 2; | |||
| x3 = LD_DP(x); x += 2; | |||
| ST_DP(x11, y); y += 2; | |||
| x4 = LD_DP(x); x += 2; | |||
| ST_DP(x12, y); y += 2; | |||
| x5 = LD_DP(x); x += 2; | |||
| ST_DP(x13, y); y += 2; | |||
| x6 = LD_DP(x); x += 2; | |||
| ST_DP(x14, y); y += 2; | |||
| x7 = LD_DP(x); x += 2; | |||
| ST_DP(x15, y); y += 2; | |||
| } | |||
| x8 = LD_DP(x); x += 2; | |||
| x9 = LD_DP(x); x += 2; | |||
| ST_DP(x0, y); y += 2; | |||
| x10 = LD_DP(x); x += 2; | |||
| ST_DP(x1, y); y += 2; | |||
| x11 = LD_DP(x); x += 2; | |||
| ST_DP(x2, y); y += 2; | |||
| x12 = LD_DP(x); x += 2; | |||
| ST_DP(x3, y); y += 2; | |||
| x13 = LD_DP(x); x += 2; | |||
| ST_DP(x4, y); y += 2; | |||
| x14 = LD_DP(x); x += 2; | |||
| ST_DP(x5, y); y += 2; | |||
| x15 = LD_DP(x); x += 2; | |||
| ST_DP(x6, y); y += 2; | |||
| ST_DP(x7, y); y += 2; | |||
| ST_DP8_INC(x8, x9, x10, x11, x12, x13, x14, x15, y, 2); | |||
| } | |||
| if (n & 31) | |||
| { | |||
| if (n & 16) | |||
| { | |||
| LD_DP8_INC(x, 2, x0, x1, x2, x3, x4, x5, x6, x7); | |||
| ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, y, 2); | |||
| } | |||
| if (n & 8) | |||
| { | |||
| LD_DP4_INC(x, 2, x0, x1, x2, x3); | |||
| ST_DP4_INC(x0, x1, x2, x3, y, 2); | |||
| } | |||
| if (n & 4) | |||
| { | |||
| LD_GP4_INC(x, 1, f0, f1, f2, f3); | |||
| ST_GP4_INC(f0, f1, f2, f3, y, 1); | |||
| } | |||
| if (n & 2) | |||
| { | |||
| LD_GP2_INC(x, 1, f0, f1); | |||
| ST_GP2_INC(f0, f1, y, 1); | |||
| } | |||
| if (n & 1) | |||
| { | |||
| *y = *x; | |||
| } | |||
| } | |||
| } | |||
| else | |||
| { | |||
| for (i = (n >> 3); i--;) | |||
| { | |||
| LD_GP8_INC(x, inc_x, f0, f1, f2, f3, f4, f5, f6, f7); | |||
| ST_GP8_INC(f0, f1, f2, f3, f4, f5, f6, f7, y, inc_y); | |||
| } | |||
| if (n & 4) | |||
| { | |||
| LD_GP4_INC(x, inc_x, f0, f1, f2, f3); | |||
| ST_GP4_INC(f0, f1, f2, f3, y, inc_y); | |||
| } | |||
| if (n & 2) | |||
| { | |||
| LD_GP2_INC(x, inc_x, f0, f1); | |||
| ST_GP2_INC(f0, f1, y, inc_y); | |||
| } | |||
| if (n & 1) | |||
| { | |||
| *y = *x; | |||
| } | |||
| } | |||
| return (0); | |||
| } | |||
| @@ -0,0 +1,368 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2017, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #include "common.h" | |||
| #include "macros_msa.h" | |||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, | |||
| BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, | |||
| BLASLONG dummy2) | |||
| { | |||
| BLASLONG i; | |||
| FLOAT *px; | |||
| FLOAT f0, f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11, f12, f13, f14, f15; | |||
| v2f64 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; | |||
| v2f64 da_vec; | |||
| px = x; | |||
| if (1 == inc_x) | |||
| { | |||
| if (0.0 == da) | |||
| { | |||
| v2f64 zero_v = __msa_cast_to_vector_double(0); | |||
| zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 0, 0.0); | |||
| zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 1, 0.0); | |||
| for (i = (n >> 5); i--;) | |||
| { | |||
| ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v, | |||
| zero_v, zero_v, x, 2); | |||
| ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v, | |||
| zero_v, zero_v, x, 2); | |||
| } | |||
| if (n & 31) | |||
| { | |||
| if (n & 16) | |||
| { | |||
| ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v, | |||
| zero_v, zero_v, x, 2); | |||
| } | |||
| if (n & 8) | |||
| { | |||
| ST_DP4_INC(zero_v, zero_v, zero_v, zero_v, x, 2); | |||
| } | |||
| if (n & 4) | |||
| { | |||
| ST_DP2_INC(zero_v, zero_v, x, 2); | |||
| } | |||
| if (n & 2) | |||
| { | |||
| *x = 0; x += 1; | |||
| *x = 0; x += 1; | |||
| } | |||
| if (n & 1) | |||
| { | |||
| *x = 0; | |||
| } | |||
| } | |||
| } | |||
| else | |||
| { | |||
| da_vec = COPY_DOUBLE_TO_VECTOR(da); | |||
| if (n > 31) | |||
| { | |||
| FLOAT *x_pref; | |||
| BLASLONG pref_offset; | |||
| pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); | |||
| if (pref_offset > 0) | |||
| { | |||
| pref_offset = L1_DATA_LINESIZE - pref_offset; | |||
| pref_offset = pref_offset / sizeof(FLOAT); | |||
| } | |||
| x_pref = x + pref_offset + 32 + 16; | |||
| LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7); | |||
| for (i = 0; i < (n >> 5) - 1; i++) | |||
| { | |||
| PREF_OFFSET(x_pref, 0); | |||
| PREF_OFFSET(x_pref, 32); | |||
| PREF_OFFSET(x_pref, 64); | |||
| PREF_OFFSET(x_pref, 96); | |||
| PREF_OFFSET(x_pref, 128); | |||
| PREF_OFFSET(x_pref, 160); | |||
| PREF_OFFSET(x_pref, 192); | |||
| PREF_OFFSET(x_pref, 224); | |||
| x_pref += 32; | |||
| x8 = LD_DP(px); px += 2; | |||
| x0 *= da_vec; | |||
| x9 = LD_DP(px); px += 2; | |||
| x1 *= da_vec; | |||
| x10 = LD_DP(px); px += 2; | |||
| x2 *= da_vec; | |||
| x11 = LD_DP(px); px += 2; | |||
| x3 *= da_vec; | |||
| x12 = LD_DP(px); px += 2; | |||
| x4 *= da_vec; | |||
| x13 = LD_DP(px); px += 2; | |||
| x5 *= da_vec; | |||
| x14 = LD_DP(px); px += 2; | |||
| x6 *= da_vec; | |||
| x15 = LD_DP(px); px += 2; | |||
| x7 *= da_vec; | |||
| x8 *= da_vec; | |||
| ST_DP(x0, x); x += 2; | |||
| x9 *= da_vec; | |||
| ST_DP(x1, x); x += 2; | |||
| x10 *= da_vec; | |||
| ST_DP(x2, x); x += 2; | |||
| x11 *= da_vec; | |||
| ST_DP(x3, x); x += 2; | |||
| x12 *= da_vec; | |||
| ST_DP(x4, x); x += 2; | |||
| x13 *= da_vec; | |||
| ST_DP(x5, x); x += 2; | |||
| x14 *= da_vec; | |||
| ST_DP(x6, x); x += 2; | |||
| x15 *= da_vec; | |||
| ST_DP(x7, x); x += 2; | |||
| ST_DP(x8, x); x += 2; | |||
| x0 = LD_DP(px); px += 2; | |||
| ST_DP(x9, x); x += 2; | |||
| x1 = LD_DP(px); px += 2; | |||
| ST_DP(x10, x); x += 2; | |||
| x2 = LD_DP(px); px += 2; | |||
| ST_DP(x11, x); x += 2; | |||
| x3 = LD_DP(px); px += 2; | |||
| ST_DP(x12, x); x += 2; | |||
| x4 = LD_DP(px); px += 2; | |||
| ST_DP(x13, x); x += 2; | |||
| x5 = LD_DP(px); px += 2; | |||
| ST_DP(x14, x); x += 2; | |||
| x6 = LD_DP(px); px += 2; | |||
| ST_DP(x15, x); x += 2; | |||
| x7 = LD_DP(px); px += 2; | |||
| } | |||
| x8 = LD_DP(px); px += 2; | |||
| x0 *= da_vec; | |||
| x9 = LD_DP(px); px += 2; | |||
| x1 *= da_vec; | |||
| x10 = LD_DP(px); px += 2; | |||
| x2 *= da_vec; | |||
| x11 = LD_DP(px); px += 2; | |||
| x3 *= da_vec; | |||
| x12 = LD_DP(px); px += 2; | |||
| x4 *= da_vec; | |||
| x13 = LD_DP(px); px += 2; | |||
| x5 *= da_vec; | |||
| x14 = LD_DP(px); px += 2; | |||
| x6 *= da_vec; | |||
| x15 = LD_DP(px); px += 2; | |||
| x7 *= da_vec; | |||
| x8 *= da_vec; | |||
| ST_DP(x0, x); x += 2; | |||
| x9 *= da_vec; | |||
| ST_DP(x1, x); x += 2; | |||
| x10 *= da_vec; | |||
| ST_DP(x2, x); x += 2; | |||
| x11 *= da_vec; | |||
| ST_DP(x3, x); x += 2; | |||
| x12 *= da_vec; | |||
| ST_DP(x4, x); x += 2; | |||
| x13 *= da_vec; | |||
| ST_DP(x5, x); x += 2; | |||
| x15 *= da_vec; | |||
| ST_DP(x6, x); x += 2; | |||
| x14 *= da_vec; | |||
| ST_DP(x7, x); x += 2; | |||
| ST_DP8_INC(x8, x9, x10, x11, x12, x13, x14, x15, x, 2); | |||
| } | |||
| if (n & 31) | |||
| { | |||
| if (n & 16) | |||
| { | |||
| LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7); | |||
| MUL4(x0, da_vec, x1, da_vec, x2, da_vec, x3, da_vec, x0, x1, x2, x3); | |||
| MUL4(x4, da_vec, x5, da_vec, x6, da_vec, x7, da_vec, x4, x5, x6, x7); | |||
| ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, 2); | |||
| } | |||
| if (n & 8) | |||
| { | |||
| LD_DP4_INC(px, 2, x0, x1, x2, x3); | |||
| MUL4(x0, da_vec, x1, da_vec, x2, da_vec, x3, da_vec, x0, x1, x2, x3); | |||
| ST_DP4_INC(x0, x1, x2, x3, x, 2); | |||
| } | |||
| if (n & 4) | |||
| { | |||
| LD_DP2_INC(px, 2, x0, x1); | |||
| MUL2(x0, da_vec, x1, da_vec, x0, x1); | |||
| ST_DP2_INC(x0, x1, x, 2); | |||
| } | |||
| if (n & 2) | |||
| { | |||
| LD_GP2_INC(px, 1, f0, f1); | |||
| MUL2(f0, da, f1, da, f0, f1); | |||
| ST_GP2_INC(f0, f1, x, 1); | |||
| } | |||
| if (n & 1) | |||
| { | |||
| *x *= da; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| else | |||
| { | |||
| if (da == 0.0) | |||
| { | |||
| for (i = n; i--;) | |||
| { | |||
| *x = 0.0; | |||
| x += inc_x; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| if (n > 15) | |||
| { | |||
| LD_GP8_INC(px, inc_x, f0, f1, f2, f3, f4, f5, f6, f7); | |||
| for (i = 0; i < (n >> 4) - 1; i++) | |||
| { | |||
| LD_GP8_INC(px, inc_x, f8, f9, f10, f11, f12, f13, f14, f15); | |||
| MUL4(f0, da, f1, da, f2, da, f3, da, f0, f1, f2, f3); | |||
| f4 *= da; | |||
| f5 *= da; | |||
| *x = f0; x += inc_x; | |||
| f6 *= da; | |||
| *x = f1; x += inc_x; | |||
| f7 *= da; | |||
| *x = f2; x += inc_x; | |||
| f8 *= da; | |||
| *x = f3; x += inc_x; | |||
| f9 *= da; | |||
| *x = f4; x += inc_x; | |||
| f10 *= da; | |||
| *x = f5; x += inc_x; | |||
| f11 *= da; | |||
| *x = f6; x += inc_x; | |||
| f12 *= da; | |||
| *x = f7; x += inc_x; | |||
| f13 *= da; | |||
| *x = f8; x += inc_x; | |||
| f14 *= da; | |||
| *x = f9; x += inc_x; | |||
| f15 *= da; | |||
| *x = f10; x += inc_x; | |||
| *x = f11; x += inc_x; | |||
| f0 = *px; px += inc_x; | |||
| *x = f12; x += inc_x; | |||
| f1 = *px; px += inc_x; | |||
| *x = f13; x += inc_x; | |||
| f2 = *px; px += inc_x; | |||
| *x = f14; x += inc_x; | |||
| f3 = *px; px += inc_x; | |||
| *x = f15; x += inc_x; | |||
| f4 = *px; px += inc_x; | |||
| f5 = *px; px += inc_x; | |||
| f6 = *px; px += inc_x; | |||
| f7 = *px; px += inc_x; | |||
| } | |||
| LD_GP8_INC(px, inc_x, f8, f9, f10, f11, f12, f13, f14, f15); | |||
| MUL4(f0, da, f1, da, f2, da, f3, da, f0, f1, f2, f3); | |||
| f4 *= da; | |||
| f5 *= da; | |||
| *x = f0; x += inc_x; | |||
| f6 *= da; | |||
| *x = f1; x += inc_x; | |||
| f7 *= da; | |||
| *x = f2; x += inc_x; | |||
| f8 *= da; | |||
| *x = f3; x += inc_x; | |||
| f9 *= da; | |||
| *x = f4; x += inc_x; | |||
| f10 *= da; | |||
| *x = f5; x += inc_x; | |||
| f11 *= da; | |||
| *x = f6; x += inc_x; | |||
| f12 *= da; | |||
| *x = f7; x += inc_x; | |||
| f13 *= da; | |||
| *x = f8; x += inc_x; | |||
| f14 *= da; | |||
| *x = f9; x += inc_x; | |||
| f15 *= da; | |||
| *x = f10; x += inc_x; | |||
| *x = f11; x += inc_x; | |||
| *x = f12; x += inc_x; | |||
| *x = f13; x += inc_x; | |||
| *x = f14; x += inc_x; | |||
| *x = f15; x += inc_x; | |||
| } | |||
| if (n & 15) | |||
| { | |||
| if (n & 8) | |||
| { | |||
| LD_GP8_INC(px, inc_x, f0, f1, f2, f3, f4, f5, f6, f7); | |||
| MUL4(f0, da, f1, da, f2, da, f3, da, f0, f1, f2, f3); | |||
| MUL4(f4, da, f5, da, f6, da, f7, da, f4, f5, f6, f7); | |||
| ST_GP8_INC(f0, f1, f2, f3, f4, f5, f6, f7, x, inc_x); | |||
| } | |||
| if (n & 4) | |||
| { | |||
| LD_GP4_INC(px, inc_x, f0, f1, f2, f3); | |||
| MUL4(f0, da, f1, da, f2, da, f3, da, f0, f1, f2, f3); | |||
| ST_GP4_INC(f0, f1, f2, f3, x, inc_x); | |||
| } | |||
| if (n & 2) | |||
| { | |||
| LD_GP2_INC(px, inc_x, f0, f1); | |||
| MUL2(f0, da, f1, da, f0, f1); | |||
| ST_GP2_INC(f0, f1, x, inc_x); | |||
| } | |||
| if (n & 1) | |||
| { | |||
| *x *= da; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,253 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #include "common.h" | |||
| #include "macros_msa.h" | |||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, | |||
| FLOAT *srcx, BLASLONG inc_x, FLOAT *srcy, BLASLONG inc_y, | |||
| FLOAT *dummy, BLASLONG dummy2) | |||
| { | |||
| BLASLONG i = 0, pref_offsetx, pref_offsety; | |||
| FLOAT *px, *py; | |||
| FLOAT x0, x1, x2, x3, x4, x5, x6, x7; | |||
| FLOAT y0, y1, y2, y3, y4, y5, y6, y7; | |||
| v2f64 xv0, xv1, xv2, xv3, xv4, xv5, xv6, xv7; | |||
| v2f64 yv0, yv1, yv2, yv3, yv4, yv5, yv6, yv7; | |||
| if (n < 0) return (0); | |||
| pref_offsetx = (BLASLONG)srcx & (L1_DATA_LINESIZE - 1); | |||
| if (pref_offsetx > 0) | |||
| { | |||
| pref_offsetx = L1_DATA_LINESIZE - pref_offsetx; | |||
| pref_offsetx = pref_offsetx / sizeof(FLOAT); | |||
| } | |||
| pref_offsety = (BLASLONG)srcy & (L1_DATA_LINESIZE - 1); | |||
| if (pref_offsety > 0) | |||
| { | |||
| pref_offsety = L1_DATA_LINESIZE - pref_offsety; | |||
| pref_offsety = pref_offsety / sizeof(FLOAT); | |||
| } | |||
| px = srcx; | |||
| py = srcy; | |||
| if ((1 == inc_x) && (1 == inc_y)) | |||
| { | |||
| if (n >> 4) | |||
| { | |||
| LD_DP8_INC(px, 2, xv0, xv1, xv2, xv3, xv4, xv5, xv6, xv7); | |||
| for (i = (n >> 4) - 1; i--;) | |||
| { | |||
| PREFETCH(px + pref_offsetx + 16); | |||
| PREFETCH(px + pref_offsetx + 20); | |||
| PREFETCH(px + pref_offsetx + 24); | |||
| PREFETCH(px + pref_offsetx + 28); | |||
| PREFETCH(py + pref_offsety + 16); | |||
| PREFETCH(py + pref_offsety + 20); | |||
| PREFETCH(py + pref_offsety + 24); | |||
| PREFETCH(py + pref_offsety + 28); | |||
| yv0 = LD_DP(py); py += 2; | |||
| ST_DP(xv0, srcy); srcy += 2; | |||
| yv1 = LD_DP(py); py += 2; | |||
| ST_DP(xv1, srcy); srcy += 2; | |||
| yv2 = LD_DP(py); py += 2; | |||
| ST_DP(xv2, srcy); srcy += 2; | |||
| yv3 = LD_DP(py); py += 2; | |||
| ST_DP(xv3, srcy); srcy += 2; | |||
| yv4 = LD_DP(py); py += 2; | |||
| ST_DP(xv4, srcy); srcy += 2; | |||
| yv5 = LD_DP(py); py += 2; | |||
| ST_DP(xv5, srcy); srcy += 2; | |||
| yv6 = LD_DP(py); py += 2; | |||
| ST_DP(xv6, srcy); srcy += 2; | |||
| yv7 = LD_DP(py); py += 2; | |||
| ST_DP(xv7, srcy); srcy += 2; | |||
| xv0 = LD_DP(px); px += 2; | |||
| ST_DP(yv0, srcx); srcx += 2; | |||
| xv1 = LD_DP(px); px += 2; | |||
| ST_DP(yv1, srcx); srcx += 2; | |||
| xv2 = LD_DP(px); px += 2; | |||
| ST_DP(yv2, srcx); srcx += 2; | |||
| xv3 = LD_DP(px); px += 2; | |||
| ST_DP(yv3, srcx); srcx += 2; | |||
| xv4 = LD_DP(px); px += 2; | |||
| ST_DP(yv4, srcx); srcx += 2; | |||
| xv5 = LD_DP(px); px += 2; | |||
| ST_DP(yv5, srcx); srcx += 2; | |||
| xv6 = LD_DP(px); px += 2; | |||
| ST_DP(yv6, srcx); srcx += 2; | |||
| xv7 = LD_DP(px); px += 2; | |||
| ST_DP(yv7, srcx); srcx += 2; | |||
| } | |||
| LD_DP8_INC(py, 2, yv0, yv1, yv2, yv3, yv4, yv5, yv6, yv7); | |||
| ST_DP8_INC(xv0, xv1, xv2, xv3, xv4, xv5, xv6, xv7, srcy, 2); | |||
| ST_DP8_INC(yv0, yv1, yv2, yv3, yv4, yv5, yv6, yv7, srcx, 2); | |||
| } | |||
| if (n & 15) | |||
| { | |||
| if ((n & 8) && (n & 4) && (n & 2)) | |||
| { | |||
| LD_DP7_INC(px, 2, xv0, xv1, xv2, xv3, xv4, xv5, xv6); | |||
| LD_DP7_INC(py, 2, yv0, yv1, yv2, yv3, yv4, yv5, yv6); | |||
| ST_DP7_INC(xv0, xv1, xv2, xv3, xv4, xv5, xv6, srcy, 2); | |||
| ST_DP7_INC(yv0, yv1, yv2, yv3, yv4, yv5, yv6, srcx, 2); | |||
| } | |||
| else if ((n & 8) && (n & 4)) | |||
| { | |||
| LD_DP6_INC(px, 2, xv0, xv1, xv2, xv3, xv4, xv5); | |||
| LD_DP6_INC(py, 2, yv0, yv1, yv2, yv3, yv4, yv5); | |||
| ST_DP6_INC(xv0, xv1, xv2, xv3, xv4, xv5, srcy, 2); | |||
| ST_DP6_INC(yv0, yv1, yv2, yv3, yv4, yv5, srcx, 2); | |||
| } | |||
| else if ((n & 8) && (n & 2)) | |||
| { | |||
| LD_DP5_INC(px, 2, xv0, xv1, xv2, xv3, xv4); | |||
| LD_DP5_INC(py, 2, yv0, yv1, yv2, yv3, yv4); | |||
| ST_DP5_INC(xv0, xv1, xv2, xv3, xv4, srcy, 2); | |||
| ST_DP5_INC(yv0, yv1, yv2, yv3, yv4, srcx, 2); | |||
| } | |||
| else if ((n & 4) && (n & 2)) | |||
| { | |||
| LD_DP3_INC(px, 2, xv0, xv1, xv2); | |||
| LD_DP3_INC(py, 2, yv0, yv1, yv2); | |||
| ST_DP3_INC(xv0, xv1, xv2, srcy, 2); | |||
| ST_DP3_INC(yv0, yv1, yv2, srcx, 2); | |||
| } | |||
| else if (n & 8) | |||
| { | |||
| LD_DP4_INC(px, 2, xv0, xv1, xv2, xv3); | |||
| LD_DP4_INC(py, 2, yv0, yv1, yv2, yv3); | |||
| ST_DP4_INC(xv0, xv1, xv2, xv3, srcy, 2); | |||
| ST_DP4_INC(yv0, yv1, yv2, yv3, srcx, 2); | |||
| } | |||
| else if (n & 4) | |||
| { | |||
| LD_DP2_INC(px, 2, xv0, xv1); | |||
| LD_DP2_INC(py, 2, yv0, yv1); | |||
| ST_DP2_INC(xv0, xv1, srcy, 2); | |||
| ST_DP2_INC(yv0, yv1, srcx, 2); | |||
| } | |||
| else if (n & 2) | |||
| { | |||
| xv0 = LD_DP(px); | |||
| yv0 = LD_DP(py); | |||
| px += 2; | |||
| py += 2; | |||
| ST_DP(xv0, srcy); | |||
| ST_DP(yv0, srcx); | |||
| srcx += 2; | |||
| srcy += 2; | |||
| } | |||
| if (n & 1) | |||
| { | |||
| x0 = px[0]; | |||
| y0 = py[0]; | |||
| srcx[0] = y0; | |||
| srcy[0] = x0; | |||
| } | |||
| } | |||
| } | |||
| else | |||
| { | |||
| for (i = (n >> 3); i--;) | |||
| { | |||
| LD_GP8_INC(px, inc_x, x0, x1, x2, x3, x4, x5, x6, x7); | |||
| LD_GP8_INC(py, inc_y, y0, y1, y2, y3, y4, y5, y6, y7); | |||
| ST_GP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, srcy, inc_y); | |||
| ST_GP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, srcx, inc_x); | |||
| } | |||
| if (n & 7) | |||
| { | |||
| if ((n & 4) && (n & 2) && (n & 1)) | |||
| { | |||
| LD_GP7_INC(px, inc_x, x0, x1, x2, x3, x4, x5, x6); | |||
| LD_GP7_INC(py, inc_y, y0, y1, y2, y3, y4, y5, y6); | |||
| ST_GP7_INC(x0, x1, x2, x3, x4, x5, x6, srcy, inc_y); | |||
| ST_GP7_INC(y0, y1, y2, y3, y4, y5, y6, srcx, inc_x); | |||
| } | |||
| else if ((n & 4) && (n & 2)) | |||
| { | |||
| LD_GP6_INC(px, inc_x, x0, x1, x2, x3, x4, x5); | |||
| LD_GP6_INC(py, inc_y, y0, y1, y2, y3, y4, y5); | |||
| ST_GP6_INC(x0, x1, x2, x3, x4, x5, srcy, inc_y); | |||
| ST_GP6_INC(y0, y1, y2, y3, y4, y5, srcx, inc_x); | |||
| } | |||
| else if ((n & 4) && (n & 1)) | |||
| { | |||
| LD_GP5_INC(px, inc_x, x0, x1, x2, x3, x4); | |||
| LD_GP5_INC(py, inc_y, y0, y1, y2, y3, y4); | |||
| ST_GP5_INC(x0, x1, x2, x3, x4, srcy, inc_y); | |||
| ST_GP5_INC(y0, y1, y2, y3, y4, srcx, inc_x); | |||
| } | |||
| else if ((n & 2) && (n & 1)) | |||
| { | |||
| LD_GP3_INC(px, inc_x, x0, x1, x2); | |||
| LD_GP3_INC(py, inc_y, y0, y1, y2); | |||
| ST_GP3_INC(x0, x1, x2, srcy, inc_y); | |||
| ST_GP3_INC(y0, y1, y2, srcx, inc_x); | |||
| } | |||
| else if (n & 4) | |||
| { | |||
| LD_GP4_INC(px, inc_x, x0, x1, x2, x3); | |||
| LD_GP4_INC(py, inc_y, y0, y1, y2, y3); | |||
| ST_GP4_INC(x0, x1, x2, x3, srcy, inc_y); | |||
| ST_GP4_INC(y0, y1, y2, y3, srcx, inc_x); | |||
| } | |||
| else if (n & 2) | |||
| { | |||
| LD_GP2_INC(px, inc_x, x0, x1); | |||
| LD_GP2_INC(py, inc_y, y0, y1); | |||
| ST_GP2_INC(x0, x1, srcy, inc_y); | |||
| ST_GP2_INC(y0, y1, srcx, inc_x); | |||
| } | |||
| else if (n & 1) | |||
| { | |||
| x0 = *srcx; | |||
| y0 = *srcy; | |||
| *srcx = y0; | |||
| *srcy = x0; | |||
| } | |||
| } | |||
| } | |||
| return (0); | |||
| } | |||
| @@ -722,6 +722,31 @@ inline static void prefetch_load_lf(unsigned char *src) | |||
| MUL2(in4, in5, in6, in7, out2, out3); \ | |||
| } | |||
| /* Description : Multiplication of pairs of vectors and added in output | |||
| Arguments : Inputs - in0, in1, vec, out0, out1 | |||
| Outputs - out0, out1 | |||
| Details : Each element from 'in0' is multiplied with elements from 'vec' | |||
| and the result is added to 'out0' | |||
| */ | |||
| #define FMADD2(in0, in1, vec, inout0, inout1) \ | |||
| { \ | |||
| inout0 += in0 * vec; \ | |||
| inout1 += in1 * vec; \ | |||
| } | |||
| #define FMADD3(in0, in1, in2, vec, \ | |||
| inout0, inout1, inout2) \ | |||
| { \ | |||
| inout0 += in0 * vec; \ | |||
| inout1 += in1 * vec; \ | |||
| inout2 += in2 * vec; \ | |||
| } | |||
| #define FMADD4(in0, in1, in2, in3, vec, \ | |||
| inout0, inout1, inout2, inout3) \ | |||
| { \ | |||
| FMADD2(in0, in1, vec, inout0, inout1); \ | |||
| FMADD2(in2, in3, vec, inout2, inout3); \ | |||
| } | |||
| /* Description : Addition of 2 pairs of variables | |||
| Arguments : Inputs - in0, in1, in2, in3 | |||
| Outputs - out0, out1 | |||
| @@ -0,0 +1,265 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #include "common.h" | |||
| #include "macros_msa.h" | |||
| #if !defined(CONJ) | |||
| #define OP0 += | |||
| #define OP1 -= | |||
| #define OP2 += | |||
| #else | |||
| #define OP0 -= | |||
| #define OP1 += | |||
| #define OP2 -= | |||
| #endif | |||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, | |||
| BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | |||
| { | |||
| BLASLONG i; | |||
| FLOAT *py; | |||
| v4f32 x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, y6, y7; | |||
| v4f32 da_vec, zero_v = {0}; | |||
| if ((n < 0) || (da == 0.0)) return(0); | |||
| py = y; | |||
| if ((1 == inc_x) && (1 == inc_y)) | |||
| { | |||
| FLOAT *x_pref, *y_pref; | |||
| BLASLONG pref_offset; | |||
| pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); | |||
| if (pref_offset > 0) | |||
| { | |||
| pref_offset = L1_DATA_LINESIZE - pref_offset; | |||
| pref_offset = pref_offset / sizeof(FLOAT); | |||
| } | |||
| x_pref = x + pref_offset + 64; | |||
| pref_offset = (BLASLONG)y & (L1_DATA_LINESIZE - 1); | |||
| if (pref_offset > 0) | |||
| { | |||
| pref_offset = L1_DATA_LINESIZE - pref_offset; | |||
| pref_offset = pref_offset / sizeof(FLOAT); | |||
| } | |||
| y_pref = y + pref_offset + 64; | |||
| da_vec = COPY_FLOAT_TO_VECTOR(da); | |||
| for (i = (n >> 5); i--;) | |||
| { | |||
| PREF_OFFSET(x_pref, 0); | |||
| PREF_OFFSET(x_pref, 32); | |||
| PREF_OFFSET(x_pref, 64); | |||
| PREF_OFFSET(x_pref, 96); | |||
| PREF_OFFSET(y_pref, 0); | |||
| PREF_OFFSET(y_pref, 32); | |||
| PREF_OFFSET(y_pref, 64); | |||
| PREF_OFFSET(y_pref, 96); | |||
| x_pref += 32; | |||
| y_pref += 32; | |||
| LD_SP8_INC(x, 4, x0, x1, x2, x3, x4, x5, x6, x7); | |||
| LD_SP8_INC(py, 4, y0, y1, y2, y3, y4, y5, y6, y7); | |||
| FMADD4(x0, x1, x2, x3, da_vec, y0, y1, y2, y3); | |||
| FMADD4(x4, x5, x6, x7, da_vec, y4, y5, y6, y7); | |||
| ST_SP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, 4); | |||
| } | |||
| if (n & 31) | |||
| { | |||
| if (n & 16) | |||
| { | |||
| LD_SP4_INC(x, 4, x0, x1, x2, x3); | |||
| LD_SP4_INC(py, 4, y0, y1, y2, y3); | |||
| FMADD4(x0, x1, x2, x3, da_vec, y0, y1, y2, y3); | |||
| ST_SP4_INC(y0, y1, y2, y3, y, 4); | |||
| } | |||
| if (n & 8) | |||
| { | |||
| LD_SP2_INC(x, 4, x0, x1); | |||
| LD_SP2_INC(py, 4, y0, y1); | |||
| FMADD2(x0, x1, da_vec, y0, y1); | |||
| ST_SP2_INC(y0, y1, y, 4); | |||
| } | |||
| if (n & 4) | |||
| { | |||
| x0 = LD_SP(x); x += 4; | |||
| y0 = LD_SP(py); py += 4; | |||
| y0 += da_vec * x0; | |||
| ST_SP(y0, y); y += 4; | |||
| } | |||
| if (n & 2) | |||
| { | |||
| FMADD2(x[0], x[1], da, y[0], y[1]); | |||
| x += 2; | |||
| y += 2; | |||
| } | |||
| if (n & 1) | |||
| { | |||
| y[0] += da * x[0]; | |||
| } | |||
| } | |||
| } | |||
| else if (1 == inc_y) | |||
| { | |||
| da_vec = COPY_FLOAT_TO_VECTOR(da); | |||
| for (i = (n >> 4); i--;) | |||
| { | |||
| x0 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x)); | |||
| x += inc_x; | |||
| x0 = (v4f32) __msa_insert_w((v4i32) x0, 1, *((int *) x)); | |||
| x += inc_x; | |||
| x0 = (v4f32) __msa_insert_w((v4i32) x0, 2, *((int *) x)); | |||
| x += inc_x; | |||
| x0 = (v4f32) __msa_insert_w((v4i32) x0, 3, *((int *) x)); | |||
| x += inc_x; | |||
| x1 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x)); | |||
| x += inc_x; | |||
| x1 = (v4f32) __msa_insert_w((v4i32) x1, 1, *((int *) x)); | |||
| x += inc_x; | |||
| x1 = (v4f32) __msa_insert_w((v4i32) x1, 2, *((int *) x)); | |||
| x += inc_x; | |||
| x1 = (v4f32) __msa_insert_w((v4i32) x1, 3, *((int *) x)); | |||
| x += inc_x; | |||
| x2 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x)); | |||
| x += inc_x; | |||
| x2 = (v4f32) __msa_insert_w((v4i32) x2, 1, *((int *) x)); | |||
| x += inc_x; | |||
| x2 = (v4f32) __msa_insert_w((v4i32) x2, 2, *((int *) x)); | |||
| x += inc_x; | |||
| x2 = (v4f32) __msa_insert_w((v4i32) x2, 3, *((int *) x)); | |||
| x += inc_x; | |||
| x3 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x)); | |||
| x += inc_x; | |||
| x3 = (v4f32) __msa_insert_w((v4i32) x3, 1, *((int *) x)); | |||
| x += inc_x; | |||
| x3 = (v4f32) __msa_insert_w((v4i32) x3, 2, *((int *) x)); | |||
| x += inc_x; | |||
| x3 = (v4f32) __msa_insert_w((v4i32) x3, 3, *((int *) x)); | |||
| x += inc_x; | |||
| LD_SP4_INC(py, 4, y0, y1, y2, y3); | |||
| FMADD4(x0, x1, x2, x3, da_vec, y0, y1, y2, y3); | |||
| ST_SP4_INC(y0, y1, y2, y3, y, 4); | |||
| } | |||
| if (n & 15) | |||
| { | |||
| if (n & 8) | |||
| { | |||
| x0 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x)); | |||
| x += inc_x; | |||
| x0 = (v4f32) __msa_insert_w((v4i32) x0, 1, *((int *) x)); | |||
| x += inc_x; | |||
| x0 = (v4f32) __msa_insert_w((v4i32) x0, 2, *((int *) x)); | |||
| x += inc_x; | |||
| x0 = (v4f32) __msa_insert_w((v4i32) x0, 3, *((int *) x)); | |||
| x += inc_x; | |||
| x1 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x)); | |||
| x += inc_x; | |||
| x1 = (v4f32) __msa_insert_w((v4i32) x1, 1, *((int *) x)); | |||
| x += inc_x; | |||
| x1 = (v4f32) __msa_insert_w((v4i32) x1, 2, *((int *) x)); | |||
| x += inc_x; | |||
| x1 = (v4f32) __msa_insert_w((v4i32) x1, 3, *((int *) x)); | |||
| x += inc_x; | |||
| LD_SP2_INC(py, 4, y0, y1); | |||
| FMADD2(x0, x1, da_vec, y0, y1); | |||
| ST_SP2_INC(y0, y1, y, 4); | |||
| } | |||
| if (n & 4) | |||
| { | |||
| x0 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x)); | |||
| x += inc_x; | |||
| x0 = (v4f32) __msa_insert_w((v4i32) x0, 1, *((int *) x)); | |||
| x += inc_x; | |||
| x0 = (v4f32) __msa_insert_w((v4i32) x0, 2, *((int *) x)); | |||
| x += inc_x; | |||
| x0 = (v4f32) __msa_insert_w((v4i32) x0, 3, *((int *) x)); | |||
| x += inc_x; | |||
| y0 = LD_SP(py); py += 4; | |||
| y0 += da_vec * x0; | |||
| ST_SP(y0, y); y += 4; | |||
| } | |||
| if (n & 2) | |||
| { | |||
| FMADD2(x[0], x[inc_x], da, y[0], y[1]); | |||
| x += 2 * inc_x; | |||
| y += 2; | |||
| } | |||
| if (n & 1) | |||
| { | |||
| y[0] += da * x[0]; | |||
| } | |||
| } | |||
| } | |||
| else | |||
| { | |||
| FLOAT x0, x1, x2, x3, y0, y1, y2, y3; | |||
| for (i = (n >> 2); i--;) | |||
| { | |||
| LD_GP4_INC(x, inc_x, x0, x1, x2, x3); | |||
| LD_GP4_INC(py, inc_y, y0, y1, y2, y3); | |||
| FMADD4(x0, x1, x2, x3, da, y0, y1, y2, y3); | |||
| ST_GP4_INC(y0, y1, y2, y3, y, inc_y); | |||
| } | |||
| if (n & 3) | |||
| { | |||
| if (n & 2) | |||
| { | |||
| LD_GP2_INC(x, inc_x, x0, x1); | |||
| LD_GP2_INC(py, inc_y, y0, y1); | |||
| FMADD2(x0, x1, da, y0, y1); | |||
| ST_GP2_INC(y0, y1, y, inc_y); | |||
| } | |||
| if (n & 1) | |||
| { | |||
| *y += da * *x; | |||
| } | |||
| } | |||
| } | |||
| return (0); | |||
| } | |||
| @@ -0,0 +1,186 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #include "common.h" | |||
| #include "macros_msa.h" | |||
| int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| { | |||
| BLASLONG i; | |||
| v4f32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; | |||
| FLOAT f0, f1, f2, f3, f4, f5, f6, f7; | |||
| if (n < 0) return (0); | |||
| if ((1 == inc_x) && (1 == inc_y)) | |||
| { | |||
| if (n > 63) | |||
| { | |||
| FLOAT *x_pref; | |||
| BLASLONG pref_offset; | |||
| pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); | |||
| if (pref_offset > 0) | |||
| { | |||
| pref_offset = L1_DATA_LINESIZE - pref_offset; | |||
| pref_offset = pref_offset / sizeof(FLOAT); | |||
| } | |||
| x_pref = x + pref_offset + 128 + 32; | |||
| LD_SP8_INC(x, 4, x0, x1, x2, x3, x4, x5, x6, x7); | |||
| for (i = (n >> 6) - 1; i--;) | |||
| { | |||
| PREF_OFFSET(x_pref, 0); | |||
| PREF_OFFSET(x_pref, 32); | |||
| PREF_OFFSET(x_pref, 64); | |||
| PREF_OFFSET(x_pref, 96); | |||
| PREF_OFFSET(x_pref, 128); | |||
| PREF_OFFSET(x_pref, 160); | |||
| PREF_OFFSET(x_pref, 192); | |||
| PREF_OFFSET(x_pref, 224); | |||
| x_pref += 64; | |||
| x8 = LD_SP(x); x += 4; | |||
| ST_SP(x0, y); y += 4; | |||
| x9 = LD_SP(x); x += 4; | |||
| ST_SP(x1, y); y += 4; | |||
| x10 = LD_SP(x); x += 4; | |||
| ST_SP(x2, y); y += 4; | |||
| x11 = LD_SP(x); x += 4; | |||
| ST_SP(x3, y); y += 4; | |||
| x12 = LD_SP(x); x += 4; | |||
| ST_SP(x4, y); y += 4; | |||
| x13 = LD_SP(x); x += 4; | |||
| ST_SP(x5, y); y += 4; | |||
| x14 = LD_SP(x); x += 4; | |||
| ST_SP(x6, y); y += 4; | |||
| x15 = LD_SP(x); x += 4; | |||
| ST_SP(x7, y); y += 4; | |||
| x0 = LD_SP(x); x += 4; | |||
| ST_SP(x8, y); y += 4; | |||
| x1 = LD_SP(x); x += 4; | |||
| ST_SP(x9, y); y += 4; | |||
| x2 = LD_SP(x); x += 4; | |||
| ST_SP(x10, y); y += 4; | |||
| x3 = LD_SP(x); x += 4; | |||
| ST_SP(x11, y); y += 4; | |||
| x4 = LD_SP(x); x += 4; | |||
| ST_SP(x12, y); y += 4; | |||
| x5 = LD_SP(x); x += 4; | |||
| ST_SP(x13, y); y += 4; | |||
| x6 = LD_SP(x); x += 4; | |||
| ST_SP(x14, y); y += 4; | |||
| x7 = LD_SP(x); x += 4; | |||
| ST_SP(x15, y); y += 4; | |||
| } | |||
| x8 = LD_SP(x); x += 4; | |||
| x9 = LD_SP(x); x += 4; | |||
| ST_SP(x0, y); y += 4; | |||
| x10 = LD_SP(x); x += 4; | |||
| ST_SP(x1, y); y += 4; | |||
| x11 = LD_SP(x); x += 4; | |||
| ST_SP(x2, y); y += 4; | |||
| x12 = LD_SP(x); x += 4; | |||
| ST_SP(x3, y); y += 4; | |||
| x13 = LD_SP(x); x += 4; | |||
| ST_SP(x4, y); y += 4; | |||
| x14 = LD_SP(x); x += 4; | |||
| ST_SP(x5, y); y += 4; | |||
| x15 = LD_SP(x); x += 4; | |||
| ST_SP(x6, y); y += 4; | |||
| ST_SP(x7, y); y += 4; | |||
| ST_SP8_INC(x8, x9, x10, x11, x12, x13, x14, x15, y, 4); | |||
| } | |||
| if (n & 63) | |||
| { | |||
| if (n & 32) | |||
| { | |||
| LD_SP8_INC(x, 4, x0, x1, x2, x3, x4, x5, x6, x7); | |||
| ST_SP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, y, 4); | |||
| } | |||
| if (n & 16) | |||
| { | |||
| LD_SP4_INC(x, 4, x0, x1, x2, x3); | |||
| ST_SP4_INC(x0, x1, x2, x3, y, 4); | |||
| } | |||
| if (n & 8) | |||
| { | |||
| LD_SP2_INC(x, 4, x0, x1); | |||
| ST_SP2_INC(x0, x1, y, 4); | |||
| } | |||
| if (n & 4) | |||
| { | |||
| LD_GP4_INC(x, 1, f0, f1, f2, f3); | |||
| ST_GP4_INC(f0, f1, f2, f3, y, 1); | |||
| } | |||
| if (n & 2) | |||
| { | |||
| LD_GP2_INC(x, 1, f0, f1); | |||
| ST_GP2_INC(f0, f1, y, 1); | |||
| } | |||
| if (n & 1) | |||
| { | |||
| *y = *x; | |||
| } | |||
| } | |||
| } | |||
| else | |||
| { | |||
| for (i = (n >> 3); i--;) | |||
| { | |||
| LD_GP8_INC(x, inc_x, f0, f1, f2, f3, f4, f5, f6, f7); | |||
| ST_GP8_INC(f0, f1, f2, f3, f4, f5, f6, f7, y, inc_y); | |||
| } | |||
| if (n & 4) | |||
| { | |||
| LD_GP4_INC(x, inc_x, f0, f1, f2, f3); | |||
| ST_GP4_INC(f0, f1, f2, f3, y, inc_y); | |||
| } | |||
| if (n & 2) | |||
| { | |||
| LD_GP2_INC(x, inc_x, f0, f1); | |||
| ST_GP2_INC(f0, f1, y, inc_y); | |||
| } | |||
| if (n & 1) | |||
| { | |||
| *y = *x; | |||
| } | |||
| } | |||
| return (0); | |||
| } | |||
| @@ -0,0 +1,385 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2017, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #include "common.h" | |||
| #include "macros_msa.h" | |||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, | |||
| BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, | |||
| BLASLONG dummy2) | |||
| { | |||
| BLASLONG i; | |||
| FLOAT *px; | |||
| FLOAT f0, f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11, f12, f13, f14, f15; | |||
| v4f32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; | |||
| v4f32 da_vec; | |||
| px = x; | |||
| if (1 == inc_x) | |||
| { | |||
| if (0.0 == da) | |||
| { | |||
| v4f32 zero_v = __msa_cast_to_vector_float(0); | |||
| zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 0, 0.0); | |||
| zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 1, 0.0); | |||
| zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 2, 0.0); | |||
| zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 3, 0.0); | |||
| for (i = (n >> 6); i--;) | |||
| { | |||
| ST_SP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v, | |||
| zero_v, zero_v, x, 4); | |||
| ST_SP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v, | |||
| zero_v, zero_v, x, 4); | |||
| } | |||
| if (n & 63) | |||
| { | |||
| if (n & 32) | |||
| { | |||
| ST_SP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v, | |||
| zero_v, zero_v, x, 4); | |||
| } | |||
| if (n & 16) | |||
| { | |||
| ST_SP4_INC(zero_v, zero_v, zero_v, zero_v, x, 4); | |||
| } | |||
| if (n & 8) | |||
| { | |||
| ST_SP2_INC(zero_v, zero_v, x, 4); | |||
| } | |||
| if (n & 4) | |||
| { | |||
| *x = 0; x += 1; | |||
| *x = 0; x += 1; | |||
| *x = 0; x += 1; | |||
| *x = 0; x += 1; | |||
| } | |||
| if (n & 2) | |||
| { | |||
| *x = 0; x += 1; | |||
| *x = 0; x += 1; | |||
| } | |||
| if (n & 1) | |||
| { | |||
| *x = 0; | |||
| } | |||
| } | |||
| } | |||
| else | |||
| { | |||
| da_vec = COPY_FLOAT_TO_VECTOR(da); | |||
| if (n > 63) | |||
| { | |||
| FLOAT *x_pref; | |||
| BLASLONG pref_offset; | |||
| pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); | |||
| if (pref_offset > 0) | |||
| { | |||
| pref_offset = L1_DATA_LINESIZE - pref_offset; | |||
| pref_offset = pref_offset / sizeof(FLOAT); | |||
| } | |||
| x_pref = x + pref_offset + 64 + 32; | |||
| LD_SP8_INC(px, 4, x0, x1, x2, x3, x4, x5, x6, x7); | |||
| for (i = 0; i < (n >> 6) - 1; i++) | |||
| { | |||
| PREF_OFFSET(x_pref, 0); | |||
| PREF_OFFSET(x_pref, 32); | |||
| PREF_OFFSET(x_pref, 64); | |||
| PREF_OFFSET(x_pref, 96); | |||
| PREF_OFFSET(x_pref, 128); | |||
| PREF_OFFSET(x_pref, 160); | |||
| PREF_OFFSET(x_pref, 192); | |||
| PREF_OFFSET(x_pref, 224); | |||
| x_pref += 64; | |||
| x8 = LD_SP(px); px += 4; | |||
| x0 *= da_vec; | |||
| x9 = LD_SP(px); px += 4; | |||
| x1 *= da_vec; | |||
| x10 = LD_SP(px); px += 4; | |||
| x2 *= da_vec; | |||
| x11 = LD_SP(px); px += 4; | |||
| x3 *= da_vec; | |||
| x12 = LD_SP(px); px += 4; | |||
| x4 *= da_vec; | |||
| x13 = LD_SP(px); px += 4; | |||
| x5 *= da_vec; | |||
| x14 = LD_SP(px); px += 4; | |||
| x6 *= da_vec; | |||
| x15 = LD_SP(px); px += 4; | |||
| x7 *= da_vec; | |||
| x8 *= da_vec; | |||
| ST_SP(x0, x); x += 4; | |||
| x9 *= da_vec; | |||
| ST_SP(x1, x); x += 4; | |||
| x10 *= da_vec; | |||
| ST_SP(x2, x); x += 4; | |||
| x11 *= da_vec; | |||
| ST_SP(x3, x); x += 4; | |||
| x12 *= da_vec; | |||
| ST_SP(x4, x); x += 4; | |||
| x13 *= da_vec; | |||
| ST_SP(x5, x); x += 4; | |||
| x14 *= da_vec; | |||
| ST_SP(x6, x); x += 4; | |||
| x15 *= da_vec; | |||
| ST_SP(x7, x); x += 4; | |||
| ST_SP(x8, x); x += 4; | |||
| x0 = LD_SP(px); px += 4; | |||
| ST_SP(x9, x); x += 4; | |||
| x1 = LD_SP(px); px += 4; | |||
| ST_SP(x10, x); x += 4; | |||
| x2 = LD_SP(px); px += 4; | |||
| ST_SP(x11, x); x += 4; | |||
| x3 = LD_SP(px); px += 4; | |||
| ST_SP(x12, x); x += 4; | |||
| x4 = LD_SP(px); px += 4; | |||
| ST_SP(x13, x); x += 4; | |||
| x5 = LD_SP(px); px += 4; | |||
| ST_SP(x14, x); x += 4; | |||
| x6 = LD_SP(px); px += 4; | |||
| ST_SP(x15, x); x += 4; | |||
| x7 = LD_SP(px); px += 4; | |||
| } | |||
| x8 = LD_SP(px); px += 4; | |||
| x0 *= da_vec; | |||
| x9 = LD_SP(px); px += 4; | |||
| x1 *= da_vec; | |||
| x10 = LD_SP(px); px += 4; | |||
| x2 *= da_vec; | |||
| x11 = LD_SP(px); px += 4; | |||
| x3 *= da_vec; | |||
| x12 = LD_SP(px); px += 4; | |||
| x4 *= da_vec; | |||
| x13 = LD_SP(px); px += 4; | |||
| x5 *= da_vec; | |||
| x14 = LD_SP(px); px += 4; | |||
| x6 *= da_vec; | |||
| x15 = LD_SP(px); px += 4; | |||
| x7 *= da_vec; | |||
| x8 *= da_vec; | |||
| ST_SP(x0, x); x += 4; | |||
| x9 *= da_vec; | |||
| ST_SP(x1, x); x += 4; | |||
| x10 *= da_vec; | |||
| ST_SP(x2, x); x += 4; | |||
| x11 *= da_vec; | |||
| ST_SP(x3, x); x += 4; | |||
| x12 *= da_vec; | |||
| ST_SP(x4, x); x += 4; | |||
| x13 *= da_vec; | |||
| ST_SP(x5, x); x += 4; | |||
| x15 *= da_vec; | |||
| ST_SP(x6, x); x += 4; | |||
| x14 *= da_vec; | |||
| ST_SP(x7, x); x += 4; | |||
| ST_SP8_INC(x8, x9, x10, x11, x12, x13, x14, x15, x, 4); | |||
| } | |||
| if (n & 63) | |||
| { | |||
| if (n & 32) | |||
| { | |||
| LD_SP8_INC(px, 4, x0, x1, x2, x3, x4, x5, x6, x7); | |||
| MUL4(x0, da_vec, x1, da_vec, x2, da_vec, x3, da_vec, x0, x1, x2, x3); | |||
| MUL4(x4, da_vec, x5, da_vec, x6, da_vec, x7, da_vec, x4, x5, x6, x7); | |||
| ST_SP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, 4); | |||
| } | |||
| if (n & 16) | |||
| { | |||
| LD_SP4_INC(px, 4, x0, x1, x2, x3); | |||
| MUL4(x0, da_vec, x1, da_vec, x2, da_vec, x3, da_vec, x0, x1, x2, x3); | |||
| ST_SP4_INC(x0, x1, x2, x3, x, 4); | |||
| } | |||
| if (n & 8) | |||
| { | |||
| LD_SP2_INC(px, 4, x0, x1); | |||
| MUL2(x0, da_vec, x1, da_vec, x0, x1); | |||
| ST_SP2_INC(x0, x1, x, 4); | |||
| } | |||
| if (n & 4) | |||
| { | |||
| LD_GP4_INC(px, 1, f0, f1, f2, f3); | |||
| MUL4(f0, da, f1, da, f2, da, f3, da, f0, f1, f2, f3); | |||
| ST_GP4_INC(f0, f1, f2, f3, x, 1); | |||
| } | |||
| if (n & 2) | |||
| { | |||
| LD_GP2_INC(px, 1, f0, f1); | |||
| MUL2(f0, da, f1, da, f0, f1); | |||
| ST_GP2_INC(f0, f1, x, 1); | |||
| } | |||
| if (n & 1) | |||
| { | |||
| *x *= da; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| else | |||
| { | |||
| if (0.0 == da) | |||
| { | |||
| for (i = n; i--;) | |||
| { | |||
| *x = 0; | |||
| x += inc_x; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| if (n > 15) | |||
| { | |||
| LD_GP8_INC(px, inc_x, f0, f1, f2, f3, f4, f5, f6, f7); | |||
| for (i = 0; i < (n >> 4) - 1; i++) | |||
| { | |||
| LD_GP8_INC(px, inc_x, f8, f9, f10, f11, f12, f13, f14, f15); | |||
| MUL4(f0, da, f1, da, f2, da, f3, da, f0, f1, f2, f3); | |||
| f4 *= da; | |||
| f5 *= da; | |||
| *x = f0; x += inc_x; | |||
| f6 *= da; | |||
| *x = f1; x += inc_x; | |||
| f7 *= da; | |||
| *x = f2; x += inc_x; | |||
| f8 *= da; | |||
| *x = f3; x += inc_x; | |||
| f9 *= da; | |||
| *x = f4; x += inc_x; | |||
| f10 *= da; | |||
| *x = f5; x += inc_x; | |||
| f11 *= da; | |||
| *x = f6; x += inc_x; | |||
| f12 *= da; | |||
| *x = f7; x += inc_x; | |||
| f13 *= da; | |||
| *x = f8; x += inc_x; | |||
| f14 *= da; | |||
| *x = f9; x += inc_x; | |||
| f15 *= da; | |||
| *x = f10; x += inc_x; | |||
| *x = f11; x += inc_x; | |||
| f0 = *px; px += inc_x; | |||
| *x = f12; x += inc_x; | |||
| f1 = *px; px += inc_x; | |||
| *x = f13; x += inc_x; | |||
| f2 = *px; px += inc_x; | |||
| *x = f14; x += inc_x; | |||
| f3 = *px; px += inc_x; | |||
| *x = f15; x += inc_x; | |||
| f4 = *px; px += inc_x; | |||
| f5 = *px; px += inc_x; | |||
| f6 = *px; px += inc_x; | |||
| f7 = *px; px += inc_x; | |||
| } | |||
| LD_GP8_INC(px, inc_x, f8, f9, f10, f11, f12, f13, f14, f15); | |||
| MUL4(f0, da, f1, da, f2, da, f3, da, f0, f1, f2, f3); | |||
| f4 *= da; | |||
| f5 *= da; | |||
| *x = f0; x += inc_x; | |||
| f6 *= da; | |||
| *x = f1; x += inc_x; | |||
| f7 *= da; | |||
| *x = f2; x += inc_x; | |||
| f8 *= da; | |||
| *x = f3; x += inc_x; | |||
| f9 *= da; | |||
| *x = f4; x += inc_x; | |||
| f10 *= da; | |||
| *x = f5; x += inc_x; | |||
| f11 *= da; | |||
| *x = f6; x += inc_x; | |||
| f12 *= da; | |||
| *x = f7; x += inc_x; | |||
| f13 *= da; | |||
| *x = f8; x += inc_x; | |||
| f14 *= da; | |||
| *x = f9; x += inc_x; | |||
| f15 *= da; | |||
| *x = f10; x += inc_x; | |||
| *x = f11; x += inc_x; | |||
| *x = f12; x += inc_x; | |||
| *x = f13; x += inc_x; | |||
| *x = f14; x += inc_x; | |||
| *x = f15; x += inc_x; | |||
| } | |||
| if (n & 15) | |||
| { | |||
| if (n & 8) | |||
| { | |||
| LD_GP8_INC(px, inc_x, f0, f1, f2, f3, f4, f5, f6, f7); | |||
| MUL4(f0, da, f1, da, f2, da, f3, da, f0, f1, f2, f3); | |||
| MUL4(f4, da, f5, da, f6, da, f7, da, f4, f5, f6, f7); | |||
| ST_GP8_INC(f0, f1, f2, f3, f4, f5, f6, f7, x, inc_x); | |||
| } | |||
| if (n & 4) | |||
| { | |||
| LD_GP4_INC(px, inc_x, f0, f1, f2, f3); | |||
| MUL4(f0, da, f1, da, f2, da, f3, da, f0, f1, f2, f3); | |||
| ST_GP4_INC(f0, f1, f2, f3, x, inc_x); | |||
| } | |||
| if (n & 2) | |||
| { | |||
| LD_GP2_INC(px, inc_x, f0, f1); | |||
| MUL2(f0, da, f1, da, f0, f1); | |||
| ST_GP2_INC(f0, f1, x, inc_x); | |||
| } | |||
| if (n & 1) | |||
| { | |||
| *x *= da; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,267 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #include "common.h" | |||
| #include "macros_msa.h" | |||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, | |||
| FLOAT *srcx, BLASLONG inc_x, FLOAT *srcy, BLASLONG inc_y, | |||
| FLOAT *dummy, BLASLONG dummy2) | |||
| { | |||
| BLASLONG i = 0, pref_offsetx, pref_offsety; | |||
| FLOAT *px, *py; | |||
| FLOAT x0, x1, x2, x3, x4, x5, x6, x7; | |||
| FLOAT y0, y1, y2, y3, y4, y5, y6, y7; | |||
| v4f32 xv0, xv1, xv2, xv3, xv4, xv5, xv6, xv7; | |||
| v4f32 yv0, yv1, yv2, yv3, yv4, yv5, yv6, yv7; | |||
| if (n < 0) return (0); | |||
| pref_offsetx = (BLASLONG)srcx & (L1_DATA_LINESIZE - 1); | |||
| if (pref_offsetx > 0) | |||
| { | |||
| pref_offsetx = L1_DATA_LINESIZE - pref_offsetx; | |||
| pref_offsetx = pref_offsetx / sizeof(FLOAT); | |||
| } | |||
| pref_offsety = (BLASLONG)srcy & (L1_DATA_LINESIZE - 1); | |||
| if (pref_offsety > 0) | |||
| { | |||
| pref_offsety = L1_DATA_LINESIZE - pref_offsety; | |||
| pref_offsety = pref_offsety / sizeof(FLOAT); | |||
| } | |||
| px = srcx; | |||
| py = srcy; | |||
| if ((1 == inc_x) && (1 == inc_y)) | |||
| { | |||
| if (n >> 5) | |||
| { | |||
| LD_SP8_INC(px, 4, xv0, xv1, xv2, xv3, xv4, xv5, xv6, xv7); | |||
| for (i = (n >> 5) - 1; i--;) | |||
| { | |||
| PREFETCH(px + pref_offsetx + 32); | |||
| PREFETCH(px + pref_offsetx + 40); | |||
| PREFETCH(px + pref_offsetx + 48); | |||
| PREFETCH(px + pref_offsetx + 56); | |||
| PREFETCH(py + pref_offsety + 32); | |||
| PREFETCH(py + pref_offsety + 40); | |||
| PREFETCH(py + pref_offsety + 48); | |||
| PREFETCH(py + pref_offsety + 56); | |||
| yv0 = LD_SP(py); py += 4; | |||
| ST_SP(xv0, srcy); srcy += 4; | |||
| yv1 = LD_SP(py); py += 4; | |||
| ST_SP(xv1, srcy); srcy += 4; | |||
| yv2 = LD_SP(py); py += 4; | |||
| ST_SP(xv2, srcy); srcy += 4; | |||
| yv3 = LD_SP(py); py += 4; | |||
| ST_SP(xv3, srcy); srcy += 4; | |||
| yv4 = LD_SP(py); py += 4; | |||
| ST_SP(xv4, srcy); srcy += 4; | |||
| yv5 = LD_SP(py); py += 4; | |||
| ST_SP(xv5, srcy); srcy += 4; | |||
| yv6 = LD_SP(py); py += 4; | |||
| ST_SP(xv6, srcy); srcy += 4; | |||
| yv7 = LD_SP(py); py += 4; | |||
| ST_SP(xv7, srcy); srcy += 4; | |||
| xv0 = LD_SP(px); px += 4; | |||
| ST_SP(yv0, srcx); srcx += 4; | |||
| xv1 = LD_SP(px); px += 4; | |||
| ST_SP(yv1, srcx); srcx += 4; | |||
| xv2 = LD_SP(px); px += 4; | |||
| ST_SP(yv2, srcx); srcx += 4; | |||
| xv3 = LD_SP(px); px += 4; | |||
| ST_SP(yv3, srcx); srcx += 4; | |||
| xv4 = LD_SP(px); px += 4; | |||
| ST_SP(yv4, srcx); srcx += 4; | |||
| xv5 = LD_SP(px); px += 4; | |||
| ST_SP(yv5, srcx); srcx += 4; | |||
| xv6 = LD_SP(px); px += 4; | |||
| ST_SP(yv6, srcx); srcx += 4; | |||
| xv7 = LD_SP(px); px += 4; | |||
| ST_SP(yv7, srcx); srcx += 4; | |||
| } | |||
| LD_SP8_INC(py, 4, yv0, yv1, yv2, yv3, yv4, yv5, yv6, yv7); | |||
| ST_SP8_INC(xv0, xv1, xv2, xv3, xv4, xv5, xv6, xv7, srcy, 4); | |||
| ST_SP8_INC(yv0, yv1, yv2, yv3, yv4, yv5, yv6, yv7, srcx, 4); | |||
| } | |||
| if (n & 31) | |||
| { | |||
| if ((n & 16) && (n & 8) && (n & 4)) | |||
| { | |||
| LD_SP7_INC(px, 4, xv0, xv1, xv2, xv3, xv4, xv5, xv6); | |||
| LD_SP7_INC(py, 4, yv0, yv1, yv2, yv3, yv4, yv5, yv6); | |||
| ST_SP7_INC(xv0, xv1, xv2, xv3, xv4, xv5, xv6, srcy, 4); | |||
| ST_SP7_INC(yv0, yv1, yv2, yv3, yv4, yv5, yv6, srcx, 4); | |||
| } | |||
| else if ((n & 16) && (n & 8)) | |||
| { | |||
| LD_SP6_INC(px, 4, xv0, xv1, xv2, xv3, xv4, xv5); | |||
| LD_SP6_INC(py, 4, yv0, yv1, yv2, yv3, yv4, yv5); | |||
| ST_SP6_INC(xv0, xv1, xv2, xv3, xv4, xv5, srcy, 4); | |||
| ST_SP6_INC(yv0, yv1, yv2, yv3, yv4, yv5, srcx, 4); | |||
| } | |||
| else if ((n & 16) && (n & 4)) | |||
| { | |||
| LD_SP5_INC(px, 4, xv0, xv1, xv2, xv3, xv4); | |||
| LD_SP5_INC(py, 4, yv0, yv1, yv2, yv3, yv4); | |||
| ST_SP5_INC(xv0, xv1, xv2, xv3, xv4, srcy, 4); | |||
| ST_SP5_INC(yv0, yv1, yv2, yv3, yv4, srcx, 4); | |||
| } | |||
| else if ((n & 8) && (n & 4)) | |||
| { | |||
| LD_SP3_INC(px, 4, xv0, xv1, xv2); | |||
| LD_SP3_INC(py, 4, yv0, yv1, yv2); | |||
| ST_SP3_INC(xv0, xv1, xv2, srcy, 4); | |||
| ST_SP3_INC(yv0, yv1, yv2, srcx, 4); | |||
| } | |||
| else if (n & 16) | |||
| { | |||
| LD_SP4_INC(px, 4, xv0, xv1, xv2, xv3); | |||
| LD_SP4_INC(py, 4, yv0, yv1, yv2, yv3); | |||
| ST_SP4_INC(xv0, xv1, xv2, xv3, srcy, 4); | |||
| ST_SP4_INC(yv0, yv1, yv2, yv3, srcx, 4); | |||
| } | |||
| else if (n & 8) | |||
| { | |||
| LD_SP2_INC(px, 4, xv0, xv1); | |||
| LD_SP2_INC(py, 4, yv0, yv1); | |||
| ST_SP2_INC(xv0, xv1, srcy, 4); | |||
| ST_SP2_INC(yv0, yv1, srcx, 4); | |||
| } | |||
| else if (n & 4) | |||
| { | |||
| xv0 = LD_SP(px); | |||
| yv0 = LD_SP(py); | |||
| px += 4; | |||
| py += 4; | |||
| ST_SP(xv0, srcy); | |||
| ST_SP(yv0, srcx); | |||
| srcx += 4; | |||
| srcy += 4; | |||
| } | |||
| if ((n & 2) && (n & 1)) | |||
| { | |||
| LD_GP3_INC(px, 1, x0, x1, x3); | |||
| LD_GP3_INC(py, 1, y0, y1, y3); | |||
| ST_GP3_INC(x0, x1, x3, srcy, 1); | |||
| ST_GP3_INC(y0, y1, y3, srcx, 1); | |||
| } | |||
| else if (n & 2) | |||
| { | |||
| LD_GP2_INC(px, 1, x0, x1); | |||
| LD_GP2_INC(py, 1, y0, y1); | |||
| ST_GP2_INC(x0, x1, srcy, 1); | |||
| ST_GP2_INC(y0, y1, srcx, 1); | |||
| } | |||
| else if (n & 1) | |||
| { | |||
| x0 = px[0]; | |||
| y0 = py[0]; | |||
| srcx[0] = y0; | |||
| srcy[0] = x0; | |||
| } | |||
| } | |||
| } | |||
| else | |||
| { | |||
| for (i = (n >> 3); i--;) | |||
| { | |||
| LD_GP8_INC(px, inc_x, x0, x1, x2, x3, x4, x5, x6, x7); | |||
| LD_GP8_INC(py, inc_y, y0, y1, y2, y3, y4, y5, y6, y7); | |||
| ST_GP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, srcy, inc_y); | |||
| ST_GP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, srcx, inc_x); | |||
| } | |||
| if (n & 7) | |||
| { | |||
| if ((n & 4) && (n & 2) && (n & 1)) | |||
| { | |||
| LD_GP7_INC(px, inc_x, x0, x1, x2, x3, x4, x5, x6); | |||
| LD_GP7_INC(py, inc_y, y0, y1, y2, y3, y4, y5, y6); | |||
| ST_GP7_INC(x0, x1, x2, x3, x4, x5, x6, srcy, inc_y); | |||
| ST_GP7_INC(y0, y1, y2, y3, y4, y5, y6, srcx, inc_x); | |||
| } | |||
| else if ((n & 4) && (n & 2)) | |||
| { | |||
| LD_GP6_INC(px, inc_x, x0, x1, x2, x3, x4, x5); | |||
| LD_GP6_INC(py, inc_y, y0, y1, y2, y3, y4, y5); | |||
| ST_GP6_INC(x0, x1, x2, x3, x4, x5, srcy, inc_y); | |||
| ST_GP6_INC(y0, y1, y2, y3, y4, y5, srcx, inc_x); | |||
| } | |||
| else if ((n & 4) && (n & 1)) | |||
| { | |||
| LD_GP5_INC(px, inc_x, x0, x1, x2, x3, x4); | |||
| LD_GP5_INC(py, inc_y, y0, y1, y2, y3, y4); | |||
| ST_GP5_INC(x0, x1, x2, x3, x4, srcy, inc_y); | |||
| ST_GP5_INC(y0, y1, y2, y3, y4, srcx, inc_x); | |||
| } | |||
| else if ((n & 2) && (n & 1)) | |||
| { | |||
| LD_GP3_INC(px, inc_x, x0, x1, x2); | |||
| LD_GP3_INC(py, inc_y, y0, y1, y2); | |||
| ST_GP3_INC(x0, x1, x2, srcy, inc_y); | |||
| ST_GP3_INC(y0, y1, y2, srcx, inc_x); | |||
| } | |||
| else if (n & 4) | |||
| { | |||
| LD_GP4_INC(px, inc_x, x0, x1, x2, x3); | |||
| LD_GP4_INC(py, inc_y, y0, y1, y2, y3); | |||
| ST_GP4_INC(x0, x1, x2, x3, srcy, inc_y); | |||
| ST_GP4_INC(y0, y1, y2, y3, srcx, inc_x); | |||
| } | |||
| else if (n & 2) | |||
| { | |||
| LD_GP2_INC(px, inc_x, x0, x1); | |||
| LD_GP2_INC(py, inc_y, y0, y1); | |||
| ST_GP2_INC(x0, x1, srcy, inc_y); | |||
| ST_GP2_INC(y0, y1, srcx, inc_x); | |||
| } | |||
| else if (n & 1) | |||
| { | |||
| x0 = *srcx; | |||
| y0 = *srcy; | |||
| *srcx = y0; | |||
| *srcy = x0; | |||
| } | |||
| } | |||
| } | |||
| return (0); | |||
| } | |||
| @@ -0,0 +1,494 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #include "common.h" | |||
| #include "macros_msa.h" | |||
| #if !defined(CONJ) | |||
| #define OP0 += | |||
| #define OP1 -= | |||
| #define OP2 += | |||
| #else | |||
| #define OP0 -= | |||
| #define OP1 += | |||
| #define OP2 -= | |||
| #endif | |||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||
| FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, | |||
| BLASLONG dummy2) | |||
| { | |||
| BLASLONG i, inc_x2, inc_y2; | |||
| FLOAT *py; | |||
| v2f64 x0, x1, x2, x3, x4, x5, x6, x7; | |||
| v2f64 y0, y1, y2, y3, y4, y5, y6, y7, dar_vec, dai_vec; | |||
| v2f64 x0r, x1r, x2r, x3r, x0i, x1i, x2i, x3i; | |||
| v2f64 y0r, y1r, y2r, y3r, y0i, y1i, y2i, y3i; | |||
| FLOAT xd0, xd1, yd0, yd1; | |||
| if (n < 0) return(0); | |||
| if ((da_r == 0.0) && (da_i == 0.0)) return(0); | |||
| py = y; | |||
| dar_vec = COPY_DOUBLE_TO_VECTOR(da_r); | |||
| dai_vec = COPY_DOUBLE_TO_VECTOR(da_i); | |||
| if ((1 == inc_x) && (1 == inc_y)) | |||
| { | |||
| FLOAT *x_pref, *y_pref; | |||
| BLASLONG pref_offset; | |||
| pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); | |||
| if (pref_offset > 0) | |||
| { | |||
| pref_offset = L1_DATA_LINESIZE - pref_offset; | |||
| pref_offset = pref_offset / sizeof(FLOAT); | |||
| } | |||
| x_pref = x + pref_offset + 32; | |||
| pref_offset = (BLASLONG)y & (L1_DATA_LINESIZE - 1); | |||
| if (pref_offset > 0) | |||
| { | |||
| pref_offset = L1_DATA_LINESIZE - pref_offset; | |||
| pref_offset = pref_offset / sizeof(FLOAT); | |||
| } | |||
| y_pref = y + pref_offset + 32; | |||
| for (i = (n >> 3); i--;) | |||
| { | |||
| PREF_OFFSET(x_pref, 0); | |||
| PREF_OFFSET(x_pref, 32); | |||
| PREF_OFFSET(x_pref, 64); | |||
| PREF_OFFSET(x_pref, 96); | |||
| PREF_OFFSET(y_pref, 0); | |||
| PREF_OFFSET(y_pref, 32); | |||
| PREF_OFFSET(y_pref, 64); | |||
| PREF_OFFSET(y_pref, 96); | |||
| x_pref += 16; | |||
| y_pref += 16; | |||
| LD_DP8_INC(x, 2, x0, x1, x2, x3, x4, x5, x6, x7); | |||
| LD_DP8_INC(py, 2, y0, y1, y2, y3, y4, y5, y6, y7); | |||
| PCKEVOD_D2_DP(x1, x0, x0r, x0i); | |||
| PCKEVOD_D2_DP(y1, y0, y0r, y0i); | |||
| PCKEVOD_D2_DP(x3, x2, x1r, x1i); | |||
| PCKEVOD_D2_DP(y3, y2, y1r, y1i); | |||
| PCKEVOD_D2_DP(x5, x4, x2r, x2i); | |||
| PCKEVOD_D2_DP(y5, y4, y2r, y2i); | |||
| PCKEVOD_D2_DP(x7, x6, x3r, x3i); | |||
| PCKEVOD_D2_DP(y7, y6, y3r, y3i); | |||
| FMADD4(x0r, x1r, x2r, x3r, dar_vec, y0r, y1r, y2r, y3r); | |||
| y0i OP0 dar_vec * x0i; | |||
| y1i OP0 dar_vec * x1i; | |||
| y2i OP0 dar_vec * x2i; | |||
| y3i OP0 dar_vec * x3i; | |||
| y0r OP1 dai_vec * x0i; | |||
| y1r OP1 dai_vec * x1i; | |||
| y2r OP1 dai_vec * x2i; | |||
| y3r OP1 dai_vec * x3i; | |||
| y0i OP2 dai_vec * x0r; | |||
| y1i OP2 dai_vec * x1r; | |||
| y2i OP2 dai_vec * x2r; | |||
| y3i OP2 dai_vec * x3r; | |||
| ILVRL_D2_DP(y0i, y0r, y0, y1); | |||
| ILVRL_D2_DP(y1i, y1r, y2, y3); | |||
| ILVRL_D2_DP(y2i, y2r, y4, y5); | |||
| ILVRL_D2_DP(y3i, y3r, y6, y7); | |||
| ST_DP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, 2); | |||
| } | |||
| if (n & 7) | |||
| { | |||
| if (n & 4) | |||
| { | |||
| LD_DP4_INC(x, 2, x0, x1, x2, x3); | |||
| LD_DP4_INC(py, 2, y0, y1, y2, y3); | |||
| PCKEVOD_D2_DP(x1, x0, x0r, x0i); | |||
| PCKEVOD_D2_DP(y1, y0, y0r, y0i); | |||
| PCKEVOD_D2_DP(x3, x2, x1r, x1i); | |||
| PCKEVOD_D2_DP(y3, y2, y1r, y1i); | |||
| FMADD2(x0r, x1r, dar_vec, y0r, y1r); | |||
| y0i OP0 dar_vec * x0i; | |||
| y1i OP0 dar_vec * x1i; | |||
| y0r OP1 dai_vec * x0i; | |||
| y1r OP1 dai_vec * x1i; | |||
| y0i OP2 dai_vec * x0r; | |||
| y1i OP2 dai_vec * x1r; | |||
| ILVRL_D2_DP(y0i, y0r, y0, y1); | |||
| ILVRL_D2_DP(y1i, y1r, y2, y3); | |||
| ST_DP4_INC(y0, y1, y2, y3, y, 2); | |||
| } | |||
| if (n & 2) | |||
| { | |||
| LD_DP2_INC(x, 2, x0, x1); | |||
| LD_DP2_INC(py, 2, y0, y1); | |||
| PCKEVOD_D2_DP(x1, x0, x0r, x0i); | |||
| PCKEVOD_D2_DP(y1, y0, y0r, y0i); | |||
| y0r += dar_vec * x0r; | |||
| y0i OP0 dar_vec * x0i; | |||
| y0r OP1 dai_vec * x0i; | |||
| y0i OP2 dai_vec * x0r; | |||
| ILVRL_D2_DP(y0i, y0r, y0, y1); | |||
| ST_DP2_INC(y0, y1, y, 2); | |||
| } | |||
| if (n & 1) | |||
| { | |||
| LD_GP2_INC(x, 1, xd0, xd1); | |||
| LD_GP2_INC(py, 1, yd0, yd1); | |||
| yd0 += da_r * xd0; | |||
| yd1 OP0 da_r * xd1; | |||
| yd0 OP1 da_i * xd1; | |||
| yd1 OP2 da_i * xd0; | |||
| ST_GP2_INC(yd0, yd1, y, 1); | |||
| } | |||
| } | |||
| } | |||
| else if (1 == inc_y) | |||
| { | |||
| FLOAT *y_pref; | |||
| BLASLONG pref_offset; | |||
| pref_offset = (BLASLONG)y & (L1_DATA_LINESIZE - 1); | |||
| if (pref_offset > 0) | |||
| { | |||
| pref_offset = L1_DATA_LINESIZE - pref_offset; | |||
| pref_offset = pref_offset / sizeof(FLOAT); | |||
| } | |||
| y_pref = y + pref_offset + 32; | |||
| inc_x2 = 2 * inc_x; | |||
| for (i = (n >> 3); i--;) | |||
| { | |||
| PREF_OFFSET(y_pref, 0); | |||
| PREF_OFFSET(y_pref, 32); | |||
| PREF_OFFSET(y_pref, 64); | |||
| PREF_OFFSET(y_pref, 96); | |||
| y_pref += 16; | |||
| LD_DP8_INC(x, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7); | |||
| LD_DP8_INC(py, 2, y0, y1, y2, y3, y4, y5, y6, y7); | |||
| PCKEVOD_D2_DP(x1, x0, x0r, x0i); | |||
| PCKEVOD_D2_DP(y1, y0, y0r, y0i); | |||
| PCKEVOD_D2_DP(x3, x2, x1r, x1i); | |||
| PCKEVOD_D2_DP(y3, y2, y1r, y1i); | |||
| PCKEVOD_D2_DP(x5, x4, x2r, x2i); | |||
| PCKEVOD_D2_DP(y5, y4, y2r, y2i); | |||
| PCKEVOD_D2_DP(x7, x6, x3r, x3i); | |||
| PCKEVOD_D2_DP(y7, y6, y3r, y3i); | |||
| FMADD4(x0r, x1r, x2r, x3r, dar_vec, y0r, y1r, y2r, y3r); | |||
| y0i OP0 dar_vec * x0i; | |||
| y1i OP0 dar_vec * x1i; | |||
| y2i OP0 dar_vec * x2i; | |||
| y3i OP0 dar_vec * x3i; | |||
| y0r OP1 dai_vec * x0i; | |||
| y1r OP1 dai_vec * x1i; | |||
| y2r OP1 dai_vec * x2i; | |||
| y3r OP1 dai_vec * x3i; | |||
| y0i OP2 dai_vec * x0r; | |||
| y1i OP2 dai_vec * x1r; | |||
| y2i OP2 dai_vec * x2r; | |||
| y3i OP2 dai_vec * x3r; | |||
| ILVRL_D2_DP(y0i, y0r, y0, y1); | |||
| ILVRL_D2_DP(y1i, y1r, y2, y3); | |||
| ILVRL_D2_DP(y2i, y2r, y4, y5); | |||
| ILVRL_D2_DP(y3i, y3r, y6, y7); | |||
| ST_DP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, 2); | |||
| } | |||
| if (n & 7) | |||
| { | |||
| if (n & 4) | |||
| { | |||
| LD_DP4_INC(x, inc_x2, x0, x1, x2, x3); | |||
| LD_DP4_INC(py, 2, y0, y1, y2, y3); | |||
| PCKEVOD_D2_DP(x1, x0, x0r, x0i); | |||
| PCKEVOD_D2_DP(y1, y0, y0r, y0i); | |||
| PCKEVOD_D2_DP(x3, x2, x1r, x1i); | |||
| PCKEVOD_D2_DP(y3, y2, y1r, y1i); | |||
| FMADD2(x0r, x1r, dar_vec, y0r, y1r); | |||
| y0i OP0 dar_vec * x0i; | |||
| y1i OP0 dar_vec * x1i; | |||
| y0r OP1 dai_vec * x0i; | |||
| y1r OP1 dai_vec * x1i; | |||
| y0i OP2 dai_vec * x0r; | |||
| y1i OP2 dai_vec * x1r; | |||
| ILVRL_D2_DP(y0i, y0r, y0, y1); | |||
| ILVRL_D2_DP(y1i, y1r, y2, y3); | |||
| ST_DP4_INC(y0, y1, y2, y3, y, 2); | |||
| } | |||
| if (n & 2) | |||
| { | |||
| LD_DP2_INC(x, inc_x2, x0, x1); | |||
| LD_DP2_INC(py, 2, y0, y1); | |||
| PCKEVOD_D2_DP(x1, x0, x0r, x0i); | |||
| PCKEVOD_D2_DP(y1, y0, y0r, y0i); | |||
| y0r += dar_vec * x0r; | |||
| y0i OP0 dar_vec * x0i; | |||
| y0r OP1 dai_vec * x0i; | |||
| y0i OP2 dai_vec * x0r; | |||
| ILVRL_D2_DP(y0i, y0r, y0, y1); | |||
| ST_DP2_INC(y0, y1, y, 2); | |||
| } | |||
| if (n & 1) | |||
| { | |||
| LD_GP2_INC(x, 1, xd0, xd1); | |||
| LD_GP2_INC(py, 1, yd0, yd1); | |||
| yd0 += da_r * xd0; | |||
| yd1 OP0 da_r * xd1; | |||
| yd0 OP1 da_i * xd1; | |||
| yd1 OP2 da_i * xd0; | |||
| ST_GP2_INC(yd0, yd1, y, 1); | |||
| } | |||
| } | |||
| } | |||
| else if (1 == inc_x) | |||
| { | |||
| FLOAT *x_pref; | |||
| BLASLONG pref_offset; | |||
| pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); | |||
| if (pref_offset > 0) | |||
| { | |||
| pref_offset = L1_DATA_LINESIZE - pref_offset; | |||
| pref_offset = pref_offset / sizeof(FLOAT); | |||
| } | |||
| x_pref = x + pref_offset + 32; | |||
| inc_y2 = 2 * inc_y; | |||
| for (i = (n >> 3); i--;) | |||
| { | |||
| PREF_OFFSET(x_pref, 0); | |||
| PREF_OFFSET(x_pref, 32); | |||
| PREF_OFFSET(x_pref, 64); | |||
| PREF_OFFSET(x_pref, 96); | |||
| x_pref += 16; | |||
| LD_DP8_INC(x, 2, x0, x1, x2, x3, x4, x5, x6, x7); | |||
| LD_DP8_INC(py, inc_y2, y0, y1, y2, y3, y4, y5, y6, y7); | |||
| PCKEVOD_D2_DP(x1, x0, x0r, x0i); | |||
| PCKEVOD_D2_DP(y1, y0, y0r, y0i); | |||
| PCKEVOD_D2_DP(x3, x2, x1r, x1i); | |||
| PCKEVOD_D2_DP(y3, y2, y1r, y1i); | |||
| PCKEVOD_D2_DP(x5, x4, x2r, x2i); | |||
| PCKEVOD_D2_DP(y5, y4, y2r, y2i); | |||
| PCKEVOD_D2_DP(x7, x6, x3r, x3i); | |||
| PCKEVOD_D2_DP(y7, y6, y3r, y3i); | |||
| FMADD4(x0r, x1r, x2r, x3r, dar_vec, y0r, y1r, y2r, y3r); | |||
| y0i OP0 dar_vec * x0i; | |||
| y1i OP0 dar_vec * x1i; | |||
| y2i OP0 dar_vec * x2i; | |||
| y3i OP0 dar_vec * x3i; | |||
| y0r OP1 dai_vec * x0i; | |||
| y1r OP1 dai_vec * x1i; | |||
| y2r OP1 dai_vec * x2i; | |||
| y3r OP1 dai_vec * x3i; | |||
| y0i OP2 dai_vec * x0r; | |||
| y1i OP2 dai_vec * x1r; | |||
| y2i OP2 dai_vec * x2r; | |||
| y3i OP2 dai_vec * x3r; | |||
| ILVRL_D2_DP(y0i, y0r, y0, y1); | |||
| ILVRL_D2_DP(y1i, y1r, y2, y3); | |||
| ILVRL_D2_DP(y2i, y2r, y4, y5); | |||
| ILVRL_D2_DP(y3i, y3r, y6, y7); | |||
| ST_DP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, inc_y2); | |||
| } | |||
| if (n & 7) | |||
| { | |||
| if (n & 4) | |||
| { | |||
| LD_DP4_INC(x, 2, x0, x1, x2, x3); | |||
| LD_DP4_INC(py, inc_y2, y0, y1, y2, y3); | |||
| PCKEVOD_D2_DP(x1, x0, x0r, x0i); | |||
| PCKEVOD_D2_DP(y1, y0, y0r, y0i); | |||
| PCKEVOD_D2_DP(x3, x2, x1r, x1i); | |||
| PCKEVOD_D2_DP(y3, y2, y1r, y1i); | |||
| FMADD2(x0r, x1r, dar_vec, y0r, y1r); | |||
| y0i OP0 dar_vec * x0i; | |||
| y1i OP0 dar_vec * x1i; | |||
| y0r OP1 dai_vec * x0i; | |||
| y1r OP1 dai_vec * x1i; | |||
| y0i OP2 dai_vec * x0r; | |||
| y1i OP2 dai_vec * x1r; | |||
| ILVRL_D2_DP(y0i, y0r, y0, y1); | |||
| ILVRL_D2_DP(y1i, y1r, y2, y3); | |||
| ST_DP4_INC(y0, y1, y2, y3, y, inc_y2); | |||
| } | |||
| if (n & 2) | |||
| { | |||
| LD_DP2_INC(x, 2, x0, x1); | |||
| LD_DP2_INC(py, inc_y2, y0, y1); | |||
| PCKEVOD_D2_DP(x1, x0, x0r, x0i); | |||
| PCKEVOD_D2_DP(y1, y0, y0r, y0i); | |||
| y0r += dar_vec * x0r; | |||
| y0i OP0 dar_vec * x0i; | |||
| y0r OP1 dai_vec * x0i; | |||
| y0i OP2 dai_vec * x0r; | |||
| ILVRL_D2_DP(y0i, y0r, y0, y1); | |||
| ST_DP2_INC(y0, y1, y, inc_y2); | |||
| } | |||
| if (n & 1) | |||
| { | |||
| LD_GP2_INC(x, 1, xd0, xd1); | |||
| LD_GP2_INC(py, 1, yd0, yd1); | |||
| yd0 += da_r * xd0; | |||
| yd1 OP0 da_r * xd1; | |||
| yd0 OP1 da_i * xd1; | |||
| yd1 OP2 da_i * xd0; | |||
| ST_GP2_INC(yd0, yd1, y, 1); | |||
| } | |||
| } | |||
| } | |||
| else | |||
| { | |||
| inc_x2 = 2 * inc_x; | |||
| inc_y2 = 2 * inc_y; | |||
| for (i = (n >> 3); i--;) | |||
| { | |||
| LD_DP8_INC(x, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7); | |||
| LD_DP8_INC(py, inc_y2, y0, y1, y2, y3, y4, y5, y6, y7); | |||
| PCKEVOD_D2_DP(x1, x0, x0r, x0i); | |||
| PCKEVOD_D2_DP(y1, y0, y0r, y0i); | |||
| PCKEVOD_D2_DP(x3, x2, x1r, x1i); | |||
| PCKEVOD_D2_DP(y3, y2, y1r, y1i); | |||
| PCKEVOD_D2_DP(x5, x4, x2r, x2i); | |||
| PCKEVOD_D2_DP(y5, y4, y2r, y2i); | |||
| PCKEVOD_D2_DP(x7, x6, x3r, x3i); | |||
| PCKEVOD_D2_DP(y7, y6, y3r, y3i); | |||
| FMADD4(x0r, x1r, x2r, x3r, dar_vec, y0r, y1r, y2r, y3r); | |||
| y0i OP0 dar_vec * x0i; | |||
| y1i OP0 dar_vec * x1i; | |||
| y2i OP0 dar_vec * x2i; | |||
| y3i OP0 dar_vec * x3i; | |||
| y0r OP1 dai_vec * x0i; | |||
| y1r OP1 dai_vec * x1i; | |||
| y2r OP1 dai_vec * x2i; | |||
| y3r OP1 dai_vec * x3i; | |||
| y0i OP2 dai_vec * x0r; | |||
| y1i OP2 dai_vec * x1r; | |||
| y2i OP2 dai_vec * x2r; | |||
| y3i OP2 dai_vec * x3r; | |||
| ILVRL_D2_DP(y0i, y0r, y0, y1); | |||
| ILVRL_D2_DP(y1i, y1r, y2, y3); | |||
| ILVRL_D2_DP(y2i, y2r, y4, y5); | |||
| ILVRL_D2_DP(y3i, y3r, y6, y7); | |||
| ST_DP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, inc_y2); | |||
| } | |||
| if (n & 7) | |||
| { | |||
| if (n & 4) | |||
| { | |||
| LD_DP4_INC(x, inc_x2, x0, x1, x2, x3); | |||
| LD_DP4_INC(py, inc_y2, y0, y1, y2, y3); | |||
| PCKEVOD_D2_DP(x1, x0, x0r, x0i); | |||
| PCKEVOD_D2_DP(y1, y0, y0r, y0i); | |||
| PCKEVOD_D2_DP(x3, x2, x1r, x1i); | |||
| PCKEVOD_D2_DP(y3, y2, y1r, y1i); | |||
| FMADD2(x0r, x1r, dar_vec, y0r, y1r); | |||
| y0i OP0 dar_vec * x0i; | |||
| y1i OP0 dar_vec * x1i; | |||
| y0r OP1 dai_vec * x0i; | |||
| y1r OP1 dai_vec * x1i; | |||
| y0i OP2 dai_vec * x0r; | |||
| y1i OP2 dai_vec * x1r; | |||
| ILVRL_D2_DP(y0i, y0r, y0, y1); | |||
| ILVRL_D2_DP(y1i, y1r, y2, y3); | |||
| ST_DP4_INC(y0, y1, y2, y3, y, inc_y2); | |||
| } | |||
| if (n & 2) | |||
| { | |||
| LD_DP2_INC(x, inc_x2, x0, x1); | |||
| LD_DP2_INC(py, inc_y2, y0, y1); | |||
| PCKEVOD_D2_DP(x1, x0, x0r, x0i); | |||
| PCKEVOD_D2_DP(y1, y0, y0r, y0i); | |||
| y0r += dar_vec * x0r; | |||
| y0i OP0 dar_vec * x0i; | |||
| y0r OP1 dai_vec * x0i; | |||
| y0i OP2 dai_vec * x0r; | |||
| ILVRL_D2_DP(y0i, y0r, y0, y1); | |||
| ST_DP2_INC(y0, y1, y, inc_y2); | |||
| } | |||
| if (n & 1) | |||
| { | |||
| LD_GP2_INC(x, 1, xd0, xd1); | |||
| LD_GP2_INC(py, 1, yd0, yd1); | |||
| yd0 += da_r * xd0; | |||
| yd1 OP0 da_r * xd1; | |||
| yd0 OP1 da_i * xd1; | |||
| yd1 OP2 da_i * xd0; | |||
| ST_GP2_INC(yd0, yd1, y, 1); | |||
| } | |||
| } | |||
| } | |||
| return (0); | |||
| } | |||
| @@ -0,0 +1,218 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #include "common.h" | |||
| #include "macros_msa.h" | |||
| int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| { | |||
| BLASLONG i; | |||
| v2f64 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; | |||
| FLOAT f0, f1; | |||
| if (n < 0) return (0); | |||
| if ((1 == inc_x) && (1 == inc_y)) | |||
| { | |||
| if (n > 15) | |||
| { | |||
| FLOAT *x_pref; | |||
| BLASLONG pref_offset; | |||
| pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); | |||
| if (pref_offset > 0) | |||
| { | |||
| pref_offset = L1_DATA_LINESIZE - pref_offset; | |||
| pref_offset = pref_offset / sizeof(FLOAT); | |||
| } | |||
| x_pref = x + pref_offset + 64 + 16; | |||
| LD_DP8_INC(x, 2, x0, x1, x2, x3, x4, x5, x6, x7); | |||
| for (i = (n >> 4) - 1; i--;) | |||
| { | |||
| PREF_OFFSET(x_pref, 0); | |||
| PREF_OFFSET(x_pref, 32); | |||
| PREF_OFFSET(x_pref, 64); | |||
| PREF_OFFSET(x_pref, 96); | |||
| PREF_OFFSET(x_pref, 128); | |||
| PREF_OFFSET(x_pref, 160); | |||
| PREF_OFFSET(x_pref, 192); | |||
| PREF_OFFSET(x_pref, 224); | |||
| x_pref += 32; | |||
| x8 = LD_DP(x); x += 2; | |||
| ST_DP(x0, y); y += 2; | |||
| x9 = LD_DP(x); x += 2; | |||
| ST_DP(x1, y); y += 2; | |||
| x10 = LD_DP(x); x += 2; | |||
| ST_DP(x2, y); y += 2; | |||
| x11 = LD_DP(x); x += 2; | |||
| ST_DP(x3, y); y += 2; | |||
| x12 = LD_DP(x); x += 2; | |||
| ST_DP(x4, y); y += 2; | |||
| x13 = LD_DP(x); x += 2; | |||
| ST_DP(x5, y); y += 2; | |||
| x14 = LD_DP(x); x += 2; | |||
| ST_DP(x6, y); y += 2; | |||
| x15 = LD_DP(x); x += 2; | |||
| ST_DP(x7, y); y += 2; | |||
| x0 = LD_DP(x); x += 2; | |||
| ST_DP(x8, y); y += 2; | |||
| x1 = LD_DP(x); x += 2; | |||
| ST_DP(x9, y); y += 2; | |||
| x2 = LD_DP(x); x += 2; | |||
| ST_DP(x10, y); y += 2; | |||
| x3 = LD_DP(x); x += 2; | |||
| ST_DP(x11, y); y += 2; | |||
| x4 = LD_DP(x); x += 2; | |||
| ST_DP(x12, y); y += 2; | |||
| x5 = LD_DP(x); x += 2; | |||
| ST_DP(x13, y); y += 2; | |||
| x6 = LD_DP(x); x += 2; | |||
| ST_DP(x14, y); y += 2; | |||
| x7 = LD_DP(x); x += 2; | |||
| ST_DP(x15, y); y += 2; | |||
| } | |||
| x8 = LD_DP(x); x += 2; | |||
| x9 = LD_DP(x); x += 2; | |||
| ST_DP(x0, y); y += 2; | |||
| x10 = LD_DP(x); x += 2; | |||
| ST_DP(x1, y); y += 2; | |||
| x11 = LD_DP(x); x += 2; | |||
| ST_DP(x2, y); y += 2; | |||
| x12 = LD_DP(x); x += 2; | |||
| ST_DP(x3, y); y += 2; | |||
| x13 = LD_DP(x); x += 2; | |||
| ST_DP(x4, y); y += 2; | |||
| x14 = LD_DP(x); x += 2; | |||
| ST_DP(x5, y); y += 2; | |||
| x15 = LD_DP(x); x += 2; | |||
| ST_DP(x6, y); y += 2; | |||
| ST_DP(x7, y); y += 2; | |||
| ST_DP8_INC(x8, x9, x10, x11, x12, x13, x14, x15, y, 2); | |||
| } | |||
| if (n & 15) | |||
| { | |||
| if (n & 8) | |||
| { | |||
| LD_DP8_INC(x, 2, x0, x1, x2, x3, x4, x5, x6, x7); | |||
| ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, y, 2); | |||
| } | |||
| if (n & 4) | |||
| { | |||
| LD_DP4_INC(x, 2, x0, x1, x2, x3); | |||
| ST_DP4_INC(x0, x1, x2, x3, y, 2); | |||
| } | |||
| if (n & 2) | |||
| { | |||
| LD_DP2_INC(x, 2, x0, x1); | |||
| ST_DP2_INC(x0, x1, y, 2); | |||
| } | |||
| if (n & 1) | |||
| { | |||
| LD_GP2_INC(x, 1, f0, f1); | |||
| ST_GP2_INC(f0, f1, y, 1); | |||
| } | |||
| } | |||
| } | |||
| else | |||
| { | |||
| inc_x *= 2; | |||
| inc_y *= 2; | |||
| for (i = (n >> 4); i--;) | |||
| { | |||
| x0 = LD_DP(x); x += inc_x; | |||
| x1 = LD_DP(x); x += inc_x; | |||
| x2 = LD_DP(x); x += inc_x; | |||
| x3 = LD_DP(x); x += inc_x; | |||
| x4 = LD_DP(x); x += inc_x; | |||
| x5 = LD_DP(x); x += inc_x; | |||
| x6 = LD_DP(x); x += inc_x; | |||
| x7 = LD_DP(x); x += inc_x; | |||
| x8 = LD_DP(x); x += inc_x; | |||
| ST_DP(x0, y); y += inc_y; | |||
| x9 = LD_DP(x); x += inc_x; | |||
| ST_DP(x1, y); y += inc_y; | |||
| x10 = LD_DP(x); x += inc_x; | |||
| ST_DP(x2, y); y += inc_y; | |||
| x11 = LD_DP(x); x += inc_x; | |||
| ST_DP(x3, y); y += inc_y; | |||
| x12 = LD_DP(x); x += inc_x; | |||
| ST_DP(x4, y); y += inc_y; | |||
| x13 = LD_DP(x); x += inc_x; | |||
| ST_DP(x5, y); y += inc_y; | |||
| x14 = LD_DP(x); x += inc_x; | |||
| ST_DP(x6, y); y += inc_y; | |||
| x15 = LD_DP(x); x += inc_x; | |||
| ST_DP(x7, y); y += inc_y; | |||
| ST_DP(x8, y); y += inc_y; | |||
| ST_DP(x9, y); y += inc_y; | |||
| ST_DP(x10, y); y += inc_y; | |||
| ST_DP(x11, y); y += inc_y; | |||
| ST_DP(x12, y); y += inc_y; | |||
| ST_DP(x13, y); y += inc_y; | |||
| ST_DP(x14, y); y += inc_y; | |||
| ST_DP(x15, y); y += inc_y; | |||
| } | |||
| if (n & 15) | |||
| { | |||
| if (n & 8) | |||
| { | |||
| LD_DP8_INC(x, inc_x, x0, x1, x2, x3, x4, x5, x6, x7); | |||
| ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, y, inc_y); | |||
| } | |||
| if (n & 4) | |||
| { | |||
| LD_DP4_INC(x, inc_x, x0, x1, x2, x3); | |||
| ST_DP4_INC(x0, x1, x2, x3, y, inc_y); | |||
| } | |||
| if (n & 2) | |||
| { | |||
| LD_DP2_INC(x, inc_x, x0, x1); | |||
| ST_DP2_INC(x0, x1, y, inc_y); | |||
| } | |||
| if (n & 1) | |||
| { | |||
| LD_GP2_INC(x, 1, f0, f1); | |||
| ST_GP2_INC(f0, f1, y, 1); | |||
| } | |||
| } | |||
| } | |||
| return (0); | |||
| } | |||
| @@ -0,0 +1,717 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2017, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #include "common.h" | |||
| #include "macros_msa.h" | |||
| /* This will shuffle the elements in 'in' vector as (mask needed :: 01 00 11 10) | |||
| 0 1 2 3 => 2 3 0 1 */ | |||
| #define SHF_78 78 | |||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||
| FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, | |||
| BLASLONG dummy2) | |||
| { | |||
| BLASLONG i, inc_x2; | |||
| FLOAT *px; | |||
| FLOAT tp0, tp1, f0, f1; | |||
| v2f64 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; | |||
| v2f64 d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13, d14, d15; | |||
| v2f64 da_i_vec, da_i_vec_neg, da_r_vec; | |||
| px = x; | |||
| if (1 == inc_x) | |||
| { | |||
| if ((0.0 == da_r) && (0.0 == da_i)) | |||
| { | |||
| v2f64 zero_v = __msa_cast_to_vector_double(0); | |||
| zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 0, 0.0); | |||
| zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 1, 0.0); | |||
| for (i = (n >> 4); i--;) | |||
| { | |||
| ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v, | |||
| zero_v, zero_v, x, 2); | |||
| ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v, | |||
| zero_v, zero_v, x, 2); | |||
| } | |||
| if (n & 15) | |||
| { | |||
| if (n & 8) | |||
| { | |||
| ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v, | |||
| zero_v, zero_v, x, 2); | |||
| } | |||
| if (n & 4) | |||
| { | |||
| ST_DP4_INC(zero_v, zero_v, zero_v, zero_v, x, 2); | |||
| } | |||
| if (n & 2) | |||
| { | |||
| ST_DP2_INC(zero_v, zero_v, x, 2); | |||
| } | |||
| if (n & 1) | |||
| { | |||
| ST_DP(zero_v, x); | |||
| } | |||
| } | |||
| } | |||
| else if (0.0 == da_r) | |||
| { | |||
| da_i_vec = COPY_DOUBLE_TO_VECTOR(da_i); | |||
| da_i_vec_neg = -da_i_vec; | |||
| da_i_vec = (v2f64) __msa_ilvev_d((v2i64) da_i_vec_neg, (v2i64) da_i_vec); | |||
| if (n > 15) | |||
| { | |||
| FLOAT *x_pref; | |||
| BLASLONG pref_offset; | |||
| pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); | |||
| if (pref_offset > 0) | |||
| { | |||
| pref_offset = L1_DATA_LINESIZE - pref_offset; | |||
| pref_offset = pref_offset / sizeof(FLOAT); | |||
| } | |||
| x_pref = x + pref_offset + 32 + 16; | |||
| LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7); | |||
| for (i = (n >> 4)- 1; i--;) | |||
| { | |||
| PREF_OFFSET(x_pref, 0); | |||
| PREF_OFFSET(x_pref, 32); | |||
| PREF_OFFSET(x_pref, 64); | |||
| PREF_OFFSET(x_pref, 96); | |||
| PREF_OFFSET(x_pref, 128); | |||
| PREF_OFFSET(x_pref, 160); | |||
| PREF_OFFSET(x_pref, 192); | |||
| PREF_OFFSET(x_pref, 224); | |||
| x_pref += 32; | |||
| x8 = LD_DP(px); px += 2; | |||
| x0 *= da_i_vec; | |||
| x9 = LD_DP(px); px += 2; | |||
| x1 *= da_i_vec; | |||
| x10 = LD_DP(px); px += 2; | |||
| x2 *= da_i_vec; | |||
| x11 = LD_DP(px); px += 2; | |||
| x3 *= da_i_vec; | |||
| x12 = LD_DP(px); px += 2; | |||
| x4 *= da_i_vec; | |||
| x13 = LD_DP(px); px += 2; | |||
| x5 *= da_i_vec; | |||
| x0 = (v2f64) __msa_shf_w((v4i32) x0, SHF_78); | |||
| x14 = LD_DP(px); px += 2; | |||
| x6 *= da_i_vec; | |||
| x1 = (v2f64) __msa_shf_w((v4i32) x1, SHF_78); | |||
| x15 = LD_DP(px); px += 2; | |||
| x7 *= da_i_vec; | |||
| x2 = (v2f64) __msa_shf_w((v4i32) x2, SHF_78); | |||
| x8 *= da_i_vec; | |||
| x3 = (v2f64) __msa_shf_w((v4i32) x3, SHF_78); | |||
| ST_DP(x0, x); x += 2; | |||
| x9 *= da_i_vec; | |||
| x4 = (v2f64) __msa_shf_w((v4i32) x4, SHF_78); | |||
| ST_DP(x1, x); x += 2; | |||
| x10 *= da_i_vec; | |||
| x5 = (v2f64) __msa_shf_w((v4i32) x5, SHF_78); | |||
| ST_DP(x2, x); x += 2; | |||
| x11 *= da_i_vec; | |||
| x6 = (v2f64) __msa_shf_w((v4i32) x6, SHF_78); | |||
| ST_DP(x3, x); x += 2; | |||
| x12 *= da_i_vec; | |||
| x7 = (v2f64) __msa_shf_w((v4i32) x7, SHF_78); | |||
| ST_DP(x4, x); x += 2; | |||
| x13 *= da_i_vec; | |||
| x8 = (v2f64) __msa_shf_w((v4i32) x8, SHF_78); | |||
| ST_DP(x5, x); x += 2; | |||
| x14 *= da_i_vec; | |||
| x9 = (v2f64) __msa_shf_w((v4i32) x9, SHF_78); | |||
| ST_DP(x6, x); x += 2; | |||
| x15 *= da_i_vec; | |||
| x10 = (v2f64) __msa_shf_w((v4i32) x10, SHF_78); | |||
| ST_DP(x7, x); x += 2; | |||
| x11 = (v2f64) __msa_shf_w((v4i32) x11, SHF_78); | |||
| ST_DP(x8, x); x += 2; | |||
| x0 = LD_DP(px); px += 2; | |||
| x12 = (v2f64) __msa_shf_w((v4i32) x12, SHF_78); | |||
| ST_DP(x9, x); x += 2; | |||
| x1 = LD_DP(px); px += 2; | |||
| x13 = (v2f64) __msa_shf_w((v4i32) x13, SHF_78); | |||
| ST_DP(x10, x); x += 2; | |||
| x2 = LD_DP(px); px += 2; | |||
| x14 = (v2f64) __msa_shf_w((v4i32) x14, SHF_78); | |||
| ST_DP(x11, x); x += 2; | |||
| x3 = LD_DP(px); px += 2; | |||
| x15 = (v2f64) __msa_shf_w((v4i32) x15, SHF_78); | |||
| ST_DP(x12, x); x += 2; | |||
| x4 = LD_DP(px); px += 2; | |||
| ST_DP(x13, x); x += 2; | |||
| x5 = LD_DP(px); px += 2; | |||
| ST_DP(x14, x); x += 2; | |||
| x6 = LD_DP(px); px += 2; | |||
| ST_DP(x15, x); x += 2; | |||
| x7 = LD_DP(px); px += 2; | |||
| } | |||
| LD_DP8_INC(px, 2, x8, x9, x10, x11, x12, x13, x14, x15); | |||
| MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec, | |||
| x0, x1, x2, x3); | |||
| MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec, | |||
| x4, x5, x6, x7); | |||
| MUL4(x8, da_i_vec, x9, da_i_vec, x10, da_i_vec, x11, da_i_vec, | |||
| x8, x9, x10, x11); | |||
| MUL4(x12, da_i_vec, x13, da_i_vec, x14, da_i_vec, x15, da_i_vec, | |||
| x12, x13, x14, x15); | |||
| SHF_W4_DP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_78); | |||
| SHF_W4_DP(x4, x5, x6, x7, x4, x5, x6, x7, SHF_78); | |||
| SHF_W4_DP(x8, x9, x10, x11, x8, x9, x10, x11, SHF_78); | |||
| SHF_W4_DP(x12, x13, x14, x15, x12, x13, x14, x15, SHF_78); | |||
| ST_DP16_INC(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, | |||
| x12, x13, x14, x15, x, 2); | |||
| } | |||
| if (n & 15) | |||
| { | |||
| if (n & 8) | |||
| { | |||
| LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7); | |||
| MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec, | |||
| x0, x1, x2, x3); | |||
| MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec, | |||
| x4, x5, x6, x7); | |||
| SHF_W4_DP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_78); | |||
| SHF_W4_DP(x4, x5, x6, x7, x4, x5, x6, x7, SHF_78); | |||
| ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, 2); | |||
| } | |||
| if (n & 4) | |||
| { | |||
| LD_DP4_INC(px, 2, x0, x1, x2, x3); | |||
| MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec, | |||
| x0, x1, x2, x3); | |||
| SHF_W4_DP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_78); | |||
| ST_DP4_INC(x0, x1, x2, x3, x, 2); | |||
| } | |||
| if (n & 2) | |||
| { | |||
| LD_DP2_INC(px, 2, x0, x1); | |||
| MUL2(x0, da_i_vec, x1, da_i_vec, x0, x1); | |||
| SHF_W2_DP(x0, x1, x0, x1, SHF_78); | |||
| ST_DP2_INC(x0, x1, x, 2); | |||
| } | |||
| if (n & 1) | |||
| { | |||
| LD_GP2_INC(px, 1, f0, f1); | |||
| MUL2(f0, da_i, f1, -da_i, f0, f1); | |||
| ST_GP2_INC(f1, f0, x, 1); | |||
| } | |||
| } | |||
| } | |||
| else if (0.0 == da_i) | |||
| { | |||
| da_r_vec = COPY_DOUBLE_TO_VECTOR(da_r); | |||
| if (n > 15) | |||
| { | |||
| FLOAT *x_pref; | |||
| BLASLONG pref_offset; | |||
| pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); | |||
| if (pref_offset > 0) | |||
| { | |||
| pref_offset = L1_DATA_LINESIZE - pref_offset; | |||
| pref_offset = pref_offset / sizeof(FLOAT); | |||
| } | |||
| x_pref = x + pref_offset + 32 + 16; | |||
| LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7); | |||
| for (i = (n >> 4)- 1; i--;) | |||
| { | |||
| PREF_OFFSET(x_pref, 0); | |||
| PREF_OFFSET(x_pref, 32); | |||
| PREF_OFFSET(x_pref, 64); | |||
| PREF_OFFSET(x_pref, 96); | |||
| PREF_OFFSET(x_pref, 128); | |||
| PREF_OFFSET(x_pref, 160); | |||
| PREF_OFFSET(x_pref, 192); | |||
| PREF_OFFSET(x_pref, 224); | |||
| x_pref += 32; | |||
| x8 = LD_DP(px); px += 2; | |||
| x0 *= da_r_vec; | |||
| x9 = LD_DP(px); px += 2; | |||
| x1 *= da_r_vec; | |||
| x10 = LD_DP(px); px += 2; | |||
| x2 *= da_r_vec; | |||
| x11 = LD_DP(px); px += 2; | |||
| x3 *= da_r_vec; | |||
| x12 = LD_DP(px); px += 2; | |||
| x4 *= da_r_vec; | |||
| x13 = LD_DP(px); px += 2; | |||
| x5 *= da_r_vec; | |||
| ST_DP(x0, x); x += 2; | |||
| x14 = LD_DP(px); px += 2; | |||
| x6 *= da_r_vec; | |||
| ST_DP(x1, x); x += 2; | |||
| x15 = LD_DP(px); px += 2; | |||
| x7 *= da_r_vec; | |||
| ST_DP(x2, x); x += 2; | |||
| x8 *= da_r_vec; | |||
| ST_DP(x3, x); x += 2; | |||
| x9 *= da_r_vec; | |||
| ST_DP(x4, x); x += 2; | |||
| x10 *= da_r_vec; | |||
| ST_DP(x5, x); x += 2; | |||
| x11 *= da_r_vec; | |||
| ST_DP(x6, x); x += 2; | |||
| x12 *= da_r_vec; | |||
| ST_DP(x7, x); x += 2; | |||
| x13 *= da_r_vec; | |||
| ST_DP(x8, x); x += 2; | |||
| x0 = LD_DP(px); px += 2; | |||
| x14 *= da_r_vec; | |||
| ST_DP(x9, x); x += 2; | |||
| x1 = LD_DP(px); px += 2; | |||
| x15 *= da_r_vec; | |||
| ST_DP(x10, x); x += 2; | |||
| x2 = LD_DP(px); px += 2; | |||
| ST_DP(x11, x); x += 2; | |||
| x3 = LD_DP(px); px += 2; | |||
| ST_DP(x12, x); x += 2; | |||
| x4 = LD_DP(px); px += 2; | |||
| ST_DP(x13, x); x += 2; | |||
| x5 = LD_DP(px); px += 2; | |||
| ST_DP(x14, x); x += 2; | |||
| x6 = LD_DP(px); px += 2; | |||
| ST_DP(x15, x); x += 2; | |||
| x7 = LD_DP(px); px += 2; | |||
| } | |||
| LD_DP8_INC(px, 2, x8, x9, x10, x11, x12, x13, x14, x15); | |||
| MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec, | |||
| x0, x1, x2, x3); | |||
| MUL4(x4, da_r_vec, x5, da_r_vec, x6, da_r_vec, x7, da_r_vec, | |||
| x4, x5, x6, x7); | |||
| MUL4(x8, da_r_vec, x9, da_r_vec, x10, da_r_vec, x11, da_r_vec, | |||
| x8, x9, x10, x11); | |||
| MUL4(x12, da_r_vec, x13, da_r_vec, x14, da_r_vec, x15, da_r_vec, | |||
| x12, x13, x14, x15); | |||
| ST_DP16_INC(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, | |||
| x12, x13, x14, x15, x, 2); | |||
| } | |||
| if (n & 15) | |||
| { | |||
| if (n & 8) | |||
| { | |||
| LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7); | |||
| MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec, | |||
| x0, x1, x2, x3); | |||
| MUL4(x4, da_r_vec, x5, da_r_vec, x6, da_r_vec, x7, da_r_vec, | |||
| x4, x5, x6, x7); | |||
| ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, 2); | |||
| } | |||
| if (n & 4) | |||
| { | |||
| LD_DP4_INC(px, 2, x0, x1, x2, x3); | |||
| MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec, | |||
| x0, x1, x2, x3); | |||
| ST_DP4_INC(x0, x1, x2, x3, x, 2); | |||
| } | |||
| if (n & 2) | |||
| { | |||
| LD_DP2_INC(px, 2, x0, x1); | |||
| MUL2(x0, da_r_vec, x1, da_r_vec, x0, x1); | |||
| ST_DP2_INC(x0, x1, x, 2); | |||
| } | |||
| if (n & 1) | |||
| { | |||
| LD_GP2_INC(px, 1, f0, f1); | |||
| MUL2(f0, da_r, f1, da_r, f0, f1); | |||
| ST_GP2_INC(f0, f1, x, 1); | |||
| } | |||
| } | |||
| } | |||
| else | |||
| { | |||
| FLOAT *x_pref; | |||
| BLASLONG pref_offset; | |||
| pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); | |||
| if (pref_offset > 0) | |||
| { | |||
| pref_offset = L1_DATA_LINESIZE - pref_offset; | |||
| pref_offset = pref_offset / sizeof(FLOAT); | |||
| } | |||
| x_pref = x + pref_offset + 32; | |||
| da_i_vec = COPY_DOUBLE_TO_VECTOR(da_i); | |||
| da_i_vec_neg = -da_i_vec; | |||
| da_i_vec = (v2f64) __msa_ilvev_d((v2i64) da_i_vec_neg, (v2i64) da_i_vec); | |||
| da_r_vec = COPY_DOUBLE_TO_VECTOR(da_r); | |||
| for (i = (n >> 4); i--;) | |||
| { | |||
| PREF_OFFSET(x_pref, 0); | |||
| PREF_OFFSET(x_pref, 32); | |||
| PREF_OFFSET(x_pref, 64); | |||
| PREF_OFFSET(x_pref, 96); | |||
| PREF_OFFSET(x_pref, 128); | |||
| PREF_OFFSET(x_pref, 160); | |||
| PREF_OFFSET(x_pref, 192); | |||
| PREF_OFFSET(x_pref, 224); | |||
| x_pref += 32; | |||
| LD_DP16_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, | |||
| x11, x12, x13, x14, x15); | |||
| MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec, | |||
| d0, d1, d2, d3); | |||
| MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec, | |||
| d4, d5, d6, d7); | |||
| MUL4(x8, da_i_vec, x9, da_i_vec, x10, da_i_vec, x11, da_i_vec, | |||
| d8, d9, d10, d11); | |||
| MUL4(x12, da_i_vec, x13, da_i_vec, x14, da_i_vec, x15, da_i_vec, | |||
| d12, d13, d14, d15); | |||
| SHF_W4_DP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_78); | |||
| SHF_W4_DP(d4, d5, d6, d7, d4, d5, d6, d7, SHF_78); | |||
| SHF_W4_DP(d8, d9, d10, d11, d8, d9, d10, d11, SHF_78); | |||
| SHF_W4_DP(d12, d13, d14, d15, d12, d13, d14, d15, SHF_78); | |||
| FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3); | |||
| FMADD4(x4, x5, x6, x7, da_r_vec, d4, d5, d6, d7); | |||
| FMADD4(x8, x9, x10, x11, da_r_vec, d8, d9, d10, d11); | |||
| FMADD4(x12, x13, x14, x15, da_r_vec, d12, d13, d14, d15); | |||
| ST_DP16_INC(d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, | |||
| d12, d13, d14, d15, x, 2); | |||
| } | |||
| if (n & 15) | |||
| { | |||
| if (n & 8) | |||
| { | |||
| LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7); | |||
| MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec, | |||
| d0, d1, d2, d3); | |||
| MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec, | |||
| d4, d5, d6, d7); | |||
| SHF_W4_DP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_78); | |||
| SHF_W4_DP(d4, d5, d6, d7, d4, d5, d6, d7, SHF_78); | |||
| FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3); | |||
| FMADD4(x4, x5, x6, x7, da_r_vec, d4, d5, d6, d7); | |||
| ST_DP8_INC(d0, d1, d2, d3, d4, d5, d6, d7, x, 2); | |||
| } | |||
| if (n & 4) | |||
| { | |||
| LD_DP4_INC(px, 2, x0, x1, x2, x3); | |||
| MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec, | |||
| d0, d1, d2, d3); | |||
| SHF_W4_DP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_78); | |||
| FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3); | |||
| ST_DP4_INC(d0, d1, d2, d3, x, 2); | |||
| } | |||
| if (n & 2) | |||
| { | |||
| LD_DP2_INC(px, 2, x0, x1); | |||
| MUL2(x0, da_i_vec, x1, da_i_vec, d0, d1); | |||
| SHF_W2_DP(d0, d1, d0, d1, SHF_78); | |||
| FMADD2(x0, x1, da_r_vec, d0, d1); | |||
| ST_DP2_INC(d0, d1, x, 2); | |||
| } | |||
| if (n & 1) | |||
| { | |||
| LD_GP2_INC(px, 1, f0, f1); | |||
| tp0 = da_r * f0; | |||
| tp0 -= da_i * f1; | |||
| tp1 = da_r * f1; | |||
| tp1 += da_i * f0; | |||
| ST_GP2_INC(tp0, tp1, x, 1); | |||
| } | |||
| } | |||
| } | |||
| } | |||
| else | |||
| { | |||
| inc_x2 = 2 * inc_x; | |||
| if ((0.0 == da_r) && (0.0 == da_i)) | |||
| { | |||
| v2f64 zero_v = __msa_cast_to_vector_double(0); | |||
| zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 0, 0.0); | |||
| zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 1, 0.0); | |||
| for (i = (n >> 4); i--;) | |||
| { | |||
| ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v, | |||
| zero_v, zero_v, x, inc_x2); | |||
| ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v, | |||
| zero_v, zero_v, x, inc_x2); | |||
| } | |||
| if (n & 15) | |||
| { | |||
| if (n & 8) | |||
| { | |||
| ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v, | |||
| zero_v, zero_v, x, inc_x2); | |||
| } | |||
| if (n & 4) | |||
| { | |||
| ST_DP4_INC(zero_v, zero_v, zero_v, zero_v, x, inc_x2); | |||
| } | |||
| if (n & 2) | |||
| { | |||
| ST_DP2_INC(zero_v, zero_v, x, inc_x2); | |||
| } | |||
| if (n & 1) | |||
| { | |||
| ST_DP(zero_v, x); | |||
| } | |||
| } | |||
| } | |||
| else if (0.0 == da_r) | |||
| { | |||
| da_i_vec = COPY_DOUBLE_TO_VECTOR(da_i); | |||
| da_i_vec_neg = -da_i_vec; | |||
| da_i_vec = (v2f64) __msa_ilvev_d((v2i64) da_i_vec_neg, (v2i64) da_i_vec); | |||
| for (i = (n >> 4); i--;) | |||
| { | |||
| LD_DP16_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, | |||
| x10, x11, x12, x13, x14, x15); | |||
| MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec, | |||
| x0, x1, x2, x3); | |||
| MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec, | |||
| x4, x5, x6, x7); | |||
| MUL4(x8, da_i_vec, x9, da_i_vec, x10, da_i_vec, x11, da_i_vec, | |||
| x8, x9, x10, x11); | |||
| MUL4(x12, da_i_vec, x13, da_i_vec, x14, da_i_vec, x15, da_i_vec, | |||
| x12, x13, x14, x15); | |||
| SHF_W4_DP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_78); | |||
| SHF_W4_DP(x4, x5, x6, x7, x4, x5, x6, x7, SHF_78); | |||
| SHF_W4_DP(x8, x9, x10, x11, x8, x9, x10, x11, SHF_78); | |||
| SHF_W4_DP(x12, x13, x14, x15, x12, x13, x14, x15, SHF_78); | |||
| ST_DP16_INC(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, | |||
| x12, x13, x14, x15, x, inc_x2); | |||
| } | |||
| if (n & 15) | |||
| { | |||
| if (n & 8) | |||
| { | |||
| LD_DP8_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7); | |||
| MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec, | |||
| x0, x1, x2, x3); | |||
| MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec, | |||
| x4, x5, x6, x7); | |||
| SHF_W4_DP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_78); | |||
| SHF_W4_DP(x4, x5, x6, x7, x4, x5, x6, x7, SHF_78); | |||
| ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, inc_x2); | |||
| } | |||
| if (n & 4) | |||
| { | |||
| LD_DP4_INC(px, inc_x2, x0, x1, x2, x3); | |||
| MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec, | |||
| x0, x1, x2, x3); | |||
| SHF_W4_DP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_78); | |||
| ST_DP4_INC(x0, x1, x2, x3, x, inc_x2); | |||
| } | |||
| if (n & 2) | |||
| { | |||
| LD_DP2_INC(px, inc_x2, x0, x1); | |||
| MUL2(x0, da_i_vec, x1, da_i_vec, x0, x1); | |||
| SHF_W2_DP(x0, x1, x0, x1, SHF_78); | |||
| ST_DP2_INC(x0, x1, x, inc_x2); | |||
| } | |||
| if (n & 1) | |||
| { | |||
| LD_GP2_INC(px, 1, f0, f1); | |||
| MUL2(f0, da_i, f1, -da_i, f0, f1); | |||
| ST_GP2_INC(f1, f0, x, 1); | |||
| } | |||
| } | |||
| } | |||
| else if (0.0 == da_i) | |||
| { | |||
| da_r_vec = COPY_DOUBLE_TO_VECTOR(da_r); | |||
| for (i = (n >> 4); i--;) | |||
| { | |||
| LD_DP16_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, | |||
| x10, x11, x12, x13, x14, x15); | |||
| MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec, | |||
| x0, x1, x2, x3); | |||
| MUL4(x4, da_r_vec, x5, da_r_vec, x6, da_r_vec, x7, da_r_vec, | |||
| x4, x5, x6, x7); | |||
| MUL4(x8, da_r_vec, x9, da_r_vec, x10, da_r_vec, x11, da_r_vec, | |||
| x8, x9, x10, x11); | |||
| MUL4(x12, da_r_vec, x13, da_r_vec, x14, da_r_vec, x15, da_r_vec, | |||
| x12, x13, x14, x15); | |||
| ST_DP16_INC(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, | |||
| x12, x13, x14, x15, x, inc_x2); | |||
| } | |||
| if (n & 15) | |||
| { | |||
| if (n & 8) | |||
| { | |||
| LD_DP8_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7); | |||
| MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec, | |||
| x0, x1, x2, x3); | |||
| MUL4(x4, da_r_vec, x5, da_r_vec, x6, da_r_vec, x7, da_r_vec, | |||
| x4, x5, x6, x7); | |||
| ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, inc_x2); | |||
| } | |||
| if (n & 4) | |||
| { | |||
| LD_DP4_INC(px, inc_x2, x0, x1, x2, x3); | |||
| MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec, | |||
| x0, x1, x2, x3); | |||
| ST_DP4_INC(x0, x1, x2, x3, x, inc_x2); | |||
| } | |||
| if (n & 2) | |||
| { | |||
| LD_DP2_INC(px, inc_x2, x0, x1); | |||
| MUL2(x0, da_r_vec, x1, da_r_vec, x0, x1); | |||
| ST_DP2_INC(x0, x1, x, inc_x2); | |||
| } | |||
| if (n & 1) | |||
| { | |||
| LD_GP2_INC(px, 1, f0, f1); | |||
| MUL2(f0, da_r, f1, da_r, f0, f1); | |||
| ST_GP2_INC(f0, f1, x, 1); | |||
| } | |||
| } | |||
| } | |||
| else | |||
| { | |||
| da_i_vec = COPY_DOUBLE_TO_VECTOR(da_i); | |||
| da_i_vec_neg = -da_i_vec; | |||
| da_i_vec = (v2f64) __msa_ilvev_d((v2i64) da_i_vec_neg, (v2i64) da_i_vec); | |||
| da_r_vec = COPY_DOUBLE_TO_VECTOR(da_r); | |||
| for (i = (n >> 4); i--;) | |||
| { | |||
| LD_DP16_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, | |||
| x10, x11, x12, x13, x14, x15); | |||
| MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec, | |||
| d0, d1, d2, d3); | |||
| MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec, | |||
| d4, d5, d6, d7); | |||
| MUL4(x8, da_i_vec, x9, da_i_vec, x10, da_i_vec, x11, da_i_vec, | |||
| d8, d9, d10, d11); | |||
| MUL4(x12, da_i_vec, x13, da_i_vec, x14, da_i_vec, x15, da_i_vec, | |||
| d12, d13, d14, d15); | |||
| SHF_W4_DP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_78); | |||
| SHF_W4_DP(d4, d5, d6, d7, d4, d5, d6, d7, SHF_78); | |||
| SHF_W4_DP(d8, d9, d10, d11, d8, d9, d10, d11, SHF_78); | |||
| SHF_W4_DP(d12, d13, d14, d15, d12, d13, d14, d15, SHF_78); | |||
| FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3); | |||
| FMADD4(x4, x5, x6, x7, da_r_vec, d4, d5, d6, d7); | |||
| FMADD4(x8, x9, x10, x11, da_r_vec, d8, d9, d10, d11); | |||
| FMADD4(x12, x13, x14, x15, da_r_vec, d12, d13, d14, d15); | |||
| ST_DP16_INC(d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, | |||
| d12, d13, d14, d15, x, inc_x2); | |||
| } | |||
| if (n & 15) | |||
| { | |||
| if (n & 8) | |||
| { | |||
| LD_DP8_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7); | |||
| MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec, | |||
| d0, d1, d2, d3); | |||
| MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec, | |||
| d4, d5, d6, d7); | |||
| SHF_W4_DP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_78); | |||
| SHF_W4_DP(d4, d5, d6, d7, d4, d5, d6, d7, SHF_78); | |||
| FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3); | |||
| FMADD4(x4, x5, x6, x7, da_r_vec, d4, d5, d6, d7); | |||
| ST_DP8_INC(d0, d1, d2, d3, d4, d5, d6, d7, x, inc_x2); | |||
| } | |||
| if (n & 4) | |||
| { | |||
| LD_DP4_INC(px, inc_x2, x0, x1, x2, x3); | |||
| MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec, | |||
| d0, d1, d2, d3); | |||
| SHF_W4_DP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_78); | |||
| FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3); | |||
| ST_DP4_INC(d0, d1, d2, d3, x, inc_x2); | |||
| } | |||
| if (n & 2) | |||
| { | |||
| LD_DP2_INC(px, inc_x2, x0, x1); | |||
| MUL2(x0, da_i_vec, x1, da_i_vec, d0, d1); | |||
| SHF_W2_DP(d0, d1, d0, d1, SHF_78); | |||
| FMADD2(x0, x1, da_r_vec, d0, d1); | |||
| ST_DP2_INC(d0, d1, x, inc_x2); | |||
| } | |||
| if (n & 1) | |||
| { | |||
| LD_GP2_INC(px, 1, f0, f1); | |||
| tp0 = da_r * f0; | |||
| tp0 -= da_i * f1; | |||
| tp1 = da_r * f1; | |||
| tp1 += da_i * f0; | |||
| ST_GP2_INC(tp0, tp1, x, 1); | |||
| } | |||
| } | |||
| } | |||
| } | |||
| return (0); | |||
| } | |||
| @@ -0,0 +1,238 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #include "common.h" | |||
| #include "macros_msa.h" | |||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, | |||
| FLOAT dummy4, FLOAT *srcx, BLASLONG inc_x, FLOAT *srcy, | |||
| BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | |||
| { | |||
| BLASLONG i, inc_x2, inc_y2, pref_offsetx, pref_offsety; | |||
| FLOAT *px, *py; | |||
| v2f64 x0, x1, x2, x3, x4, x5, x6, x7; | |||
| v2f64 y0, y1, y2, y3, y4, y5, y6, y7; | |||
| if (n < 0) return (0); | |||
| pref_offsetx = (BLASLONG)srcx & (L1_DATA_LINESIZE - 1); | |||
| if (pref_offsetx > 0) | |||
| { | |||
| pref_offsetx = L1_DATA_LINESIZE - pref_offsetx; | |||
| pref_offsetx = pref_offsetx / sizeof(FLOAT); | |||
| } | |||
| pref_offsety = (BLASLONG)srcy & (L1_DATA_LINESIZE - 1); | |||
| if (pref_offsety > 0) | |||
| { | |||
| pref_offsety = L1_DATA_LINESIZE - pref_offsety; | |||
| pref_offsety = pref_offsety / sizeof(FLOAT); | |||
| } | |||
| inc_x2 = 2 * inc_x; | |||
| inc_y2 = 2 * inc_y; | |||
| px = srcx; | |||
| py = srcy; | |||
| if ((1 == inc_x) && (1 == inc_y)) | |||
| { | |||
| if (n >> 3) | |||
| { | |||
| LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7); | |||
| for (i = (n >> 3) - 1; i--;) | |||
| { | |||
| PREFETCH(px + pref_offsetx + 16); | |||
| PREFETCH(px + pref_offsetx + 20); | |||
| PREFETCH(px + pref_offsetx + 24); | |||
| PREFETCH(px + pref_offsetx + 28); | |||
| PREFETCH(py + pref_offsety + 16); | |||
| PREFETCH(py + pref_offsety + 20); | |||
| PREFETCH(py + pref_offsety + 24); | |||
| PREFETCH(py + pref_offsety + 28); | |||
| y0 = LD_DP(py); py += 2; | |||
| ST_DP(x0, srcy); srcy += 2; | |||
| y1 = LD_DP(py); py += 2; | |||
| ST_DP(x1, srcy); srcy += 2; | |||
| y2 = LD_DP(py); py += 2; | |||
| ST_DP(x2, srcy); srcy += 2; | |||
| y3 = LD_DP(py); py += 2; | |||
| ST_DP(x3, srcy); srcy += 2; | |||
| y4 = LD_DP(py); py += 2; | |||
| ST_DP(x4, srcy); srcy += 2; | |||
| y5 = LD_DP(py); py += 2; | |||
| ST_DP(x5, srcy); srcy += 2; | |||
| y6 = LD_DP(py); py += 2; | |||
| ST_DP(x6, srcy); srcy += 2; | |||
| y7 = LD_DP(py); py += 2; | |||
| ST_DP(x7, srcy); srcy += 2; | |||
| x0 = LD_DP(px); px += 2; | |||
| ST_DP(y0, srcx); srcx += 2; | |||
| x1 = LD_DP(px); px += 2; | |||
| ST_DP(y1, srcx); srcx += 2; | |||
| x2 = LD_DP(px); px += 2; | |||
| ST_DP(y2, srcx); srcx += 2; | |||
| x3 = LD_DP(px); px += 2; | |||
| ST_DP(y3, srcx); srcx += 2; | |||
| x4 = LD_DP(px); px += 2; | |||
| ST_DP(y4, srcx); srcx += 2; | |||
| x5 = LD_DP(px); px += 2; | |||
| ST_DP(y5, srcx); srcx += 2; | |||
| x6 = LD_DP(px); px += 2; | |||
| ST_DP(y6, srcx); srcx += 2; | |||
| x7 = LD_DP(px); px += 2; | |||
| ST_DP(y7, srcx); srcx += 2; | |||
| } | |||
| LD_DP8_INC(py, 2, y0, y1, y2, y3, y4, y5, y6, y7); | |||
| ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, srcy, 2); | |||
| ST_DP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, srcx, 2); | |||
| } | |||
| if (n & 7) | |||
| { | |||
| if ((n & 4) && (n & 2) && (n & 1)) | |||
| { | |||
| LD_DP7_INC(px, 2, x0, x1, x2, x3, x4, x5, x6); | |||
| LD_DP7_INC(py, 2, y0, y1, y2, y3, y4, y5, y6); | |||
| ST_DP7_INC(x0, x1, x2, x3, x4, x5, x6, srcy, 2); | |||
| ST_DP7_INC(y0, y1, y2, y3, y4, y5, y6, srcx, 2); | |||
| } | |||
| else if ((n & 4) && (n & 2)) | |||
| { | |||
| LD_DP6_INC(px, 2, x0, x1, x2, x3, x4, x5); | |||
| LD_DP6_INC(py, 2, y0, y1, y2, y3, y4, y5); | |||
| ST_DP6_INC(x0, x1, x2, x3, x4, x5, srcy, 2); | |||
| ST_DP6_INC(y0, y1, y2, y3, y4, y5, srcx, 2); | |||
| } | |||
| else if ((n & 4) && (n & 1)) | |||
| { | |||
| LD_DP5_INC(px, 2, x0, x1, x2, x3, x4); | |||
| LD_DP5_INC(py, 2, y0, y1, y2, y3, y4); | |||
| ST_DP5_INC(x0, x1, x2, x3, x4, srcy, 2); | |||
| ST_DP5_INC(y0, y1, y2, y3, y4, srcx, 2); | |||
| } | |||
| else if ((n & 2) && (n & 1)) | |||
| { | |||
| LD_DP3_INC(px, 2, x0, x1, x2); | |||
| LD_DP3_INC(py, 2, y0, y1, y2); | |||
| ST_DP3_INC(x0, x1, x2, srcy, 2); | |||
| ST_DP3_INC(y0, y1, y2, srcx, 2); | |||
| } | |||
| else if (n & 4) | |||
| { | |||
| LD_DP4_INC(px, 2, x0, x1, x2, x3); | |||
| LD_DP4_INC(py, 2, y0, y1, y2, y3); | |||
| ST_DP4_INC(x0, x1, x2, x3, srcy, 2); | |||
| ST_DP4_INC(y0, y1, y2, y3, srcx, 2); | |||
| } | |||
| else if (n & 2) | |||
| { | |||
| LD_DP2_INC(px, 2, x0, x1); | |||
| LD_DP2_INC(py, 2, y0, y1); | |||
| ST_DP2_INC(x0, x1, srcy, 2); | |||
| ST_DP2_INC(y0, y1, srcx, 2); | |||
| } | |||
| else if (n & 1) | |||
| { | |||
| x0 = LD_DP(px); | |||
| y0 = LD_DP(py); | |||
| ST_DP(y0, srcx); | |||
| ST_DP(x0, srcy); | |||
| } | |||
| } | |||
| } | |||
| else | |||
| { | |||
| for (i = (n >> 3); i--;) | |||
| { | |||
| LD_DP8_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7); | |||
| LD_DP8_INC(py, inc_y2, y0, y1, y2, y3, y4, y5, y6, y7); | |||
| ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, srcy, inc_y2); | |||
| ST_DP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, srcx, inc_x2); | |||
| } | |||
| if (n & 7) | |||
| { | |||
| if ((n & 4) && (n & 2) && (n & 1)) | |||
| { | |||
| LD_DP7_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6); | |||
| LD_DP7_INC(py, inc_y2, y0, y1, y2, y3, y4, y5, y6); | |||
| ST_DP7_INC(x0, x1, x2, x3, x4, x5, x6, srcy, inc_y2); | |||
| ST_DP7_INC(y0, y1, y2, y3, y4, y5, y6, srcx, inc_x2); | |||
| } | |||
| else if ((n & 4) && (n & 2)) | |||
| { | |||
| LD_DP6_INC(px, inc_x2, x0, x1, x2, x3, x4, x5); | |||
| LD_DP6_INC(py, inc_y2, y0, y1, y2, y3, y4, y5); | |||
| ST_DP6_INC(x0, x1, x2, x3, x4, x5, srcy, inc_y2); | |||
| ST_DP6_INC(y0, y1, y2, y3, y4, y5, srcx, inc_x2); | |||
| } | |||
| else if ((n & 4) && (n & 1)) | |||
| { | |||
| LD_DP5_INC(px, inc_x2, x0, x1, x2, x3, x4); | |||
| LD_DP5_INC(py, inc_y2, y0, y1, y2, y3, y4); | |||
| ST_DP5_INC(x0, x1, x2, x3, x4, srcy, inc_y2); | |||
| ST_DP5_INC(y0, y1, y2, y3, y4, srcx, inc_x2); | |||
| } | |||
| else if ((n & 2) && (n & 1)) | |||
| { | |||
| LD_DP3_INC(px, inc_x2, x0, x1, x2); | |||
| LD_DP3_INC(py, inc_y2, y0, y1, y2); | |||
| ST_DP3_INC(x0, x1, x2, srcy, inc_y2); | |||
| ST_DP3_INC(y0, y1, y2, srcx, inc_x2); | |||
| } | |||
| else if (n & 4) | |||
| { | |||
| LD_DP4_INC(px, inc_x2, x0, x1, x2, x3); | |||
| LD_DP4_INC(py, inc_y2, y0, y1, y2, y3); | |||
| ST_DP4_INC(x0, x1, x2, x3, srcy, inc_y2); | |||
| ST_DP4_INC(y0, y1, y2, y3, srcx, inc_x2); | |||
| } | |||
| else if (n & 2) | |||
| { | |||
| LD_DP2_INC(px, inc_x2, x0, x1); | |||
| LD_DP2_INC(py, inc_y2, y0, y1); | |||
| ST_DP2_INC(x0, x1, srcy, inc_y2); | |||
| ST_DP2_INC(y0, y1, srcx, inc_x2); | |||
| } | |||
| else if (n & 1) | |||
| { | |||
| x0 = LD_DP(px); | |||
| y0 = LD_DP(py); | |||
| ST_DP(y0, srcx); | |||
| ST_DP(x0, srcy); | |||
| } | |||
| } | |||
| } | |||
| return (0); | |||
| } | |||