Add msa optimization for AXPY, COPY, SCALE, SWAPtags/v0.2.20^2
| @@ -42,15 +42,29 @@ CASUMKERNEL = ../mips/asum.c | |||||
| ZASUMKERNEL = ../mips/asum.c | ZASUMKERNEL = ../mips/asum.c | ||||
| endif | endif | ||||
| ifdef HAVE_MSA | |||||
| SAXPYKERNEL = ../mips/saxpy_msa.c | |||||
| DAXPYKERNEL = ../mips/daxpy_msa.c | |||||
| CAXPYKERNEL = ../mips/caxpy_msa.c | |||||
| ZAXPYKERNEL = ../mips/zaxpy_msa.c | |||||
| else | |||||
| SAXPYKERNEL = ../mips/axpy.c | SAXPYKERNEL = ../mips/axpy.c | ||||
| DAXPYKERNEL = ../mips/axpy.c | DAXPYKERNEL = ../mips/axpy.c | ||||
| CAXPYKERNEL = ../mips/zaxpy.c | CAXPYKERNEL = ../mips/zaxpy.c | ||||
| ZAXPYKERNEL = ../mips/zaxpy.c | ZAXPYKERNEL = ../mips/zaxpy.c | ||||
| endif | |||||
| ifdef HAVE_MSA | |||||
| SCOPYKERNEL = ../mips/scopy_msa.c | |||||
| DCOPYKERNEL = ../mips/dcopy_msa.c | |||||
| CCOPYKERNEL = ../mips/ccopy_msa.c | |||||
| ZCOPYKERNEL = ../mips/zcopy_msa.c | |||||
| else | |||||
| SCOPYKERNEL = ../mips/copy.c | SCOPYKERNEL = ../mips/copy.c | ||||
| DCOPYKERNEL = ../mips/copy.c | DCOPYKERNEL = ../mips/copy.c | ||||
| CCOPYKERNEL = ../mips/zcopy.c | CCOPYKERNEL = ../mips/zcopy.c | ||||
| ZCOPYKERNEL = ../mips/zcopy.c | ZCOPYKERNEL = ../mips/zcopy.c | ||||
| endif | |||||
| ifdef HAVE_MSA | ifdef HAVE_MSA | ||||
| SDOTKERNEL = ../mips/sdot_msa.c | SDOTKERNEL = ../mips/sdot_msa.c | ||||
| @@ -74,15 +88,29 @@ DROTKERNEL = ../mips/rot.c | |||||
| CROTKERNEL = ../mips/zrot.c | CROTKERNEL = ../mips/zrot.c | ||||
| ZROTKERNEL = ../mips/zrot.c | ZROTKERNEL = ../mips/zrot.c | ||||
| ifdef HAVE_MSA | |||||
| SSCALKERNEL = ../mips/sscal_msa.c | |||||
| DSCALKERNEL = ../mips/dscal_msa.c | |||||
| CSCALKERNEL = ../mips/cscal_msa.c | |||||
| ZSCALKERNEL = ../mips/zscal_msa.c | |||||
| else | |||||
| SSCALKERNEL = ../mips/scal.c | SSCALKERNEL = ../mips/scal.c | ||||
| DSCALKERNEL = ../mips/scal.c | DSCALKERNEL = ../mips/scal.c | ||||
| CSCALKERNEL = ../mips/zscal.c | CSCALKERNEL = ../mips/zscal.c | ||||
| ZSCALKERNEL = ../mips/zscal.c | ZSCALKERNEL = ../mips/zscal.c | ||||
| endif | |||||
| ifdef HAVE_MSA | |||||
| SSWAPKERNEL = ../mips/sswap_msa.c | |||||
| DSWAPKERNEL = ../mips/dswap_msa.c | |||||
| CSWAPKERNEL = ../mips/cswap_msa.c | |||||
| ZSWAPKERNEL = ../mips/zswap_msa.c | |||||
| else | |||||
| SSWAPKERNEL = ../mips/swap.c | SSWAPKERNEL = ../mips/swap.c | ||||
| DSWAPKERNEL = ../mips/swap.c | DSWAPKERNEL = ../mips/swap.c | ||||
| CSWAPKERNEL = ../mips/zswap.c | CSWAPKERNEL = ../mips/zswap.c | ||||
| ZSWAPKERNEL = ../mips/zswap.c | ZSWAPKERNEL = ../mips/zswap.c | ||||
| endif | |||||
| ifdef HAVE_MSA | ifdef HAVE_MSA | ||||
| SGEMVNKERNEL = ../mips/sgemv_n_msa.c | SGEMVNKERNEL = ../mips/sgemv_n_msa.c | ||||
| @@ -0,0 +1,471 @@ | |||||
| /******************************************************************************* | |||||
| Copyright (c) 2016, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *******************************************************************************/ | |||||
| #include "common.h" | |||||
| #include "macros_msa.h" | |||||
| #if !defined(CONJ) | |||||
| #define OP0 += | |||||
| #define OP1 -= | |||||
| #define OP2 += | |||||
| #else | |||||
| #define OP0 -= | |||||
| #define OP1 += | |||||
| #define OP2 -= | |||||
| #endif | |||||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||||
| FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, | |||||
| BLASLONG dummy2) | |||||
| { | |||||
| BLASLONG i, inc_x2, inc_y2; | |||||
| FLOAT *py; | |||||
| v4f32 x0, x1, x2, x3, x4, x5, x6, x7; | |||||
| v4f32 y0, y1, y2, y3, y4, y5, y6, y7, dar_vec, dai_vec; | |||||
| v4f32 x0r, x1r, x2r, x3r, x0i, x1i, x2i, x3i; | |||||
| v4f32 y0r, y1r, y2r, y3r, y0i, y1i, y2i, y3i; | |||||
| FLOAT xd0, xd1, xd2, xd3, xd4, xd5, xd6, xd7; | |||||
| FLOAT yd0, yd1, yd2, yd3, yd4, yd5, yd6, yd7; | |||||
| if (n < 0) return(0); | |||||
| if ((da_r == 0.0) && (da_i == 0.0)) return(0); | |||||
| py = y; | |||||
| if ((1 == inc_x) && (1 == inc_y)) | |||||
| { | |||||
| FLOAT *x_pref, *y_pref; | |||||
| BLASLONG pref_offset; | |||||
| pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); | |||||
| if (pref_offset > 0) | |||||
| { | |||||
| pref_offset = L1_DATA_LINESIZE - pref_offset; | |||||
| pref_offset = pref_offset / sizeof(FLOAT); | |||||
| } | |||||
| x_pref = x + pref_offset + 64; | |||||
| pref_offset = (BLASLONG)y & (L1_DATA_LINESIZE - 1); | |||||
| if (pref_offset > 0) | |||||
| { | |||||
| pref_offset = L1_DATA_LINESIZE - pref_offset; | |||||
| pref_offset = pref_offset / sizeof(FLOAT); | |||||
| } | |||||
| y_pref = y + pref_offset + 64; | |||||
| dar_vec = COPY_FLOAT_TO_VECTOR(da_r); | |||||
| dai_vec = COPY_FLOAT_TO_VECTOR(da_i); | |||||
| for (i = (n >> 4); i--;) | |||||
| { | |||||
| PREF_OFFSET(x_pref, 0); | |||||
| PREF_OFFSET(x_pref, 32); | |||||
| PREF_OFFSET(x_pref, 64); | |||||
| PREF_OFFSET(x_pref, 96); | |||||
| PREF_OFFSET(y_pref, 0); | |||||
| PREF_OFFSET(y_pref, 32); | |||||
| PREF_OFFSET(y_pref, 64); | |||||
| PREF_OFFSET(y_pref, 96); | |||||
| x_pref += 32; | |||||
| y_pref += 32; | |||||
| LD_SP8_INC(x, 4, x0, x1, x2, x3, x4, x5, x6, x7); | |||||
| LD_SP8_INC(py, 4, y0, y1, y2, y3, y4, y5, y6, y7); | |||||
| PCKEVOD_W2_SP(x1, x0, x0r, x0i); | |||||
| PCKEVOD_W2_SP(y1, y0, y0r, y0i); | |||||
| PCKEVOD_W2_SP(x3, x2, x1r, x1i); | |||||
| PCKEVOD_W2_SP(y3, y2, y1r, y1i); | |||||
| PCKEVOD_W2_SP(x5, x4, x2r, x2i); | |||||
| PCKEVOD_W2_SP(y5, y4, y2r, y2i); | |||||
| PCKEVOD_W2_SP(x7, x6, x3r, x3i); | |||||
| PCKEVOD_W2_SP(y7, y6, y3r, y3i); | |||||
| FMADD4(x0r, x1r, x2r, x3r, dar_vec, y0r, y1r, y2r, y3r); | |||||
| y0i OP0 dar_vec * x0i; | |||||
| y1i OP0 dar_vec * x1i; | |||||
| y2i OP0 dar_vec * x2i; | |||||
| y3i OP0 dar_vec * x3i; | |||||
| y0r OP1 dai_vec * x0i; | |||||
| y1r OP1 dai_vec * x1i; | |||||
| y2r OP1 dai_vec * x2i; | |||||
| y3r OP1 dai_vec * x3i; | |||||
| y0i OP2 dai_vec * x0r; | |||||
| y1i OP2 dai_vec * x1r; | |||||
| y2i OP2 dai_vec * x2r; | |||||
| y3i OP2 dai_vec * x3r; | |||||
| ILVRL_W2_SP(y0i, y0r, y0, y1); | |||||
| ILVRL_W2_SP(y1i, y1r, y2, y3); | |||||
| ILVRL_W2_SP(y2i, y2r, y4, y5); | |||||
| ILVRL_W2_SP(y3i, y3r, y6, y7); | |||||
| ST_SP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, 4); | |||||
| } | |||||
| if (n & 15) | |||||
| { | |||||
| if (n & 8) | |||||
| { | |||||
| LD_SP4_INC(x, 4, x0, x1, x2, x3); | |||||
| LD_SP4_INC(py, 4, y0, y1, y2, y3); | |||||
| PCKEVOD_W2_SP(x1, x0, x0r, x0i); | |||||
| PCKEVOD_W2_SP(y1, y0, y0r, y0i); | |||||
| PCKEVOD_W2_SP(x3, x2, x1r, x1i); | |||||
| PCKEVOD_W2_SP(y3, y2, y1r, y1i); | |||||
| FMADD2(x0r, x1r, dar_vec, y0r, y1r); | |||||
| y0i OP0 dar_vec * x0i; | |||||
| y1i OP0 dar_vec * x1i; | |||||
| y0r OP1 dai_vec * x0i; | |||||
| y1r OP1 dai_vec * x1i; | |||||
| y0i OP2 dai_vec * x0r; | |||||
| y1i OP2 dai_vec * x1r; | |||||
| ILVRL_W2_SP(y0i, y0r, y0, y1); | |||||
| ILVRL_W2_SP(y1i, y1r, y2, y3); | |||||
| ST_SP4_INC(y0, y1, y2, y3, y, 4); | |||||
| } | |||||
| if (n & 4) | |||||
| { | |||||
| LD_SP2_INC(x, 4, x0, x1); | |||||
| LD_SP2_INC(py, 4, y0, y1); | |||||
| PCKEVOD_W2_SP(x1, x0, x0r, x0i); | |||||
| PCKEVOD_W2_SP(y1, y0, y0r, y0i); | |||||
| y0r += dar_vec * x0r; | |||||
| y0i OP0 dar_vec * x0i; | |||||
| y0r OP1 dai_vec * x0i; | |||||
| y0i OP2 dai_vec * x0r; | |||||
| ILVRL_W2_SP(y0i, y0r, y0, y1); | |||||
| ST_SP2_INC(y0, y1, y, 4); | |||||
| } | |||||
| if (n & 2) | |||||
| { | |||||
| LD_GP4_INC(x, 1, xd0, xd1, xd2, xd3); | |||||
| LD_GP4_INC(py, 1, yd0, yd1, yd2, yd3); | |||||
| FMADD2(xd0, xd2, da_r, yd0, yd2); | |||||
| yd1 OP0 da_r * xd1; | |||||
| yd3 OP0 da_r * xd3; | |||||
| yd0 OP1 da_i * xd1; | |||||
| yd2 OP1 da_i * xd3; | |||||
| yd1 OP2 da_i * xd0; | |||||
| yd3 OP2 da_i * xd2; | |||||
| ST_GP4_INC(yd0, yd1, yd2, yd3, y, 1); | |||||
| } | |||||
| if (n & 1) | |||||
| { | |||||
| LD_GP2_INC(x, 1, xd0, xd1); | |||||
| LD_GP2_INC(py, 1, yd0, yd1); | |||||
| yd0 += da_r * xd0; | |||||
| yd1 OP0 da_r * xd1; | |||||
| yd0 OP1 da_i * xd1; | |||||
| yd1 OP2 da_i * xd0; | |||||
| ST_GP2_INC(yd0, yd1, y, 1); | |||||
| } | |||||
| } | |||||
| } | |||||
| else if (1 == inc_y) | |||||
| { | |||||
| FLOAT *y_pref; | |||||
| BLASLONG pref_offset; | |||||
| v4f32 x8, x9, x10, x11, x12, x13, x14; | |||||
| pref_offset = (BLASLONG)y & (L1_DATA_LINESIZE - 1); | |||||
| if (pref_offset > 0) | |||||
| { | |||||
| pref_offset = L1_DATA_LINESIZE - pref_offset; | |||||
| pref_offset = pref_offset / sizeof(FLOAT); | |||||
| } | |||||
| y_pref = y + pref_offset + 64; | |||||
| inc_x2 = 2 * inc_x; | |||||
| dar_vec = COPY_FLOAT_TO_VECTOR(da_r); | |||||
| dai_vec = COPY_FLOAT_TO_VECTOR(da_i); | |||||
| for (i = (n >> 4); i--;) | |||||
| { | |||||
| PREF_OFFSET(y_pref, 0); | |||||
| PREF_OFFSET(y_pref, 32); | |||||
| PREF_OFFSET(y_pref, 64); | |||||
| PREF_OFFSET(y_pref, 96); | |||||
| y_pref += 32; | |||||
| LD_SP8_INC(x, inc_x2, x0, x1, x2, x3, x4, x5, x6, x14); | |||||
| LD_SP7_INC(x, inc_x2, x8, x9, x10, x11, x12, x13, x7); | |||||
| PCKEV_D2_SP(x1, x0, x3, x2, x0, x1); | |||||
| PCKEV_D2_SP(x5, x4, x14, x6, x2, x3); | |||||
| PCKEV_D2_SP(x9, x8, x11, x10, x4, x5); | |||||
| x6 = (v4f32) __msa_pckev_d((v2i64) x13, (v2i64) x12); | |||||
| x7 = (v4f32) __msa_insert_w((v4i32) x7, 2, *((int *) x)); | |||||
| x7 = (v4f32) __msa_insert_w((v4i32) x7, 3, *((int *) (x + 1))); | |||||
| x += inc_x2; | |||||
| LD_SP8_INC(py, 4, y0, y1, y2, y3, y4, y5, y6, y7); | |||||
| PCKEVOD_W2_SP(x1, x0, x0r, x0i); | |||||
| PCKEVOD_W2_SP(y1, y0, y0r, y0i); | |||||
| PCKEVOD_W2_SP(x3, x2, x1r, x1i); | |||||
| PCKEVOD_W2_SP(y3, y2, y1r, y1i); | |||||
| PCKEVOD_W2_SP(x5, x4, x2r, x2i); | |||||
| PCKEVOD_W2_SP(y5, y4, y2r, y2i); | |||||
| PCKEVOD_W2_SP(x7, x6, x3r, x3i); | |||||
| PCKEVOD_W2_SP(y7, y6, y3r, y3i); | |||||
| FMADD4(x0r, x1r, x2r, x3r, dar_vec, y0r, y1r, y2r, y3r); | |||||
| y0i OP0 dar_vec * x0i; | |||||
| y1i OP0 dar_vec * x1i; | |||||
| y2i OP0 dar_vec * x2i; | |||||
| y3i OP0 dar_vec * x3i; | |||||
| y0r OP1 dai_vec * x0i; | |||||
| y1r OP1 dai_vec * x1i; | |||||
| y2r OP1 dai_vec * x2i; | |||||
| y3r OP1 dai_vec * x3i; | |||||
| y0i OP2 dai_vec * x0r; | |||||
| y1i OP2 dai_vec * x1r; | |||||
| y2i OP2 dai_vec * x2r; | |||||
| y3i OP2 dai_vec * x3r; | |||||
| ILVRL_W2_SP(y0i, y0r, y0, y1); | |||||
| ILVRL_W2_SP(y1i, y1r, y2, y3); | |||||
| ILVRL_W2_SP(y2i, y2r, y4, y5); | |||||
| ILVRL_W2_SP(y3i, y3r, y6, y7); | |||||
| ST_SP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, 4); | |||||
| } | |||||
| if (n & 15) | |||||
| { | |||||
| if (n & 8) | |||||
| { | |||||
| LD_SP7_INC(x, inc_x2, x0, x1, x2, x6, x4, x5, x3); | |||||
| PCKEV_D2_SP(x1, x0, x6, x2, x0, x1); | |||||
| x2 = (v4f32) __msa_pckev_d((v2i64) x5, (v2i64) x4); | |||||
| x3 = (v4f32) __msa_insert_w((v4i32) x3, 2, *((int *) x)); | |||||
| x3 = (v4f32) __msa_insert_w((v4i32) x3, 3, *((int *) (x + 1))); | |||||
| x += inc_x2; | |||||
| LD_SP4_INC(py, 4, y0, y1, y2, y3); | |||||
| PCKEVOD_W2_SP(x1, x0, x0r, x0i); | |||||
| PCKEVOD_W2_SP(y1, y0, y0r, y0i); | |||||
| PCKEVOD_W2_SP(x3, x2, x1r, x1i); | |||||
| PCKEVOD_W2_SP(y3, y2, y1r, y1i); | |||||
| FMADD2(x0r, x1r, dar_vec, y0r, y1r); | |||||
| y0i OP0 dar_vec * x0i; | |||||
| y1i OP0 dar_vec * x1i; | |||||
| y0r OP1 dai_vec * x0i; | |||||
| y1r OP1 dai_vec * x1i; | |||||
| y0i OP2 dai_vec * x0r; | |||||
| y1i OP2 dai_vec * x1r; | |||||
| ILVRL_W2_SP(y0i, y0r, y0, y1); | |||||
| ILVRL_W2_SP(y1i, y1r, y2, y3); | |||||
| ST_SP4_INC(y0, y1, y2, y3, y, 4); | |||||
| } | |||||
| if (n & 4) | |||||
| { | |||||
| LD_SP3_INC(x, inc_x2, x0, x2, x1); | |||||
| x0 = (v4f32) __msa_pckev_d((v2i64) x2, (v2i64) x0); | |||||
| x1 = (v4f32) __msa_insert_w((v4i32) x1, 2, *((int *) x)); | |||||
| x1 = (v4f32) __msa_insert_w((v4i32) x1, 3, *((int *) (x + 1))); | |||||
| x += inc_x2; | |||||
| LD_SP2_INC(py, 4, y0, y1); | |||||
| PCKEVOD_W2_SP(x1, x0, x0r, x0i); | |||||
| PCKEVOD_W2_SP(y1, y0, y0r, y0i); | |||||
| y0r += dar_vec * x0r; | |||||
| y0i OP0 dar_vec * x0i; | |||||
| y0r OP1 dai_vec * x0i; | |||||
| y0i OP2 dai_vec * x0r; | |||||
| ILVRL_W2_SP(y0i, y0r, y0, y1); | |||||
| ST_SP2_INC(y0, y1, y, 4); | |||||
| } | |||||
| if (n & 2) | |||||
| { | |||||
| xd0 = x[0]; | |||||
| xd1 = x[1]; | |||||
| x += inc_x2; | |||||
| xd2 = x[0]; | |||||
| xd3 = x[1]; | |||||
| x += inc_x2; | |||||
| LD_GP4_INC(py, 1, yd0, yd1, yd2, yd3); | |||||
| FMADD2(xd0, xd2, da_r, yd0, yd2); | |||||
| yd1 OP0 da_r * xd1; | |||||
| yd3 OP0 da_r * xd3; | |||||
| yd0 OP1 da_i * xd1; | |||||
| yd2 OP1 da_i * xd3; | |||||
| yd1 OP2 da_i * xd0; | |||||
| yd3 OP2 da_i * xd2; | |||||
| ST_GP4_INC(yd0, yd1, yd2, yd3, y, 1); | |||||
| } | |||||
| if (n & 1) | |||||
| { | |||||
| LD_GP2_INC(x, 1, xd0, xd1); | |||||
| LD_GP2_INC(py, 1, yd0, yd1); | |||||
| yd0 += da_r * xd0; | |||||
| yd1 OP0 da_r * xd1; | |||||
| yd0 OP1 da_i * xd1; | |||||
| yd1 OP2 da_i * xd0; | |||||
| ST_GP2_INC(yd0, yd1, y, 1); | |||||
| } | |||||
| } | |||||
| } | |||||
| else | |||||
| { | |||||
| inc_x2 = 2 * inc_x; | |||||
| inc_y2 = 2 * inc_y; | |||||
| for (i = (n >> 2); i--;) | |||||
| { | |||||
| xd0 = x[0]; | |||||
| xd1 = x[1]; | |||||
| x += inc_x2; | |||||
| xd2 = x[0]; | |||||
| xd3 = x[1]; | |||||
| x += inc_x2; | |||||
| xd4 = x[0]; | |||||
| xd5 = x[1]; | |||||
| x += inc_x2; | |||||
| xd6 = x[0]; | |||||
| xd7 = x[1]; | |||||
| x += inc_x2; | |||||
| yd0 = py[0]; | |||||
| yd1 = py[1]; | |||||
| py += inc_y2; | |||||
| yd2 = py[0]; | |||||
| yd3 = py[1]; | |||||
| py += inc_y2; | |||||
| yd4 = py[0]; | |||||
| yd5 = py[1]; | |||||
| py += inc_y2; | |||||
| yd6 = py[0]; | |||||
| yd7 = py[1]; | |||||
| py += inc_y2; | |||||
| FMADD4(xd0, xd2, xd4, xd6, da_r, yd0, yd2, yd4, yd6); | |||||
| yd1 OP0 da_r * xd1; | |||||
| yd3 OP0 da_r * xd3; | |||||
| yd5 OP0 da_r * xd5; | |||||
| yd7 OP0 da_r * xd7; | |||||
| yd0 OP1 da_i * xd1; | |||||
| yd2 OP1 da_i * xd3; | |||||
| yd4 OP1 da_i * xd5; | |||||
| yd6 OP1 da_i * xd7; | |||||
| yd1 OP2 da_i * xd0; | |||||
| yd3 OP2 da_i * xd2; | |||||
| yd5 OP2 da_i * xd4; | |||||
| yd7 OP2 da_i * xd6; | |||||
| y[0] = yd0; | |||||
| y[1] = yd1; | |||||
| y += inc_y2; | |||||
| y[0] = yd2; | |||||
| y[1] = yd3; | |||||
| y += inc_y2; | |||||
| y[0] = yd4; | |||||
| y[1] = yd5; | |||||
| y += inc_y2; | |||||
| y[0] = yd6; | |||||
| y[1] = yd7; | |||||
| y += inc_y2; | |||||
| } | |||||
| if (n & 3) | |||||
| { | |||||
| if (n & 2) | |||||
| { | |||||
| xd0 = x[0]; | |||||
| xd1 = x[1]; | |||||
| x += inc_x2; | |||||
| xd2 = x[0]; | |||||
| xd3 = x[1]; | |||||
| x += inc_x2; | |||||
| yd0 = py[0]; | |||||
| yd1 = py[1]; | |||||
| py += inc_y2; | |||||
| yd2 = py[0]; | |||||
| yd3 = py[1]; | |||||
| py += inc_y2; | |||||
| FMADD2(xd0, xd2, da_r, yd0, yd2); | |||||
| yd1 OP0 da_r * xd1; | |||||
| yd3 OP0 da_r * xd3; | |||||
| yd0 OP1 da_i * xd1; | |||||
| yd2 OP1 da_i * xd3; | |||||
| yd1 OP2 da_i * xd0; | |||||
| yd3 OP2 da_i * xd2; | |||||
| y[0] = yd0; | |||||
| y[1] = yd1; | |||||
| y += inc_y2; | |||||
| y[0] = yd2; | |||||
| y[1] = yd3; | |||||
| y += inc_y2; | |||||
| } | |||||
| if (n & 1) | |||||
| { | |||||
| xd0 = x[0]; | |||||
| xd1 = x[1]; | |||||
| yd0 = y[0]; | |||||
| yd1 = y[1]; | |||||
| yd0 += da_r * xd0; | |||||
| yd1 OP0 da_r * xd1; | |||||
| yd0 OP1 da_i * xd1; | |||||
| yd1 OP2 da_i * xd0; | |||||
| y[0] = yd0; | |||||
| y[1] = yd1; | |||||
| } | |||||
| } | |||||
| } | |||||
| return (0); | |||||
| } | |||||
| @@ -0,0 +1,201 @@ | |||||
| /******************************************************************************* | |||||
| Copyright (c) 2016, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *******************************************************************************/ | |||||
| #include "common.h" | |||||
| #include "macros_msa.h" | |||||
| int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||||
| { | |||||
| BLASLONG i, inc_x2, inc_y2; | |||||
| v4f32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; | |||||
| FLOAT f0, f1, f2, f3, f4, f5, f6, f7; | |||||
| if (n < 0) return (0); | |||||
| if ((1 == inc_x) && (1 == inc_y)) | |||||
| { | |||||
| if (n > 31) | |||||
| { | |||||
| FLOAT *x_pref; | |||||
| BLASLONG pref_offset; | |||||
| pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); | |||||
| if (pref_offset > 0) | |||||
| { | |||||
| pref_offset = L1_DATA_LINESIZE - pref_offset; | |||||
| pref_offset = pref_offset / sizeof(FLOAT); | |||||
| } | |||||
| x_pref = x + pref_offset + 128 + 32; | |||||
| LD_SP8_INC(x, 4, x0, x1, x2, x3, x4, x5, x6, x7); | |||||
| for (i = (n >> 5) - 1; i--;) | |||||
| { | |||||
| PREF_OFFSET(x_pref, 0); | |||||
| PREF_OFFSET(x_pref, 32); | |||||
| PREF_OFFSET(x_pref, 64); | |||||
| PREF_OFFSET(x_pref, 96); | |||||
| PREF_OFFSET(x_pref, 128); | |||||
| PREF_OFFSET(x_pref, 160); | |||||
| PREF_OFFSET(x_pref, 192); | |||||
| PREF_OFFSET(x_pref, 224); | |||||
| x_pref += 64; | |||||
| x8 = LD_SP(x); x += 4; | |||||
| ST_SP(x0, y); y += 4; | |||||
| x9 = LD_SP(x); x += 4; | |||||
| ST_SP(x1, y); y += 4; | |||||
| x10 = LD_SP(x); x += 4; | |||||
| ST_SP(x2, y); y += 4; | |||||
| x11 = LD_SP(x); x += 4; | |||||
| ST_SP(x3, y); y += 4; | |||||
| x12 = LD_SP(x); x += 4; | |||||
| ST_SP(x4, y); y += 4; | |||||
| x13 = LD_SP(x); x += 4; | |||||
| ST_SP(x5, y); y += 4; | |||||
| x14 = LD_SP(x); x += 4; | |||||
| ST_SP(x6, y); y += 4; | |||||
| x15 = LD_SP(x); x += 4; | |||||
| ST_SP(x7, y); y += 4; | |||||
| x0 = LD_SP(x); x += 4; | |||||
| ST_SP(x8, y); y += 4; | |||||
| x1 = LD_SP(x); x += 4; | |||||
| ST_SP(x9, y); y += 4; | |||||
| x2 = LD_SP(x); x += 4; | |||||
| ST_SP(x10, y); y += 4; | |||||
| x3 = LD_SP(x); x += 4; | |||||
| ST_SP(x11, y); y += 4; | |||||
| x4 = LD_SP(x); x += 4; | |||||
| ST_SP(x12, y); y += 4; | |||||
| x5 = LD_SP(x); x += 4; | |||||
| ST_SP(x13, y); y += 4; | |||||
| x6 = LD_SP(x); x += 4; | |||||
| ST_SP(x14, y); y += 4; | |||||
| x7 = LD_SP(x); x += 4; | |||||
| ST_SP(x15, y); y += 4; | |||||
| } | |||||
| x8 = LD_SP(x); x += 4; | |||||
| x9 = LD_SP(x); x += 4; | |||||
| ST_SP(x0, y); y += 4; | |||||
| x10 = LD_SP(x); x += 4; | |||||
| ST_SP(x1, y); y += 4; | |||||
| x11 = LD_SP(x); x += 4; | |||||
| ST_SP(x2, y); y += 4; | |||||
| x12 = LD_SP(x); x += 4; | |||||
| ST_SP(x3, y); y += 4; | |||||
| x13 = LD_SP(x); x += 4; | |||||
| ST_SP(x4, y); y += 4; | |||||
| x14 = LD_SP(x); x += 4; | |||||
| ST_SP(x5, y); y += 4; | |||||
| x15 = LD_SP(x); x += 4; | |||||
| ST_SP(x6, y); y += 4; | |||||
| ST_SP(x7, y); y += 4; | |||||
| ST_SP8_INC(x8, x9, x10, x11, x12, x13, x14, x15, y, 4); | |||||
| } | |||||
| if (n & 31) | |||||
| { | |||||
| if (n & 16) | |||||
| { | |||||
| LD_SP8_INC(x, 4, x0, x1, x2, x3, x4, x5, x6, x7); | |||||
| ST_SP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, y, 4); | |||||
| } | |||||
| if (n & 8) | |||||
| { | |||||
| LD_SP4_INC(x, 4, x0, x1, x2, x3); | |||||
| ST_SP4_INC(x0, x1, x2, x3, y, 4); | |||||
| } | |||||
| if (n & 4) | |||||
| { | |||||
| LD_SP2_INC(x, 4, x0, x1); | |||||
| ST_SP2_INC(x0, x1, y, 4); | |||||
| } | |||||
| if (n & 2) | |||||
| { | |||||
| LD_GP4_INC(x, 1, f0, f1, f2, f3); | |||||
| ST_GP4_INC(f0, f1, f2, f3, y, 1); | |||||
| } | |||||
| if (n & 1) | |||||
| { | |||||
| LD_GP2_INC(x, 1, f0, f1); | |||||
| ST_GP2_INC(f0, f1, y, 1); | |||||
| } | |||||
| } | |||||
| } | |||||
| else | |||||
| { | |||||
| inc_x2 = 2 * inc_x; | |||||
| inc_y2 = 2 * inc_y; | |||||
| for (i = (n >> 2); i--;) | |||||
| { | |||||
| f0 = *x; | |||||
| f1 = *(x+1); x += inc_x2; | |||||
| f2 = *x; | |||||
| f3 = *(x+1); x += inc_x2; | |||||
| f4 = *x; | |||||
| f5 = *(x+1); x += inc_x2; | |||||
| f6 = *x; | |||||
| f7 = *(x+1); x += inc_x2; | |||||
| *y = f0; | |||||
| *(y+1) = f1; y += inc_y2; | |||||
| *y = f2; | |||||
| *(y+1) = f3; y += inc_y2; | |||||
| *y = f4; | |||||
| *(y+1) = f5; y += inc_y2; | |||||
| *y = f6; | |||||
| *(y+1) = f7; y += inc_y2; | |||||
| } | |||||
| if (n & 2) | |||||
| { | |||||
| f0 = *x; | |||||
| f1 = *(x+1); x += inc_x2; | |||||
| f2 = *x; | |||||
| f3 = *(x+1); x += inc_x2; | |||||
| *y = f0; | |||||
| *(y+1) = f1; y += inc_y2; | |||||
| *y = f2; | |||||
| *(y+1) = f3; y += inc_y2; | |||||
| } | |||||
| if (n & 1) | |||||
| { | |||||
| LD_GP2_INC(x, 1, f0, f1); | |||||
| ST_GP2_INC(f0, f1, y, 1); | |||||
| } | |||||
| } | |||||
| return (0); | |||||
| } | |||||
| @@ -0,0 +1,281 @@ | |||||
| /******************************************************************************* | |||||
| Copyright (c) 2016, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *******************************************************************************/ | |||||
| #include "common.h" | |||||
| #include "macros_msa.h" | |||||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, | |||||
| FLOAT dummy4, FLOAT *srcx, BLASLONG inc_x, FLOAT *srcy, | |||||
| BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | |||||
| { | |||||
| BLASLONG i = 0, pref_offsetx, pref_offsety; | |||||
| FLOAT *px, *py; | |||||
| BLASLONG inc_x2, inc_y2; | |||||
| FLOAT x0, x1, x2, x3, x4, x5, x6, x7; | |||||
| FLOAT y0, y1, y2, y3, y4, y5, y6, y7; | |||||
| v4f32 xv0, xv1, xv2, xv3, xv4, xv5, xv6, xv7; | |||||
| v4f32 yv0, yv1, yv2, yv3, yv4, yv5, yv6, yv7; | |||||
| if (n < 0) return (0); | |||||
| pref_offsetx = (BLASLONG)srcx & (L1_DATA_LINESIZE - 1); | |||||
| if (pref_offsetx > 0) | |||||
| { | |||||
| pref_offsetx = L1_DATA_LINESIZE - pref_offsetx; | |||||
| pref_offsetx = pref_offsetx / sizeof(FLOAT); | |||||
| } | |||||
| pref_offsety = (BLASLONG)srcy & (L1_DATA_LINESIZE - 1); | |||||
| if (pref_offsety > 0) | |||||
| { | |||||
| pref_offsety = L1_DATA_LINESIZE - pref_offsety; | |||||
| pref_offsety = pref_offsety / sizeof(FLOAT); | |||||
| } | |||||
| px = srcx; | |||||
| py = srcy; | |||||
| inc_x2 = 2 * inc_x; | |||||
| inc_y2 = 2 * inc_y; | |||||
| if ((1 == inc_x) && (1 == inc_y)) | |||||
| { | |||||
| if (n >> 4) | |||||
| { | |||||
| LD_SP8_INC(px, 4, xv0, xv1, xv2, xv3, xv4, xv5, xv6, xv7); | |||||
| for (i = (n >> 4) - 1; i--;) | |||||
| { | |||||
| PREFETCH(px + pref_offsetx + 32); | |||||
| PREFETCH(px + pref_offsetx + 40); | |||||
| PREFETCH(px + pref_offsetx + 48); | |||||
| PREFETCH(px + pref_offsetx + 56); | |||||
| PREFETCH(py + pref_offsety + 32); | |||||
| PREFETCH(py + pref_offsety + 40); | |||||
| PREFETCH(py + pref_offsety + 48); | |||||
| PREFETCH(py + pref_offsety + 56); | |||||
| yv0 = LD_SP(py); py += 4; | |||||
| ST_SP(xv0, srcy); srcy += 4; | |||||
| yv1 = LD_SP(py); py += 4; | |||||
| ST_SP(xv1, srcy); srcy += 4; | |||||
| yv2 = LD_SP(py); py += 4; | |||||
| ST_SP(xv2, srcy); srcy += 4; | |||||
| yv3 = LD_SP(py); py += 4; | |||||
| ST_SP(xv3, srcy); srcy += 4; | |||||
| yv4 = LD_SP(py); py += 4; | |||||
| ST_SP(xv4, srcy); srcy += 4; | |||||
| yv5 = LD_SP(py); py += 4; | |||||
| ST_SP(xv5, srcy); srcy += 4; | |||||
| yv6 = LD_SP(py); py += 4; | |||||
| ST_SP(xv6, srcy); srcy += 4; | |||||
| yv7 = LD_SP(py); py += 4; | |||||
| ST_SP(xv7, srcy); srcy += 4; | |||||
| xv0 = LD_SP(px); px += 4; | |||||
| ST_SP(yv0, srcx); srcx += 4; | |||||
| xv1 = LD_SP(px); px += 4; | |||||
| ST_SP(yv1, srcx); srcx += 4; | |||||
| xv2 = LD_SP(px); px += 4; | |||||
| ST_SP(yv2, srcx); srcx += 4; | |||||
| xv3 = LD_SP(px); px += 4; | |||||
| ST_SP(yv3, srcx); srcx += 4; | |||||
| xv4 = LD_SP(px); px += 4; | |||||
| ST_SP(yv4, srcx); srcx += 4; | |||||
| xv5 = LD_SP(px); px += 4; | |||||
| ST_SP(yv5, srcx); srcx += 4; | |||||
| xv6 = LD_SP(px); px += 4; | |||||
| ST_SP(yv6, srcx); srcx += 4; | |||||
| xv7 = LD_SP(px); px += 4; | |||||
| ST_SP(yv7, srcx); srcx += 4; | |||||
| } | |||||
| LD_SP8_INC(py, 4, yv0, yv1, yv2, yv3, yv4, yv5, yv6, yv7); | |||||
| ST_SP8_INC(xv0, xv1, xv2, xv3, xv4, xv5, xv6, xv7, srcy, 4); | |||||
| ST_SP8_INC(yv0, yv1, yv2, yv3, yv4, yv5, yv6, yv7, srcx, 4); | |||||
| } | |||||
| if (n & 15) | |||||
| { | |||||
| if ((n & 8) && (n & 4) && (n & 2)) | |||||
| { | |||||
| LD_SP7_INC(px, 4, xv0, xv1, xv2, xv3, xv4, xv5, xv6); | |||||
| LD_SP7_INC(py, 4, yv0, yv1, yv2, yv3, yv4, yv5, yv6); | |||||
| ST_SP7_INC(xv0, xv1, xv2, xv3, xv4, xv5, xv6, srcy, 4); | |||||
| ST_SP7_INC(yv0, yv1, yv2, yv3, yv4, yv5, yv6, srcx, 4); | |||||
| } | |||||
| else if ((n & 8) && (n & 4)) | |||||
| { | |||||
| LD_SP6_INC(px, 4, xv0, xv1, xv2, xv3, xv4, xv5); | |||||
| LD_SP6_INC(py, 4, yv0, yv1, yv2, yv3, yv4, yv5); | |||||
| ST_SP6_INC(xv0, xv1, xv2, xv3, xv4, xv5, srcy, 4); | |||||
| ST_SP6_INC(yv0, yv1, yv2, yv3, yv4, yv5, srcx, 4); | |||||
| } | |||||
| else if ((n & 8) && (n & 2)) | |||||
| { | |||||
| LD_SP5_INC(px, 4, xv0, xv1, xv2, xv3, xv4); | |||||
| LD_SP5_INC(py, 4, yv0, yv1, yv2, yv3, yv4); | |||||
| ST_SP5_INC(xv0, xv1, xv2, xv3, xv4, srcy, 4); | |||||
| ST_SP5_INC(yv0, yv1, yv2, yv3, yv4, srcx, 4); | |||||
| } | |||||
| else if ((n & 4) && (n & 2)) | |||||
| { | |||||
| LD_SP3_INC(px, 4, xv0, xv1, xv2); | |||||
| LD_SP3_INC(py, 4, yv0, yv1, yv2); | |||||
| ST_SP3_INC(xv0, xv1, xv2, srcy, 4); | |||||
| ST_SP3_INC(yv0, yv1, yv2, srcx, 4); | |||||
| } | |||||
| else if (n & 8) | |||||
| { | |||||
| LD_SP4_INC(px, 4, xv0, xv1, xv2, xv3); | |||||
| LD_SP4_INC(py, 4, yv0, yv1, yv2, yv3); | |||||
| ST_SP4_INC(xv0, xv1, xv2, xv3, srcy, 4); | |||||
| ST_SP4_INC(yv0, yv1, yv2, yv3, srcx, 4); | |||||
| } | |||||
| else if (n & 4) | |||||
| { | |||||
| LD_SP2_INC(px, 4, xv0, xv1); | |||||
| LD_SP2_INC(py, 4, yv0, yv1); | |||||
| ST_SP2_INC(xv0, xv1, srcy, 4); | |||||
| ST_SP2_INC(yv0, yv1, srcx, 4); | |||||
| } | |||||
| else if (n & 2) | |||||
| { | |||||
| xv0 = LD_SP(px); | |||||
| yv0 = LD_SP(py); | |||||
| px += 4; | |||||
| py += 4; | |||||
| ST_SP(xv0, srcy); | |||||
| ST_SP(yv0, srcx); | |||||
| srcx += 4; | |||||
| srcy += 4; | |||||
| } | |||||
| if (n & 1) | |||||
| { | |||||
| LD_GP2_INC(px, 1, x0, x1); | |||||
| LD_GP2_INC(py, 1, y0, y1); | |||||
| ST_GP2_INC(x0, x1, srcy, 1); | |||||
| ST_GP2_INC(y0, y1, srcx, 1); | |||||
| } | |||||
| } | |||||
| } | |||||
| else | |||||
| { | |||||
| for (i = (n >> 2); i--;) | |||||
| { | |||||
| x0 = srcx[0 * inc_x2]; | |||||
| x1 = srcx[0 * inc_x2 + 1]; | |||||
| x2 = srcx[1 * inc_x2]; | |||||
| x3 = srcx[1 * inc_x2 + 1]; | |||||
| x4 = srcx[2 * inc_x2]; | |||||
| x5 = srcx[2 * inc_x2 + 1]; | |||||
| x6 = srcx[3 * inc_x2]; | |||||
| x7 = srcx[3 * inc_x2 + 1]; | |||||
| y0 = srcy[0 * inc_y2]; | |||||
| y1 = srcy[0 * inc_y2 + 1]; | |||||
| y2 = srcy[1 * inc_y2]; | |||||
| y3 = srcy[1 * inc_y2 + 1]; | |||||
| y4 = srcy[2 * inc_y2]; | |||||
| y5 = srcy[2 * inc_y2 + 1]; | |||||
| y6 = srcy[3 * inc_y2]; | |||||
| y7 = srcy[3 * inc_y2 + 1]; | |||||
| srcx[0 * inc_x2] = y0; | |||||
| srcx[0 * inc_x2 + 1] = y1; | |||||
| srcx[1 * inc_x2] = y2; | |||||
| srcx[1 * inc_x2 + 1] = y3; | |||||
| srcx[2 * inc_x2] = y4; | |||||
| srcx[2 * inc_x2 + 1] = y5; | |||||
| srcx[3 * inc_x2] = y6; | |||||
| srcx[3 * inc_x2 + 1] = y7; | |||||
| srcy[0 * inc_y2] = x0; | |||||
| srcy[0 * inc_y2 + 1] = x1; | |||||
| srcy[1 * inc_y2] = x2; | |||||
| srcy[1 * inc_y2 + 1] = x3; | |||||
| srcy[2 * inc_y2] = x4; | |||||
| srcy[2 * inc_y2 + 1] = x5; | |||||
| srcy[3 * inc_y2] = x6; | |||||
| srcy[3 * inc_y2 + 1] = x7; | |||||
| srcx += 4 * inc_x2; | |||||
| srcy += 4 * inc_y2; | |||||
| } | |||||
| if (n & 2) | |||||
| { | |||||
| x0 = srcx[0 * inc_x2]; | |||||
| x1 = srcx[0 * inc_x2 + 1]; | |||||
| x2 = srcx[1 * inc_x2]; | |||||
| x3 = srcx[1 * inc_x2 + 1]; | |||||
| y0 = srcy[0 * inc_y2]; | |||||
| y1 = srcy[0 * inc_y2 + 1]; | |||||
| y2 = srcy[1 * inc_y2]; | |||||
| y3 = srcy[1 * inc_y2 + 1]; | |||||
| srcx[0 * inc_x2] = y0; | |||||
| srcx[0 * inc_x2 + 1] = y1; | |||||
| srcx[1 * inc_x2] = y2; | |||||
| srcx[1 * inc_x2 + 1] = y3; | |||||
| srcy[0 * inc_y2] = x0; | |||||
| srcy[0 * inc_y2 + 1] = x1; | |||||
| srcy[1 * inc_y2] = x2; | |||||
| srcy[1 * inc_y2 + 1] = x3; | |||||
| srcx += 2 * inc_x2; | |||||
| srcy += 2 * inc_y2; | |||||
| } | |||||
| if (n & 1) | |||||
| { | |||||
| x0 = srcx[0 * inc_x2]; | |||||
| x1 = srcx[0 * inc_x2 + 1]; | |||||
| y0 = srcy[0 * inc_y2]; | |||||
| y1 = srcy[0 * inc_y2 + 1]; | |||||
| srcx[0 * inc_x2] = y0; | |||||
| srcx[0 * inc_x2 + 1] = y1; | |||||
| srcy[0 * inc_y2] = x0; | |||||
| srcy[0 * inc_y2 + 1] = x1; | |||||
| srcx += inc_x2; | |||||
| srcy += inc_y2; | |||||
| } | |||||
| } | |||||
| return (0); | |||||
| } | |||||
| @@ -0,0 +1,246 @@ | |||||
| /******************************************************************************* | |||||
| Copyright (c) 2016, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *******************************************************************************/ | |||||
| #include "common.h" | |||||
| #include "macros_msa.h" | |||||
| #if !defined(CONJ) | |||||
| #define OP0 += | |||||
| #define OP1 -= | |||||
| #define OP2 += | |||||
| #else | |||||
| #define OP0 -= | |||||
| #define OP1 += | |||||
| #define OP2 -= | |||||
| #endif | |||||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, | |||||
| BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, | |||||
| BLASLONG dummy2) | |||||
| { | |||||
| BLASLONG i; | |||||
| FLOAT *py; | |||||
| v2f64 x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, y6, y7; | |||||
| v2f64 da_vec, zero_v = {0}; | |||||
| if ((n < 0) || (da == 0.0)) return(0); | |||||
| py = y; | |||||
| if ((1 == inc_x) && (1 == inc_y)) | |||||
| { | |||||
| FLOAT *x_pref, *y_pref; | |||||
| BLASLONG pref_offset; | |||||
| pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); | |||||
| if (pref_offset > 0) | |||||
| { | |||||
| pref_offset = L1_DATA_LINESIZE - pref_offset; | |||||
| pref_offset = pref_offset / sizeof(FLOAT); | |||||
| } | |||||
| x_pref = x + pref_offset + 32; | |||||
| pref_offset = (BLASLONG)y & (L1_DATA_LINESIZE - 1); | |||||
| if (pref_offset > 0) | |||||
| { | |||||
| pref_offset = L1_DATA_LINESIZE - pref_offset; | |||||
| pref_offset = pref_offset / sizeof(FLOAT); | |||||
| } | |||||
| y_pref = y + pref_offset + 32; | |||||
| da_vec = COPY_DOUBLE_TO_VECTOR(da); | |||||
| for (i = (n >> 4); i--;) | |||||
| { | |||||
| PREF_OFFSET(x_pref, 0); | |||||
| PREF_OFFSET(x_pref, 32); | |||||
| PREF_OFFSET(x_pref, 64); | |||||
| PREF_OFFSET(x_pref, 96); | |||||
| PREF_OFFSET(y_pref, 0); | |||||
| PREF_OFFSET(y_pref, 32); | |||||
| PREF_OFFSET(y_pref, 64); | |||||
| PREF_OFFSET(y_pref, 96); | |||||
| x_pref += 16; | |||||
| y_pref += 16; | |||||
| LD_DP8_INC(x, 2, x0, x1, x2, x3, x4, x5, x6, x7); | |||||
| LD_DP8_INC(py, 2, y0, y1, y2, y3, y4, y5, y6, y7); | |||||
| FMADD4(x0, x1, x2, x3, da_vec, y0, y1, y2, y3); | |||||
| FMADD4(x4, x5, x6, x7, da_vec, y4, y5, y6, y7); | |||||
| ST_DP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, 2); | |||||
| } | |||||
| if (n & 15) | |||||
| { | |||||
| if (n & 8) | |||||
| { | |||||
| LD_DP4_INC(x, 2, x0, x1, x2, x3); | |||||
| LD_DP4_INC(py, 2, y0, y1, y2, y3); | |||||
| FMADD4(x0, x1, x2, x3, da_vec, y0, y1, y2, y3); | |||||
| ST_DP4_INC(y0, y1, y2, y3, y, 2); | |||||
| } | |||||
| if (n & 4) | |||||
| { | |||||
| LD_DP2_INC(x, 2, x0, x1); | |||||
| LD_DP2_INC(py, 2, y0, y1); | |||||
| FMADD2(x0, x1, da_vec, y0, y1); | |||||
| ST_DP2_INC(y0, y1, y, 2); | |||||
| } | |||||
| if (n & 2) | |||||
| { | |||||
| x0 = LD_DP(x); x += 2; | |||||
| y0 = LD_DP(py); py += 2; | |||||
| y0 += da_vec * x0; | |||||
| ST_DP(y0, y); y += 2; | |||||
| } | |||||
| if (n & 1) | |||||
| { | |||||
| y[0] += da * x[0]; | |||||
| } | |||||
| } | |||||
| } | |||||
| else if (1 == inc_y) | |||||
| { | |||||
| FLOAT *y_pref; | |||||
| BLASLONG pref_offset; | |||||
| v2f64 x8, x9, x10, x11, x12, x13, x14; | |||||
| pref_offset = (BLASLONG)y & (L1_DATA_LINESIZE - 1); | |||||
| if (pref_offset > 0) | |||||
| { | |||||
| pref_offset = L1_DATA_LINESIZE - pref_offset; | |||||
| pref_offset = pref_offset / sizeof(FLOAT); | |||||
| } | |||||
| y_pref = y + pref_offset + 32; | |||||
| da_vec = COPY_DOUBLE_TO_VECTOR(da); | |||||
| for (i = (n >> 4); i--;) | |||||
| { | |||||
| PREF_OFFSET(y_pref, 0); | |||||
| PREF_OFFSET(y_pref, 32); | |||||
| PREF_OFFSET(y_pref, 64); | |||||
| PREF_OFFSET(y_pref, 96); | |||||
| y_pref += 16; | |||||
| LD_DP8_INC(x, inc_x, x0, x1, x2, x3, x4, x5, x6, x14); | |||||
| LD_DP7_INC(x, inc_x, x8, x9, x10, x11, x12, x13, x7); | |||||
| PCKEV_D2_SD(x1, x0, x3, x2, x0, x1); | |||||
| PCKEV_D2_SD(x5, x4, x14, x6, x2, x3); | |||||
| PCKEV_D2_SD(x9, x8, x11, x10, x4, x5); | |||||
| x6 = (v2f64) __msa_pckev_d((v2i64) x13, (v2i64) x12); | |||||
| x7 = (v2f64) __msa_insert_d((v2i64) x7, 1, *((BLASLONG *) x)); | |||||
| x += inc_x; | |||||
| LD_DP8_INC(py, 2, y0, y1, y2, y3, y4, y5, y6, y7); | |||||
| FMADD4(x0, x1, x2, x3, da_vec, y0, y1, y2, y3); | |||||
| FMADD4(x4, x5, x6, x7, da_vec, y4, y5, y6, y7); | |||||
| ST_DP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, 2); | |||||
| } | |||||
| if (n & 15) | |||||
| { | |||||
| if (n & 8) | |||||
| { | |||||
| LD_DP7_INC(x, inc_x, x0, x1, x2, x6, x4, x5, x3); | |||||
| PCKEV_D2_SD(x1, x0, x6, x2, x0, x1); | |||||
| x2 = (v2f64) __msa_pckev_d((v2i64) x5, (v2i64) x4); | |||||
| x3 = (v2f64) __msa_insert_d((v2i64) x3, 1, *((BLASLONG *) x)); | |||||
| x += inc_x; | |||||
| LD_DP4_INC(py, 2, y0, y1, y2, y3); | |||||
| FMADD4(x0, x1, x2, x3, da_vec, y0, y1, y2, y3); | |||||
| ST_DP4_INC(y0, y1, y2, y3, y, 2); | |||||
| } | |||||
| if (n & 4) | |||||
| { | |||||
| LD_DP3_INC(x, inc_x, x0, x2, x1); | |||||
| x0 = (v2f64) __msa_pckev_d((v2i64) x2, (v2i64) x0); | |||||
| x1 = (v2f64) __msa_insert_d((v2i64) x1, 1, *((BLASLONG *) x)); | |||||
| x += inc_x; | |||||
| LD_DP2_INC(py, 2, y0, y1); | |||||
| FMADD2(x0, x1, da_vec, y0, y1); | |||||
| ST_DP2_INC(y0, y1, y, 2); | |||||
| } | |||||
| if (n & 2) | |||||
| { | |||||
| x0 = (v2f64) __msa_insert_d((v2i64) zero_v, 0, *((BLASLONG *) x)); | |||||
| x += inc_x; | |||||
| x0 = (v2f64) __msa_insert_d((v2i64) x0, 1, *((BLASLONG *) x)); | |||||
| x += inc_x; | |||||
| y0 = LD_DP(py); py += 2; | |||||
| y0 += da_vec * x0; | |||||
| ST_DP(y0, y); y += 2; | |||||
| } | |||||
| if (n & 1) | |||||
| { | |||||
| y[0] += da * x[0]; | |||||
| } | |||||
| } | |||||
| } | |||||
| else | |||||
| { | |||||
| FLOAT x0, x1, x2, x3, y0, y1, y2, y3; | |||||
| for (i = (n >> 2); i--;) | |||||
| { | |||||
| LD_GP4_INC(x, inc_x, x0, x1, x2, x3); | |||||
| LD_GP4_INC(py, inc_y, y0, y1, y2, y3); | |||||
| FMADD4(x0, x1, x2, x3, da, y0, y1, y2, y3); | |||||
| ST_GP4_INC(y0, y1, y2, y3, y, inc_y); | |||||
| } | |||||
| if (n & 3) | |||||
| { | |||||
| if (n & 2) | |||||
| { | |||||
| LD_GP2_INC(x, inc_x, x0, x1); | |||||
| LD_GP2_INC(py, inc_y, y0, y1); | |||||
| FMADD2(x0, x1, da, y0, y1); | |||||
| ST_GP2_INC(y0, y1, y, inc_y); | |||||
| } | |||||
| if (n & 1) | |||||
| { | |||||
| *y += da * *x; | |||||
| } | |||||
| } | |||||
| } | |||||
| return (0); | |||||
| } | |||||
| @@ -0,0 +1,180 @@ | |||||
| /******************************************************************************* | |||||
| Copyright (c) 2016, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *******************************************************************************/ | |||||
| #include "common.h" | |||||
| #include "macros_msa.h" | |||||
| int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||||
| { | |||||
| BLASLONG i; | |||||
| v2f64 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; | |||||
| FLOAT f0, f1, f2, f3, f4, f5, f6, f7; | |||||
| if (n < 0) return (0); | |||||
| if ((1 == inc_x) && (1 == inc_y)) | |||||
| { | |||||
| if (n > 31) | |||||
| { | |||||
| FLOAT *x_pref; | |||||
| BLASLONG pref_offset; | |||||
| pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); | |||||
| if (pref_offset > 0) | |||||
| { | |||||
| pref_offset = L1_DATA_LINESIZE - pref_offset; | |||||
| pref_offset = pref_offset / sizeof(FLOAT); | |||||
| } | |||||
| x_pref = x + pref_offset + 64 + 16; | |||||
| LD_DP8_INC(x, 2, x0, x1, x2, x3, x4, x5, x6, x7); | |||||
| for (i = (n >> 5) - 1; i--;) | |||||
| { | |||||
| PREF_OFFSET(x_pref, 0); | |||||
| PREF_OFFSET(x_pref, 32); | |||||
| PREF_OFFSET(x_pref, 64); | |||||
| PREF_OFFSET(x_pref, 96); | |||||
| PREF_OFFSET(x_pref, 128); | |||||
| PREF_OFFSET(x_pref, 160); | |||||
| PREF_OFFSET(x_pref, 192); | |||||
| PREF_OFFSET(x_pref, 224); | |||||
| x_pref += 32; | |||||
| x8 = LD_DP(x); x += 2; | |||||
| ST_DP(x0, y); y += 2; | |||||
| x9 = LD_DP(x); x += 2; | |||||
| ST_DP(x1, y); y += 2; | |||||
| x10 = LD_DP(x); x += 2; | |||||
| ST_DP(x2, y); y += 2; | |||||
| x11 = LD_DP(x); x += 2; | |||||
| ST_DP(x3, y); y += 2; | |||||
| x12 = LD_DP(x); x += 2; | |||||
| ST_DP(x4, y); y += 2; | |||||
| x13 = LD_DP(x); x += 2; | |||||
| ST_DP(x5, y); y += 2; | |||||
| x14 = LD_DP(x); x += 2; | |||||
| ST_DP(x6, y); y += 2; | |||||
| x15 = LD_DP(x); x += 2; | |||||
| ST_DP(x7, y); y += 2; | |||||
| x0 = LD_DP(x); x += 2; | |||||
| ST_DP(x8, y); y += 2; | |||||
| x1 = LD_DP(x); x += 2; | |||||
| ST_DP(x9, y); y += 2; | |||||
| x2 = LD_DP(x); x += 2; | |||||
| ST_DP(x10, y); y += 2; | |||||
| x3 = LD_DP(x); x += 2; | |||||
| ST_DP(x11, y); y += 2; | |||||
| x4 = LD_DP(x); x += 2; | |||||
| ST_DP(x12, y); y += 2; | |||||
| x5 = LD_DP(x); x += 2; | |||||
| ST_DP(x13, y); y += 2; | |||||
| x6 = LD_DP(x); x += 2; | |||||
| ST_DP(x14, y); y += 2; | |||||
| x7 = LD_DP(x); x += 2; | |||||
| ST_DP(x15, y); y += 2; | |||||
| } | |||||
| x8 = LD_DP(x); x += 2; | |||||
| x9 = LD_DP(x); x += 2; | |||||
| ST_DP(x0, y); y += 2; | |||||
| x10 = LD_DP(x); x += 2; | |||||
| ST_DP(x1, y); y += 2; | |||||
| x11 = LD_DP(x); x += 2; | |||||
| ST_DP(x2, y); y += 2; | |||||
| x12 = LD_DP(x); x += 2; | |||||
| ST_DP(x3, y); y += 2; | |||||
| x13 = LD_DP(x); x += 2; | |||||
| ST_DP(x4, y); y += 2; | |||||
| x14 = LD_DP(x); x += 2; | |||||
| ST_DP(x5, y); y += 2; | |||||
| x15 = LD_DP(x); x += 2; | |||||
| ST_DP(x6, y); y += 2; | |||||
| ST_DP(x7, y); y += 2; | |||||
| ST_DP8_INC(x8, x9, x10, x11, x12, x13, x14, x15, y, 2); | |||||
| } | |||||
| if (n & 31) | |||||
| { | |||||
| if (n & 16) | |||||
| { | |||||
| LD_DP8_INC(x, 2, x0, x1, x2, x3, x4, x5, x6, x7); | |||||
| ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, y, 2); | |||||
| } | |||||
| if (n & 8) | |||||
| { | |||||
| LD_DP4_INC(x, 2, x0, x1, x2, x3); | |||||
| ST_DP4_INC(x0, x1, x2, x3, y, 2); | |||||
| } | |||||
| if (n & 4) | |||||
| { | |||||
| LD_GP4_INC(x, 1, f0, f1, f2, f3); | |||||
| ST_GP4_INC(f0, f1, f2, f3, y, 1); | |||||
| } | |||||
| if (n & 2) | |||||
| { | |||||
| LD_GP2_INC(x, 1, f0, f1); | |||||
| ST_GP2_INC(f0, f1, y, 1); | |||||
| } | |||||
| if (n & 1) | |||||
| { | |||||
| *y = *x; | |||||
| } | |||||
| } | |||||
| } | |||||
| else | |||||
| { | |||||
| for (i = (n >> 3); i--;) | |||||
| { | |||||
| LD_GP8_INC(x, inc_x, f0, f1, f2, f3, f4, f5, f6, f7); | |||||
| ST_GP8_INC(f0, f1, f2, f3, f4, f5, f6, f7, y, inc_y); | |||||
| } | |||||
| if (n & 4) | |||||
| { | |||||
| LD_GP4_INC(x, inc_x, f0, f1, f2, f3); | |||||
| ST_GP4_INC(f0, f1, f2, f3, y, inc_y); | |||||
| } | |||||
| if (n & 2) | |||||
| { | |||||
| LD_GP2_INC(x, inc_x, f0, f1); | |||||
| ST_GP2_INC(f0, f1, y, inc_y); | |||||
| } | |||||
| if (n & 1) | |||||
| { | |||||
| *y = *x; | |||||
| } | |||||
| } | |||||
| return (0); | |||||
| } | |||||
| @@ -0,0 +1,368 @@ | |||||
| /******************************************************************************* | |||||
| Copyright (c) 2017, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *******************************************************************************/ | |||||
| #include "common.h" | |||||
| #include "macros_msa.h" | |||||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, | |||||
| BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, | |||||
| BLASLONG dummy2) | |||||
| { | |||||
| BLASLONG i; | |||||
| FLOAT *px; | |||||
| FLOAT f0, f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11, f12, f13, f14, f15; | |||||
| v2f64 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; | |||||
| v2f64 da_vec; | |||||
| px = x; | |||||
| if (1 == inc_x) | |||||
| { | |||||
| if (0.0 == da) | |||||
| { | |||||
| v2f64 zero_v = __msa_cast_to_vector_double(0); | |||||
| zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 0, 0.0); | |||||
| zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 1, 0.0); | |||||
| for (i = (n >> 5); i--;) | |||||
| { | |||||
| ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v, | |||||
| zero_v, zero_v, x, 2); | |||||
| ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v, | |||||
| zero_v, zero_v, x, 2); | |||||
| } | |||||
| if (n & 31) | |||||
| { | |||||
| if (n & 16) | |||||
| { | |||||
| ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v, | |||||
| zero_v, zero_v, x, 2); | |||||
| } | |||||
| if (n & 8) | |||||
| { | |||||
| ST_DP4_INC(zero_v, zero_v, zero_v, zero_v, x, 2); | |||||
| } | |||||
| if (n & 4) | |||||
| { | |||||
| ST_DP2_INC(zero_v, zero_v, x, 2); | |||||
| } | |||||
| if (n & 2) | |||||
| { | |||||
| *x = 0; x += 1; | |||||
| *x = 0; x += 1; | |||||
| } | |||||
| if (n & 1) | |||||
| { | |||||
| *x = 0; | |||||
| } | |||||
| } | |||||
| } | |||||
| else | |||||
| { | |||||
| da_vec = COPY_DOUBLE_TO_VECTOR(da); | |||||
| if (n > 31) | |||||
| { | |||||
| FLOAT *x_pref; | |||||
| BLASLONG pref_offset; | |||||
| pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); | |||||
| if (pref_offset > 0) | |||||
| { | |||||
| pref_offset = L1_DATA_LINESIZE - pref_offset; | |||||
| pref_offset = pref_offset / sizeof(FLOAT); | |||||
| } | |||||
| x_pref = x + pref_offset + 32 + 16; | |||||
| LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7); | |||||
| for (i = 0; i < (n >> 5) - 1; i++) | |||||
| { | |||||
| PREF_OFFSET(x_pref, 0); | |||||
| PREF_OFFSET(x_pref, 32); | |||||
| PREF_OFFSET(x_pref, 64); | |||||
| PREF_OFFSET(x_pref, 96); | |||||
| PREF_OFFSET(x_pref, 128); | |||||
| PREF_OFFSET(x_pref, 160); | |||||
| PREF_OFFSET(x_pref, 192); | |||||
| PREF_OFFSET(x_pref, 224); | |||||
| x_pref += 32; | |||||
| x8 = LD_DP(px); px += 2; | |||||
| x0 *= da_vec; | |||||
| x9 = LD_DP(px); px += 2; | |||||
| x1 *= da_vec; | |||||
| x10 = LD_DP(px); px += 2; | |||||
| x2 *= da_vec; | |||||
| x11 = LD_DP(px); px += 2; | |||||
| x3 *= da_vec; | |||||
| x12 = LD_DP(px); px += 2; | |||||
| x4 *= da_vec; | |||||
| x13 = LD_DP(px); px += 2; | |||||
| x5 *= da_vec; | |||||
| x14 = LD_DP(px); px += 2; | |||||
| x6 *= da_vec; | |||||
| x15 = LD_DP(px); px += 2; | |||||
| x7 *= da_vec; | |||||
| x8 *= da_vec; | |||||
| ST_DP(x0, x); x += 2; | |||||
| x9 *= da_vec; | |||||
| ST_DP(x1, x); x += 2; | |||||
| x10 *= da_vec; | |||||
| ST_DP(x2, x); x += 2; | |||||
| x11 *= da_vec; | |||||
| ST_DP(x3, x); x += 2; | |||||
| x12 *= da_vec; | |||||
| ST_DP(x4, x); x += 2; | |||||
| x13 *= da_vec; | |||||
| ST_DP(x5, x); x += 2; | |||||
| x14 *= da_vec; | |||||
| ST_DP(x6, x); x += 2; | |||||
| x15 *= da_vec; | |||||
| ST_DP(x7, x); x += 2; | |||||
| ST_DP(x8, x); x += 2; | |||||
| x0 = LD_DP(px); px += 2; | |||||
| ST_DP(x9, x); x += 2; | |||||
| x1 = LD_DP(px); px += 2; | |||||
| ST_DP(x10, x); x += 2; | |||||
| x2 = LD_DP(px); px += 2; | |||||
| ST_DP(x11, x); x += 2; | |||||
| x3 = LD_DP(px); px += 2; | |||||
| ST_DP(x12, x); x += 2; | |||||
| x4 = LD_DP(px); px += 2; | |||||
| ST_DP(x13, x); x += 2; | |||||
| x5 = LD_DP(px); px += 2; | |||||
| ST_DP(x14, x); x += 2; | |||||
| x6 = LD_DP(px); px += 2; | |||||
| ST_DP(x15, x); x += 2; | |||||
| x7 = LD_DP(px); px += 2; | |||||
| } | |||||
| x8 = LD_DP(px); px += 2; | |||||
| x0 *= da_vec; | |||||
| x9 = LD_DP(px); px += 2; | |||||
| x1 *= da_vec; | |||||
| x10 = LD_DP(px); px += 2; | |||||
| x2 *= da_vec; | |||||
| x11 = LD_DP(px); px += 2; | |||||
| x3 *= da_vec; | |||||
| x12 = LD_DP(px); px += 2; | |||||
| x4 *= da_vec; | |||||
| x13 = LD_DP(px); px += 2; | |||||
| x5 *= da_vec; | |||||
| x14 = LD_DP(px); px += 2; | |||||
| x6 *= da_vec; | |||||
| x15 = LD_DP(px); px += 2; | |||||
| x7 *= da_vec; | |||||
| x8 *= da_vec; | |||||
| ST_DP(x0, x); x += 2; | |||||
| x9 *= da_vec; | |||||
| ST_DP(x1, x); x += 2; | |||||
| x10 *= da_vec; | |||||
| ST_DP(x2, x); x += 2; | |||||
| x11 *= da_vec; | |||||
| ST_DP(x3, x); x += 2; | |||||
| x12 *= da_vec; | |||||
| ST_DP(x4, x); x += 2; | |||||
| x13 *= da_vec; | |||||
| ST_DP(x5, x); x += 2; | |||||
| x15 *= da_vec; | |||||
| ST_DP(x6, x); x += 2; | |||||
| x14 *= da_vec; | |||||
| ST_DP(x7, x); x += 2; | |||||
| ST_DP8_INC(x8, x9, x10, x11, x12, x13, x14, x15, x, 2); | |||||
| } | |||||
| if (n & 31) | |||||
| { | |||||
| if (n & 16) | |||||
| { | |||||
| LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7); | |||||
| MUL4(x0, da_vec, x1, da_vec, x2, da_vec, x3, da_vec, x0, x1, x2, x3); | |||||
| MUL4(x4, da_vec, x5, da_vec, x6, da_vec, x7, da_vec, x4, x5, x6, x7); | |||||
| ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, 2); | |||||
| } | |||||
| if (n & 8) | |||||
| { | |||||
| LD_DP4_INC(px, 2, x0, x1, x2, x3); | |||||
| MUL4(x0, da_vec, x1, da_vec, x2, da_vec, x3, da_vec, x0, x1, x2, x3); | |||||
| ST_DP4_INC(x0, x1, x2, x3, x, 2); | |||||
| } | |||||
| if (n & 4) | |||||
| { | |||||
| LD_DP2_INC(px, 2, x0, x1); | |||||
| MUL2(x0, da_vec, x1, da_vec, x0, x1); | |||||
| ST_DP2_INC(x0, x1, x, 2); | |||||
| } | |||||
| if (n & 2) | |||||
| { | |||||
| LD_GP2_INC(px, 1, f0, f1); | |||||
| MUL2(f0, da, f1, da, f0, f1); | |||||
| ST_GP2_INC(f0, f1, x, 1); | |||||
| } | |||||
| if (n & 1) | |||||
| { | |||||
| *x *= da; | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| else | |||||
| { | |||||
| if (da == 0.0) | |||||
| { | |||||
| for (i = n; i--;) | |||||
| { | |||||
| *x = 0.0; | |||||
| x += inc_x; | |||||
| } | |||||
| } | |||||
| else | |||||
| { | |||||
| if (n > 15) | |||||
| { | |||||
| LD_GP8_INC(px, inc_x, f0, f1, f2, f3, f4, f5, f6, f7); | |||||
| for (i = 0; i < (n >> 4) - 1; i++) | |||||
| { | |||||
| LD_GP8_INC(px, inc_x, f8, f9, f10, f11, f12, f13, f14, f15); | |||||
| MUL4(f0, da, f1, da, f2, da, f3, da, f0, f1, f2, f3); | |||||
| f4 *= da; | |||||
| f5 *= da; | |||||
| *x = f0; x += inc_x; | |||||
| f6 *= da; | |||||
| *x = f1; x += inc_x; | |||||
| f7 *= da; | |||||
| *x = f2; x += inc_x; | |||||
| f8 *= da; | |||||
| *x = f3; x += inc_x; | |||||
| f9 *= da; | |||||
| *x = f4; x += inc_x; | |||||
| f10 *= da; | |||||
| *x = f5; x += inc_x; | |||||
| f11 *= da; | |||||
| *x = f6; x += inc_x; | |||||
| f12 *= da; | |||||
| *x = f7; x += inc_x; | |||||
| f13 *= da; | |||||
| *x = f8; x += inc_x; | |||||
| f14 *= da; | |||||
| *x = f9; x += inc_x; | |||||
| f15 *= da; | |||||
| *x = f10; x += inc_x; | |||||
| *x = f11; x += inc_x; | |||||
| f0 = *px; px += inc_x; | |||||
| *x = f12; x += inc_x; | |||||
| f1 = *px; px += inc_x; | |||||
| *x = f13; x += inc_x; | |||||
| f2 = *px; px += inc_x; | |||||
| *x = f14; x += inc_x; | |||||
| f3 = *px; px += inc_x; | |||||
| *x = f15; x += inc_x; | |||||
| f4 = *px; px += inc_x; | |||||
| f5 = *px; px += inc_x; | |||||
| f6 = *px; px += inc_x; | |||||
| f7 = *px; px += inc_x; | |||||
| } | |||||
| LD_GP8_INC(px, inc_x, f8, f9, f10, f11, f12, f13, f14, f15); | |||||
| MUL4(f0, da, f1, da, f2, da, f3, da, f0, f1, f2, f3); | |||||
| f4 *= da; | |||||
| f5 *= da; | |||||
| *x = f0; x += inc_x; | |||||
| f6 *= da; | |||||
| *x = f1; x += inc_x; | |||||
| f7 *= da; | |||||
| *x = f2; x += inc_x; | |||||
| f8 *= da; | |||||
| *x = f3; x += inc_x; | |||||
| f9 *= da; | |||||
| *x = f4; x += inc_x; | |||||
| f10 *= da; | |||||
| *x = f5; x += inc_x; | |||||
| f11 *= da; | |||||
| *x = f6; x += inc_x; | |||||
| f12 *= da; | |||||
| *x = f7; x += inc_x; | |||||
| f13 *= da; | |||||
| *x = f8; x += inc_x; | |||||
| f14 *= da; | |||||
| *x = f9; x += inc_x; | |||||
| f15 *= da; | |||||
| *x = f10; x += inc_x; | |||||
| *x = f11; x += inc_x; | |||||
| *x = f12; x += inc_x; | |||||
| *x = f13; x += inc_x; | |||||
| *x = f14; x += inc_x; | |||||
| *x = f15; x += inc_x; | |||||
| } | |||||
| if (n & 15) | |||||
| { | |||||
| if (n & 8) | |||||
| { | |||||
| LD_GP8_INC(px, inc_x, f0, f1, f2, f3, f4, f5, f6, f7); | |||||
| MUL4(f0, da, f1, da, f2, da, f3, da, f0, f1, f2, f3); | |||||
| MUL4(f4, da, f5, da, f6, da, f7, da, f4, f5, f6, f7); | |||||
| ST_GP8_INC(f0, f1, f2, f3, f4, f5, f6, f7, x, inc_x); | |||||
| } | |||||
| if (n & 4) | |||||
| { | |||||
| LD_GP4_INC(px, inc_x, f0, f1, f2, f3); | |||||
| MUL4(f0, da, f1, da, f2, da, f3, da, f0, f1, f2, f3); | |||||
| ST_GP4_INC(f0, f1, f2, f3, x, inc_x); | |||||
| } | |||||
| if (n & 2) | |||||
| { | |||||
| LD_GP2_INC(px, inc_x, f0, f1); | |||||
| MUL2(f0, da, f1, da, f0, f1); | |||||
| ST_GP2_INC(f0, f1, x, inc_x); | |||||
| } | |||||
| if (n & 1) | |||||
| { | |||||
| *x *= da; | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| @@ -0,0 +1,253 @@ | |||||
| /******************************************************************************* | |||||
| Copyright (c) 2016, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *******************************************************************************/ | |||||
| #include "common.h" | |||||
| #include "macros_msa.h" | |||||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, | |||||
| FLOAT *srcx, BLASLONG inc_x, FLOAT *srcy, BLASLONG inc_y, | |||||
| FLOAT *dummy, BLASLONG dummy2) | |||||
| { | |||||
| BLASLONG i = 0, pref_offsetx, pref_offsety; | |||||
| FLOAT *px, *py; | |||||
| FLOAT x0, x1, x2, x3, x4, x5, x6, x7; | |||||
| FLOAT y0, y1, y2, y3, y4, y5, y6, y7; | |||||
| v2f64 xv0, xv1, xv2, xv3, xv4, xv5, xv6, xv7; | |||||
| v2f64 yv0, yv1, yv2, yv3, yv4, yv5, yv6, yv7; | |||||
| if (n < 0) return (0); | |||||
| pref_offsetx = (BLASLONG)srcx & (L1_DATA_LINESIZE - 1); | |||||
| if (pref_offsetx > 0) | |||||
| { | |||||
| pref_offsetx = L1_DATA_LINESIZE - pref_offsetx; | |||||
| pref_offsetx = pref_offsetx / sizeof(FLOAT); | |||||
| } | |||||
| pref_offsety = (BLASLONG)srcy & (L1_DATA_LINESIZE - 1); | |||||
| if (pref_offsety > 0) | |||||
| { | |||||
| pref_offsety = L1_DATA_LINESIZE - pref_offsety; | |||||
| pref_offsety = pref_offsety / sizeof(FLOAT); | |||||
| } | |||||
| px = srcx; | |||||
| py = srcy; | |||||
| if ((1 == inc_x) && (1 == inc_y)) | |||||
| { | |||||
| if (n >> 4) | |||||
| { | |||||
| LD_DP8_INC(px, 2, xv0, xv1, xv2, xv3, xv4, xv5, xv6, xv7); | |||||
| for (i = (n >> 4) - 1; i--;) | |||||
| { | |||||
| PREFETCH(px + pref_offsetx + 16); | |||||
| PREFETCH(px + pref_offsetx + 20); | |||||
| PREFETCH(px + pref_offsetx + 24); | |||||
| PREFETCH(px + pref_offsetx + 28); | |||||
| PREFETCH(py + pref_offsety + 16); | |||||
| PREFETCH(py + pref_offsety + 20); | |||||
| PREFETCH(py + pref_offsety + 24); | |||||
| PREFETCH(py + pref_offsety + 28); | |||||
| yv0 = LD_DP(py); py += 2; | |||||
| ST_DP(xv0, srcy); srcy += 2; | |||||
| yv1 = LD_DP(py); py += 2; | |||||
| ST_DP(xv1, srcy); srcy += 2; | |||||
| yv2 = LD_DP(py); py += 2; | |||||
| ST_DP(xv2, srcy); srcy += 2; | |||||
| yv3 = LD_DP(py); py += 2; | |||||
| ST_DP(xv3, srcy); srcy += 2; | |||||
| yv4 = LD_DP(py); py += 2; | |||||
| ST_DP(xv4, srcy); srcy += 2; | |||||
| yv5 = LD_DP(py); py += 2; | |||||
| ST_DP(xv5, srcy); srcy += 2; | |||||
| yv6 = LD_DP(py); py += 2; | |||||
| ST_DP(xv6, srcy); srcy += 2; | |||||
| yv7 = LD_DP(py); py += 2; | |||||
| ST_DP(xv7, srcy); srcy += 2; | |||||
| xv0 = LD_DP(px); px += 2; | |||||
| ST_DP(yv0, srcx); srcx += 2; | |||||
| xv1 = LD_DP(px); px += 2; | |||||
| ST_DP(yv1, srcx); srcx += 2; | |||||
| xv2 = LD_DP(px); px += 2; | |||||
| ST_DP(yv2, srcx); srcx += 2; | |||||
| xv3 = LD_DP(px); px += 2; | |||||
| ST_DP(yv3, srcx); srcx += 2; | |||||
| xv4 = LD_DP(px); px += 2; | |||||
| ST_DP(yv4, srcx); srcx += 2; | |||||
| xv5 = LD_DP(px); px += 2; | |||||
| ST_DP(yv5, srcx); srcx += 2; | |||||
| xv6 = LD_DP(px); px += 2; | |||||
| ST_DP(yv6, srcx); srcx += 2; | |||||
| xv7 = LD_DP(px); px += 2; | |||||
| ST_DP(yv7, srcx); srcx += 2; | |||||
| } | |||||
| LD_DP8_INC(py, 2, yv0, yv1, yv2, yv3, yv4, yv5, yv6, yv7); | |||||
| ST_DP8_INC(xv0, xv1, xv2, xv3, xv4, xv5, xv6, xv7, srcy, 2); | |||||
| ST_DP8_INC(yv0, yv1, yv2, yv3, yv4, yv5, yv6, yv7, srcx, 2); | |||||
| } | |||||
| if (n & 15) | |||||
| { | |||||
| if ((n & 8) && (n & 4) && (n & 2)) | |||||
| { | |||||
| LD_DP7_INC(px, 2, xv0, xv1, xv2, xv3, xv4, xv5, xv6); | |||||
| LD_DP7_INC(py, 2, yv0, yv1, yv2, yv3, yv4, yv5, yv6); | |||||
| ST_DP7_INC(xv0, xv1, xv2, xv3, xv4, xv5, xv6, srcy, 2); | |||||
| ST_DP7_INC(yv0, yv1, yv2, yv3, yv4, yv5, yv6, srcx, 2); | |||||
| } | |||||
| else if ((n & 8) && (n & 4)) | |||||
| { | |||||
| LD_DP6_INC(px, 2, xv0, xv1, xv2, xv3, xv4, xv5); | |||||
| LD_DP6_INC(py, 2, yv0, yv1, yv2, yv3, yv4, yv5); | |||||
| ST_DP6_INC(xv0, xv1, xv2, xv3, xv4, xv5, srcy, 2); | |||||
| ST_DP6_INC(yv0, yv1, yv2, yv3, yv4, yv5, srcx, 2); | |||||
| } | |||||
| else if ((n & 8) && (n & 2)) | |||||
| { | |||||
| LD_DP5_INC(px, 2, xv0, xv1, xv2, xv3, xv4); | |||||
| LD_DP5_INC(py, 2, yv0, yv1, yv2, yv3, yv4); | |||||
| ST_DP5_INC(xv0, xv1, xv2, xv3, xv4, srcy, 2); | |||||
| ST_DP5_INC(yv0, yv1, yv2, yv3, yv4, srcx, 2); | |||||
| } | |||||
| else if ((n & 4) && (n & 2)) | |||||
| { | |||||
| LD_DP3_INC(px, 2, xv0, xv1, xv2); | |||||
| LD_DP3_INC(py, 2, yv0, yv1, yv2); | |||||
| ST_DP3_INC(xv0, xv1, xv2, srcy, 2); | |||||
| ST_DP3_INC(yv0, yv1, yv2, srcx, 2); | |||||
| } | |||||
| else if (n & 8) | |||||
| { | |||||
| LD_DP4_INC(px, 2, xv0, xv1, xv2, xv3); | |||||
| LD_DP4_INC(py, 2, yv0, yv1, yv2, yv3); | |||||
| ST_DP4_INC(xv0, xv1, xv2, xv3, srcy, 2); | |||||
| ST_DP4_INC(yv0, yv1, yv2, yv3, srcx, 2); | |||||
| } | |||||
| else if (n & 4) | |||||
| { | |||||
| LD_DP2_INC(px, 2, xv0, xv1); | |||||
| LD_DP2_INC(py, 2, yv0, yv1); | |||||
| ST_DP2_INC(xv0, xv1, srcy, 2); | |||||
| ST_DP2_INC(yv0, yv1, srcx, 2); | |||||
| } | |||||
| else if (n & 2) | |||||
| { | |||||
| xv0 = LD_DP(px); | |||||
| yv0 = LD_DP(py); | |||||
| px += 2; | |||||
| py += 2; | |||||
| ST_DP(xv0, srcy); | |||||
| ST_DP(yv0, srcx); | |||||
| srcx += 2; | |||||
| srcy += 2; | |||||
| } | |||||
| if (n & 1) | |||||
| { | |||||
| x0 = px[0]; | |||||
| y0 = py[0]; | |||||
| srcx[0] = y0; | |||||
| srcy[0] = x0; | |||||
| } | |||||
| } | |||||
| } | |||||
| else | |||||
| { | |||||
| for (i = (n >> 3); i--;) | |||||
| { | |||||
| LD_GP8_INC(px, inc_x, x0, x1, x2, x3, x4, x5, x6, x7); | |||||
| LD_GP8_INC(py, inc_y, y0, y1, y2, y3, y4, y5, y6, y7); | |||||
| ST_GP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, srcy, inc_y); | |||||
| ST_GP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, srcx, inc_x); | |||||
| } | |||||
| if (n & 7) | |||||
| { | |||||
| if ((n & 4) && (n & 2) && (n & 1)) | |||||
| { | |||||
| LD_GP7_INC(px, inc_x, x0, x1, x2, x3, x4, x5, x6); | |||||
| LD_GP7_INC(py, inc_y, y0, y1, y2, y3, y4, y5, y6); | |||||
| ST_GP7_INC(x0, x1, x2, x3, x4, x5, x6, srcy, inc_y); | |||||
| ST_GP7_INC(y0, y1, y2, y3, y4, y5, y6, srcx, inc_x); | |||||
| } | |||||
| else if ((n & 4) && (n & 2)) | |||||
| { | |||||
| LD_GP6_INC(px, inc_x, x0, x1, x2, x3, x4, x5); | |||||
| LD_GP6_INC(py, inc_y, y0, y1, y2, y3, y4, y5); | |||||
| ST_GP6_INC(x0, x1, x2, x3, x4, x5, srcy, inc_y); | |||||
| ST_GP6_INC(y0, y1, y2, y3, y4, y5, srcx, inc_x); | |||||
| } | |||||
| else if ((n & 4) && (n & 1)) | |||||
| { | |||||
| LD_GP5_INC(px, inc_x, x0, x1, x2, x3, x4); | |||||
| LD_GP5_INC(py, inc_y, y0, y1, y2, y3, y4); | |||||
| ST_GP5_INC(x0, x1, x2, x3, x4, srcy, inc_y); | |||||
| ST_GP5_INC(y0, y1, y2, y3, y4, srcx, inc_x); | |||||
| } | |||||
| else if ((n & 2) && (n & 1)) | |||||
| { | |||||
| LD_GP3_INC(px, inc_x, x0, x1, x2); | |||||
| LD_GP3_INC(py, inc_y, y0, y1, y2); | |||||
| ST_GP3_INC(x0, x1, x2, srcy, inc_y); | |||||
| ST_GP3_INC(y0, y1, y2, srcx, inc_x); | |||||
| } | |||||
| else if (n & 4) | |||||
| { | |||||
| LD_GP4_INC(px, inc_x, x0, x1, x2, x3); | |||||
| LD_GP4_INC(py, inc_y, y0, y1, y2, y3); | |||||
| ST_GP4_INC(x0, x1, x2, x3, srcy, inc_y); | |||||
| ST_GP4_INC(y0, y1, y2, y3, srcx, inc_x); | |||||
| } | |||||
| else if (n & 2) | |||||
| { | |||||
| LD_GP2_INC(px, inc_x, x0, x1); | |||||
| LD_GP2_INC(py, inc_y, y0, y1); | |||||
| ST_GP2_INC(x0, x1, srcy, inc_y); | |||||
| ST_GP2_INC(y0, y1, srcx, inc_x); | |||||
| } | |||||
| else if (n & 1) | |||||
| { | |||||
| x0 = *srcx; | |||||
| y0 = *srcy; | |||||
| *srcx = y0; | |||||
| *srcy = x0; | |||||
| } | |||||
| } | |||||
| } | |||||
| return (0); | |||||
| } | |||||
| @@ -722,6 +722,31 @@ inline static void prefetch_load_lf(unsigned char *src) | |||||
| MUL2(in4, in5, in6, in7, out2, out3); \ | MUL2(in4, in5, in6, in7, out2, out3); \ | ||||
| } | } | ||||
| /* Description : Multiplication of pairs of vectors and added in output | |||||
| Arguments : Inputs - in0, in1, vec, out0, out1 | |||||
| Outputs - out0, out1 | |||||
| Details : Each element from 'in0' is multiplied with elements from 'vec' | |||||
| and the result is added to 'out0' | |||||
| */ | |||||
| #define FMADD2(in0, in1, vec, inout0, inout1) \ | |||||
| { \ | |||||
| inout0 += in0 * vec; \ | |||||
| inout1 += in1 * vec; \ | |||||
| } | |||||
| #define FMADD3(in0, in1, in2, vec, \ | |||||
| inout0, inout1, inout2) \ | |||||
| { \ | |||||
| inout0 += in0 * vec; \ | |||||
| inout1 += in1 * vec; \ | |||||
| inout2 += in2 * vec; \ | |||||
| } | |||||
| #define FMADD4(in0, in1, in2, in3, vec, \ | |||||
| inout0, inout1, inout2, inout3) \ | |||||
| { \ | |||||
| FMADD2(in0, in1, vec, inout0, inout1); \ | |||||
| FMADD2(in2, in3, vec, inout2, inout3); \ | |||||
| } | |||||
| /* Description : Addition of 2 pairs of variables | /* Description : Addition of 2 pairs of variables | ||||
| Arguments : Inputs - in0, in1, in2, in3 | Arguments : Inputs - in0, in1, in2, in3 | ||||
| Outputs - out0, out1 | Outputs - out0, out1 | ||||
| @@ -0,0 +1,265 @@ | |||||
| /******************************************************************************* | |||||
| Copyright (c) 2016, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *******************************************************************************/ | |||||
| #include "common.h" | |||||
| #include "macros_msa.h" | |||||
| #if !defined(CONJ) | |||||
| #define OP0 += | |||||
| #define OP1 -= | |||||
| #define OP2 += | |||||
| #else | |||||
| #define OP0 -= | |||||
| #define OP1 += | |||||
| #define OP2 -= | |||||
| #endif | |||||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, | |||||
| BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | |||||
| { | |||||
| BLASLONG i; | |||||
| FLOAT *py; | |||||
| v4f32 x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, y6, y7; | |||||
| v4f32 da_vec, zero_v = {0}; | |||||
| if ((n < 0) || (da == 0.0)) return(0); | |||||
| py = y; | |||||
| if ((1 == inc_x) && (1 == inc_y)) | |||||
| { | |||||
| FLOAT *x_pref, *y_pref; | |||||
| BLASLONG pref_offset; | |||||
| pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); | |||||
| if (pref_offset > 0) | |||||
| { | |||||
| pref_offset = L1_DATA_LINESIZE - pref_offset; | |||||
| pref_offset = pref_offset / sizeof(FLOAT); | |||||
| } | |||||
| x_pref = x + pref_offset + 64; | |||||
| pref_offset = (BLASLONG)y & (L1_DATA_LINESIZE - 1); | |||||
| if (pref_offset > 0) | |||||
| { | |||||
| pref_offset = L1_DATA_LINESIZE - pref_offset; | |||||
| pref_offset = pref_offset / sizeof(FLOAT); | |||||
| } | |||||
| y_pref = y + pref_offset + 64; | |||||
| da_vec = COPY_FLOAT_TO_VECTOR(da); | |||||
| for (i = (n >> 5); i--;) | |||||
| { | |||||
| PREF_OFFSET(x_pref, 0); | |||||
| PREF_OFFSET(x_pref, 32); | |||||
| PREF_OFFSET(x_pref, 64); | |||||
| PREF_OFFSET(x_pref, 96); | |||||
| PREF_OFFSET(y_pref, 0); | |||||
| PREF_OFFSET(y_pref, 32); | |||||
| PREF_OFFSET(y_pref, 64); | |||||
| PREF_OFFSET(y_pref, 96); | |||||
| x_pref += 32; | |||||
| y_pref += 32; | |||||
| LD_SP8_INC(x, 4, x0, x1, x2, x3, x4, x5, x6, x7); | |||||
| LD_SP8_INC(py, 4, y0, y1, y2, y3, y4, y5, y6, y7); | |||||
| FMADD4(x0, x1, x2, x3, da_vec, y0, y1, y2, y3); | |||||
| FMADD4(x4, x5, x6, x7, da_vec, y4, y5, y6, y7); | |||||
| ST_SP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, 4); | |||||
| } | |||||
| if (n & 31) | |||||
| { | |||||
| if (n & 16) | |||||
| { | |||||
| LD_SP4_INC(x, 4, x0, x1, x2, x3); | |||||
| LD_SP4_INC(py, 4, y0, y1, y2, y3); | |||||
| FMADD4(x0, x1, x2, x3, da_vec, y0, y1, y2, y3); | |||||
| ST_SP4_INC(y0, y1, y2, y3, y, 4); | |||||
| } | |||||
| if (n & 8) | |||||
| { | |||||
| LD_SP2_INC(x, 4, x0, x1); | |||||
| LD_SP2_INC(py, 4, y0, y1); | |||||
| FMADD2(x0, x1, da_vec, y0, y1); | |||||
| ST_SP2_INC(y0, y1, y, 4); | |||||
| } | |||||
| if (n & 4) | |||||
| { | |||||
| x0 = LD_SP(x); x += 4; | |||||
| y0 = LD_SP(py); py += 4; | |||||
| y0 += da_vec * x0; | |||||
| ST_SP(y0, y); y += 4; | |||||
| } | |||||
| if (n & 2) | |||||
| { | |||||
| FMADD2(x[0], x[1], da, y[0], y[1]); | |||||
| x += 2; | |||||
| y += 2; | |||||
| } | |||||
| if (n & 1) | |||||
| { | |||||
| y[0] += da * x[0]; | |||||
| } | |||||
| } | |||||
| } | |||||
| else if (1 == inc_y) | |||||
| { | |||||
| da_vec = COPY_FLOAT_TO_VECTOR(da); | |||||
| for (i = (n >> 4); i--;) | |||||
| { | |||||
| x0 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x)); | |||||
| x += inc_x; | |||||
| x0 = (v4f32) __msa_insert_w((v4i32) x0, 1, *((int *) x)); | |||||
| x += inc_x; | |||||
| x0 = (v4f32) __msa_insert_w((v4i32) x0, 2, *((int *) x)); | |||||
| x += inc_x; | |||||
| x0 = (v4f32) __msa_insert_w((v4i32) x0, 3, *((int *) x)); | |||||
| x += inc_x; | |||||
| x1 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x)); | |||||
| x += inc_x; | |||||
| x1 = (v4f32) __msa_insert_w((v4i32) x1, 1, *((int *) x)); | |||||
| x += inc_x; | |||||
| x1 = (v4f32) __msa_insert_w((v4i32) x1, 2, *((int *) x)); | |||||
| x += inc_x; | |||||
| x1 = (v4f32) __msa_insert_w((v4i32) x1, 3, *((int *) x)); | |||||
| x += inc_x; | |||||
| x2 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x)); | |||||
| x += inc_x; | |||||
| x2 = (v4f32) __msa_insert_w((v4i32) x2, 1, *((int *) x)); | |||||
| x += inc_x; | |||||
| x2 = (v4f32) __msa_insert_w((v4i32) x2, 2, *((int *) x)); | |||||
| x += inc_x; | |||||
| x2 = (v4f32) __msa_insert_w((v4i32) x2, 3, *((int *) x)); | |||||
| x += inc_x; | |||||
| x3 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x)); | |||||
| x += inc_x; | |||||
| x3 = (v4f32) __msa_insert_w((v4i32) x3, 1, *((int *) x)); | |||||
| x += inc_x; | |||||
| x3 = (v4f32) __msa_insert_w((v4i32) x3, 2, *((int *) x)); | |||||
| x += inc_x; | |||||
| x3 = (v4f32) __msa_insert_w((v4i32) x3, 3, *((int *) x)); | |||||
| x += inc_x; | |||||
| LD_SP4_INC(py, 4, y0, y1, y2, y3); | |||||
| FMADD4(x0, x1, x2, x3, da_vec, y0, y1, y2, y3); | |||||
| ST_SP4_INC(y0, y1, y2, y3, y, 4); | |||||
| } | |||||
| if (n & 15) | |||||
| { | |||||
| if (n & 8) | |||||
| { | |||||
| x0 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x)); | |||||
| x += inc_x; | |||||
| x0 = (v4f32) __msa_insert_w((v4i32) x0, 1, *((int *) x)); | |||||
| x += inc_x; | |||||
| x0 = (v4f32) __msa_insert_w((v4i32) x0, 2, *((int *) x)); | |||||
| x += inc_x; | |||||
| x0 = (v4f32) __msa_insert_w((v4i32) x0, 3, *((int *) x)); | |||||
| x += inc_x; | |||||
| x1 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x)); | |||||
| x += inc_x; | |||||
| x1 = (v4f32) __msa_insert_w((v4i32) x1, 1, *((int *) x)); | |||||
| x += inc_x; | |||||
| x1 = (v4f32) __msa_insert_w((v4i32) x1, 2, *((int *) x)); | |||||
| x += inc_x; | |||||
| x1 = (v4f32) __msa_insert_w((v4i32) x1, 3, *((int *) x)); | |||||
| x += inc_x; | |||||
| LD_SP2_INC(py, 4, y0, y1); | |||||
| FMADD2(x0, x1, da_vec, y0, y1); | |||||
| ST_SP2_INC(y0, y1, y, 4); | |||||
| } | |||||
| if (n & 4) | |||||
| { | |||||
| x0 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x)); | |||||
| x += inc_x; | |||||
| x0 = (v4f32) __msa_insert_w((v4i32) x0, 1, *((int *) x)); | |||||
| x += inc_x; | |||||
| x0 = (v4f32) __msa_insert_w((v4i32) x0, 2, *((int *) x)); | |||||
| x += inc_x; | |||||
| x0 = (v4f32) __msa_insert_w((v4i32) x0, 3, *((int *) x)); | |||||
| x += inc_x; | |||||
| y0 = LD_SP(py); py += 4; | |||||
| y0 += da_vec * x0; | |||||
| ST_SP(y0, y); y += 4; | |||||
| } | |||||
| if (n & 2) | |||||
| { | |||||
| FMADD2(x[0], x[inc_x], da, y[0], y[1]); | |||||
| x += 2 * inc_x; | |||||
| y += 2; | |||||
| } | |||||
| if (n & 1) | |||||
| { | |||||
| y[0] += da * x[0]; | |||||
| } | |||||
| } | |||||
| } | |||||
| else | |||||
| { | |||||
| FLOAT x0, x1, x2, x3, y0, y1, y2, y3; | |||||
| for (i = (n >> 2); i--;) | |||||
| { | |||||
| LD_GP4_INC(x, inc_x, x0, x1, x2, x3); | |||||
| LD_GP4_INC(py, inc_y, y0, y1, y2, y3); | |||||
| FMADD4(x0, x1, x2, x3, da, y0, y1, y2, y3); | |||||
| ST_GP4_INC(y0, y1, y2, y3, y, inc_y); | |||||
| } | |||||
| if (n & 3) | |||||
| { | |||||
| if (n & 2) | |||||
| { | |||||
| LD_GP2_INC(x, inc_x, x0, x1); | |||||
| LD_GP2_INC(py, inc_y, y0, y1); | |||||
| FMADD2(x0, x1, da, y0, y1); | |||||
| ST_GP2_INC(y0, y1, y, inc_y); | |||||
| } | |||||
| if (n & 1) | |||||
| { | |||||
| *y += da * *x; | |||||
| } | |||||
| } | |||||
| } | |||||
| return (0); | |||||
| } | |||||
| @@ -0,0 +1,186 @@ | |||||
| /******************************************************************************* | |||||
| Copyright (c) 2016, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *******************************************************************************/ | |||||
| #include "common.h" | |||||
| #include "macros_msa.h" | |||||
| int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||||
| { | |||||
| BLASLONG i; | |||||
| v4f32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; | |||||
| FLOAT f0, f1, f2, f3, f4, f5, f6, f7; | |||||
| if (n < 0) return (0); | |||||
| if ((1 == inc_x) && (1 == inc_y)) | |||||
| { | |||||
| if (n > 63) | |||||
| { | |||||
| FLOAT *x_pref; | |||||
| BLASLONG pref_offset; | |||||
| pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); | |||||
| if (pref_offset > 0) | |||||
| { | |||||
| pref_offset = L1_DATA_LINESIZE - pref_offset; | |||||
| pref_offset = pref_offset / sizeof(FLOAT); | |||||
| } | |||||
| x_pref = x + pref_offset + 128 + 32; | |||||
| LD_SP8_INC(x, 4, x0, x1, x2, x3, x4, x5, x6, x7); | |||||
| for (i = (n >> 6) - 1; i--;) | |||||
| { | |||||
| PREF_OFFSET(x_pref, 0); | |||||
| PREF_OFFSET(x_pref, 32); | |||||
| PREF_OFFSET(x_pref, 64); | |||||
| PREF_OFFSET(x_pref, 96); | |||||
| PREF_OFFSET(x_pref, 128); | |||||
| PREF_OFFSET(x_pref, 160); | |||||
| PREF_OFFSET(x_pref, 192); | |||||
| PREF_OFFSET(x_pref, 224); | |||||
| x_pref += 64; | |||||
| x8 = LD_SP(x); x += 4; | |||||
| ST_SP(x0, y); y += 4; | |||||
| x9 = LD_SP(x); x += 4; | |||||
| ST_SP(x1, y); y += 4; | |||||
| x10 = LD_SP(x); x += 4; | |||||
| ST_SP(x2, y); y += 4; | |||||
| x11 = LD_SP(x); x += 4; | |||||
| ST_SP(x3, y); y += 4; | |||||
| x12 = LD_SP(x); x += 4; | |||||
| ST_SP(x4, y); y += 4; | |||||
| x13 = LD_SP(x); x += 4; | |||||
| ST_SP(x5, y); y += 4; | |||||
| x14 = LD_SP(x); x += 4; | |||||
| ST_SP(x6, y); y += 4; | |||||
| x15 = LD_SP(x); x += 4; | |||||
| ST_SP(x7, y); y += 4; | |||||
| x0 = LD_SP(x); x += 4; | |||||
| ST_SP(x8, y); y += 4; | |||||
| x1 = LD_SP(x); x += 4; | |||||
| ST_SP(x9, y); y += 4; | |||||
| x2 = LD_SP(x); x += 4; | |||||
| ST_SP(x10, y); y += 4; | |||||
| x3 = LD_SP(x); x += 4; | |||||
| ST_SP(x11, y); y += 4; | |||||
| x4 = LD_SP(x); x += 4; | |||||
| ST_SP(x12, y); y += 4; | |||||
| x5 = LD_SP(x); x += 4; | |||||
| ST_SP(x13, y); y += 4; | |||||
| x6 = LD_SP(x); x += 4; | |||||
| ST_SP(x14, y); y += 4; | |||||
| x7 = LD_SP(x); x += 4; | |||||
| ST_SP(x15, y); y += 4; | |||||
| } | |||||
| x8 = LD_SP(x); x += 4; | |||||
| x9 = LD_SP(x); x += 4; | |||||
| ST_SP(x0, y); y += 4; | |||||
| x10 = LD_SP(x); x += 4; | |||||
| ST_SP(x1, y); y += 4; | |||||
| x11 = LD_SP(x); x += 4; | |||||
| ST_SP(x2, y); y += 4; | |||||
| x12 = LD_SP(x); x += 4; | |||||
| ST_SP(x3, y); y += 4; | |||||
| x13 = LD_SP(x); x += 4; | |||||
| ST_SP(x4, y); y += 4; | |||||
| x14 = LD_SP(x); x += 4; | |||||
| ST_SP(x5, y); y += 4; | |||||
| x15 = LD_SP(x); x += 4; | |||||
| ST_SP(x6, y); y += 4; | |||||
| ST_SP(x7, y); y += 4; | |||||
| ST_SP8_INC(x8, x9, x10, x11, x12, x13, x14, x15, y, 4); | |||||
| } | |||||
| if (n & 63) | |||||
| { | |||||
| if (n & 32) | |||||
| { | |||||
| LD_SP8_INC(x, 4, x0, x1, x2, x3, x4, x5, x6, x7); | |||||
| ST_SP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, y, 4); | |||||
| } | |||||
| if (n & 16) | |||||
| { | |||||
| LD_SP4_INC(x, 4, x0, x1, x2, x3); | |||||
| ST_SP4_INC(x0, x1, x2, x3, y, 4); | |||||
| } | |||||
| if (n & 8) | |||||
| { | |||||
| LD_SP2_INC(x, 4, x0, x1); | |||||
| ST_SP2_INC(x0, x1, y, 4); | |||||
| } | |||||
| if (n & 4) | |||||
| { | |||||
| LD_GP4_INC(x, 1, f0, f1, f2, f3); | |||||
| ST_GP4_INC(f0, f1, f2, f3, y, 1); | |||||
| } | |||||
| if (n & 2) | |||||
| { | |||||
| LD_GP2_INC(x, 1, f0, f1); | |||||
| ST_GP2_INC(f0, f1, y, 1); | |||||
| } | |||||
| if (n & 1) | |||||
| { | |||||
| *y = *x; | |||||
| } | |||||
| } | |||||
| } | |||||
| else | |||||
| { | |||||
| for (i = (n >> 3); i--;) | |||||
| { | |||||
| LD_GP8_INC(x, inc_x, f0, f1, f2, f3, f4, f5, f6, f7); | |||||
| ST_GP8_INC(f0, f1, f2, f3, f4, f5, f6, f7, y, inc_y); | |||||
| } | |||||
| if (n & 4) | |||||
| { | |||||
| LD_GP4_INC(x, inc_x, f0, f1, f2, f3); | |||||
| ST_GP4_INC(f0, f1, f2, f3, y, inc_y); | |||||
| } | |||||
| if (n & 2) | |||||
| { | |||||
| LD_GP2_INC(x, inc_x, f0, f1); | |||||
| ST_GP2_INC(f0, f1, y, inc_y); | |||||
| } | |||||
| if (n & 1) | |||||
| { | |||||
| *y = *x; | |||||
| } | |||||
| } | |||||
| return (0); | |||||
| } | |||||
| @@ -0,0 +1,385 @@ | |||||
| /******************************************************************************* | |||||
| Copyright (c) 2017, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *******************************************************************************/ | |||||
| #include "common.h" | |||||
| #include "macros_msa.h" | |||||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, | |||||
| BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, | |||||
| BLASLONG dummy2) | |||||
| { | |||||
| BLASLONG i; | |||||
| FLOAT *px; | |||||
| FLOAT f0, f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11, f12, f13, f14, f15; | |||||
| v4f32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; | |||||
| v4f32 da_vec; | |||||
| px = x; | |||||
| if (1 == inc_x) | |||||
| { | |||||
| if (0.0 == da) | |||||
| { | |||||
| v4f32 zero_v = __msa_cast_to_vector_float(0); | |||||
| zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 0, 0.0); | |||||
| zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 1, 0.0); | |||||
| zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 2, 0.0); | |||||
| zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 3, 0.0); | |||||
| for (i = (n >> 6); i--;) | |||||
| { | |||||
| ST_SP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v, | |||||
| zero_v, zero_v, x, 4); | |||||
| ST_SP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v, | |||||
| zero_v, zero_v, x, 4); | |||||
| } | |||||
| if (n & 63) | |||||
| { | |||||
| if (n & 32) | |||||
| { | |||||
| ST_SP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v, | |||||
| zero_v, zero_v, x, 4); | |||||
| } | |||||
| if (n & 16) | |||||
| { | |||||
| ST_SP4_INC(zero_v, zero_v, zero_v, zero_v, x, 4); | |||||
| } | |||||
| if (n & 8) | |||||
| { | |||||
| ST_SP2_INC(zero_v, zero_v, x, 4); | |||||
| } | |||||
| if (n & 4) | |||||
| { | |||||
| *x = 0; x += 1; | |||||
| *x = 0; x += 1; | |||||
| *x = 0; x += 1; | |||||
| *x = 0; x += 1; | |||||
| } | |||||
| if (n & 2) | |||||
| { | |||||
| *x = 0; x += 1; | |||||
| *x = 0; x += 1; | |||||
| } | |||||
| if (n & 1) | |||||
| { | |||||
| *x = 0; | |||||
| } | |||||
| } | |||||
| } | |||||
| else | |||||
| { | |||||
| da_vec = COPY_FLOAT_TO_VECTOR(da); | |||||
| if (n > 63) | |||||
| { | |||||
| FLOAT *x_pref; | |||||
| BLASLONG pref_offset; | |||||
| pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); | |||||
| if (pref_offset > 0) | |||||
| { | |||||
| pref_offset = L1_DATA_LINESIZE - pref_offset; | |||||
| pref_offset = pref_offset / sizeof(FLOAT); | |||||
| } | |||||
| x_pref = x + pref_offset + 64 + 32; | |||||
| LD_SP8_INC(px, 4, x0, x1, x2, x3, x4, x5, x6, x7); | |||||
| for (i = 0; i < (n >> 6) - 1; i++) | |||||
| { | |||||
| PREF_OFFSET(x_pref, 0); | |||||
| PREF_OFFSET(x_pref, 32); | |||||
| PREF_OFFSET(x_pref, 64); | |||||
| PREF_OFFSET(x_pref, 96); | |||||
| PREF_OFFSET(x_pref, 128); | |||||
| PREF_OFFSET(x_pref, 160); | |||||
| PREF_OFFSET(x_pref, 192); | |||||
| PREF_OFFSET(x_pref, 224); | |||||
| x_pref += 64; | |||||
| x8 = LD_SP(px); px += 4; | |||||
| x0 *= da_vec; | |||||
| x9 = LD_SP(px); px += 4; | |||||
| x1 *= da_vec; | |||||
| x10 = LD_SP(px); px += 4; | |||||
| x2 *= da_vec; | |||||
| x11 = LD_SP(px); px += 4; | |||||
| x3 *= da_vec; | |||||
| x12 = LD_SP(px); px += 4; | |||||
| x4 *= da_vec; | |||||
| x13 = LD_SP(px); px += 4; | |||||
| x5 *= da_vec; | |||||
| x14 = LD_SP(px); px += 4; | |||||
| x6 *= da_vec; | |||||
| x15 = LD_SP(px); px += 4; | |||||
| x7 *= da_vec; | |||||
| x8 *= da_vec; | |||||
| ST_SP(x0, x); x += 4; | |||||
| x9 *= da_vec; | |||||
| ST_SP(x1, x); x += 4; | |||||
| x10 *= da_vec; | |||||
| ST_SP(x2, x); x += 4; | |||||
| x11 *= da_vec; | |||||
| ST_SP(x3, x); x += 4; | |||||
| x12 *= da_vec; | |||||
| ST_SP(x4, x); x += 4; | |||||
| x13 *= da_vec; | |||||
| ST_SP(x5, x); x += 4; | |||||
| x14 *= da_vec; | |||||
| ST_SP(x6, x); x += 4; | |||||
| x15 *= da_vec; | |||||
| ST_SP(x7, x); x += 4; | |||||
| ST_SP(x8, x); x += 4; | |||||
| x0 = LD_SP(px); px += 4; | |||||
| ST_SP(x9, x); x += 4; | |||||
| x1 = LD_SP(px); px += 4; | |||||
| ST_SP(x10, x); x += 4; | |||||
| x2 = LD_SP(px); px += 4; | |||||
| ST_SP(x11, x); x += 4; | |||||
| x3 = LD_SP(px); px += 4; | |||||
| ST_SP(x12, x); x += 4; | |||||
| x4 = LD_SP(px); px += 4; | |||||
| ST_SP(x13, x); x += 4; | |||||
| x5 = LD_SP(px); px += 4; | |||||
| ST_SP(x14, x); x += 4; | |||||
| x6 = LD_SP(px); px += 4; | |||||
| ST_SP(x15, x); x += 4; | |||||
| x7 = LD_SP(px); px += 4; | |||||
| } | |||||
| x8 = LD_SP(px); px += 4; | |||||
| x0 *= da_vec; | |||||
| x9 = LD_SP(px); px += 4; | |||||
| x1 *= da_vec; | |||||
| x10 = LD_SP(px); px += 4; | |||||
| x2 *= da_vec; | |||||
| x11 = LD_SP(px); px += 4; | |||||
| x3 *= da_vec; | |||||
| x12 = LD_SP(px); px += 4; | |||||
| x4 *= da_vec; | |||||
| x13 = LD_SP(px); px += 4; | |||||
| x5 *= da_vec; | |||||
| x14 = LD_SP(px); px += 4; | |||||
| x6 *= da_vec; | |||||
| x15 = LD_SP(px); px += 4; | |||||
| x7 *= da_vec; | |||||
| x8 *= da_vec; | |||||
| ST_SP(x0, x); x += 4; | |||||
| x9 *= da_vec; | |||||
| ST_SP(x1, x); x += 4; | |||||
| x10 *= da_vec; | |||||
| ST_SP(x2, x); x += 4; | |||||
| x11 *= da_vec; | |||||
| ST_SP(x3, x); x += 4; | |||||
| x12 *= da_vec; | |||||
| ST_SP(x4, x); x += 4; | |||||
| x13 *= da_vec; | |||||
| ST_SP(x5, x); x += 4; | |||||
| x15 *= da_vec; | |||||
| ST_SP(x6, x); x += 4; | |||||
| x14 *= da_vec; | |||||
| ST_SP(x7, x); x += 4; | |||||
| ST_SP8_INC(x8, x9, x10, x11, x12, x13, x14, x15, x, 4); | |||||
| } | |||||
| if (n & 63) | |||||
| { | |||||
| if (n & 32) | |||||
| { | |||||
| LD_SP8_INC(px, 4, x0, x1, x2, x3, x4, x5, x6, x7); | |||||
| MUL4(x0, da_vec, x1, da_vec, x2, da_vec, x3, da_vec, x0, x1, x2, x3); | |||||
| MUL4(x4, da_vec, x5, da_vec, x6, da_vec, x7, da_vec, x4, x5, x6, x7); | |||||
| ST_SP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, 4); | |||||
| } | |||||
| if (n & 16) | |||||
| { | |||||
| LD_SP4_INC(px, 4, x0, x1, x2, x3); | |||||
| MUL4(x0, da_vec, x1, da_vec, x2, da_vec, x3, da_vec, x0, x1, x2, x3); | |||||
| ST_SP4_INC(x0, x1, x2, x3, x, 4); | |||||
| } | |||||
| if (n & 8) | |||||
| { | |||||
| LD_SP2_INC(px, 4, x0, x1); | |||||
| MUL2(x0, da_vec, x1, da_vec, x0, x1); | |||||
| ST_SP2_INC(x0, x1, x, 4); | |||||
| } | |||||
| if (n & 4) | |||||
| { | |||||
| LD_GP4_INC(px, 1, f0, f1, f2, f3); | |||||
| MUL4(f0, da, f1, da, f2, da, f3, da, f0, f1, f2, f3); | |||||
| ST_GP4_INC(f0, f1, f2, f3, x, 1); | |||||
| } | |||||
| if (n & 2) | |||||
| { | |||||
| LD_GP2_INC(px, 1, f0, f1); | |||||
| MUL2(f0, da, f1, da, f0, f1); | |||||
| ST_GP2_INC(f0, f1, x, 1); | |||||
| } | |||||
| if (n & 1) | |||||
| { | |||||
| *x *= da; | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| else | |||||
| { | |||||
| if (0.0 == da) | |||||
| { | |||||
| for (i = n; i--;) | |||||
| { | |||||
| *x = 0; | |||||
| x += inc_x; | |||||
| } | |||||
| } | |||||
| else | |||||
| { | |||||
| if (n > 15) | |||||
| { | |||||
| LD_GP8_INC(px, inc_x, f0, f1, f2, f3, f4, f5, f6, f7); | |||||
| for (i = 0; i < (n >> 4) - 1; i++) | |||||
| { | |||||
| LD_GP8_INC(px, inc_x, f8, f9, f10, f11, f12, f13, f14, f15); | |||||
| MUL4(f0, da, f1, da, f2, da, f3, da, f0, f1, f2, f3); | |||||
| f4 *= da; | |||||
| f5 *= da; | |||||
| *x = f0; x += inc_x; | |||||
| f6 *= da; | |||||
| *x = f1; x += inc_x; | |||||
| f7 *= da; | |||||
| *x = f2; x += inc_x; | |||||
| f8 *= da; | |||||
| *x = f3; x += inc_x; | |||||
| f9 *= da; | |||||
| *x = f4; x += inc_x; | |||||
| f10 *= da; | |||||
| *x = f5; x += inc_x; | |||||
| f11 *= da; | |||||
| *x = f6; x += inc_x; | |||||
| f12 *= da; | |||||
| *x = f7; x += inc_x; | |||||
| f13 *= da; | |||||
| *x = f8; x += inc_x; | |||||
| f14 *= da; | |||||
| *x = f9; x += inc_x; | |||||
| f15 *= da; | |||||
| *x = f10; x += inc_x; | |||||
| *x = f11; x += inc_x; | |||||
| f0 = *px; px += inc_x; | |||||
| *x = f12; x += inc_x; | |||||
| f1 = *px; px += inc_x; | |||||
| *x = f13; x += inc_x; | |||||
| f2 = *px; px += inc_x; | |||||
| *x = f14; x += inc_x; | |||||
| f3 = *px; px += inc_x; | |||||
| *x = f15; x += inc_x; | |||||
| f4 = *px; px += inc_x; | |||||
| f5 = *px; px += inc_x; | |||||
| f6 = *px; px += inc_x; | |||||
| f7 = *px; px += inc_x; | |||||
| } | |||||
| LD_GP8_INC(px, inc_x, f8, f9, f10, f11, f12, f13, f14, f15); | |||||
| MUL4(f0, da, f1, da, f2, da, f3, da, f0, f1, f2, f3); | |||||
| f4 *= da; | |||||
| f5 *= da; | |||||
| *x = f0; x += inc_x; | |||||
| f6 *= da; | |||||
| *x = f1; x += inc_x; | |||||
| f7 *= da; | |||||
| *x = f2; x += inc_x; | |||||
| f8 *= da; | |||||
| *x = f3; x += inc_x; | |||||
| f9 *= da; | |||||
| *x = f4; x += inc_x; | |||||
| f10 *= da; | |||||
| *x = f5; x += inc_x; | |||||
| f11 *= da; | |||||
| *x = f6; x += inc_x; | |||||
| f12 *= da; | |||||
| *x = f7; x += inc_x; | |||||
| f13 *= da; | |||||
| *x = f8; x += inc_x; | |||||
| f14 *= da; | |||||
| *x = f9; x += inc_x; | |||||
| f15 *= da; | |||||
| *x = f10; x += inc_x; | |||||
| *x = f11; x += inc_x; | |||||
| *x = f12; x += inc_x; | |||||
| *x = f13; x += inc_x; | |||||
| *x = f14; x += inc_x; | |||||
| *x = f15; x += inc_x; | |||||
| } | |||||
| if (n & 15) | |||||
| { | |||||
| if (n & 8) | |||||
| { | |||||
| LD_GP8_INC(px, inc_x, f0, f1, f2, f3, f4, f5, f6, f7); | |||||
| MUL4(f0, da, f1, da, f2, da, f3, da, f0, f1, f2, f3); | |||||
| MUL4(f4, da, f5, da, f6, da, f7, da, f4, f5, f6, f7); | |||||
| ST_GP8_INC(f0, f1, f2, f3, f4, f5, f6, f7, x, inc_x); | |||||
| } | |||||
| if (n & 4) | |||||
| { | |||||
| LD_GP4_INC(px, inc_x, f0, f1, f2, f3); | |||||
| MUL4(f0, da, f1, da, f2, da, f3, da, f0, f1, f2, f3); | |||||
| ST_GP4_INC(f0, f1, f2, f3, x, inc_x); | |||||
| } | |||||
| if (n & 2) | |||||
| { | |||||
| LD_GP2_INC(px, inc_x, f0, f1); | |||||
| MUL2(f0, da, f1, da, f0, f1); | |||||
| ST_GP2_INC(f0, f1, x, inc_x); | |||||
| } | |||||
| if (n & 1) | |||||
| { | |||||
| *x *= da; | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| @@ -0,0 +1,267 @@ | |||||
| /******************************************************************************* | |||||
| Copyright (c) 2016, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *******************************************************************************/ | |||||
| #include "common.h" | |||||
| #include "macros_msa.h" | |||||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, | |||||
| FLOAT *srcx, BLASLONG inc_x, FLOAT *srcy, BLASLONG inc_y, | |||||
| FLOAT *dummy, BLASLONG dummy2) | |||||
| { | |||||
| BLASLONG i = 0, pref_offsetx, pref_offsety; | |||||
| FLOAT *px, *py; | |||||
| FLOAT x0, x1, x2, x3, x4, x5, x6, x7; | |||||
| FLOAT y0, y1, y2, y3, y4, y5, y6, y7; | |||||
| v4f32 xv0, xv1, xv2, xv3, xv4, xv5, xv6, xv7; | |||||
| v4f32 yv0, yv1, yv2, yv3, yv4, yv5, yv6, yv7; | |||||
| if (n < 0) return (0); | |||||
| pref_offsetx = (BLASLONG)srcx & (L1_DATA_LINESIZE - 1); | |||||
| if (pref_offsetx > 0) | |||||
| { | |||||
| pref_offsetx = L1_DATA_LINESIZE - pref_offsetx; | |||||
| pref_offsetx = pref_offsetx / sizeof(FLOAT); | |||||
| } | |||||
| pref_offsety = (BLASLONG)srcy & (L1_DATA_LINESIZE - 1); | |||||
| if (pref_offsety > 0) | |||||
| { | |||||
| pref_offsety = L1_DATA_LINESIZE - pref_offsety; | |||||
| pref_offsety = pref_offsety / sizeof(FLOAT); | |||||
| } | |||||
| px = srcx; | |||||
| py = srcy; | |||||
| if ((1 == inc_x) && (1 == inc_y)) | |||||
| { | |||||
| if (n >> 5) | |||||
| { | |||||
| LD_SP8_INC(px, 4, xv0, xv1, xv2, xv3, xv4, xv5, xv6, xv7); | |||||
| for (i = (n >> 5) - 1; i--;) | |||||
| { | |||||
| PREFETCH(px + pref_offsetx + 32); | |||||
| PREFETCH(px + pref_offsetx + 40); | |||||
| PREFETCH(px + pref_offsetx + 48); | |||||
| PREFETCH(px + pref_offsetx + 56); | |||||
| PREFETCH(py + pref_offsety + 32); | |||||
| PREFETCH(py + pref_offsety + 40); | |||||
| PREFETCH(py + pref_offsety + 48); | |||||
| PREFETCH(py + pref_offsety + 56); | |||||
| yv0 = LD_SP(py); py += 4; | |||||
| ST_SP(xv0, srcy); srcy += 4; | |||||
| yv1 = LD_SP(py); py += 4; | |||||
| ST_SP(xv1, srcy); srcy += 4; | |||||
| yv2 = LD_SP(py); py += 4; | |||||
| ST_SP(xv2, srcy); srcy += 4; | |||||
| yv3 = LD_SP(py); py += 4; | |||||
| ST_SP(xv3, srcy); srcy += 4; | |||||
| yv4 = LD_SP(py); py += 4; | |||||
| ST_SP(xv4, srcy); srcy += 4; | |||||
| yv5 = LD_SP(py); py += 4; | |||||
| ST_SP(xv5, srcy); srcy += 4; | |||||
| yv6 = LD_SP(py); py += 4; | |||||
| ST_SP(xv6, srcy); srcy += 4; | |||||
| yv7 = LD_SP(py); py += 4; | |||||
| ST_SP(xv7, srcy); srcy += 4; | |||||
| xv0 = LD_SP(px); px += 4; | |||||
| ST_SP(yv0, srcx); srcx += 4; | |||||
| xv1 = LD_SP(px); px += 4; | |||||
| ST_SP(yv1, srcx); srcx += 4; | |||||
| xv2 = LD_SP(px); px += 4; | |||||
| ST_SP(yv2, srcx); srcx += 4; | |||||
| xv3 = LD_SP(px); px += 4; | |||||
| ST_SP(yv3, srcx); srcx += 4; | |||||
| xv4 = LD_SP(px); px += 4; | |||||
| ST_SP(yv4, srcx); srcx += 4; | |||||
| xv5 = LD_SP(px); px += 4; | |||||
| ST_SP(yv5, srcx); srcx += 4; | |||||
| xv6 = LD_SP(px); px += 4; | |||||
| ST_SP(yv6, srcx); srcx += 4; | |||||
| xv7 = LD_SP(px); px += 4; | |||||
| ST_SP(yv7, srcx); srcx += 4; | |||||
| } | |||||
| LD_SP8_INC(py, 4, yv0, yv1, yv2, yv3, yv4, yv5, yv6, yv7); | |||||
| ST_SP8_INC(xv0, xv1, xv2, xv3, xv4, xv5, xv6, xv7, srcy, 4); | |||||
| ST_SP8_INC(yv0, yv1, yv2, yv3, yv4, yv5, yv6, yv7, srcx, 4); | |||||
| } | |||||
| if (n & 31) | |||||
| { | |||||
| if ((n & 16) && (n & 8) && (n & 4)) | |||||
| { | |||||
| LD_SP7_INC(px, 4, xv0, xv1, xv2, xv3, xv4, xv5, xv6); | |||||
| LD_SP7_INC(py, 4, yv0, yv1, yv2, yv3, yv4, yv5, yv6); | |||||
| ST_SP7_INC(xv0, xv1, xv2, xv3, xv4, xv5, xv6, srcy, 4); | |||||
| ST_SP7_INC(yv0, yv1, yv2, yv3, yv4, yv5, yv6, srcx, 4); | |||||
| } | |||||
| else if ((n & 16) && (n & 8)) | |||||
| { | |||||
| LD_SP6_INC(px, 4, xv0, xv1, xv2, xv3, xv4, xv5); | |||||
| LD_SP6_INC(py, 4, yv0, yv1, yv2, yv3, yv4, yv5); | |||||
| ST_SP6_INC(xv0, xv1, xv2, xv3, xv4, xv5, srcy, 4); | |||||
| ST_SP6_INC(yv0, yv1, yv2, yv3, yv4, yv5, srcx, 4); | |||||
| } | |||||
| else if ((n & 16) && (n & 4)) | |||||
| { | |||||
| LD_SP5_INC(px, 4, xv0, xv1, xv2, xv3, xv4); | |||||
| LD_SP5_INC(py, 4, yv0, yv1, yv2, yv3, yv4); | |||||
| ST_SP5_INC(xv0, xv1, xv2, xv3, xv4, srcy, 4); | |||||
| ST_SP5_INC(yv0, yv1, yv2, yv3, yv4, srcx, 4); | |||||
| } | |||||
| else if ((n & 8) && (n & 4)) | |||||
| { | |||||
| LD_SP3_INC(px, 4, xv0, xv1, xv2); | |||||
| LD_SP3_INC(py, 4, yv0, yv1, yv2); | |||||
| ST_SP3_INC(xv0, xv1, xv2, srcy, 4); | |||||
| ST_SP3_INC(yv0, yv1, yv2, srcx, 4); | |||||
| } | |||||
| else if (n & 16) | |||||
| { | |||||
| LD_SP4_INC(px, 4, xv0, xv1, xv2, xv3); | |||||
| LD_SP4_INC(py, 4, yv0, yv1, yv2, yv3); | |||||
| ST_SP4_INC(xv0, xv1, xv2, xv3, srcy, 4); | |||||
| ST_SP4_INC(yv0, yv1, yv2, yv3, srcx, 4); | |||||
| } | |||||
| else if (n & 8) | |||||
| { | |||||
| LD_SP2_INC(px, 4, xv0, xv1); | |||||
| LD_SP2_INC(py, 4, yv0, yv1); | |||||
| ST_SP2_INC(xv0, xv1, srcy, 4); | |||||
| ST_SP2_INC(yv0, yv1, srcx, 4); | |||||
| } | |||||
| else if (n & 4) | |||||
| { | |||||
| xv0 = LD_SP(px); | |||||
| yv0 = LD_SP(py); | |||||
| px += 4; | |||||
| py += 4; | |||||
| ST_SP(xv0, srcy); | |||||
| ST_SP(yv0, srcx); | |||||
| srcx += 4; | |||||
| srcy += 4; | |||||
| } | |||||
| if ((n & 2) && (n & 1)) | |||||
| { | |||||
| LD_GP3_INC(px, 1, x0, x1, x3); | |||||
| LD_GP3_INC(py, 1, y0, y1, y3); | |||||
| ST_GP3_INC(x0, x1, x3, srcy, 1); | |||||
| ST_GP3_INC(y0, y1, y3, srcx, 1); | |||||
| } | |||||
| else if (n & 2) | |||||
| { | |||||
| LD_GP2_INC(px, 1, x0, x1); | |||||
| LD_GP2_INC(py, 1, y0, y1); | |||||
| ST_GP2_INC(x0, x1, srcy, 1); | |||||
| ST_GP2_INC(y0, y1, srcx, 1); | |||||
| } | |||||
| else if (n & 1) | |||||
| { | |||||
| x0 = px[0]; | |||||
| y0 = py[0]; | |||||
| srcx[0] = y0; | |||||
| srcy[0] = x0; | |||||
| } | |||||
| } | |||||
| } | |||||
| else | |||||
| { | |||||
| for (i = (n >> 3); i--;) | |||||
| { | |||||
| LD_GP8_INC(px, inc_x, x0, x1, x2, x3, x4, x5, x6, x7); | |||||
| LD_GP8_INC(py, inc_y, y0, y1, y2, y3, y4, y5, y6, y7); | |||||
| ST_GP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, srcy, inc_y); | |||||
| ST_GP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, srcx, inc_x); | |||||
| } | |||||
| if (n & 7) | |||||
| { | |||||
| if ((n & 4) && (n & 2) && (n & 1)) | |||||
| { | |||||
| LD_GP7_INC(px, inc_x, x0, x1, x2, x3, x4, x5, x6); | |||||
| LD_GP7_INC(py, inc_y, y0, y1, y2, y3, y4, y5, y6); | |||||
| ST_GP7_INC(x0, x1, x2, x3, x4, x5, x6, srcy, inc_y); | |||||
| ST_GP7_INC(y0, y1, y2, y3, y4, y5, y6, srcx, inc_x); | |||||
| } | |||||
| else if ((n & 4) && (n & 2)) | |||||
| { | |||||
| LD_GP6_INC(px, inc_x, x0, x1, x2, x3, x4, x5); | |||||
| LD_GP6_INC(py, inc_y, y0, y1, y2, y3, y4, y5); | |||||
| ST_GP6_INC(x0, x1, x2, x3, x4, x5, srcy, inc_y); | |||||
| ST_GP6_INC(y0, y1, y2, y3, y4, y5, srcx, inc_x); | |||||
| } | |||||
| else if ((n & 4) && (n & 1)) | |||||
| { | |||||
| LD_GP5_INC(px, inc_x, x0, x1, x2, x3, x4); | |||||
| LD_GP5_INC(py, inc_y, y0, y1, y2, y3, y4); | |||||
| ST_GP5_INC(x0, x1, x2, x3, x4, srcy, inc_y); | |||||
| ST_GP5_INC(y0, y1, y2, y3, y4, srcx, inc_x); | |||||
| } | |||||
| else if ((n & 2) && (n & 1)) | |||||
| { | |||||
| LD_GP3_INC(px, inc_x, x0, x1, x2); | |||||
| LD_GP3_INC(py, inc_y, y0, y1, y2); | |||||
| ST_GP3_INC(x0, x1, x2, srcy, inc_y); | |||||
| ST_GP3_INC(y0, y1, y2, srcx, inc_x); | |||||
| } | |||||
| else if (n & 4) | |||||
| { | |||||
| LD_GP4_INC(px, inc_x, x0, x1, x2, x3); | |||||
| LD_GP4_INC(py, inc_y, y0, y1, y2, y3); | |||||
| ST_GP4_INC(x0, x1, x2, x3, srcy, inc_y); | |||||
| ST_GP4_INC(y0, y1, y2, y3, srcx, inc_x); | |||||
| } | |||||
| else if (n & 2) | |||||
| { | |||||
| LD_GP2_INC(px, inc_x, x0, x1); | |||||
| LD_GP2_INC(py, inc_y, y0, y1); | |||||
| ST_GP2_INC(x0, x1, srcy, inc_y); | |||||
| ST_GP2_INC(y0, y1, srcx, inc_x); | |||||
| } | |||||
| else if (n & 1) | |||||
| { | |||||
| x0 = *srcx; | |||||
| y0 = *srcy; | |||||
| *srcx = y0; | |||||
| *srcy = x0; | |||||
| } | |||||
| } | |||||
| } | |||||
| return (0); | |||||
| } | |||||
| @@ -0,0 +1,494 @@ | |||||
| /******************************************************************************* | |||||
| Copyright (c) 2016, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *******************************************************************************/ | |||||
| #include "common.h" | |||||
| #include "macros_msa.h" | |||||
| #if !defined(CONJ) | |||||
| #define OP0 += | |||||
| #define OP1 -= | |||||
| #define OP2 += | |||||
| #else | |||||
| #define OP0 -= | |||||
| #define OP1 += | |||||
| #define OP2 -= | |||||
| #endif | |||||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||||
| FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, | |||||
| BLASLONG dummy2) | |||||
| { | |||||
| BLASLONG i, inc_x2, inc_y2; | |||||
| FLOAT *py; | |||||
| v2f64 x0, x1, x2, x3, x4, x5, x6, x7; | |||||
| v2f64 y0, y1, y2, y3, y4, y5, y6, y7, dar_vec, dai_vec; | |||||
| v2f64 x0r, x1r, x2r, x3r, x0i, x1i, x2i, x3i; | |||||
| v2f64 y0r, y1r, y2r, y3r, y0i, y1i, y2i, y3i; | |||||
| FLOAT xd0, xd1, yd0, yd1; | |||||
| if (n < 0) return(0); | |||||
| if ((da_r == 0.0) && (da_i == 0.0)) return(0); | |||||
| py = y; | |||||
| dar_vec = COPY_DOUBLE_TO_VECTOR(da_r); | |||||
| dai_vec = COPY_DOUBLE_TO_VECTOR(da_i); | |||||
| if ((1 == inc_x) && (1 == inc_y)) | |||||
| { | |||||
| FLOAT *x_pref, *y_pref; | |||||
| BLASLONG pref_offset; | |||||
| pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); | |||||
| if (pref_offset > 0) | |||||
| { | |||||
| pref_offset = L1_DATA_LINESIZE - pref_offset; | |||||
| pref_offset = pref_offset / sizeof(FLOAT); | |||||
| } | |||||
| x_pref = x + pref_offset + 32; | |||||
| pref_offset = (BLASLONG)y & (L1_DATA_LINESIZE - 1); | |||||
| if (pref_offset > 0) | |||||
| { | |||||
| pref_offset = L1_DATA_LINESIZE - pref_offset; | |||||
| pref_offset = pref_offset / sizeof(FLOAT); | |||||
| } | |||||
| y_pref = y + pref_offset + 32; | |||||
| for (i = (n >> 3); i--;) | |||||
| { | |||||
| PREF_OFFSET(x_pref, 0); | |||||
| PREF_OFFSET(x_pref, 32); | |||||
| PREF_OFFSET(x_pref, 64); | |||||
| PREF_OFFSET(x_pref, 96); | |||||
| PREF_OFFSET(y_pref, 0); | |||||
| PREF_OFFSET(y_pref, 32); | |||||
| PREF_OFFSET(y_pref, 64); | |||||
| PREF_OFFSET(y_pref, 96); | |||||
| x_pref += 16; | |||||
| y_pref += 16; | |||||
| LD_DP8_INC(x, 2, x0, x1, x2, x3, x4, x5, x6, x7); | |||||
| LD_DP8_INC(py, 2, y0, y1, y2, y3, y4, y5, y6, y7); | |||||
| PCKEVOD_D2_DP(x1, x0, x0r, x0i); | |||||
| PCKEVOD_D2_DP(y1, y0, y0r, y0i); | |||||
| PCKEVOD_D2_DP(x3, x2, x1r, x1i); | |||||
| PCKEVOD_D2_DP(y3, y2, y1r, y1i); | |||||
| PCKEVOD_D2_DP(x5, x4, x2r, x2i); | |||||
| PCKEVOD_D2_DP(y5, y4, y2r, y2i); | |||||
| PCKEVOD_D2_DP(x7, x6, x3r, x3i); | |||||
| PCKEVOD_D2_DP(y7, y6, y3r, y3i); | |||||
| FMADD4(x0r, x1r, x2r, x3r, dar_vec, y0r, y1r, y2r, y3r); | |||||
| y0i OP0 dar_vec * x0i; | |||||
| y1i OP0 dar_vec * x1i; | |||||
| y2i OP0 dar_vec * x2i; | |||||
| y3i OP0 dar_vec * x3i; | |||||
| y0r OP1 dai_vec * x0i; | |||||
| y1r OP1 dai_vec * x1i; | |||||
| y2r OP1 dai_vec * x2i; | |||||
| y3r OP1 dai_vec * x3i; | |||||
| y0i OP2 dai_vec * x0r; | |||||
| y1i OP2 dai_vec * x1r; | |||||
| y2i OP2 dai_vec * x2r; | |||||
| y3i OP2 dai_vec * x3r; | |||||
| ILVRL_D2_DP(y0i, y0r, y0, y1); | |||||
| ILVRL_D2_DP(y1i, y1r, y2, y3); | |||||
| ILVRL_D2_DP(y2i, y2r, y4, y5); | |||||
| ILVRL_D2_DP(y3i, y3r, y6, y7); | |||||
| ST_DP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, 2); | |||||
| } | |||||
| if (n & 7) | |||||
| { | |||||
| if (n & 4) | |||||
| { | |||||
| LD_DP4_INC(x, 2, x0, x1, x2, x3); | |||||
| LD_DP4_INC(py, 2, y0, y1, y2, y3); | |||||
| PCKEVOD_D2_DP(x1, x0, x0r, x0i); | |||||
| PCKEVOD_D2_DP(y1, y0, y0r, y0i); | |||||
| PCKEVOD_D2_DP(x3, x2, x1r, x1i); | |||||
| PCKEVOD_D2_DP(y3, y2, y1r, y1i); | |||||
| FMADD2(x0r, x1r, dar_vec, y0r, y1r); | |||||
| y0i OP0 dar_vec * x0i; | |||||
| y1i OP0 dar_vec * x1i; | |||||
| y0r OP1 dai_vec * x0i; | |||||
| y1r OP1 dai_vec * x1i; | |||||
| y0i OP2 dai_vec * x0r; | |||||
| y1i OP2 dai_vec * x1r; | |||||
| ILVRL_D2_DP(y0i, y0r, y0, y1); | |||||
| ILVRL_D2_DP(y1i, y1r, y2, y3); | |||||
| ST_DP4_INC(y0, y1, y2, y3, y, 2); | |||||
| } | |||||
| if (n & 2) | |||||
| { | |||||
| LD_DP2_INC(x, 2, x0, x1); | |||||
| LD_DP2_INC(py, 2, y0, y1); | |||||
| PCKEVOD_D2_DP(x1, x0, x0r, x0i); | |||||
| PCKEVOD_D2_DP(y1, y0, y0r, y0i); | |||||
| y0r += dar_vec * x0r; | |||||
| y0i OP0 dar_vec * x0i; | |||||
| y0r OP1 dai_vec * x0i; | |||||
| y0i OP2 dai_vec * x0r; | |||||
| ILVRL_D2_DP(y0i, y0r, y0, y1); | |||||
| ST_DP2_INC(y0, y1, y, 2); | |||||
| } | |||||
| if (n & 1) | |||||
| { | |||||
| LD_GP2_INC(x, 1, xd0, xd1); | |||||
| LD_GP2_INC(py, 1, yd0, yd1); | |||||
| yd0 += da_r * xd0; | |||||
| yd1 OP0 da_r * xd1; | |||||
| yd0 OP1 da_i * xd1; | |||||
| yd1 OP2 da_i * xd0; | |||||
| ST_GP2_INC(yd0, yd1, y, 1); | |||||
| } | |||||
| } | |||||
| } | |||||
| else if (1 == inc_y) | |||||
| { | |||||
| FLOAT *y_pref; | |||||
| BLASLONG pref_offset; | |||||
| pref_offset = (BLASLONG)y & (L1_DATA_LINESIZE - 1); | |||||
| if (pref_offset > 0) | |||||
| { | |||||
| pref_offset = L1_DATA_LINESIZE - pref_offset; | |||||
| pref_offset = pref_offset / sizeof(FLOAT); | |||||
| } | |||||
| y_pref = y + pref_offset + 32; | |||||
| inc_x2 = 2 * inc_x; | |||||
| for (i = (n >> 3); i--;) | |||||
| { | |||||
| PREF_OFFSET(y_pref, 0); | |||||
| PREF_OFFSET(y_pref, 32); | |||||
| PREF_OFFSET(y_pref, 64); | |||||
| PREF_OFFSET(y_pref, 96); | |||||
| y_pref += 16; | |||||
| LD_DP8_INC(x, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7); | |||||
| LD_DP8_INC(py, 2, y0, y1, y2, y3, y4, y5, y6, y7); | |||||
| PCKEVOD_D2_DP(x1, x0, x0r, x0i); | |||||
| PCKEVOD_D2_DP(y1, y0, y0r, y0i); | |||||
| PCKEVOD_D2_DP(x3, x2, x1r, x1i); | |||||
| PCKEVOD_D2_DP(y3, y2, y1r, y1i); | |||||
| PCKEVOD_D2_DP(x5, x4, x2r, x2i); | |||||
| PCKEVOD_D2_DP(y5, y4, y2r, y2i); | |||||
| PCKEVOD_D2_DP(x7, x6, x3r, x3i); | |||||
| PCKEVOD_D2_DP(y7, y6, y3r, y3i); | |||||
| FMADD4(x0r, x1r, x2r, x3r, dar_vec, y0r, y1r, y2r, y3r); | |||||
| y0i OP0 dar_vec * x0i; | |||||
| y1i OP0 dar_vec * x1i; | |||||
| y2i OP0 dar_vec * x2i; | |||||
| y3i OP0 dar_vec * x3i; | |||||
| y0r OP1 dai_vec * x0i; | |||||
| y1r OP1 dai_vec * x1i; | |||||
| y2r OP1 dai_vec * x2i; | |||||
| y3r OP1 dai_vec * x3i; | |||||
| y0i OP2 dai_vec * x0r; | |||||
| y1i OP2 dai_vec * x1r; | |||||
| y2i OP2 dai_vec * x2r; | |||||
| y3i OP2 dai_vec * x3r; | |||||
| ILVRL_D2_DP(y0i, y0r, y0, y1); | |||||
| ILVRL_D2_DP(y1i, y1r, y2, y3); | |||||
| ILVRL_D2_DP(y2i, y2r, y4, y5); | |||||
| ILVRL_D2_DP(y3i, y3r, y6, y7); | |||||
| ST_DP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, 2); | |||||
| } | |||||
| if (n & 7) | |||||
| { | |||||
| if (n & 4) | |||||
| { | |||||
| LD_DP4_INC(x, inc_x2, x0, x1, x2, x3); | |||||
| LD_DP4_INC(py, 2, y0, y1, y2, y3); | |||||
| PCKEVOD_D2_DP(x1, x0, x0r, x0i); | |||||
| PCKEVOD_D2_DP(y1, y0, y0r, y0i); | |||||
| PCKEVOD_D2_DP(x3, x2, x1r, x1i); | |||||
| PCKEVOD_D2_DP(y3, y2, y1r, y1i); | |||||
| FMADD2(x0r, x1r, dar_vec, y0r, y1r); | |||||
| y0i OP0 dar_vec * x0i; | |||||
| y1i OP0 dar_vec * x1i; | |||||
| y0r OP1 dai_vec * x0i; | |||||
| y1r OP1 dai_vec * x1i; | |||||
| y0i OP2 dai_vec * x0r; | |||||
| y1i OP2 dai_vec * x1r; | |||||
| ILVRL_D2_DP(y0i, y0r, y0, y1); | |||||
| ILVRL_D2_DP(y1i, y1r, y2, y3); | |||||
| ST_DP4_INC(y0, y1, y2, y3, y, 2); | |||||
| } | |||||
| if (n & 2) | |||||
| { | |||||
| LD_DP2_INC(x, inc_x2, x0, x1); | |||||
| LD_DP2_INC(py, 2, y0, y1); | |||||
| PCKEVOD_D2_DP(x1, x0, x0r, x0i); | |||||
| PCKEVOD_D2_DP(y1, y0, y0r, y0i); | |||||
| y0r += dar_vec * x0r; | |||||
| y0i OP0 dar_vec * x0i; | |||||
| y0r OP1 dai_vec * x0i; | |||||
| y0i OP2 dai_vec * x0r; | |||||
| ILVRL_D2_DP(y0i, y0r, y0, y1); | |||||
| ST_DP2_INC(y0, y1, y, 2); | |||||
| } | |||||
| if (n & 1) | |||||
| { | |||||
| LD_GP2_INC(x, 1, xd0, xd1); | |||||
| LD_GP2_INC(py, 1, yd0, yd1); | |||||
| yd0 += da_r * xd0; | |||||
| yd1 OP0 da_r * xd1; | |||||
| yd0 OP1 da_i * xd1; | |||||
| yd1 OP2 da_i * xd0; | |||||
| ST_GP2_INC(yd0, yd1, y, 1); | |||||
| } | |||||
| } | |||||
| } | |||||
| else if (1 == inc_x) | |||||
| { | |||||
| FLOAT *x_pref; | |||||
| BLASLONG pref_offset; | |||||
| pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); | |||||
| if (pref_offset > 0) | |||||
| { | |||||
| pref_offset = L1_DATA_LINESIZE - pref_offset; | |||||
| pref_offset = pref_offset / sizeof(FLOAT); | |||||
| } | |||||
| x_pref = x + pref_offset + 32; | |||||
| inc_y2 = 2 * inc_y; | |||||
| for (i = (n >> 3); i--;) | |||||
| { | |||||
| PREF_OFFSET(x_pref, 0); | |||||
| PREF_OFFSET(x_pref, 32); | |||||
| PREF_OFFSET(x_pref, 64); | |||||
| PREF_OFFSET(x_pref, 96); | |||||
| x_pref += 16; | |||||
| LD_DP8_INC(x, 2, x0, x1, x2, x3, x4, x5, x6, x7); | |||||
| LD_DP8_INC(py, inc_y2, y0, y1, y2, y3, y4, y5, y6, y7); | |||||
| PCKEVOD_D2_DP(x1, x0, x0r, x0i); | |||||
| PCKEVOD_D2_DP(y1, y0, y0r, y0i); | |||||
| PCKEVOD_D2_DP(x3, x2, x1r, x1i); | |||||
| PCKEVOD_D2_DP(y3, y2, y1r, y1i); | |||||
| PCKEVOD_D2_DP(x5, x4, x2r, x2i); | |||||
| PCKEVOD_D2_DP(y5, y4, y2r, y2i); | |||||
| PCKEVOD_D2_DP(x7, x6, x3r, x3i); | |||||
| PCKEVOD_D2_DP(y7, y6, y3r, y3i); | |||||
| FMADD4(x0r, x1r, x2r, x3r, dar_vec, y0r, y1r, y2r, y3r); | |||||
| y0i OP0 dar_vec * x0i; | |||||
| y1i OP0 dar_vec * x1i; | |||||
| y2i OP0 dar_vec * x2i; | |||||
| y3i OP0 dar_vec * x3i; | |||||
| y0r OP1 dai_vec * x0i; | |||||
| y1r OP1 dai_vec * x1i; | |||||
| y2r OP1 dai_vec * x2i; | |||||
| y3r OP1 dai_vec * x3i; | |||||
| y0i OP2 dai_vec * x0r; | |||||
| y1i OP2 dai_vec * x1r; | |||||
| y2i OP2 dai_vec * x2r; | |||||
| y3i OP2 dai_vec * x3r; | |||||
| ILVRL_D2_DP(y0i, y0r, y0, y1); | |||||
| ILVRL_D2_DP(y1i, y1r, y2, y3); | |||||
| ILVRL_D2_DP(y2i, y2r, y4, y5); | |||||
| ILVRL_D2_DP(y3i, y3r, y6, y7); | |||||
| ST_DP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, inc_y2); | |||||
| } | |||||
| if (n & 7) | |||||
| { | |||||
| if (n & 4) | |||||
| { | |||||
| LD_DP4_INC(x, 2, x0, x1, x2, x3); | |||||
| LD_DP4_INC(py, inc_y2, y0, y1, y2, y3); | |||||
| PCKEVOD_D2_DP(x1, x0, x0r, x0i); | |||||
| PCKEVOD_D2_DP(y1, y0, y0r, y0i); | |||||
| PCKEVOD_D2_DP(x3, x2, x1r, x1i); | |||||
| PCKEVOD_D2_DP(y3, y2, y1r, y1i); | |||||
| FMADD2(x0r, x1r, dar_vec, y0r, y1r); | |||||
| y0i OP0 dar_vec * x0i; | |||||
| y1i OP0 dar_vec * x1i; | |||||
| y0r OP1 dai_vec * x0i; | |||||
| y1r OP1 dai_vec * x1i; | |||||
| y0i OP2 dai_vec * x0r; | |||||
| y1i OP2 dai_vec * x1r; | |||||
| ILVRL_D2_DP(y0i, y0r, y0, y1); | |||||
| ILVRL_D2_DP(y1i, y1r, y2, y3); | |||||
| ST_DP4_INC(y0, y1, y2, y3, y, inc_y2); | |||||
| } | |||||
| if (n & 2) | |||||
| { | |||||
| LD_DP2_INC(x, 2, x0, x1); | |||||
| LD_DP2_INC(py, inc_y2, y0, y1); | |||||
| PCKEVOD_D2_DP(x1, x0, x0r, x0i); | |||||
| PCKEVOD_D2_DP(y1, y0, y0r, y0i); | |||||
| y0r += dar_vec * x0r; | |||||
| y0i OP0 dar_vec * x0i; | |||||
| y0r OP1 dai_vec * x0i; | |||||
| y0i OP2 dai_vec * x0r; | |||||
| ILVRL_D2_DP(y0i, y0r, y0, y1); | |||||
| ST_DP2_INC(y0, y1, y, inc_y2); | |||||
| } | |||||
| if (n & 1) | |||||
| { | |||||
| LD_GP2_INC(x, 1, xd0, xd1); | |||||
| LD_GP2_INC(py, 1, yd0, yd1); | |||||
| yd0 += da_r * xd0; | |||||
| yd1 OP0 da_r * xd1; | |||||
| yd0 OP1 da_i * xd1; | |||||
| yd1 OP2 da_i * xd0; | |||||
| ST_GP2_INC(yd0, yd1, y, 1); | |||||
| } | |||||
| } | |||||
| } | |||||
| else | |||||
| { | |||||
| inc_x2 = 2 * inc_x; | |||||
| inc_y2 = 2 * inc_y; | |||||
| for (i = (n >> 3); i--;) | |||||
| { | |||||
| LD_DP8_INC(x, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7); | |||||
| LD_DP8_INC(py, inc_y2, y0, y1, y2, y3, y4, y5, y6, y7); | |||||
| PCKEVOD_D2_DP(x1, x0, x0r, x0i); | |||||
| PCKEVOD_D2_DP(y1, y0, y0r, y0i); | |||||
| PCKEVOD_D2_DP(x3, x2, x1r, x1i); | |||||
| PCKEVOD_D2_DP(y3, y2, y1r, y1i); | |||||
| PCKEVOD_D2_DP(x5, x4, x2r, x2i); | |||||
| PCKEVOD_D2_DP(y5, y4, y2r, y2i); | |||||
| PCKEVOD_D2_DP(x7, x6, x3r, x3i); | |||||
| PCKEVOD_D2_DP(y7, y6, y3r, y3i); | |||||
| FMADD4(x0r, x1r, x2r, x3r, dar_vec, y0r, y1r, y2r, y3r); | |||||
| y0i OP0 dar_vec * x0i; | |||||
| y1i OP0 dar_vec * x1i; | |||||
| y2i OP0 dar_vec * x2i; | |||||
| y3i OP0 dar_vec * x3i; | |||||
| y0r OP1 dai_vec * x0i; | |||||
| y1r OP1 dai_vec * x1i; | |||||
| y2r OP1 dai_vec * x2i; | |||||
| y3r OP1 dai_vec * x3i; | |||||
| y0i OP2 dai_vec * x0r; | |||||
| y1i OP2 dai_vec * x1r; | |||||
| y2i OP2 dai_vec * x2r; | |||||
| y3i OP2 dai_vec * x3r; | |||||
| ILVRL_D2_DP(y0i, y0r, y0, y1); | |||||
| ILVRL_D2_DP(y1i, y1r, y2, y3); | |||||
| ILVRL_D2_DP(y2i, y2r, y4, y5); | |||||
| ILVRL_D2_DP(y3i, y3r, y6, y7); | |||||
| ST_DP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, inc_y2); | |||||
| } | |||||
| if (n & 7) | |||||
| { | |||||
| if (n & 4) | |||||
| { | |||||
| LD_DP4_INC(x, inc_x2, x0, x1, x2, x3); | |||||
| LD_DP4_INC(py, inc_y2, y0, y1, y2, y3); | |||||
| PCKEVOD_D2_DP(x1, x0, x0r, x0i); | |||||
| PCKEVOD_D2_DP(y1, y0, y0r, y0i); | |||||
| PCKEVOD_D2_DP(x3, x2, x1r, x1i); | |||||
| PCKEVOD_D2_DP(y3, y2, y1r, y1i); | |||||
| FMADD2(x0r, x1r, dar_vec, y0r, y1r); | |||||
| y0i OP0 dar_vec * x0i; | |||||
| y1i OP0 dar_vec * x1i; | |||||
| y0r OP1 dai_vec * x0i; | |||||
| y1r OP1 dai_vec * x1i; | |||||
| y0i OP2 dai_vec * x0r; | |||||
| y1i OP2 dai_vec * x1r; | |||||
| ILVRL_D2_DP(y0i, y0r, y0, y1); | |||||
| ILVRL_D2_DP(y1i, y1r, y2, y3); | |||||
| ST_DP4_INC(y0, y1, y2, y3, y, inc_y2); | |||||
| } | |||||
| if (n & 2) | |||||
| { | |||||
| LD_DP2_INC(x, inc_x2, x0, x1); | |||||
| LD_DP2_INC(py, inc_y2, y0, y1); | |||||
| PCKEVOD_D2_DP(x1, x0, x0r, x0i); | |||||
| PCKEVOD_D2_DP(y1, y0, y0r, y0i); | |||||
| y0r += dar_vec * x0r; | |||||
| y0i OP0 dar_vec * x0i; | |||||
| y0r OP1 dai_vec * x0i; | |||||
| y0i OP2 dai_vec * x0r; | |||||
| ILVRL_D2_DP(y0i, y0r, y0, y1); | |||||
| ST_DP2_INC(y0, y1, y, inc_y2); | |||||
| } | |||||
| if (n & 1) | |||||
| { | |||||
| LD_GP2_INC(x, 1, xd0, xd1); | |||||
| LD_GP2_INC(py, 1, yd0, yd1); | |||||
| yd0 += da_r * xd0; | |||||
| yd1 OP0 da_r * xd1; | |||||
| yd0 OP1 da_i * xd1; | |||||
| yd1 OP2 da_i * xd0; | |||||
| ST_GP2_INC(yd0, yd1, y, 1); | |||||
| } | |||||
| } | |||||
| } | |||||
| return (0); | |||||
| } | |||||
| @@ -0,0 +1,218 @@ | |||||
| /******************************************************************************* | |||||
| Copyright (c) 2016, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *******************************************************************************/ | |||||
| #include "common.h" | |||||
| #include "macros_msa.h" | |||||
| int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||||
| { | |||||
| BLASLONG i; | |||||
| v2f64 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; | |||||
| FLOAT f0, f1; | |||||
| if (n < 0) return (0); | |||||
| if ((1 == inc_x) && (1 == inc_y)) | |||||
| { | |||||
| if (n > 15) | |||||
| { | |||||
| FLOAT *x_pref; | |||||
| BLASLONG pref_offset; | |||||
| pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); | |||||
| if (pref_offset > 0) | |||||
| { | |||||
| pref_offset = L1_DATA_LINESIZE - pref_offset; | |||||
| pref_offset = pref_offset / sizeof(FLOAT); | |||||
| } | |||||
| x_pref = x + pref_offset + 64 + 16; | |||||
| LD_DP8_INC(x, 2, x0, x1, x2, x3, x4, x5, x6, x7); | |||||
| for (i = (n >> 4) - 1; i--;) | |||||
| { | |||||
| PREF_OFFSET(x_pref, 0); | |||||
| PREF_OFFSET(x_pref, 32); | |||||
| PREF_OFFSET(x_pref, 64); | |||||
| PREF_OFFSET(x_pref, 96); | |||||
| PREF_OFFSET(x_pref, 128); | |||||
| PREF_OFFSET(x_pref, 160); | |||||
| PREF_OFFSET(x_pref, 192); | |||||
| PREF_OFFSET(x_pref, 224); | |||||
| x_pref += 32; | |||||
| x8 = LD_DP(x); x += 2; | |||||
| ST_DP(x0, y); y += 2; | |||||
| x9 = LD_DP(x); x += 2; | |||||
| ST_DP(x1, y); y += 2; | |||||
| x10 = LD_DP(x); x += 2; | |||||
| ST_DP(x2, y); y += 2; | |||||
| x11 = LD_DP(x); x += 2; | |||||
| ST_DP(x3, y); y += 2; | |||||
| x12 = LD_DP(x); x += 2; | |||||
| ST_DP(x4, y); y += 2; | |||||
| x13 = LD_DP(x); x += 2; | |||||
| ST_DP(x5, y); y += 2; | |||||
| x14 = LD_DP(x); x += 2; | |||||
| ST_DP(x6, y); y += 2; | |||||
| x15 = LD_DP(x); x += 2; | |||||
| ST_DP(x7, y); y += 2; | |||||
| x0 = LD_DP(x); x += 2; | |||||
| ST_DP(x8, y); y += 2; | |||||
| x1 = LD_DP(x); x += 2; | |||||
| ST_DP(x9, y); y += 2; | |||||
| x2 = LD_DP(x); x += 2; | |||||
| ST_DP(x10, y); y += 2; | |||||
| x3 = LD_DP(x); x += 2; | |||||
| ST_DP(x11, y); y += 2; | |||||
| x4 = LD_DP(x); x += 2; | |||||
| ST_DP(x12, y); y += 2; | |||||
| x5 = LD_DP(x); x += 2; | |||||
| ST_DP(x13, y); y += 2; | |||||
| x6 = LD_DP(x); x += 2; | |||||
| ST_DP(x14, y); y += 2; | |||||
| x7 = LD_DP(x); x += 2; | |||||
| ST_DP(x15, y); y += 2; | |||||
| } | |||||
| x8 = LD_DP(x); x += 2; | |||||
| x9 = LD_DP(x); x += 2; | |||||
| ST_DP(x0, y); y += 2; | |||||
| x10 = LD_DP(x); x += 2; | |||||
| ST_DP(x1, y); y += 2; | |||||
| x11 = LD_DP(x); x += 2; | |||||
| ST_DP(x2, y); y += 2; | |||||
| x12 = LD_DP(x); x += 2; | |||||
| ST_DP(x3, y); y += 2; | |||||
| x13 = LD_DP(x); x += 2; | |||||
| ST_DP(x4, y); y += 2; | |||||
| x14 = LD_DP(x); x += 2; | |||||
| ST_DP(x5, y); y += 2; | |||||
| x15 = LD_DP(x); x += 2; | |||||
| ST_DP(x6, y); y += 2; | |||||
| ST_DP(x7, y); y += 2; | |||||
| ST_DP8_INC(x8, x9, x10, x11, x12, x13, x14, x15, y, 2); | |||||
| } | |||||
| if (n & 15) | |||||
| { | |||||
| if (n & 8) | |||||
| { | |||||
| LD_DP8_INC(x, 2, x0, x1, x2, x3, x4, x5, x6, x7); | |||||
| ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, y, 2); | |||||
| } | |||||
| if (n & 4) | |||||
| { | |||||
| LD_DP4_INC(x, 2, x0, x1, x2, x3); | |||||
| ST_DP4_INC(x0, x1, x2, x3, y, 2); | |||||
| } | |||||
| if (n & 2) | |||||
| { | |||||
| LD_DP2_INC(x, 2, x0, x1); | |||||
| ST_DP2_INC(x0, x1, y, 2); | |||||
| } | |||||
| if (n & 1) | |||||
| { | |||||
| LD_GP2_INC(x, 1, f0, f1); | |||||
| ST_GP2_INC(f0, f1, y, 1); | |||||
| } | |||||
| } | |||||
| } | |||||
| else | |||||
| { | |||||
| inc_x *= 2; | |||||
| inc_y *= 2; | |||||
| for (i = (n >> 4); i--;) | |||||
| { | |||||
| x0 = LD_DP(x); x += inc_x; | |||||
| x1 = LD_DP(x); x += inc_x; | |||||
| x2 = LD_DP(x); x += inc_x; | |||||
| x3 = LD_DP(x); x += inc_x; | |||||
| x4 = LD_DP(x); x += inc_x; | |||||
| x5 = LD_DP(x); x += inc_x; | |||||
| x6 = LD_DP(x); x += inc_x; | |||||
| x7 = LD_DP(x); x += inc_x; | |||||
| x8 = LD_DP(x); x += inc_x; | |||||
| ST_DP(x0, y); y += inc_y; | |||||
| x9 = LD_DP(x); x += inc_x; | |||||
| ST_DP(x1, y); y += inc_y; | |||||
| x10 = LD_DP(x); x += inc_x; | |||||
| ST_DP(x2, y); y += inc_y; | |||||
| x11 = LD_DP(x); x += inc_x; | |||||
| ST_DP(x3, y); y += inc_y; | |||||
| x12 = LD_DP(x); x += inc_x; | |||||
| ST_DP(x4, y); y += inc_y; | |||||
| x13 = LD_DP(x); x += inc_x; | |||||
| ST_DP(x5, y); y += inc_y; | |||||
| x14 = LD_DP(x); x += inc_x; | |||||
| ST_DP(x6, y); y += inc_y; | |||||
| x15 = LD_DP(x); x += inc_x; | |||||
| ST_DP(x7, y); y += inc_y; | |||||
| ST_DP(x8, y); y += inc_y; | |||||
| ST_DP(x9, y); y += inc_y; | |||||
| ST_DP(x10, y); y += inc_y; | |||||
| ST_DP(x11, y); y += inc_y; | |||||
| ST_DP(x12, y); y += inc_y; | |||||
| ST_DP(x13, y); y += inc_y; | |||||
| ST_DP(x14, y); y += inc_y; | |||||
| ST_DP(x15, y); y += inc_y; | |||||
| } | |||||
| if (n & 15) | |||||
| { | |||||
| if (n & 8) | |||||
| { | |||||
| LD_DP8_INC(x, inc_x, x0, x1, x2, x3, x4, x5, x6, x7); | |||||
| ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, y, inc_y); | |||||
| } | |||||
| if (n & 4) | |||||
| { | |||||
| LD_DP4_INC(x, inc_x, x0, x1, x2, x3); | |||||
| ST_DP4_INC(x0, x1, x2, x3, y, inc_y); | |||||
| } | |||||
| if (n & 2) | |||||
| { | |||||
| LD_DP2_INC(x, inc_x, x0, x1); | |||||
| ST_DP2_INC(x0, x1, y, inc_y); | |||||
| } | |||||
| if (n & 1) | |||||
| { | |||||
| LD_GP2_INC(x, 1, f0, f1); | |||||
| ST_GP2_INC(f0, f1, y, 1); | |||||
| } | |||||
| } | |||||
| } | |||||
| return (0); | |||||
| } | |||||
| @@ -0,0 +1,717 @@ | |||||
| /******************************************************************************* | |||||
| Copyright (c) 2017, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *******************************************************************************/ | |||||
| #include "common.h" | |||||
| #include "macros_msa.h" | |||||
| /* This will shuffle the elements in 'in' vector as (mask needed :: 01 00 11 10) | |||||
| 0 1 2 3 => 2 3 0 1 */ | |||||
| #define SHF_78 78 | |||||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||||
| FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, | |||||
| BLASLONG dummy2) | |||||
| { | |||||
| BLASLONG i, inc_x2; | |||||
| FLOAT *px; | |||||
| FLOAT tp0, tp1, f0, f1; | |||||
| v2f64 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; | |||||
| v2f64 d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13, d14, d15; | |||||
| v2f64 da_i_vec, da_i_vec_neg, da_r_vec; | |||||
| px = x; | |||||
| if (1 == inc_x) | |||||
| { | |||||
| if ((0.0 == da_r) && (0.0 == da_i)) | |||||
| { | |||||
| v2f64 zero_v = __msa_cast_to_vector_double(0); | |||||
| zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 0, 0.0); | |||||
| zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 1, 0.0); | |||||
| for (i = (n >> 4); i--;) | |||||
| { | |||||
| ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v, | |||||
| zero_v, zero_v, x, 2); | |||||
| ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v, | |||||
| zero_v, zero_v, x, 2); | |||||
| } | |||||
| if (n & 15) | |||||
| { | |||||
| if (n & 8) | |||||
| { | |||||
| ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v, | |||||
| zero_v, zero_v, x, 2); | |||||
| } | |||||
| if (n & 4) | |||||
| { | |||||
| ST_DP4_INC(zero_v, zero_v, zero_v, zero_v, x, 2); | |||||
| } | |||||
| if (n & 2) | |||||
| { | |||||
| ST_DP2_INC(zero_v, zero_v, x, 2); | |||||
| } | |||||
| if (n & 1) | |||||
| { | |||||
| ST_DP(zero_v, x); | |||||
| } | |||||
| } | |||||
| } | |||||
| else if (0.0 == da_r) | |||||
| { | |||||
| da_i_vec = COPY_DOUBLE_TO_VECTOR(da_i); | |||||
| da_i_vec_neg = -da_i_vec; | |||||
| da_i_vec = (v2f64) __msa_ilvev_d((v2i64) da_i_vec_neg, (v2i64) da_i_vec); | |||||
| if (n > 15) | |||||
| { | |||||
| FLOAT *x_pref; | |||||
| BLASLONG pref_offset; | |||||
| pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); | |||||
| if (pref_offset > 0) | |||||
| { | |||||
| pref_offset = L1_DATA_LINESIZE - pref_offset; | |||||
| pref_offset = pref_offset / sizeof(FLOAT); | |||||
| } | |||||
| x_pref = x + pref_offset + 32 + 16; | |||||
| LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7); | |||||
| for (i = (n >> 4)- 1; i--;) | |||||
| { | |||||
| PREF_OFFSET(x_pref, 0); | |||||
| PREF_OFFSET(x_pref, 32); | |||||
| PREF_OFFSET(x_pref, 64); | |||||
| PREF_OFFSET(x_pref, 96); | |||||
| PREF_OFFSET(x_pref, 128); | |||||
| PREF_OFFSET(x_pref, 160); | |||||
| PREF_OFFSET(x_pref, 192); | |||||
| PREF_OFFSET(x_pref, 224); | |||||
| x_pref += 32; | |||||
| x8 = LD_DP(px); px += 2; | |||||
| x0 *= da_i_vec; | |||||
| x9 = LD_DP(px); px += 2; | |||||
| x1 *= da_i_vec; | |||||
| x10 = LD_DP(px); px += 2; | |||||
| x2 *= da_i_vec; | |||||
| x11 = LD_DP(px); px += 2; | |||||
| x3 *= da_i_vec; | |||||
| x12 = LD_DP(px); px += 2; | |||||
| x4 *= da_i_vec; | |||||
| x13 = LD_DP(px); px += 2; | |||||
| x5 *= da_i_vec; | |||||
| x0 = (v2f64) __msa_shf_w((v4i32) x0, SHF_78); | |||||
| x14 = LD_DP(px); px += 2; | |||||
| x6 *= da_i_vec; | |||||
| x1 = (v2f64) __msa_shf_w((v4i32) x1, SHF_78); | |||||
| x15 = LD_DP(px); px += 2; | |||||
| x7 *= da_i_vec; | |||||
| x2 = (v2f64) __msa_shf_w((v4i32) x2, SHF_78); | |||||
| x8 *= da_i_vec; | |||||
| x3 = (v2f64) __msa_shf_w((v4i32) x3, SHF_78); | |||||
| ST_DP(x0, x); x += 2; | |||||
| x9 *= da_i_vec; | |||||
| x4 = (v2f64) __msa_shf_w((v4i32) x4, SHF_78); | |||||
| ST_DP(x1, x); x += 2; | |||||
| x10 *= da_i_vec; | |||||
| x5 = (v2f64) __msa_shf_w((v4i32) x5, SHF_78); | |||||
| ST_DP(x2, x); x += 2; | |||||
| x11 *= da_i_vec; | |||||
| x6 = (v2f64) __msa_shf_w((v4i32) x6, SHF_78); | |||||
| ST_DP(x3, x); x += 2; | |||||
| x12 *= da_i_vec; | |||||
| x7 = (v2f64) __msa_shf_w((v4i32) x7, SHF_78); | |||||
| ST_DP(x4, x); x += 2; | |||||
| x13 *= da_i_vec; | |||||
| x8 = (v2f64) __msa_shf_w((v4i32) x8, SHF_78); | |||||
| ST_DP(x5, x); x += 2; | |||||
| x14 *= da_i_vec; | |||||
| x9 = (v2f64) __msa_shf_w((v4i32) x9, SHF_78); | |||||
| ST_DP(x6, x); x += 2; | |||||
| x15 *= da_i_vec; | |||||
| x10 = (v2f64) __msa_shf_w((v4i32) x10, SHF_78); | |||||
| ST_DP(x7, x); x += 2; | |||||
| x11 = (v2f64) __msa_shf_w((v4i32) x11, SHF_78); | |||||
| ST_DP(x8, x); x += 2; | |||||
| x0 = LD_DP(px); px += 2; | |||||
| x12 = (v2f64) __msa_shf_w((v4i32) x12, SHF_78); | |||||
| ST_DP(x9, x); x += 2; | |||||
| x1 = LD_DP(px); px += 2; | |||||
| x13 = (v2f64) __msa_shf_w((v4i32) x13, SHF_78); | |||||
| ST_DP(x10, x); x += 2; | |||||
| x2 = LD_DP(px); px += 2; | |||||
| x14 = (v2f64) __msa_shf_w((v4i32) x14, SHF_78); | |||||
| ST_DP(x11, x); x += 2; | |||||
| x3 = LD_DP(px); px += 2; | |||||
| x15 = (v2f64) __msa_shf_w((v4i32) x15, SHF_78); | |||||
| ST_DP(x12, x); x += 2; | |||||
| x4 = LD_DP(px); px += 2; | |||||
| ST_DP(x13, x); x += 2; | |||||
| x5 = LD_DP(px); px += 2; | |||||
| ST_DP(x14, x); x += 2; | |||||
| x6 = LD_DP(px); px += 2; | |||||
| ST_DP(x15, x); x += 2; | |||||
| x7 = LD_DP(px); px += 2; | |||||
| } | |||||
| LD_DP8_INC(px, 2, x8, x9, x10, x11, x12, x13, x14, x15); | |||||
| MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec, | |||||
| x0, x1, x2, x3); | |||||
| MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec, | |||||
| x4, x5, x6, x7); | |||||
| MUL4(x8, da_i_vec, x9, da_i_vec, x10, da_i_vec, x11, da_i_vec, | |||||
| x8, x9, x10, x11); | |||||
| MUL4(x12, da_i_vec, x13, da_i_vec, x14, da_i_vec, x15, da_i_vec, | |||||
| x12, x13, x14, x15); | |||||
| SHF_W4_DP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_78); | |||||
| SHF_W4_DP(x4, x5, x6, x7, x4, x5, x6, x7, SHF_78); | |||||
| SHF_W4_DP(x8, x9, x10, x11, x8, x9, x10, x11, SHF_78); | |||||
| SHF_W4_DP(x12, x13, x14, x15, x12, x13, x14, x15, SHF_78); | |||||
| ST_DP16_INC(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, | |||||
| x12, x13, x14, x15, x, 2); | |||||
| } | |||||
| if (n & 15) | |||||
| { | |||||
| if (n & 8) | |||||
| { | |||||
| LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7); | |||||
| MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec, | |||||
| x0, x1, x2, x3); | |||||
| MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec, | |||||
| x4, x5, x6, x7); | |||||
| SHF_W4_DP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_78); | |||||
| SHF_W4_DP(x4, x5, x6, x7, x4, x5, x6, x7, SHF_78); | |||||
| ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, 2); | |||||
| } | |||||
| if (n & 4) | |||||
| { | |||||
| LD_DP4_INC(px, 2, x0, x1, x2, x3); | |||||
| MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec, | |||||
| x0, x1, x2, x3); | |||||
| SHF_W4_DP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_78); | |||||
| ST_DP4_INC(x0, x1, x2, x3, x, 2); | |||||
| } | |||||
| if (n & 2) | |||||
| { | |||||
| LD_DP2_INC(px, 2, x0, x1); | |||||
| MUL2(x0, da_i_vec, x1, da_i_vec, x0, x1); | |||||
| SHF_W2_DP(x0, x1, x0, x1, SHF_78); | |||||
| ST_DP2_INC(x0, x1, x, 2); | |||||
| } | |||||
| if (n & 1) | |||||
| { | |||||
| LD_GP2_INC(px, 1, f0, f1); | |||||
| MUL2(f0, da_i, f1, -da_i, f0, f1); | |||||
| ST_GP2_INC(f1, f0, x, 1); | |||||
| } | |||||
| } | |||||
| } | |||||
| else if (0.0 == da_i) | |||||
| { | |||||
| da_r_vec = COPY_DOUBLE_TO_VECTOR(da_r); | |||||
| if (n > 15) | |||||
| { | |||||
| FLOAT *x_pref; | |||||
| BLASLONG pref_offset; | |||||
| pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); | |||||
| if (pref_offset > 0) | |||||
| { | |||||
| pref_offset = L1_DATA_LINESIZE - pref_offset; | |||||
| pref_offset = pref_offset / sizeof(FLOAT); | |||||
| } | |||||
| x_pref = x + pref_offset + 32 + 16; | |||||
| LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7); | |||||
| for (i = (n >> 4)- 1; i--;) | |||||
| { | |||||
| PREF_OFFSET(x_pref, 0); | |||||
| PREF_OFFSET(x_pref, 32); | |||||
| PREF_OFFSET(x_pref, 64); | |||||
| PREF_OFFSET(x_pref, 96); | |||||
| PREF_OFFSET(x_pref, 128); | |||||
| PREF_OFFSET(x_pref, 160); | |||||
| PREF_OFFSET(x_pref, 192); | |||||
| PREF_OFFSET(x_pref, 224); | |||||
| x_pref += 32; | |||||
| x8 = LD_DP(px); px += 2; | |||||
| x0 *= da_r_vec; | |||||
| x9 = LD_DP(px); px += 2; | |||||
| x1 *= da_r_vec; | |||||
| x10 = LD_DP(px); px += 2; | |||||
| x2 *= da_r_vec; | |||||
| x11 = LD_DP(px); px += 2; | |||||
| x3 *= da_r_vec; | |||||
| x12 = LD_DP(px); px += 2; | |||||
| x4 *= da_r_vec; | |||||
| x13 = LD_DP(px); px += 2; | |||||
| x5 *= da_r_vec; | |||||
| ST_DP(x0, x); x += 2; | |||||
| x14 = LD_DP(px); px += 2; | |||||
| x6 *= da_r_vec; | |||||
| ST_DP(x1, x); x += 2; | |||||
| x15 = LD_DP(px); px += 2; | |||||
| x7 *= da_r_vec; | |||||
| ST_DP(x2, x); x += 2; | |||||
| x8 *= da_r_vec; | |||||
| ST_DP(x3, x); x += 2; | |||||
| x9 *= da_r_vec; | |||||
| ST_DP(x4, x); x += 2; | |||||
| x10 *= da_r_vec; | |||||
| ST_DP(x5, x); x += 2; | |||||
| x11 *= da_r_vec; | |||||
| ST_DP(x6, x); x += 2; | |||||
| x12 *= da_r_vec; | |||||
| ST_DP(x7, x); x += 2; | |||||
| x13 *= da_r_vec; | |||||
| ST_DP(x8, x); x += 2; | |||||
| x0 = LD_DP(px); px += 2; | |||||
| x14 *= da_r_vec; | |||||
| ST_DP(x9, x); x += 2; | |||||
| x1 = LD_DP(px); px += 2; | |||||
| x15 *= da_r_vec; | |||||
| ST_DP(x10, x); x += 2; | |||||
| x2 = LD_DP(px); px += 2; | |||||
| ST_DP(x11, x); x += 2; | |||||
| x3 = LD_DP(px); px += 2; | |||||
| ST_DP(x12, x); x += 2; | |||||
| x4 = LD_DP(px); px += 2; | |||||
| ST_DP(x13, x); x += 2; | |||||
| x5 = LD_DP(px); px += 2; | |||||
| ST_DP(x14, x); x += 2; | |||||
| x6 = LD_DP(px); px += 2; | |||||
| ST_DP(x15, x); x += 2; | |||||
| x7 = LD_DP(px); px += 2; | |||||
| } | |||||
| LD_DP8_INC(px, 2, x8, x9, x10, x11, x12, x13, x14, x15); | |||||
| MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec, | |||||
| x0, x1, x2, x3); | |||||
| MUL4(x4, da_r_vec, x5, da_r_vec, x6, da_r_vec, x7, da_r_vec, | |||||
| x4, x5, x6, x7); | |||||
| MUL4(x8, da_r_vec, x9, da_r_vec, x10, da_r_vec, x11, da_r_vec, | |||||
| x8, x9, x10, x11); | |||||
| MUL4(x12, da_r_vec, x13, da_r_vec, x14, da_r_vec, x15, da_r_vec, | |||||
| x12, x13, x14, x15); | |||||
| ST_DP16_INC(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, | |||||
| x12, x13, x14, x15, x, 2); | |||||
| } | |||||
| if (n & 15) | |||||
| { | |||||
| if (n & 8) | |||||
| { | |||||
| LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7); | |||||
| MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec, | |||||
| x0, x1, x2, x3); | |||||
| MUL4(x4, da_r_vec, x5, da_r_vec, x6, da_r_vec, x7, da_r_vec, | |||||
| x4, x5, x6, x7); | |||||
| ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, 2); | |||||
| } | |||||
| if (n & 4) | |||||
| { | |||||
| LD_DP4_INC(px, 2, x0, x1, x2, x3); | |||||
| MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec, | |||||
| x0, x1, x2, x3); | |||||
| ST_DP4_INC(x0, x1, x2, x3, x, 2); | |||||
| } | |||||
| if (n & 2) | |||||
| { | |||||
| LD_DP2_INC(px, 2, x0, x1); | |||||
| MUL2(x0, da_r_vec, x1, da_r_vec, x0, x1); | |||||
| ST_DP2_INC(x0, x1, x, 2); | |||||
| } | |||||
| if (n & 1) | |||||
| { | |||||
| LD_GP2_INC(px, 1, f0, f1); | |||||
| MUL2(f0, da_r, f1, da_r, f0, f1); | |||||
| ST_GP2_INC(f0, f1, x, 1); | |||||
| } | |||||
| } | |||||
| } | |||||
| else | |||||
| { | |||||
| FLOAT *x_pref; | |||||
| BLASLONG pref_offset; | |||||
| pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); | |||||
| if (pref_offset > 0) | |||||
| { | |||||
| pref_offset = L1_DATA_LINESIZE - pref_offset; | |||||
| pref_offset = pref_offset / sizeof(FLOAT); | |||||
| } | |||||
| x_pref = x + pref_offset + 32; | |||||
| da_i_vec = COPY_DOUBLE_TO_VECTOR(da_i); | |||||
| da_i_vec_neg = -da_i_vec; | |||||
| da_i_vec = (v2f64) __msa_ilvev_d((v2i64) da_i_vec_neg, (v2i64) da_i_vec); | |||||
| da_r_vec = COPY_DOUBLE_TO_VECTOR(da_r); | |||||
| for (i = (n >> 4); i--;) | |||||
| { | |||||
| PREF_OFFSET(x_pref, 0); | |||||
| PREF_OFFSET(x_pref, 32); | |||||
| PREF_OFFSET(x_pref, 64); | |||||
| PREF_OFFSET(x_pref, 96); | |||||
| PREF_OFFSET(x_pref, 128); | |||||
| PREF_OFFSET(x_pref, 160); | |||||
| PREF_OFFSET(x_pref, 192); | |||||
| PREF_OFFSET(x_pref, 224); | |||||
| x_pref += 32; | |||||
| LD_DP16_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, | |||||
| x11, x12, x13, x14, x15); | |||||
| MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec, | |||||
| d0, d1, d2, d3); | |||||
| MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec, | |||||
| d4, d5, d6, d7); | |||||
| MUL4(x8, da_i_vec, x9, da_i_vec, x10, da_i_vec, x11, da_i_vec, | |||||
| d8, d9, d10, d11); | |||||
| MUL4(x12, da_i_vec, x13, da_i_vec, x14, da_i_vec, x15, da_i_vec, | |||||
| d12, d13, d14, d15); | |||||
| SHF_W4_DP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_78); | |||||
| SHF_W4_DP(d4, d5, d6, d7, d4, d5, d6, d7, SHF_78); | |||||
| SHF_W4_DP(d8, d9, d10, d11, d8, d9, d10, d11, SHF_78); | |||||
| SHF_W4_DP(d12, d13, d14, d15, d12, d13, d14, d15, SHF_78); | |||||
| FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3); | |||||
| FMADD4(x4, x5, x6, x7, da_r_vec, d4, d5, d6, d7); | |||||
| FMADD4(x8, x9, x10, x11, da_r_vec, d8, d9, d10, d11); | |||||
| FMADD4(x12, x13, x14, x15, da_r_vec, d12, d13, d14, d15); | |||||
| ST_DP16_INC(d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, | |||||
| d12, d13, d14, d15, x, 2); | |||||
| } | |||||
| if (n & 15) | |||||
| { | |||||
| if (n & 8) | |||||
| { | |||||
| LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7); | |||||
| MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec, | |||||
| d0, d1, d2, d3); | |||||
| MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec, | |||||
| d4, d5, d6, d7); | |||||
| SHF_W4_DP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_78); | |||||
| SHF_W4_DP(d4, d5, d6, d7, d4, d5, d6, d7, SHF_78); | |||||
| FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3); | |||||
| FMADD4(x4, x5, x6, x7, da_r_vec, d4, d5, d6, d7); | |||||
| ST_DP8_INC(d0, d1, d2, d3, d4, d5, d6, d7, x, 2); | |||||
| } | |||||
| if (n & 4) | |||||
| { | |||||
| LD_DP4_INC(px, 2, x0, x1, x2, x3); | |||||
| MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec, | |||||
| d0, d1, d2, d3); | |||||
| SHF_W4_DP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_78); | |||||
| FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3); | |||||
| ST_DP4_INC(d0, d1, d2, d3, x, 2); | |||||
| } | |||||
| if (n & 2) | |||||
| { | |||||
| LD_DP2_INC(px, 2, x0, x1); | |||||
| MUL2(x0, da_i_vec, x1, da_i_vec, d0, d1); | |||||
| SHF_W2_DP(d0, d1, d0, d1, SHF_78); | |||||
| FMADD2(x0, x1, da_r_vec, d0, d1); | |||||
| ST_DP2_INC(d0, d1, x, 2); | |||||
| } | |||||
| if (n & 1) | |||||
| { | |||||
| LD_GP2_INC(px, 1, f0, f1); | |||||
| tp0 = da_r * f0; | |||||
| tp0 -= da_i * f1; | |||||
| tp1 = da_r * f1; | |||||
| tp1 += da_i * f0; | |||||
| ST_GP2_INC(tp0, tp1, x, 1); | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| else | |||||
| { | |||||
| inc_x2 = 2 * inc_x; | |||||
| if ((0.0 == da_r) && (0.0 == da_i)) | |||||
| { | |||||
| v2f64 zero_v = __msa_cast_to_vector_double(0); | |||||
| zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 0, 0.0); | |||||
| zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 1, 0.0); | |||||
| for (i = (n >> 4); i--;) | |||||
| { | |||||
| ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v, | |||||
| zero_v, zero_v, x, inc_x2); | |||||
| ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v, | |||||
| zero_v, zero_v, x, inc_x2); | |||||
| } | |||||
| if (n & 15) | |||||
| { | |||||
| if (n & 8) | |||||
| { | |||||
| ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v, | |||||
| zero_v, zero_v, x, inc_x2); | |||||
| } | |||||
| if (n & 4) | |||||
| { | |||||
| ST_DP4_INC(zero_v, zero_v, zero_v, zero_v, x, inc_x2); | |||||
| } | |||||
| if (n & 2) | |||||
| { | |||||
| ST_DP2_INC(zero_v, zero_v, x, inc_x2); | |||||
| } | |||||
| if (n & 1) | |||||
| { | |||||
| ST_DP(zero_v, x); | |||||
| } | |||||
| } | |||||
| } | |||||
| else if (0.0 == da_r) | |||||
| { | |||||
| da_i_vec = COPY_DOUBLE_TO_VECTOR(da_i); | |||||
| da_i_vec_neg = -da_i_vec; | |||||
| da_i_vec = (v2f64) __msa_ilvev_d((v2i64) da_i_vec_neg, (v2i64) da_i_vec); | |||||
| for (i = (n >> 4); i--;) | |||||
| { | |||||
| LD_DP16_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, | |||||
| x10, x11, x12, x13, x14, x15); | |||||
| MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec, | |||||
| x0, x1, x2, x3); | |||||
| MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec, | |||||
| x4, x5, x6, x7); | |||||
| MUL4(x8, da_i_vec, x9, da_i_vec, x10, da_i_vec, x11, da_i_vec, | |||||
| x8, x9, x10, x11); | |||||
| MUL4(x12, da_i_vec, x13, da_i_vec, x14, da_i_vec, x15, da_i_vec, | |||||
| x12, x13, x14, x15); | |||||
| SHF_W4_DP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_78); | |||||
| SHF_W4_DP(x4, x5, x6, x7, x4, x5, x6, x7, SHF_78); | |||||
| SHF_W4_DP(x8, x9, x10, x11, x8, x9, x10, x11, SHF_78); | |||||
| SHF_W4_DP(x12, x13, x14, x15, x12, x13, x14, x15, SHF_78); | |||||
| ST_DP16_INC(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, | |||||
| x12, x13, x14, x15, x, inc_x2); | |||||
| } | |||||
| if (n & 15) | |||||
| { | |||||
| if (n & 8) | |||||
| { | |||||
| LD_DP8_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7); | |||||
| MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec, | |||||
| x0, x1, x2, x3); | |||||
| MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec, | |||||
| x4, x5, x6, x7); | |||||
| SHF_W4_DP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_78); | |||||
| SHF_W4_DP(x4, x5, x6, x7, x4, x5, x6, x7, SHF_78); | |||||
| ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, inc_x2); | |||||
| } | |||||
| if (n & 4) | |||||
| { | |||||
| LD_DP4_INC(px, inc_x2, x0, x1, x2, x3); | |||||
| MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec, | |||||
| x0, x1, x2, x3); | |||||
| SHF_W4_DP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_78); | |||||
| ST_DP4_INC(x0, x1, x2, x3, x, inc_x2); | |||||
| } | |||||
| if (n & 2) | |||||
| { | |||||
| LD_DP2_INC(px, inc_x2, x0, x1); | |||||
| MUL2(x0, da_i_vec, x1, da_i_vec, x0, x1); | |||||
| SHF_W2_DP(x0, x1, x0, x1, SHF_78); | |||||
| ST_DP2_INC(x0, x1, x, inc_x2); | |||||
| } | |||||
| if (n & 1) | |||||
| { | |||||
| LD_GP2_INC(px, 1, f0, f1); | |||||
| MUL2(f0, da_i, f1, -da_i, f0, f1); | |||||
| ST_GP2_INC(f1, f0, x, 1); | |||||
| } | |||||
| } | |||||
| } | |||||
| else if (0.0 == da_i) | |||||
| { | |||||
| da_r_vec = COPY_DOUBLE_TO_VECTOR(da_r); | |||||
| for (i = (n >> 4); i--;) | |||||
| { | |||||
| LD_DP16_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, | |||||
| x10, x11, x12, x13, x14, x15); | |||||
| MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec, | |||||
| x0, x1, x2, x3); | |||||
| MUL4(x4, da_r_vec, x5, da_r_vec, x6, da_r_vec, x7, da_r_vec, | |||||
| x4, x5, x6, x7); | |||||
| MUL4(x8, da_r_vec, x9, da_r_vec, x10, da_r_vec, x11, da_r_vec, | |||||
| x8, x9, x10, x11); | |||||
| MUL4(x12, da_r_vec, x13, da_r_vec, x14, da_r_vec, x15, da_r_vec, | |||||
| x12, x13, x14, x15); | |||||
| ST_DP16_INC(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, | |||||
| x12, x13, x14, x15, x, inc_x2); | |||||
| } | |||||
| if (n & 15) | |||||
| { | |||||
| if (n & 8) | |||||
| { | |||||
| LD_DP8_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7); | |||||
| MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec, | |||||
| x0, x1, x2, x3); | |||||
| MUL4(x4, da_r_vec, x5, da_r_vec, x6, da_r_vec, x7, da_r_vec, | |||||
| x4, x5, x6, x7); | |||||
| ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, inc_x2); | |||||
| } | |||||
| if (n & 4) | |||||
| { | |||||
| LD_DP4_INC(px, inc_x2, x0, x1, x2, x3); | |||||
| MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec, | |||||
| x0, x1, x2, x3); | |||||
| ST_DP4_INC(x0, x1, x2, x3, x, inc_x2); | |||||
| } | |||||
| if (n & 2) | |||||
| { | |||||
| LD_DP2_INC(px, inc_x2, x0, x1); | |||||
| MUL2(x0, da_r_vec, x1, da_r_vec, x0, x1); | |||||
| ST_DP2_INC(x0, x1, x, inc_x2); | |||||
| } | |||||
| if (n & 1) | |||||
| { | |||||
| LD_GP2_INC(px, 1, f0, f1); | |||||
| MUL2(f0, da_r, f1, da_r, f0, f1); | |||||
| ST_GP2_INC(f0, f1, x, 1); | |||||
| } | |||||
| } | |||||
| } | |||||
| else | |||||
| { | |||||
| da_i_vec = COPY_DOUBLE_TO_VECTOR(da_i); | |||||
| da_i_vec_neg = -da_i_vec; | |||||
| da_i_vec = (v2f64) __msa_ilvev_d((v2i64) da_i_vec_neg, (v2i64) da_i_vec); | |||||
| da_r_vec = COPY_DOUBLE_TO_VECTOR(da_r); | |||||
| for (i = (n >> 4); i--;) | |||||
| { | |||||
| LD_DP16_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, | |||||
| x10, x11, x12, x13, x14, x15); | |||||
| MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec, | |||||
| d0, d1, d2, d3); | |||||
| MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec, | |||||
| d4, d5, d6, d7); | |||||
| MUL4(x8, da_i_vec, x9, da_i_vec, x10, da_i_vec, x11, da_i_vec, | |||||
| d8, d9, d10, d11); | |||||
| MUL4(x12, da_i_vec, x13, da_i_vec, x14, da_i_vec, x15, da_i_vec, | |||||
| d12, d13, d14, d15); | |||||
| SHF_W4_DP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_78); | |||||
| SHF_W4_DP(d4, d5, d6, d7, d4, d5, d6, d7, SHF_78); | |||||
| SHF_W4_DP(d8, d9, d10, d11, d8, d9, d10, d11, SHF_78); | |||||
| SHF_W4_DP(d12, d13, d14, d15, d12, d13, d14, d15, SHF_78); | |||||
| FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3); | |||||
| FMADD4(x4, x5, x6, x7, da_r_vec, d4, d5, d6, d7); | |||||
| FMADD4(x8, x9, x10, x11, da_r_vec, d8, d9, d10, d11); | |||||
| FMADD4(x12, x13, x14, x15, da_r_vec, d12, d13, d14, d15); | |||||
| ST_DP16_INC(d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, | |||||
| d12, d13, d14, d15, x, inc_x2); | |||||
| } | |||||
| if (n & 15) | |||||
| { | |||||
| if (n & 8) | |||||
| { | |||||
| LD_DP8_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7); | |||||
| MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec, | |||||
| d0, d1, d2, d3); | |||||
| MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec, | |||||
| d4, d5, d6, d7); | |||||
| SHF_W4_DP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_78); | |||||
| SHF_W4_DP(d4, d5, d6, d7, d4, d5, d6, d7, SHF_78); | |||||
| FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3); | |||||
| FMADD4(x4, x5, x6, x7, da_r_vec, d4, d5, d6, d7); | |||||
| ST_DP8_INC(d0, d1, d2, d3, d4, d5, d6, d7, x, inc_x2); | |||||
| } | |||||
| if (n & 4) | |||||
| { | |||||
| LD_DP4_INC(px, inc_x2, x0, x1, x2, x3); | |||||
| MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec, | |||||
| d0, d1, d2, d3); | |||||
| SHF_W4_DP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_78); | |||||
| FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3); | |||||
| ST_DP4_INC(d0, d1, d2, d3, x, inc_x2); | |||||
| } | |||||
| if (n & 2) | |||||
| { | |||||
| LD_DP2_INC(px, inc_x2, x0, x1); | |||||
| MUL2(x0, da_i_vec, x1, da_i_vec, d0, d1); | |||||
| SHF_W2_DP(d0, d1, d0, d1, SHF_78); | |||||
| FMADD2(x0, x1, da_r_vec, d0, d1); | |||||
| ST_DP2_INC(d0, d1, x, inc_x2); | |||||
| } | |||||
| if (n & 1) | |||||
| { | |||||
| LD_GP2_INC(px, 1, f0, f1); | |||||
| tp0 = da_r * f0; | |||||
| tp0 -= da_i * f1; | |||||
| tp1 = da_r * f1; | |||||
| tp1 += da_i * f0; | |||||
| ST_GP2_INC(tp0, tp1, x, 1); | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| return (0); | |||||
| } | |||||
| @@ -0,0 +1,238 @@ | |||||
| /******************************************************************************* | |||||
| Copyright (c) 2016, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *******************************************************************************/ | |||||
| #include "common.h" | |||||
| #include "macros_msa.h" | |||||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, | |||||
| FLOAT dummy4, FLOAT *srcx, BLASLONG inc_x, FLOAT *srcy, | |||||
| BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | |||||
| { | |||||
| BLASLONG i, inc_x2, inc_y2, pref_offsetx, pref_offsety; | |||||
| FLOAT *px, *py; | |||||
| v2f64 x0, x1, x2, x3, x4, x5, x6, x7; | |||||
| v2f64 y0, y1, y2, y3, y4, y5, y6, y7; | |||||
| if (n < 0) return (0); | |||||
| pref_offsetx = (BLASLONG)srcx & (L1_DATA_LINESIZE - 1); | |||||
| if (pref_offsetx > 0) | |||||
| { | |||||
| pref_offsetx = L1_DATA_LINESIZE - pref_offsetx; | |||||
| pref_offsetx = pref_offsetx / sizeof(FLOAT); | |||||
| } | |||||
| pref_offsety = (BLASLONG)srcy & (L1_DATA_LINESIZE - 1); | |||||
| if (pref_offsety > 0) | |||||
| { | |||||
| pref_offsety = L1_DATA_LINESIZE - pref_offsety; | |||||
| pref_offsety = pref_offsety / sizeof(FLOAT); | |||||
| } | |||||
| inc_x2 = 2 * inc_x; | |||||
| inc_y2 = 2 * inc_y; | |||||
| px = srcx; | |||||
| py = srcy; | |||||
| if ((1 == inc_x) && (1 == inc_y)) | |||||
| { | |||||
| if (n >> 3) | |||||
| { | |||||
| LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7); | |||||
| for (i = (n >> 3) - 1; i--;) | |||||
| { | |||||
| PREFETCH(px + pref_offsetx + 16); | |||||
| PREFETCH(px + pref_offsetx + 20); | |||||
| PREFETCH(px + pref_offsetx + 24); | |||||
| PREFETCH(px + pref_offsetx + 28); | |||||
| PREFETCH(py + pref_offsety + 16); | |||||
| PREFETCH(py + pref_offsety + 20); | |||||
| PREFETCH(py + pref_offsety + 24); | |||||
| PREFETCH(py + pref_offsety + 28); | |||||
| y0 = LD_DP(py); py += 2; | |||||
| ST_DP(x0, srcy); srcy += 2; | |||||
| y1 = LD_DP(py); py += 2; | |||||
| ST_DP(x1, srcy); srcy += 2; | |||||
| y2 = LD_DP(py); py += 2; | |||||
| ST_DP(x2, srcy); srcy += 2; | |||||
| y3 = LD_DP(py); py += 2; | |||||
| ST_DP(x3, srcy); srcy += 2; | |||||
| y4 = LD_DP(py); py += 2; | |||||
| ST_DP(x4, srcy); srcy += 2; | |||||
| y5 = LD_DP(py); py += 2; | |||||
| ST_DP(x5, srcy); srcy += 2; | |||||
| y6 = LD_DP(py); py += 2; | |||||
| ST_DP(x6, srcy); srcy += 2; | |||||
| y7 = LD_DP(py); py += 2; | |||||
| ST_DP(x7, srcy); srcy += 2; | |||||
| x0 = LD_DP(px); px += 2; | |||||
| ST_DP(y0, srcx); srcx += 2; | |||||
| x1 = LD_DP(px); px += 2; | |||||
| ST_DP(y1, srcx); srcx += 2; | |||||
| x2 = LD_DP(px); px += 2; | |||||
| ST_DP(y2, srcx); srcx += 2; | |||||
| x3 = LD_DP(px); px += 2; | |||||
| ST_DP(y3, srcx); srcx += 2; | |||||
| x4 = LD_DP(px); px += 2; | |||||
| ST_DP(y4, srcx); srcx += 2; | |||||
| x5 = LD_DP(px); px += 2; | |||||
| ST_DP(y5, srcx); srcx += 2; | |||||
| x6 = LD_DP(px); px += 2; | |||||
| ST_DP(y6, srcx); srcx += 2; | |||||
| x7 = LD_DP(px); px += 2; | |||||
| ST_DP(y7, srcx); srcx += 2; | |||||
| } | |||||
| LD_DP8_INC(py, 2, y0, y1, y2, y3, y4, y5, y6, y7); | |||||
| ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, srcy, 2); | |||||
| ST_DP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, srcx, 2); | |||||
| } | |||||
| if (n & 7) | |||||
| { | |||||
| if ((n & 4) && (n & 2) && (n & 1)) | |||||
| { | |||||
| LD_DP7_INC(px, 2, x0, x1, x2, x3, x4, x5, x6); | |||||
| LD_DP7_INC(py, 2, y0, y1, y2, y3, y4, y5, y6); | |||||
| ST_DP7_INC(x0, x1, x2, x3, x4, x5, x6, srcy, 2); | |||||
| ST_DP7_INC(y0, y1, y2, y3, y4, y5, y6, srcx, 2); | |||||
| } | |||||
| else if ((n & 4) && (n & 2)) | |||||
| { | |||||
| LD_DP6_INC(px, 2, x0, x1, x2, x3, x4, x5); | |||||
| LD_DP6_INC(py, 2, y0, y1, y2, y3, y4, y5); | |||||
| ST_DP6_INC(x0, x1, x2, x3, x4, x5, srcy, 2); | |||||
| ST_DP6_INC(y0, y1, y2, y3, y4, y5, srcx, 2); | |||||
| } | |||||
| else if ((n & 4) && (n & 1)) | |||||
| { | |||||
| LD_DP5_INC(px, 2, x0, x1, x2, x3, x4); | |||||
| LD_DP5_INC(py, 2, y0, y1, y2, y3, y4); | |||||
| ST_DP5_INC(x0, x1, x2, x3, x4, srcy, 2); | |||||
| ST_DP5_INC(y0, y1, y2, y3, y4, srcx, 2); | |||||
| } | |||||
| else if ((n & 2) && (n & 1)) | |||||
| { | |||||
| LD_DP3_INC(px, 2, x0, x1, x2); | |||||
| LD_DP3_INC(py, 2, y0, y1, y2); | |||||
| ST_DP3_INC(x0, x1, x2, srcy, 2); | |||||
| ST_DP3_INC(y0, y1, y2, srcx, 2); | |||||
| } | |||||
| else if (n & 4) | |||||
| { | |||||
| LD_DP4_INC(px, 2, x0, x1, x2, x3); | |||||
| LD_DP4_INC(py, 2, y0, y1, y2, y3); | |||||
| ST_DP4_INC(x0, x1, x2, x3, srcy, 2); | |||||
| ST_DP4_INC(y0, y1, y2, y3, srcx, 2); | |||||
| } | |||||
| else if (n & 2) | |||||
| { | |||||
| LD_DP2_INC(px, 2, x0, x1); | |||||
| LD_DP2_INC(py, 2, y0, y1); | |||||
| ST_DP2_INC(x0, x1, srcy, 2); | |||||
| ST_DP2_INC(y0, y1, srcx, 2); | |||||
| } | |||||
| else if (n & 1) | |||||
| { | |||||
| x0 = LD_DP(px); | |||||
| y0 = LD_DP(py); | |||||
| ST_DP(y0, srcx); | |||||
| ST_DP(x0, srcy); | |||||
| } | |||||
| } | |||||
| } | |||||
| else | |||||
| { | |||||
| for (i = (n >> 3); i--;) | |||||
| { | |||||
| LD_DP8_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7); | |||||
| LD_DP8_INC(py, inc_y2, y0, y1, y2, y3, y4, y5, y6, y7); | |||||
| ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, srcy, inc_y2); | |||||
| ST_DP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, srcx, inc_x2); | |||||
| } | |||||
| if (n & 7) | |||||
| { | |||||
| if ((n & 4) && (n & 2) && (n & 1)) | |||||
| { | |||||
| LD_DP7_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6); | |||||
| LD_DP7_INC(py, inc_y2, y0, y1, y2, y3, y4, y5, y6); | |||||
| ST_DP7_INC(x0, x1, x2, x3, x4, x5, x6, srcy, inc_y2); | |||||
| ST_DP7_INC(y0, y1, y2, y3, y4, y5, y6, srcx, inc_x2); | |||||
| } | |||||
| else if ((n & 4) && (n & 2)) | |||||
| { | |||||
| LD_DP6_INC(px, inc_x2, x0, x1, x2, x3, x4, x5); | |||||
| LD_DP6_INC(py, inc_y2, y0, y1, y2, y3, y4, y5); | |||||
| ST_DP6_INC(x0, x1, x2, x3, x4, x5, srcy, inc_y2); | |||||
| ST_DP6_INC(y0, y1, y2, y3, y4, y5, srcx, inc_x2); | |||||
| } | |||||
| else if ((n & 4) && (n & 1)) | |||||
| { | |||||
| LD_DP5_INC(px, inc_x2, x0, x1, x2, x3, x4); | |||||
| LD_DP5_INC(py, inc_y2, y0, y1, y2, y3, y4); | |||||
| ST_DP5_INC(x0, x1, x2, x3, x4, srcy, inc_y2); | |||||
| ST_DP5_INC(y0, y1, y2, y3, y4, srcx, inc_x2); | |||||
| } | |||||
| else if ((n & 2) && (n & 1)) | |||||
| { | |||||
| LD_DP3_INC(px, inc_x2, x0, x1, x2); | |||||
| LD_DP3_INC(py, inc_y2, y0, y1, y2); | |||||
| ST_DP3_INC(x0, x1, x2, srcy, inc_y2); | |||||
| ST_DP3_INC(y0, y1, y2, srcx, inc_x2); | |||||
| } | |||||
| else if (n & 4) | |||||
| { | |||||
| LD_DP4_INC(px, inc_x2, x0, x1, x2, x3); | |||||
| LD_DP4_INC(py, inc_y2, y0, y1, y2, y3); | |||||
| ST_DP4_INC(x0, x1, x2, x3, srcy, inc_y2); | |||||
| ST_DP4_INC(y0, y1, y2, y3, srcx, inc_x2); | |||||
| } | |||||
| else if (n & 2) | |||||
| { | |||||
| LD_DP2_INC(px, inc_x2, x0, x1); | |||||
| LD_DP2_INC(py, inc_y2, y0, y1); | |||||
| ST_DP2_INC(x0, x1, srcy, inc_y2); | |||||
| ST_DP2_INC(y0, y1, srcx, inc_x2); | |||||
| } | |||||
| else if (n & 1) | |||||
| { | |||||
| x0 = LD_DP(px); | |||||
| y0 = LD_DP(py); | |||||
| ST_DP(y0, srcx); | |||||
| ST_DP(x0, srcy); | |||||
| } | |||||
| } | |||||
| } | |||||
| return (0); | |||||
| } | |||||