Browse Source

Merge pull request #1055 from ksraste/develop

Add msa optimization for AXPY, COPY, SCALE, SWAP
tags/v0.2.20^2
Zhang Xianyi GitHub 9 years ago
parent
commit
002b41f024
18 changed files with 5835 additions and 0 deletions
  1. +28
    -0
      kernel/mips/KERNEL.P5600
  2. +471
    -0
      kernel/mips/caxpy_msa.c
  3. +201
    -0
      kernel/mips/ccopy_msa.c
  4. +1012
    -0
      kernel/mips/cscal_msa.c
  5. +281
    -0
      kernel/mips/cswap_msa.c
  6. +246
    -0
      kernel/mips/daxpy_msa.c
  7. +180
    -0
      kernel/mips/dcopy_msa.c
  8. +368
    -0
      kernel/mips/dscal_msa.c
  9. +253
    -0
      kernel/mips/dswap_msa.c
  10. +25
    -0
      kernel/mips/macros_msa.h
  11. +265
    -0
      kernel/mips/saxpy_msa.c
  12. +186
    -0
      kernel/mips/scopy_msa.c
  13. +385
    -0
      kernel/mips/sscal_msa.c
  14. +267
    -0
      kernel/mips/sswap_msa.c
  15. +494
    -0
      kernel/mips/zaxpy_msa.c
  16. +218
    -0
      kernel/mips/zcopy_msa.c
  17. +717
    -0
      kernel/mips/zscal_msa.c
  18. +238
    -0
      kernel/mips/zswap_msa.c

+ 28
- 0
kernel/mips/KERNEL.P5600 View File

@@ -42,15 +42,29 @@ CASUMKERNEL = ../mips/asum.c
ZASUMKERNEL = ../mips/asum.c
endif

ifdef HAVE_MSA
SAXPYKERNEL = ../mips/saxpy_msa.c
DAXPYKERNEL = ../mips/daxpy_msa.c
CAXPYKERNEL = ../mips/caxpy_msa.c
ZAXPYKERNEL = ../mips/zaxpy_msa.c
else
SAXPYKERNEL = ../mips/axpy.c
DAXPYKERNEL = ../mips/axpy.c
CAXPYKERNEL = ../mips/zaxpy.c
ZAXPYKERNEL = ../mips/zaxpy.c
endif

ifdef HAVE_MSA
SCOPYKERNEL = ../mips/scopy_msa.c
DCOPYKERNEL = ../mips/dcopy_msa.c
CCOPYKERNEL = ../mips/ccopy_msa.c
ZCOPYKERNEL = ../mips/zcopy_msa.c
else
SCOPYKERNEL = ../mips/copy.c
DCOPYKERNEL = ../mips/copy.c
CCOPYKERNEL = ../mips/zcopy.c
ZCOPYKERNEL = ../mips/zcopy.c
endif

ifdef HAVE_MSA
SDOTKERNEL = ../mips/sdot_msa.c
@@ -74,15 +88,29 @@ DROTKERNEL = ../mips/rot.c
CROTKERNEL = ../mips/zrot.c
ZROTKERNEL = ../mips/zrot.c

ifdef HAVE_MSA
SSCALKERNEL = ../mips/sscal_msa.c
DSCALKERNEL = ../mips/dscal_msa.c
CSCALKERNEL = ../mips/cscal_msa.c
ZSCALKERNEL = ../mips/zscal_msa.c
else
SSCALKERNEL = ../mips/scal.c
DSCALKERNEL = ../mips/scal.c
CSCALKERNEL = ../mips/zscal.c
ZSCALKERNEL = ../mips/zscal.c
endif

ifdef HAVE_MSA
SSWAPKERNEL = ../mips/sswap_msa.c
DSWAPKERNEL = ../mips/dswap_msa.c
CSWAPKERNEL = ../mips/cswap_msa.c
ZSWAPKERNEL = ../mips/zswap_msa.c
else
SSWAPKERNEL = ../mips/swap.c
DSWAPKERNEL = ../mips/swap.c
CSWAPKERNEL = ../mips/zswap.c
ZSWAPKERNEL = ../mips/zswap.c
endif

ifdef HAVE_MSA
SGEMVNKERNEL = ../mips/sgemv_n_msa.c


+ 471
- 0
kernel/mips/caxpy_msa.c View File

@@ -0,0 +1,471 @@
/*******************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/

#include "common.h"
#include "macros_msa.h"

#if !defined(CONJ)
#define OP0 +=
#define OP1 -=
#define OP2 +=
#else
#define OP0 -=
#define OP1 +=
#define OP2 -=
#endif

int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
BLASLONG dummy2)
{
BLASLONG i, inc_x2, inc_y2;
FLOAT *py;
v4f32 x0, x1, x2, x3, x4, x5, x6, x7;
v4f32 y0, y1, y2, y3, y4, y5, y6, y7, dar_vec, dai_vec;
v4f32 x0r, x1r, x2r, x3r, x0i, x1i, x2i, x3i;
v4f32 y0r, y1r, y2r, y3r, y0i, y1i, y2i, y3i;
FLOAT xd0, xd1, xd2, xd3, xd4, xd5, xd6, xd7;
FLOAT yd0, yd1, yd2, yd3, yd4, yd5, yd6, yd7;

if (n < 0) return(0);
if ((da_r == 0.0) && (da_i == 0.0)) return(0);

py = y;

if ((1 == inc_x) && (1 == inc_y))
{
FLOAT *x_pref, *y_pref;
BLASLONG pref_offset;

pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
if (pref_offset > 0)
{
pref_offset = L1_DATA_LINESIZE - pref_offset;
pref_offset = pref_offset / sizeof(FLOAT);
}
x_pref = x + pref_offset + 64;

pref_offset = (BLASLONG)y & (L1_DATA_LINESIZE - 1);
if (pref_offset > 0)
{
pref_offset = L1_DATA_LINESIZE - pref_offset;
pref_offset = pref_offset / sizeof(FLOAT);
}
y_pref = y + pref_offset + 64;

dar_vec = COPY_FLOAT_TO_VECTOR(da_r);
dai_vec = COPY_FLOAT_TO_VECTOR(da_i);

for (i = (n >> 4); i--;)
{
PREF_OFFSET(x_pref, 0);
PREF_OFFSET(x_pref, 32);
PREF_OFFSET(x_pref, 64);
PREF_OFFSET(x_pref, 96);
PREF_OFFSET(y_pref, 0);
PREF_OFFSET(y_pref, 32);
PREF_OFFSET(y_pref, 64);
PREF_OFFSET(y_pref, 96);
x_pref += 32;
y_pref += 32;

LD_SP8_INC(x, 4, x0, x1, x2, x3, x4, x5, x6, x7);
LD_SP8_INC(py, 4, y0, y1, y2, y3, y4, y5, y6, y7);
PCKEVOD_W2_SP(x1, x0, x0r, x0i);
PCKEVOD_W2_SP(y1, y0, y0r, y0i);
PCKEVOD_W2_SP(x3, x2, x1r, x1i);
PCKEVOD_W2_SP(y3, y2, y1r, y1i);
PCKEVOD_W2_SP(x5, x4, x2r, x2i);
PCKEVOD_W2_SP(y5, y4, y2r, y2i);
PCKEVOD_W2_SP(x7, x6, x3r, x3i);
PCKEVOD_W2_SP(y7, y6, y3r, y3i);

FMADD4(x0r, x1r, x2r, x3r, dar_vec, y0r, y1r, y2r, y3r);
y0i OP0 dar_vec * x0i;
y1i OP0 dar_vec * x1i;
y2i OP0 dar_vec * x2i;
y3i OP0 dar_vec * x3i;
y0r OP1 dai_vec * x0i;
y1r OP1 dai_vec * x1i;
y2r OP1 dai_vec * x2i;
y3r OP1 dai_vec * x3i;
y0i OP2 dai_vec * x0r;
y1i OP2 dai_vec * x1r;
y2i OP2 dai_vec * x2r;
y3i OP2 dai_vec * x3r;

ILVRL_W2_SP(y0i, y0r, y0, y1);
ILVRL_W2_SP(y1i, y1r, y2, y3);
ILVRL_W2_SP(y2i, y2r, y4, y5);
ILVRL_W2_SP(y3i, y3r, y6, y7);
ST_SP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, 4);
}

if (n & 15)
{
if (n & 8)
{
LD_SP4_INC(x, 4, x0, x1, x2, x3);
LD_SP4_INC(py, 4, y0, y1, y2, y3);
PCKEVOD_W2_SP(x1, x0, x0r, x0i);
PCKEVOD_W2_SP(y1, y0, y0r, y0i);
PCKEVOD_W2_SP(x3, x2, x1r, x1i);
PCKEVOD_W2_SP(y3, y2, y1r, y1i);

FMADD2(x0r, x1r, dar_vec, y0r, y1r);
y0i OP0 dar_vec * x0i;
y1i OP0 dar_vec * x1i;
y0r OP1 dai_vec * x0i;
y1r OP1 dai_vec * x1i;
y0i OP2 dai_vec * x0r;
y1i OP2 dai_vec * x1r;

ILVRL_W2_SP(y0i, y0r, y0, y1);
ILVRL_W2_SP(y1i, y1r, y2, y3);
ST_SP4_INC(y0, y1, y2, y3, y, 4);
}

if (n & 4)
{
LD_SP2_INC(x, 4, x0, x1);
LD_SP2_INC(py, 4, y0, y1);
PCKEVOD_W2_SP(x1, x0, x0r, x0i);
PCKEVOD_W2_SP(y1, y0, y0r, y0i);

y0r += dar_vec * x0r;
y0i OP0 dar_vec * x0i;
y0r OP1 dai_vec * x0i;
y0i OP2 dai_vec * x0r;

ILVRL_W2_SP(y0i, y0r, y0, y1);
ST_SP2_INC(y0, y1, y, 4);
}

if (n & 2)
{
LD_GP4_INC(x, 1, xd0, xd1, xd2, xd3);
LD_GP4_INC(py, 1, yd0, yd1, yd2, yd3);

FMADD2(xd0, xd2, da_r, yd0, yd2);
yd1 OP0 da_r * xd1;
yd3 OP0 da_r * xd3;
yd0 OP1 da_i * xd1;
yd2 OP1 da_i * xd3;
yd1 OP2 da_i * xd0;
yd3 OP2 da_i * xd2;

ST_GP4_INC(yd0, yd1, yd2, yd3, y, 1);
}

if (n & 1)
{
LD_GP2_INC(x, 1, xd0, xd1);
LD_GP2_INC(py, 1, yd0, yd1);

yd0 += da_r * xd0;
yd1 OP0 da_r * xd1;
yd0 OP1 da_i * xd1;
yd1 OP2 da_i * xd0;

ST_GP2_INC(yd0, yd1, y, 1);
}
}
}
else if (1 == inc_y)
{
FLOAT *y_pref;
BLASLONG pref_offset;
v4f32 x8, x9, x10, x11, x12, x13, x14;

pref_offset = (BLASLONG)y & (L1_DATA_LINESIZE - 1);
if (pref_offset > 0)
{
pref_offset = L1_DATA_LINESIZE - pref_offset;
pref_offset = pref_offset / sizeof(FLOAT);
}
y_pref = y + pref_offset + 64;

inc_x2 = 2 * inc_x;

dar_vec = COPY_FLOAT_TO_VECTOR(da_r);
dai_vec = COPY_FLOAT_TO_VECTOR(da_i);

for (i = (n >> 4); i--;)
{
PREF_OFFSET(y_pref, 0);
PREF_OFFSET(y_pref, 32);
PREF_OFFSET(y_pref, 64);
PREF_OFFSET(y_pref, 96);
y_pref += 32;

LD_SP8_INC(x, inc_x2, x0, x1, x2, x3, x4, x5, x6, x14);
LD_SP7_INC(x, inc_x2, x8, x9, x10, x11, x12, x13, x7);
PCKEV_D2_SP(x1, x0, x3, x2, x0, x1);
PCKEV_D2_SP(x5, x4, x14, x6, x2, x3);
PCKEV_D2_SP(x9, x8, x11, x10, x4, x5);
x6 = (v4f32) __msa_pckev_d((v2i64) x13, (v2i64) x12);
x7 = (v4f32) __msa_insert_w((v4i32) x7, 2, *((int *) x));
x7 = (v4f32) __msa_insert_w((v4i32) x7, 3, *((int *) (x + 1)));
x += inc_x2;

LD_SP8_INC(py, 4, y0, y1, y2, y3, y4, y5, y6, y7);
PCKEVOD_W2_SP(x1, x0, x0r, x0i);
PCKEVOD_W2_SP(y1, y0, y0r, y0i);
PCKEVOD_W2_SP(x3, x2, x1r, x1i);
PCKEVOD_W2_SP(y3, y2, y1r, y1i);
PCKEVOD_W2_SP(x5, x4, x2r, x2i);
PCKEVOD_W2_SP(y5, y4, y2r, y2i);
PCKEVOD_W2_SP(x7, x6, x3r, x3i);
PCKEVOD_W2_SP(y7, y6, y3r, y3i);

FMADD4(x0r, x1r, x2r, x3r, dar_vec, y0r, y1r, y2r, y3r);
y0i OP0 dar_vec * x0i;
y1i OP0 dar_vec * x1i;
y2i OP0 dar_vec * x2i;
y3i OP0 dar_vec * x3i;
y0r OP1 dai_vec * x0i;
y1r OP1 dai_vec * x1i;
y2r OP1 dai_vec * x2i;
y3r OP1 dai_vec * x3i;
y0i OP2 dai_vec * x0r;
y1i OP2 dai_vec * x1r;
y2i OP2 dai_vec * x2r;
y3i OP2 dai_vec * x3r;

ILVRL_W2_SP(y0i, y0r, y0, y1);
ILVRL_W2_SP(y1i, y1r, y2, y3);
ILVRL_W2_SP(y2i, y2r, y4, y5);
ILVRL_W2_SP(y3i, y3r, y6, y7);

ST_SP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, 4);
}

if (n & 15)
{
if (n & 8)
{
LD_SP7_INC(x, inc_x2, x0, x1, x2, x6, x4, x5, x3);
PCKEV_D2_SP(x1, x0, x6, x2, x0, x1);

x2 = (v4f32) __msa_pckev_d((v2i64) x5, (v2i64) x4);
x3 = (v4f32) __msa_insert_w((v4i32) x3, 2, *((int *) x));
x3 = (v4f32) __msa_insert_w((v4i32) x3, 3, *((int *) (x + 1)));
x += inc_x2;

LD_SP4_INC(py, 4, y0, y1, y2, y3);
PCKEVOD_W2_SP(x1, x0, x0r, x0i);
PCKEVOD_W2_SP(y1, y0, y0r, y0i);
PCKEVOD_W2_SP(x3, x2, x1r, x1i);
PCKEVOD_W2_SP(y3, y2, y1r, y1i);

FMADD2(x0r, x1r, dar_vec, y0r, y1r);
y0i OP0 dar_vec * x0i;
y1i OP0 dar_vec * x1i;
y0r OP1 dai_vec * x0i;
y1r OP1 dai_vec * x1i;
y0i OP2 dai_vec * x0r;
y1i OP2 dai_vec * x1r;

ILVRL_W2_SP(y0i, y0r, y0, y1);
ILVRL_W2_SP(y1i, y1r, y2, y3);
ST_SP4_INC(y0, y1, y2, y3, y, 4);
}

if (n & 4)
{
LD_SP3_INC(x, inc_x2, x0, x2, x1);

x0 = (v4f32) __msa_pckev_d((v2i64) x2, (v2i64) x0);
x1 = (v4f32) __msa_insert_w((v4i32) x1, 2, *((int *) x));
x1 = (v4f32) __msa_insert_w((v4i32) x1, 3, *((int *) (x + 1)));
x += inc_x2;

LD_SP2_INC(py, 4, y0, y1);
PCKEVOD_W2_SP(x1, x0, x0r, x0i);
PCKEVOD_W2_SP(y1, y0, y0r, y0i);

y0r += dar_vec * x0r;
y0i OP0 dar_vec * x0i;
y0r OP1 dai_vec * x0i;
y0i OP2 dai_vec * x0r;

ILVRL_W2_SP(y0i, y0r, y0, y1);
ST_SP2_INC(y0, y1, y, 4);
}

if (n & 2)
{
xd0 = x[0];
xd1 = x[1];
x += inc_x2;
xd2 = x[0];
xd3 = x[1];
x += inc_x2;

LD_GP4_INC(py, 1, yd0, yd1, yd2, yd3);

FMADD2(xd0, xd2, da_r, yd0, yd2);
yd1 OP0 da_r * xd1;
yd3 OP0 da_r * xd3;
yd0 OP1 da_i * xd1;
yd2 OP1 da_i * xd3;
yd1 OP2 da_i * xd0;
yd3 OP2 da_i * xd2;

ST_GP4_INC(yd0, yd1, yd2, yd3, y, 1);
}

if (n & 1)
{
LD_GP2_INC(x, 1, xd0, xd1);
LD_GP2_INC(py, 1, yd0, yd1);

yd0 += da_r * xd0;
yd1 OP0 da_r * xd1;
yd0 OP1 da_i * xd1;
yd1 OP2 da_i * xd0;

ST_GP2_INC(yd0, yd1, y, 1);
}
}
}
else
{
inc_x2 = 2 * inc_x;
inc_y2 = 2 * inc_y;

for (i = (n >> 2); i--;)
{
xd0 = x[0];
xd1 = x[1];
x += inc_x2;
xd2 = x[0];
xd3 = x[1];
x += inc_x2;
xd4 = x[0];
xd5 = x[1];
x += inc_x2;
xd6 = x[0];
xd7 = x[1];
x += inc_x2;

yd0 = py[0];
yd1 = py[1];
py += inc_y2;
yd2 = py[0];
yd3 = py[1];
py += inc_y2;
yd4 = py[0];
yd5 = py[1];
py += inc_y2;
yd6 = py[0];
yd7 = py[1];
py += inc_y2;

FMADD4(xd0, xd2, xd4, xd6, da_r, yd0, yd2, yd4, yd6);
yd1 OP0 da_r * xd1;
yd3 OP0 da_r * xd3;
yd5 OP0 da_r * xd5;
yd7 OP0 da_r * xd7;
yd0 OP1 da_i * xd1;
yd2 OP1 da_i * xd3;
yd4 OP1 da_i * xd5;
yd6 OP1 da_i * xd7;
yd1 OP2 da_i * xd0;
yd3 OP2 da_i * xd2;
yd5 OP2 da_i * xd4;
yd7 OP2 da_i * xd6;

y[0] = yd0;
y[1] = yd1;
y += inc_y2;
y[0] = yd2;
y[1] = yd3;
y += inc_y2;
y[0] = yd4;
y[1] = yd5;
y += inc_y2;
y[0] = yd6;
y[1] = yd7;
y += inc_y2;
}

if (n & 3)
{
if (n & 2)
{
xd0 = x[0];
xd1 = x[1];
x += inc_x2;
xd2 = x[0];
xd3 = x[1];
x += inc_x2;

yd0 = py[0];
yd1 = py[1];
py += inc_y2;
yd2 = py[0];
yd3 = py[1];
py += inc_y2;

FMADD2(xd0, xd2, da_r, yd0, yd2);
yd1 OP0 da_r * xd1;
yd3 OP0 da_r * xd3;
yd0 OP1 da_i * xd1;
yd2 OP1 da_i * xd3;
yd1 OP2 da_i * xd0;
yd3 OP2 da_i * xd2;

y[0] = yd0;
y[1] = yd1;
y += inc_y2;
y[0] = yd2;
y[1] = yd3;
y += inc_y2;
}

if (n & 1)
{
xd0 = x[0];
xd1 = x[1];

yd0 = y[0];
yd1 = y[1];

yd0 += da_r * xd0;
yd1 OP0 da_r * xd1;
yd0 OP1 da_i * xd1;
yd1 OP2 da_i * xd0;

y[0] = yd0;
y[1] = yd1;
}
}
}

return (0);
}

+ 201
- 0
kernel/mips/ccopy_msa.c View File

@@ -0,0 +1,201 @@
/*******************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/

#include "common.h"
#include "macros_msa.h"

int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
BLASLONG i, inc_x2, inc_y2;
v4f32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
FLOAT f0, f1, f2, f3, f4, f5, f6, f7;

if (n < 0) return (0);

if ((1 == inc_x) && (1 == inc_y))
{
if (n > 31)
{
FLOAT *x_pref;
BLASLONG pref_offset;

pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
if (pref_offset > 0)
{
pref_offset = L1_DATA_LINESIZE - pref_offset;
pref_offset = pref_offset / sizeof(FLOAT);
}
x_pref = x + pref_offset + 128 + 32;

LD_SP8_INC(x, 4, x0, x1, x2, x3, x4, x5, x6, x7);
for (i = (n >> 5) - 1; i--;)
{
PREF_OFFSET(x_pref, 0);
PREF_OFFSET(x_pref, 32);
PREF_OFFSET(x_pref, 64);
PREF_OFFSET(x_pref, 96);
PREF_OFFSET(x_pref, 128);
PREF_OFFSET(x_pref, 160);
PREF_OFFSET(x_pref, 192);
PREF_OFFSET(x_pref, 224);
x_pref += 64;

x8 = LD_SP(x); x += 4;
ST_SP(x0, y); y += 4;
x9 = LD_SP(x); x += 4;
ST_SP(x1, y); y += 4;
x10 = LD_SP(x); x += 4;
ST_SP(x2, y); y += 4;
x11 = LD_SP(x); x += 4;
ST_SP(x3, y); y += 4;
x12 = LD_SP(x); x += 4;
ST_SP(x4, y); y += 4;
x13 = LD_SP(x); x += 4;
ST_SP(x5, y); y += 4;
x14 = LD_SP(x); x += 4;
ST_SP(x6, y); y += 4;
x15 = LD_SP(x); x += 4;
ST_SP(x7, y); y += 4;
x0 = LD_SP(x); x += 4;
ST_SP(x8, y); y += 4;
x1 = LD_SP(x); x += 4;
ST_SP(x9, y); y += 4;
x2 = LD_SP(x); x += 4;
ST_SP(x10, y); y += 4;
x3 = LD_SP(x); x += 4;
ST_SP(x11, y); y += 4;
x4 = LD_SP(x); x += 4;
ST_SP(x12, y); y += 4;
x5 = LD_SP(x); x += 4;
ST_SP(x13, y); y += 4;
x6 = LD_SP(x); x += 4;
ST_SP(x14, y); y += 4;
x7 = LD_SP(x); x += 4;
ST_SP(x15, y); y += 4;
}

x8 = LD_SP(x); x += 4;
x9 = LD_SP(x); x += 4;
ST_SP(x0, y); y += 4;
x10 = LD_SP(x); x += 4;
ST_SP(x1, y); y += 4;
x11 = LD_SP(x); x += 4;
ST_SP(x2, y); y += 4;
x12 = LD_SP(x); x += 4;
ST_SP(x3, y); y += 4;
x13 = LD_SP(x); x += 4;
ST_SP(x4, y); y += 4;
x14 = LD_SP(x); x += 4;
ST_SP(x5, y); y += 4;
x15 = LD_SP(x); x += 4;
ST_SP(x6, y); y += 4;
ST_SP(x7, y); y += 4;

ST_SP8_INC(x8, x9, x10, x11, x12, x13, x14, x15, y, 4);
}

if (n & 31)
{
if (n & 16)
{
LD_SP8_INC(x, 4, x0, x1, x2, x3, x4, x5, x6, x7);
ST_SP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, y, 4);
}

if (n & 8)
{
LD_SP4_INC(x, 4, x0, x1, x2, x3);
ST_SP4_INC(x0, x1, x2, x3, y, 4);
}

if (n & 4)
{
LD_SP2_INC(x, 4, x0, x1);
ST_SP2_INC(x0, x1, y, 4);
}

if (n & 2)
{
LD_GP4_INC(x, 1, f0, f1, f2, f3);
ST_GP4_INC(f0, f1, f2, f3, y, 1);
}

if (n & 1)
{
LD_GP2_INC(x, 1, f0, f1);
ST_GP2_INC(f0, f1, y, 1);
}
}
}
else
{
inc_x2 = 2 * inc_x;
inc_y2 = 2 * inc_y;

for (i = (n >> 2); i--;)
{
f0 = *x;
f1 = *(x+1); x += inc_x2;
f2 = *x;
f3 = *(x+1); x += inc_x2;
f4 = *x;
f5 = *(x+1); x += inc_x2;
f6 = *x;
f7 = *(x+1); x += inc_x2;

*y = f0;
*(y+1) = f1; y += inc_y2;
*y = f2;
*(y+1) = f3; y += inc_y2;
*y = f4;
*(y+1) = f5; y += inc_y2;
*y = f6;
*(y+1) = f7; y += inc_y2;
}

if (n & 2)
{
f0 = *x;
f1 = *(x+1); x += inc_x2;
f2 = *x;
f3 = *(x+1); x += inc_x2;

*y = f0;
*(y+1) = f1; y += inc_y2;
*y = f2;
*(y+1) = f3; y += inc_y2;
}

if (n & 1)
{
LD_GP2_INC(x, 1, f0, f1);
ST_GP2_INC(f0, f1, y, 1);
}
}

return (0);
}

+ 1012
- 0
kernel/mips/cscal_msa.c
File diff suppressed because it is too large
View File


+ 281
- 0
kernel/mips/cswap_msa.c View File

@@ -0,0 +1,281 @@
/*******************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/

#include "common.h"
#include "macros_msa.h"

int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3,
FLOAT dummy4, FLOAT *srcx, BLASLONG inc_x, FLOAT *srcy,
BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i = 0, pref_offsetx, pref_offsety;
FLOAT *px, *py;
BLASLONG inc_x2, inc_y2;
FLOAT x0, x1, x2, x3, x4, x5, x6, x7;
FLOAT y0, y1, y2, y3, y4, y5, y6, y7;
v4f32 xv0, xv1, xv2, xv3, xv4, xv5, xv6, xv7;
v4f32 yv0, yv1, yv2, yv3, yv4, yv5, yv6, yv7;

if (n < 0) return (0);

pref_offsetx = (BLASLONG)srcx & (L1_DATA_LINESIZE - 1);
if (pref_offsetx > 0)
{
pref_offsetx = L1_DATA_LINESIZE - pref_offsetx;
pref_offsetx = pref_offsetx / sizeof(FLOAT);
}

pref_offsety = (BLASLONG)srcy & (L1_DATA_LINESIZE - 1);
if (pref_offsety > 0)
{
pref_offsety = L1_DATA_LINESIZE - pref_offsety;
pref_offsety = pref_offsety / sizeof(FLOAT);
}

px = srcx;
py = srcy;

inc_x2 = 2 * inc_x;
inc_y2 = 2 * inc_y;

if ((1 == inc_x) && (1 == inc_y))
{
if (n >> 4)
{
LD_SP8_INC(px, 4, xv0, xv1, xv2, xv3, xv4, xv5, xv6, xv7);

for (i = (n >> 4) - 1; i--;)
{
PREFETCH(px + pref_offsetx + 32);
PREFETCH(px + pref_offsetx + 40);
PREFETCH(px + pref_offsetx + 48);
PREFETCH(px + pref_offsetx + 56);

PREFETCH(py + pref_offsety + 32);
PREFETCH(py + pref_offsety + 40);
PREFETCH(py + pref_offsety + 48);
PREFETCH(py + pref_offsety + 56);

yv0 = LD_SP(py); py += 4;
ST_SP(xv0, srcy); srcy += 4;
yv1 = LD_SP(py); py += 4;
ST_SP(xv1, srcy); srcy += 4;
yv2 = LD_SP(py); py += 4;
ST_SP(xv2, srcy); srcy += 4;
yv3 = LD_SP(py); py += 4;
ST_SP(xv3, srcy); srcy += 4;
yv4 = LD_SP(py); py += 4;
ST_SP(xv4, srcy); srcy += 4;
yv5 = LD_SP(py); py += 4;
ST_SP(xv5, srcy); srcy += 4;
yv6 = LD_SP(py); py += 4;
ST_SP(xv6, srcy); srcy += 4;
yv7 = LD_SP(py); py += 4;
ST_SP(xv7, srcy); srcy += 4;

xv0 = LD_SP(px); px += 4;
ST_SP(yv0, srcx); srcx += 4;
xv1 = LD_SP(px); px += 4;
ST_SP(yv1, srcx); srcx += 4;
xv2 = LD_SP(px); px += 4;
ST_SP(yv2, srcx); srcx += 4;
xv3 = LD_SP(px); px += 4;
ST_SP(yv3, srcx); srcx += 4;
xv4 = LD_SP(px); px += 4;
ST_SP(yv4, srcx); srcx += 4;
xv5 = LD_SP(px); px += 4;
ST_SP(yv5, srcx); srcx += 4;
xv6 = LD_SP(px); px += 4;
ST_SP(yv6, srcx); srcx += 4;
xv7 = LD_SP(px); px += 4;
ST_SP(yv7, srcx); srcx += 4;
}

LD_SP8_INC(py, 4, yv0, yv1, yv2, yv3, yv4, yv5, yv6, yv7);
ST_SP8_INC(xv0, xv1, xv2, xv3, xv4, xv5, xv6, xv7, srcy, 4);
ST_SP8_INC(yv0, yv1, yv2, yv3, yv4, yv5, yv6, yv7, srcx, 4);
}

if (n & 15)
{
if ((n & 8) && (n & 4) && (n & 2))
{
LD_SP7_INC(px, 4, xv0, xv1, xv2, xv3, xv4, xv5, xv6);
LD_SP7_INC(py, 4, yv0, yv1, yv2, yv3, yv4, yv5, yv6);
ST_SP7_INC(xv0, xv1, xv2, xv3, xv4, xv5, xv6, srcy, 4);
ST_SP7_INC(yv0, yv1, yv2, yv3, yv4, yv5, yv6, srcx, 4);
}
else if ((n & 8) && (n & 4))
{
LD_SP6_INC(px, 4, xv0, xv1, xv2, xv3, xv4, xv5);
LD_SP6_INC(py, 4, yv0, yv1, yv2, yv3, yv4, yv5);
ST_SP6_INC(xv0, xv1, xv2, xv3, xv4, xv5, srcy, 4);
ST_SP6_INC(yv0, yv1, yv2, yv3, yv4, yv5, srcx, 4);
}
else if ((n & 8) && (n & 2))
{
LD_SP5_INC(px, 4, xv0, xv1, xv2, xv3, xv4);
LD_SP5_INC(py, 4, yv0, yv1, yv2, yv3, yv4);
ST_SP5_INC(xv0, xv1, xv2, xv3, xv4, srcy, 4);
ST_SP5_INC(yv0, yv1, yv2, yv3, yv4, srcx, 4);
}
else if ((n & 4) && (n & 2))
{
LD_SP3_INC(px, 4, xv0, xv1, xv2);
LD_SP3_INC(py, 4, yv0, yv1, yv2);
ST_SP3_INC(xv0, xv1, xv2, srcy, 4);
ST_SP3_INC(yv0, yv1, yv2, srcx, 4);
}
else if (n & 8)
{
LD_SP4_INC(px, 4, xv0, xv1, xv2, xv3);
LD_SP4_INC(py, 4, yv0, yv1, yv2, yv3);
ST_SP4_INC(xv0, xv1, xv2, xv3, srcy, 4);
ST_SP4_INC(yv0, yv1, yv2, yv3, srcx, 4);
}
else if (n & 4)
{
LD_SP2_INC(px, 4, xv0, xv1);
LD_SP2_INC(py, 4, yv0, yv1);
ST_SP2_INC(xv0, xv1, srcy, 4);
ST_SP2_INC(yv0, yv1, srcx, 4);
}
else if (n & 2)
{
xv0 = LD_SP(px);
yv0 = LD_SP(py);

px += 4;
py += 4;

ST_SP(xv0, srcy);
ST_SP(yv0, srcx);

srcx += 4;
srcy += 4;
}

if (n & 1)
{
LD_GP2_INC(px, 1, x0, x1);
LD_GP2_INC(py, 1, y0, y1);
ST_GP2_INC(x0, x1, srcy, 1);
ST_GP2_INC(y0, y1, srcx, 1);
}
}
}
else
{
for (i = (n >> 2); i--;)
{
x0 = srcx[0 * inc_x2];
x1 = srcx[0 * inc_x2 + 1];
x2 = srcx[1 * inc_x2];
x3 = srcx[1 * inc_x2 + 1];
x4 = srcx[2 * inc_x2];
x5 = srcx[2 * inc_x2 + 1];
x6 = srcx[3 * inc_x2];
x7 = srcx[3 * inc_x2 + 1];

y0 = srcy[0 * inc_y2];
y1 = srcy[0 * inc_y2 + 1];
y2 = srcy[1 * inc_y2];
y3 = srcy[1 * inc_y2 + 1];
y4 = srcy[2 * inc_y2];
y5 = srcy[2 * inc_y2 + 1];
y6 = srcy[3 * inc_y2];
y7 = srcy[3 * inc_y2 + 1];

srcx[0 * inc_x2] = y0;
srcx[0 * inc_x2 + 1] = y1;
srcx[1 * inc_x2] = y2;
srcx[1 * inc_x2 + 1] = y3;
srcx[2 * inc_x2] = y4;
srcx[2 * inc_x2 + 1] = y5;
srcx[3 * inc_x2] = y6;
srcx[3 * inc_x2 + 1] = y7;

srcy[0 * inc_y2] = x0;
srcy[0 * inc_y2 + 1] = x1;
srcy[1 * inc_y2] = x2;
srcy[1 * inc_y2 + 1] = x3;
srcy[2 * inc_y2] = x4;
srcy[2 * inc_y2 + 1] = x5;
srcy[3 * inc_y2] = x6;
srcy[3 * inc_y2 + 1] = x7;

srcx += 4 * inc_x2;
srcy += 4 * inc_y2;
}

if (n & 2)
{
x0 = srcx[0 * inc_x2];
x1 = srcx[0 * inc_x2 + 1];
x2 = srcx[1 * inc_x2];
x3 = srcx[1 * inc_x2 + 1];

y0 = srcy[0 * inc_y2];
y1 = srcy[0 * inc_y2 + 1];
y2 = srcy[1 * inc_y2];
y3 = srcy[1 * inc_y2 + 1];

srcx[0 * inc_x2] = y0;
srcx[0 * inc_x2 + 1] = y1;
srcx[1 * inc_x2] = y2;
srcx[1 * inc_x2 + 1] = y3;

srcy[0 * inc_y2] = x0;
srcy[0 * inc_y2 + 1] = x1;
srcy[1 * inc_y2] = x2;
srcy[1 * inc_y2 + 1] = x3;

srcx += 2 * inc_x2;
srcy += 2 * inc_y2;
}

if (n & 1)
{
x0 = srcx[0 * inc_x2];
x1 = srcx[0 * inc_x2 + 1];

y0 = srcy[0 * inc_y2];
y1 = srcy[0 * inc_y2 + 1];

srcx[0 * inc_x2] = y0;
srcx[0 * inc_x2 + 1] = y1;

srcy[0 * inc_y2] = x0;
srcy[0 * inc_y2 + 1] = x1;

srcx += inc_x2;
srcy += inc_y2;
}
}

return (0);
}

+ 246
- 0
kernel/mips/daxpy_msa.c View File

@@ -0,0 +1,246 @@
/*******************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/

#include "common.h"
#include "macros_msa.h"

#if !defined(CONJ)
#define OP0 +=
#define OP1 -=
#define OP2 +=
#else
#define OP0 -=
#define OP1 +=
#define OP2 -=
#endif

int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x,
BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
BLASLONG dummy2)
{
BLASLONG i;
FLOAT *py;
v2f64 x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, y6, y7;
v2f64 da_vec, zero_v = {0};

if ((n < 0) || (da == 0.0)) return(0);

py = y;

if ((1 == inc_x) && (1 == inc_y))
{
FLOAT *x_pref, *y_pref;
BLASLONG pref_offset;

pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
if (pref_offset > 0)
{
pref_offset = L1_DATA_LINESIZE - pref_offset;
pref_offset = pref_offset / sizeof(FLOAT);
}
x_pref = x + pref_offset + 32;

pref_offset = (BLASLONG)y & (L1_DATA_LINESIZE - 1);
if (pref_offset > 0)
{
pref_offset = L1_DATA_LINESIZE - pref_offset;
pref_offset = pref_offset / sizeof(FLOAT);
}
y_pref = y + pref_offset + 32;

da_vec = COPY_DOUBLE_TO_VECTOR(da);

for (i = (n >> 4); i--;)
{
PREF_OFFSET(x_pref, 0);
PREF_OFFSET(x_pref, 32);
PREF_OFFSET(x_pref, 64);
PREF_OFFSET(x_pref, 96);
PREF_OFFSET(y_pref, 0);
PREF_OFFSET(y_pref, 32);
PREF_OFFSET(y_pref, 64);
PREF_OFFSET(y_pref, 96);
x_pref += 16;
y_pref += 16;

LD_DP8_INC(x, 2, x0, x1, x2, x3, x4, x5, x6, x7);
LD_DP8_INC(py, 2, y0, y1, y2, y3, y4, y5, y6, y7);
FMADD4(x0, x1, x2, x3, da_vec, y0, y1, y2, y3);
FMADD4(x4, x5, x6, x7, da_vec, y4, y5, y6, y7);
ST_DP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, 2);
}

if (n & 15)
{
if (n & 8)
{
LD_DP4_INC(x, 2, x0, x1, x2, x3);
LD_DP4_INC(py, 2, y0, y1, y2, y3);
FMADD4(x0, x1, x2, x3, da_vec, y0, y1, y2, y3);
ST_DP4_INC(y0, y1, y2, y3, y, 2);
}

if (n & 4)
{
LD_DP2_INC(x, 2, x0, x1);
LD_DP2_INC(py, 2, y0, y1);
FMADD2(x0, x1, da_vec, y0, y1);
ST_DP2_INC(y0, y1, y, 2);
}

if (n & 2)
{
x0 = LD_DP(x); x += 2;
y0 = LD_DP(py); py += 2;
y0 += da_vec * x0;
ST_DP(y0, y); y += 2;
}

if (n & 1)
{
y[0] += da * x[0];
}
}
}
else if (1 == inc_y)
{
FLOAT *y_pref;
BLASLONG pref_offset;
v2f64 x8, x9, x10, x11, x12, x13, x14;

pref_offset = (BLASLONG)y & (L1_DATA_LINESIZE - 1);
if (pref_offset > 0)
{
pref_offset = L1_DATA_LINESIZE - pref_offset;
pref_offset = pref_offset / sizeof(FLOAT);
}
y_pref = y + pref_offset + 32;

da_vec = COPY_DOUBLE_TO_VECTOR(da);

for (i = (n >> 4); i--;)
{
PREF_OFFSET(y_pref, 0);
PREF_OFFSET(y_pref, 32);
PREF_OFFSET(y_pref, 64);
PREF_OFFSET(y_pref, 96);
y_pref += 16;

LD_DP8_INC(x, inc_x, x0, x1, x2, x3, x4, x5, x6, x14);
LD_DP7_INC(x, inc_x, x8, x9, x10, x11, x12, x13, x7);

PCKEV_D2_SD(x1, x0, x3, x2, x0, x1);
PCKEV_D2_SD(x5, x4, x14, x6, x2, x3);
PCKEV_D2_SD(x9, x8, x11, x10, x4, x5);
x6 = (v2f64) __msa_pckev_d((v2i64) x13, (v2i64) x12);
x7 = (v2f64) __msa_insert_d((v2i64) x7, 1, *((BLASLONG *) x));
x += inc_x;

LD_DP8_INC(py, 2, y0, y1, y2, y3, y4, y5, y6, y7);
FMADD4(x0, x1, x2, x3, da_vec, y0, y1, y2, y3);
FMADD4(x4, x5, x6, x7, da_vec, y4, y5, y6, y7);
ST_DP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, 2);
}

if (n & 15)
{
if (n & 8)
{
LD_DP7_INC(x, inc_x, x0, x1, x2, x6, x4, x5, x3);

PCKEV_D2_SD(x1, x0, x6, x2, x0, x1);
x2 = (v2f64) __msa_pckev_d((v2i64) x5, (v2i64) x4);
x3 = (v2f64) __msa_insert_d((v2i64) x3, 1, *((BLASLONG *) x));
x += inc_x;

LD_DP4_INC(py, 2, y0, y1, y2, y3);
FMADD4(x0, x1, x2, x3, da_vec, y0, y1, y2, y3);
ST_DP4_INC(y0, y1, y2, y3, y, 2);
}

if (n & 4)
{
LD_DP3_INC(x, inc_x, x0, x2, x1);

x0 = (v2f64) __msa_pckev_d((v2i64) x2, (v2i64) x0);
x1 = (v2f64) __msa_insert_d((v2i64) x1, 1, *((BLASLONG *) x));
x += inc_x;

LD_DP2_INC(py, 2, y0, y1);
FMADD2(x0, x1, da_vec, y0, y1);
ST_DP2_INC(y0, y1, y, 2);
}

if (n & 2)
{
x0 = (v2f64) __msa_insert_d((v2i64) zero_v, 0, *((BLASLONG *) x));
x += inc_x;
x0 = (v2f64) __msa_insert_d((v2i64) x0, 1, *((BLASLONG *) x));
x += inc_x;

y0 = LD_DP(py); py += 2;
y0 += da_vec * x0;
ST_DP(y0, y); y += 2;
}

if (n & 1)
{
y[0] += da * x[0];
}
}
}
else
{
FLOAT x0, x1, x2, x3, y0, y1, y2, y3;

for (i = (n >> 2); i--;)
{
LD_GP4_INC(x, inc_x, x0, x1, x2, x3);
LD_GP4_INC(py, inc_y, y0, y1, y2, y3);
FMADD4(x0, x1, x2, x3, da, y0, y1, y2, y3);
ST_GP4_INC(y0, y1, y2, y3, y, inc_y);
}

if (n & 3)
{
if (n & 2)
{
LD_GP2_INC(x, inc_x, x0, x1);
LD_GP2_INC(py, inc_y, y0, y1);
FMADD2(x0, x1, da, y0, y1);
ST_GP2_INC(y0, y1, y, inc_y);
}

if (n & 1)
{
*y += da * *x;
}
}
}

return (0);
}

+ 180
- 0
kernel/mips/dcopy_msa.c View File

@@ -0,0 +1,180 @@
/*******************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/

#include "common.h"
#include "macros_msa.h"

int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
BLASLONG i;
v2f64 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
FLOAT f0, f1, f2, f3, f4, f5, f6, f7;

if (n < 0) return (0);

if ((1 == inc_x) && (1 == inc_y))
{
if (n > 31)
{
FLOAT *x_pref;
BLASLONG pref_offset;

pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
if (pref_offset > 0)
{
pref_offset = L1_DATA_LINESIZE - pref_offset;
pref_offset = pref_offset / sizeof(FLOAT);
}
x_pref = x + pref_offset + 64 + 16;

LD_DP8_INC(x, 2, x0, x1, x2, x3, x4, x5, x6, x7);
for (i = (n >> 5) - 1; i--;)
{
PREF_OFFSET(x_pref, 0);
PREF_OFFSET(x_pref, 32);
PREF_OFFSET(x_pref, 64);
PREF_OFFSET(x_pref, 96);
PREF_OFFSET(x_pref, 128);
PREF_OFFSET(x_pref, 160);
PREF_OFFSET(x_pref, 192);
PREF_OFFSET(x_pref, 224);
x_pref += 32;

x8 = LD_DP(x); x += 2;
ST_DP(x0, y); y += 2;
x9 = LD_DP(x); x += 2;
ST_DP(x1, y); y += 2;
x10 = LD_DP(x); x += 2;
ST_DP(x2, y); y += 2;
x11 = LD_DP(x); x += 2;
ST_DP(x3, y); y += 2;
x12 = LD_DP(x); x += 2;
ST_DP(x4, y); y += 2;
x13 = LD_DP(x); x += 2;
ST_DP(x5, y); y += 2;
x14 = LD_DP(x); x += 2;
ST_DP(x6, y); y += 2;
x15 = LD_DP(x); x += 2;
ST_DP(x7, y); y += 2;
x0 = LD_DP(x); x += 2;
ST_DP(x8, y); y += 2;
x1 = LD_DP(x); x += 2;
ST_DP(x9, y); y += 2;
x2 = LD_DP(x); x += 2;
ST_DP(x10, y); y += 2;
x3 = LD_DP(x); x += 2;
ST_DP(x11, y); y += 2;
x4 = LD_DP(x); x += 2;
ST_DP(x12, y); y += 2;
x5 = LD_DP(x); x += 2;
ST_DP(x13, y); y += 2;
x6 = LD_DP(x); x += 2;
ST_DP(x14, y); y += 2;
x7 = LD_DP(x); x += 2;
ST_DP(x15, y); y += 2;
}

x8 = LD_DP(x); x += 2;
x9 = LD_DP(x); x += 2;
ST_DP(x0, y); y += 2;
x10 = LD_DP(x); x += 2;
ST_DP(x1, y); y += 2;
x11 = LD_DP(x); x += 2;
ST_DP(x2, y); y += 2;
x12 = LD_DP(x); x += 2;
ST_DP(x3, y); y += 2;
x13 = LD_DP(x); x += 2;
ST_DP(x4, y); y += 2;
x14 = LD_DP(x); x += 2;
ST_DP(x5, y); y += 2;
x15 = LD_DP(x); x += 2;
ST_DP(x6, y); y += 2;
ST_DP(x7, y); y += 2;

ST_DP8_INC(x8, x9, x10, x11, x12, x13, x14, x15, y, 2);
}

if (n & 31)
{
if (n & 16)
{
LD_DP8_INC(x, 2, x0, x1, x2, x3, x4, x5, x6, x7);
ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, y, 2);
}

if (n & 8)
{
LD_DP4_INC(x, 2, x0, x1, x2, x3);
ST_DP4_INC(x0, x1, x2, x3, y, 2);
}

if (n & 4)
{
LD_GP4_INC(x, 1, f0, f1, f2, f3);
ST_GP4_INC(f0, f1, f2, f3, y, 1);
}

if (n & 2)
{
LD_GP2_INC(x, 1, f0, f1);
ST_GP2_INC(f0, f1, y, 1);
}

if (n & 1)
{
*y = *x;
}
}
}
else
{
for (i = (n >> 3); i--;)
{
LD_GP8_INC(x, inc_x, f0, f1, f2, f3, f4, f5, f6, f7);
ST_GP8_INC(f0, f1, f2, f3, f4, f5, f6, f7, y, inc_y);
}

if (n & 4)
{
LD_GP4_INC(x, inc_x, f0, f1, f2, f3);
ST_GP4_INC(f0, f1, f2, f3, y, inc_y);
}

if (n & 2)
{
LD_GP2_INC(x, inc_x, f0, f1);
ST_GP2_INC(f0, f1, y, inc_y);
}

if (n & 1)
{
*y = *x;
}
}

return (0);
}

+ 368
- 0
kernel/mips/dscal_msa.c View File

@@ -0,0 +1,368 @@
/*******************************************************************************
Copyright (c) 2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/

#include "common.h"
#include "macros_msa.h"

int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x,
BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
BLASLONG dummy2)
{
BLASLONG i;
FLOAT *px;
FLOAT f0, f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11, f12, f13, f14, f15;
v2f64 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
v2f64 da_vec;

px = x;

if (1 == inc_x)
{
if (0.0 == da)
{
v2f64 zero_v = __msa_cast_to_vector_double(0);
zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 0, 0.0);
zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 1, 0.0);

for (i = (n >> 5); i--;)
{
ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
zero_v, zero_v, x, 2);
ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
zero_v, zero_v, x, 2);
}

if (n & 31)
{
if (n & 16)
{
ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
zero_v, zero_v, x, 2);
}

if (n & 8)
{
ST_DP4_INC(zero_v, zero_v, zero_v, zero_v, x, 2);
}

if (n & 4)
{
ST_DP2_INC(zero_v, zero_v, x, 2);
}

if (n & 2)
{
*x = 0; x += 1;
*x = 0; x += 1;
}

if (n & 1)
{
*x = 0;
}
}
}
else
{
da_vec = COPY_DOUBLE_TO_VECTOR(da);

if (n > 31)
{
FLOAT *x_pref;
BLASLONG pref_offset;

pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
if (pref_offset > 0)
{
pref_offset = L1_DATA_LINESIZE - pref_offset;
pref_offset = pref_offset / sizeof(FLOAT);
}
x_pref = x + pref_offset + 32 + 16;

LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7);
for (i = 0; i < (n >> 5) - 1; i++)
{
PREF_OFFSET(x_pref, 0);
PREF_OFFSET(x_pref, 32);
PREF_OFFSET(x_pref, 64);
PREF_OFFSET(x_pref, 96);
PREF_OFFSET(x_pref, 128);
PREF_OFFSET(x_pref, 160);
PREF_OFFSET(x_pref, 192);
PREF_OFFSET(x_pref, 224);
x_pref += 32;

x8 = LD_DP(px); px += 2;
x0 *= da_vec;
x9 = LD_DP(px); px += 2;
x1 *= da_vec;
x10 = LD_DP(px); px += 2;
x2 *= da_vec;
x11 = LD_DP(px); px += 2;
x3 *= da_vec;
x12 = LD_DP(px); px += 2;
x4 *= da_vec;
x13 = LD_DP(px); px += 2;
x5 *= da_vec;
x14 = LD_DP(px); px += 2;
x6 *= da_vec;
x15 = LD_DP(px); px += 2;
x7 *= da_vec;
x8 *= da_vec;
ST_DP(x0, x); x += 2;
x9 *= da_vec;
ST_DP(x1, x); x += 2;
x10 *= da_vec;
ST_DP(x2, x); x += 2;
x11 *= da_vec;
ST_DP(x3, x); x += 2;
x12 *= da_vec;
ST_DP(x4, x); x += 2;
x13 *= da_vec;
ST_DP(x5, x); x += 2;
x14 *= da_vec;
ST_DP(x6, x); x += 2;
x15 *= da_vec;
ST_DP(x7, x); x += 2;
ST_DP(x8, x); x += 2;
x0 = LD_DP(px); px += 2;
ST_DP(x9, x); x += 2;
x1 = LD_DP(px); px += 2;
ST_DP(x10, x); x += 2;
x2 = LD_DP(px); px += 2;
ST_DP(x11, x); x += 2;
x3 = LD_DP(px); px += 2;
ST_DP(x12, x); x += 2;
x4 = LD_DP(px); px += 2;
ST_DP(x13, x); x += 2;
x5 = LD_DP(px); px += 2;
ST_DP(x14, x); x += 2;
x6 = LD_DP(px); px += 2;
ST_DP(x15, x); x += 2;
x7 = LD_DP(px); px += 2;
}

x8 = LD_DP(px); px += 2;
x0 *= da_vec;
x9 = LD_DP(px); px += 2;
x1 *= da_vec;
x10 = LD_DP(px); px += 2;
x2 *= da_vec;
x11 = LD_DP(px); px += 2;
x3 *= da_vec;
x12 = LD_DP(px); px += 2;
x4 *= da_vec;
x13 = LD_DP(px); px += 2;
x5 *= da_vec;
x14 = LD_DP(px); px += 2;
x6 *= da_vec;
x15 = LD_DP(px); px += 2;
x7 *= da_vec;
x8 *= da_vec;
ST_DP(x0, x); x += 2;
x9 *= da_vec;
ST_DP(x1, x); x += 2;
x10 *= da_vec;
ST_DP(x2, x); x += 2;
x11 *= da_vec;
ST_DP(x3, x); x += 2;
x12 *= da_vec;
ST_DP(x4, x); x += 2;
x13 *= da_vec;
ST_DP(x5, x); x += 2;
x15 *= da_vec;
ST_DP(x6, x); x += 2;
x14 *= da_vec;
ST_DP(x7, x); x += 2;

ST_DP8_INC(x8, x9, x10, x11, x12, x13, x14, x15, x, 2);
}

if (n & 31)
{
if (n & 16)
{
LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7);
MUL4(x0, da_vec, x1, da_vec, x2, da_vec, x3, da_vec, x0, x1, x2, x3);
MUL4(x4, da_vec, x5, da_vec, x6, da_vec, x7, da_vec, x4, x5, x6, x7);
ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, 2);
}

if (n & 8)
{
LD_DP4_INC(px, 2, x0, x1, x2, x3);
MUL4(x0, da_vec, x1, da_vec, x2, da_vec, x3, da_vec, x0, x1, x2, x3);
ST_DP4_INC(x0, x1, x2, x3, x, 2);
}

if (n & 4)
{
LD_DP2_INC(px, 2, x0, x1);
MUL2(x0, da_vec, x1, da_vec, x0, x1);
ST_DP2_INC(x0, x1, x, 2);
}

if (n & 2)
{
LD_GP2_INC(px, 1, f0, f1);
MUL2(f0, da, f1, da, f0, f1);
ST_GP2_INC(f0, f1, x, 1);
}

if (n & 1)
{
*x *= da;
}
}
}
}
else
{
if (da == 0.0)
{
for (i = n; i--;)
{
*x = 0.0;

x += inc_x;
}
}
else
{
if (n > 15)
{
LD_GP8_INC(px, inc_x, f0, f1, f2, f3, f4, f5, f6, f7);
for (i = 0; i < (n >> 4) - 1; i++)
{
LD_GP8_INC(px, inc_x, f8, f9, f10, f11, f12, f13, f14, f15);
MUL4(f0, da, f1, da, f2, da, f3, da, f0, f1, f2, f3);

f4 *= da;
f5 *= da;
*x = f0; x += inc_x;
f6 *= da;
*x = f1; x += inc_x;
f7 *= da;
*x = f2; x += inc_x;
f8 *= da;
*x = f3; x += inc_x;
f9 *= da;
*x = f4; x += inc_x;
f10 *= da;
*x = f5; x += inc_x;
f11 *= da;
*x = f6; x += inc_x;
f12 *= da;
*x = f7; x += inc_x;
f13 *= da;
*x = f8; x += inc_x;
f14 *= da;
*x = f9; x += inc_x;
f15 *= da;
*x = f10; x += inc_x;
*x = f11; x += inc_x;
f0 = *px; px += inc_x;
*x = f12; x += inc_x;
f1 = *px; px += inc_x;
*x = f13; x += inc_x;
f2 = *px; px += inc_x;
*x = f14; x += inc_x;
f3 = *px; px += inc_x;
*x = f15; x += inc_x;
f4 = *px; px += inc_x;
f5 = *px; px += inc_x;
f6 = *px; px += inc_x;
f7 = *px; px += inc_x;
}

LD_GP8_INC(px, inc_x, f8, f9, f10, f11, f12, f13, f14, f15);
MUL4(f0, da, f1, da, f2, da, f3, da, f0, f1, f2, f3);

f4 *= da;
f5 *= da;
*x = f0; x += inc_x;
f6 *= da;
*x = f1; x += inc_x;
f7 *= da;
*x = f2; x += inc_x;
f8 *= da;
*x = f3; x += inc_x;
f9 *= da;
*x = f4; x += inc_x;
f10 *= da;
*x = f5; x += inc_x;
f11 *= da;
*x = f6; x += inc_x;
f12 *= da;
*x = f7; x += inc_x;
f13 *= da;
*x = f8; x += inc_x;
f14 *= da;
*x = f9; x += inc_x;
f15 *= da;
*x = f10; x += inc_x;
*x = f11; x += inc_x;
*x = f12; x += inc_x;
*x = f13; x += inc_x;
*x = f14; x += inc_x;
*x = f15; x += inc_x;
}

if (n & 15)
{
if (n & 8)
{
LD_GP8_INC(px, inc_x, f0, f1, f2, f3, f4, f5, f6, f7);
MUL4(f0, da, f1, da, f2, da, f3, da, f0, f1, f2, f3);
MUL4(f4, da, f5, da, f6, da, f7, da, f4, f5, f6, f7);
ST_GP8_INC(f0, f1, f2, f3, f4, f5, f6, f7, x, inc_x);
}

if (n & 4)
{
LD_GP4_INC(px, inc_x, f0, f1, f2, f3);
MUL4(f0, da, f1, da, f2, da, f3, da, f0, f1, f2, f3);
ST_GP4_INC(f0, f1, f2, f3, x, inc_x);
}

if (n & 2)
{
LD_GP2_INC(px, inc_x, f0, f1);
MUL2(f0, da, f1, da, f0, f1);
ST_GP2_INC(f0, f1, x, inc_x);
}

if (n & 1)
{
*x *= da;
}
}
}
}

return 0;
}

+ 253
- 0
kernel/mips/dswap_msa.c View File

@@ -0,0 +1,253 @@
/*******************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/

#include "common.h"
#include "macros_msa.h"

int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3,
FLOAT *srcx, BLASLONG inc_x, FLOAT *srcy, BLASLONG inc_y,
FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i = 0, pref_offsetx, pref_offsety;
FLOAT *px, *py;
FLOAT x0, x1, x2, x3, x4, x5, x6, x7;
FLOAT y0, y1, y2, y3, y4, y5, y6, y7;
v2f64 xv0, xv1, xv2, xv3, xv4, xv5, xv6, xv7;
v2f64 yv0, yv1, yv2, yv3, yv4, yv5, yv6, yv7;

if (n < 0) return (0);

pref_offsetx = (BLASLONG)srcx & (L1_DATA_LINESIZE - 1);
if (pref_offsetx > 0)
{
pref_offsetx = L1_DATA_LINESIZE - pref_offsetx;
pref_offsetx = pref_offsetx / sizeof(FLOAT);
}

pref_offsety = (BLASLONG)srcy & (L1_DATA_LINESIZE - 1);
if (pref_offsety > 0)
{
pref_offsety = L1_DATA_LINESIZE - pref_offsety;
pref_offsety = pref_offsety / sizeof(FLOAT);
}

px = srcx;
py = srcy;

if ((1 == inc_x) && (1 == inc_y))
{
if (n >> 4)
{
LD_DP8_INC(px, 2, xv0, xv1, xv2, xv3, xv4, xv5, xv6, xv7);

for (i = (n >> 4) - 1; i--;)
{
PREFETCH(px + pref_offsetx + 16);
PREFETCH(px + pref_offsetx + 20);
PREFETCH(px + pref_offsetx + 24);
PREFETCH(px + pref_offsetx + 28);

PREFETCH(py + pref_offsety + 16);
PREFETCH(py + pref_offsety + 20);
PREFETCH(py + pref_offsety + 24);
PREFETCH(py + pref_offsety + 28);

yv0 = LD_DP(py); py += 2;
ST_DP(xv0, srcy); srcy += 2;
yv1 = LD_DP(py); py += 2;
ST_DP(xv1, srcy); srcy += 2;
yv2 = LD_DP(py); py += 2;
ST_DP(xv2, srcy); srcy += 2;
yv3 = LD_DP(py); py += 2;
ST_DP(xv3, srcy); srcy += 2;
yv4 = LD_DP(py); py += 2;
ST_DP(xv4, srcy); srcy += 2;
yv5 = LD_DP(py); py += 2;
ST_DP(xv5, srcy); srcy += 2;
yv6 = LD_DP(py); py += 2;
ST_DP(xv6, srcy); srcy += 2;
yv7 = LD_DP(py); py += 2;
ST_DP(xv7, srcy); srcy += 2;

xv0 = LD_DP(px); px += 2;
ST_DP(yv0, srcx); srcx += 2;
xv1 = LD_DP(px); px += 2;
ST_DP(yv1, srcx); srcx += 2;
xv2 = LD_DP(px); px += 2;
ST_DP(yv2, srcx); srcx += 2;
xv3 = LD_DP(px); px += 2;
ST_DP(yv3, srcx); srcx += 2;
xv4 = LD_DP(px); px += 2;
ST_DP(yv4, srcx); srcx += 2;
xv5 = LD_DP(px); px += 2;
ST_DP(yv5, srcx); srcx += 2;
xv6 = LD_DP(px); px += 2;
ST_DP(yv6, srcx); srcx += 2;
xv7 = LD_DP(px); px += 2;
ST_DP(yv7, srcx); srcx += 2;
}

LD_DP8_INC(py, 2, yv0, yv1, yv2, yv3, yv4, yv5, yv6, yv7);
ST_DP8_INC(xv0, xv1, xv2, xv3, xv4, xv5, xv6, xv7, srcy, 2);
ST_DP8_INC(yv0, yv1, yv2, yv3, yv4, yv5, yv6, yv7, srcx, 2);
}

if (n & 15)
{
if ((n & 8) && (n & 4) && (n & 2))
{
LD_DP7_INC(px, 2, xv0, xv1, xv2, xv3, xv4, xv5, xv6);
LD_DP7_INC(py, 2, yv0, yv1, yv2, yv3, yv4, yv5, yv6);
ST_DP7_INC(xv0, xv1, xv2, xv3, xv4, xv5, xv6, srcy, 2);
ST_DP7_INC(yv0, yv1, yv2, yv3, yv4, yv5, yv6, srcx, 2);
}
else if ((n & 8) && (n & 4))
{
LD_DP6_INC(px, 2, xv0, xv1, xv2, xv3, xv4, xv5);
LD_DP6_INC(py, 2, yv0, yv1, yv2, yv3, yv4, yv5);
ST_DP6_INC(xv0, xv1, xv2, xv3, xv4, xv5, srcy, 2);
ST_DP6_INC(yv0, yv1, yv2, yv3, yv4, yv5, srcx, 2);
}
else if ((n & 8) && (n & 2))
{
LD_DP5_INC(px, 2, xv0, xv1, xv2, xv3, xv4);
LD_DP5_INC(py, 2, yv0, yv1, yv2, yv3, yv4);
ST_DP5_INC(xv0, xv1, xv2, xv3, xv4, srcy, 2);
ST_DP5_INC(yv0, yv1, yv2, yv3, yv4, srcx, 2);
}
else if ((n & 4) && (n & 2))
{
LD_DP3_INC(px, 2, xv0, xv1, xv2);
LD_DP3_INC(py, 2, yv0, yv1, yv2);
ST_DP3_INC(xv0, xv1, xv2, srcy, 2);
ST_DP3_INC(yv0, yv1, yv2, srcx, 2);
}
else if (n & 8)
{
LD_DP4_INC(px, 2, xv0, xv1, xv2, xv3);
LD_DP4_INC(py, 2, yv0, yv1, yv2, yv3);
ST_DP4_INC(xv0, xv1, xv2, xv3, srcy, 2);
ST_DP4_INC(yv0, yv1, yv2, yv3, srcx, 2);
}
else if (n & 4)
{
LD_DP2_INC(px, 2, xv0, xv1);
LD_DP2_INC(py, 2, yv0, yv1);
ST_DP2_INC(xv0, xv1, srcy, 2);
ST_DP2_INC(yv0, yv1, srcx, 2);
}
else if (n & 2)
{
xv0 = LD_DP(px);
yv0 = LD_DP(py);

px += 2;
py += 2;

ST_DP(xv0, srcy);
ST_DP(yv0, srcx);

srcx += 2;
srcy += 2;
}

if (n & 1)
{
x0 = px[0];
y0 = py[0];
srcx[0] = y0;
srcy[0] = x0;
}
}
}
else
{
for (i = (n >> 3); i--;)
{
LD_GP8_INC(px, inc_x, x0, x1, x2, x3, x4, x5, x6, x7);
LD_GP8_INC(py, inc_y, y0, y1, y2, y3, y4, y5, y6, y7);
ST_GP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, srcy, inc_y);
ST_GP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, srcx, inc_x);
}

if (n & 7)
{
if ((n & 4) && (n & 2) && (n & 1))
{
LD_GP7_INC(px, inc_x, x0, x1, x2, x3, x4, x5, x6);
LD_GP7_INC(py, inc_y, y0, y1, y2, y3, y4, y5, y6);
ST_GP7_INC(x0, x1, x2, x3, x4, x5, x6, srcy, inc_y);
ST_GP7_INC(y0, y1, y2, y3, y4, y5, y6, srcx, inc_x);
}
else if ((n & 4) && (n & 2))
{
LD_GP6_INC(px, inc_x, x0, x1, x2, x3, x4, x5);
LD_GP6_INC(py, inc_y, y0, y1, y2, y3, y4, y5);
ST_GP6_INC(x0, x1, x2, x3, x4, x5, srcy, inc_y);
ST_GP6_INC(y0, y1, y2, y3, y4, y5, srcx, inc_x);
}
else if ((n & 4) && (n & 1))
{
LD_GP5_INC(px, inc_x, x0, x1, x2, x3, x4);
LD_GP5_INC(py, inc_y, y0, y1, y2, y3, y4);
ST_GP5_INC(x0, x1, x2, x3, x4, srcy, inc_y);
ST_GP5_INC(y0, y1, y2, y3, y4, srcx, inc_x);
}
else if ((n & 2) && (n & 1))
{
LD_GP3_INC(px, inc_x, x0, x1, x2);
LD_GP3_INC(py, inc_y, y0, y1, y2);
ST_GP3_INC(x0, x1, x2, srcy, inc_y);
ST_GP3_INC(y0, y1, y2, srcx, inc_x);
}
else if (n & 4)
{
LD_GP4_INC(px, inc_x, x0, x1, x2, x3);
LD_GP4_INC(py, inc_y, y0, y1, y2, y3);
ST_GP4_INC(x0, x1, x2, x3, srcy, inc_y);
ST_GP4_INC(y0, y1, y2, y3, srcx, inc_x);
}
else if (n & 2)
{
LD_GP2_INC(px, inc_x, x0, x1);
LD_GP2_INC(py, inc_y, y0, y1);
ST_GP2_INC(x0, x1, srcy, inc_y);
ST_GP2_INC(y0, y1, srcx, inc_x);
}
else if (n & 1)
{
x0 = *srcx;
y0 = *srcy;

*srcx = y0;
*srcy = x0;
}
}
}

return (0);
}

+ 25
- 0
kernel/mips/macros_msa.h View File

@@ -722,6 +722,31 @@ inline static void prefetch_load_lf(unsigned char *src)
MUL2(in4, in5, in6, in7, out2, out3); \
}

/* Description : Multiplication of pairs of vectors and added in output
Arguments : Inputs - in0, in1, vec, out0, out1
Outputs - out0, out1
Details : Each element from 'in0' is multiplied with elements from 'vec'
and the result is added to 'out0'
*/
#define FMADD2(in0, in1, vec, inout0, inout1) \
{ \
inout0 += in0 * vec; \
inout1 += in1 * vec; \
}
#define FMADD3(in0, in1, in2, vec, \
inout0, inout1, inout2) \
{ \
inout0 += in0 * vec; \
inout1 += in1 * vec; \
inout2 += in2 * vec; \
}
#define FMADD4(in0, in1, in2, in3, vec, \
inout0, inout1, inout2, inout3) \
{ \
FMADD2(in0, in1, vec, inout0, inout1); \
FMADD2(in2, in3, vec, inout2, inout3); \
}

/* Description : Addition of 2 pairs of variables
Arguments : Inputs - in0, in1, in2, in3
Outputs - out0, out1


+ 265
- 0
kernel/mips/saxpy_msa.c View File

@@ -0,0 +1,265 @@
/*******************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/

#include "common.h"
#include "macros_msa.h"

#if !defined(CONJ)
#define OP0 +=
#define OP1 -=
#define OP2 +=
#else
#define OP0 -=
#define OP1 +=
#define OP2 -=
#endif

int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x,
BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i;
FLOAT *py;
v4f32 x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, y6, y7;
v4f32 da_vec, zero_v = {0};

if ((n < 0) || (da == 0.0)) return(0);

py = y;

if ((1 == inc_x) && (1 == inc_y))
{
FLOAT *x_pref, *y_pref;
BLASLONG pref_offset;

pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
if (pref_offset > 0)
{
pref_offset = L1_DATA_LINESIZE - pref_offset;
pref_offset = pref_offset / sizeof(FLOAT);
}
x_pref = x + pref_offset + 64;

pref_offset = (BLASLONG)y & (L1_DATA_LINESIZE - 1);
if (pref_offset > 0)
{
pref_offset = L1_DATA_LINESIZE - pref_offset;
pref_offset = pref_offset / sizeof(FLOAT);
}
y_pref = y + pref_offset + 64;

da_vec = COPY_FLOAT_TO_VECTOR(da);

for (i = (n >> 5); i--;)
{
PREF_OFFSET(x_pref, 0);
PREF_OFFSET(x_pref, 32);
PREF_OFFSET(x_pref, 64);
PREF_OFFSET(x_pref, 96);
PREF_OFFSET(y_pref, 0);
PREF_OFFSET(y_pref, 32);
PREF_OFFSET(y_pref, 64);
PREF_OFFSET(y_pref, 96);
x_pref += 32;
y_pref += 32;

LD_SP8_INC(x, 4, x0, x1, x2, x3, x4, x5, x6, x7);
LD_SP8_INC(py, 4, y0, y1, y2, y3, y4, y5, y6, y7);
FMADD4(x0, x1, x2, x3, da_vec, y0, y1, y2, y3);
FMADD4(x4, x5, x6, x7, da_vec, y4, y5, y6, y7);
ST_SP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, 4);
}

if (n & 31)
{
if (n & 16)
{
LD_SP4_INC(x, 4, x0, x1, x2, x3);
LD_SP4_INC(py, 4, y0, y1, y2, y3);
FMADD4(x0, x1, x2, x3, da_vec, y0, y1, y2, y3);
ST_SP4_INC(y0, y1, y2, y3, y, 4);
}

if (n & 8)
{
LD_SP2_INC(x, 4, x0, x1);
LD_SP2_INC(py, 4, y0, y1);
FMADD2(x0, x1, da_vec, y0, y1);
ST_SP2_INC(y0, y1, y, 4);
}

if (n & 4)
{
x0 = LD_SP(x); x += 4;
y0 = LD_SP(py); py += 4;
y0 += da_vec * x0;
ST_SP(y0, y); y += 4;
}

if (n & 2)
{
FMADD2(x[0], x[1], da, y[0], y[1]);
x += 2;
y += 2;
}

if (n & 1)
{
y[0] += da * x[0];
}
}
}
else if (1 == inc_y)
{
da_vec = COPY_FLOAT_TO_VECTOR(da);

for (i = (n >> 4); i--;)
{
x0 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x));
x += inc_x;
x0 = (v4f32) __msa_insert_w((v4i32) x0, 1, *((int *) x));
x += inc_x;
x0 = (v4f32) __msa_insert_w((v4i32) x0, 2, *((int *) x));
x += inc_x;
x0 = (v4f32) __msa_insert_w((v4i32) x0, 3, *((int *) x));
x += inc_x;
x1 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x));
x += inc_x;
x1 = (v4f32) __msa_insert_w((v4i32) x1, 1, *((int *) x));
x += inc_x;
x1 = (v4f32) __msa_insert_w((v4i32) x1, 2, *((int *) x));
x += inc_x;
x1 = (v4f32) __msa_insert_w((v4i32) x1, 3, *((int *) x));
x += inc_x;
x2 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x));
x += inc_x;
x2 = (v4f32) __msa_insert_w((v4i32) x2, 1, *((int *) x));
x += inc_x;
x2 = (v4f32) __msa_insert_w((v4i32) x2, 2, *((int *) x));
x += inc_x;
x2 = (v4f32) __msa_insert_w((v4i32) x2, 3, *((int *) x));
x += inc_x;
x3 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x));
x += inc_x;
x3 = (v4f32) __msa_insert_w((v4i32) x3, 1, *((int *) x));
x += inc_x;
x3 = (v4f32) __msa_insert_w((v4i32) x3, 2, *((int *) x));
x += inc_x;
x3 = (v4f32) __msa_insert_w((v4i32) x3, 3, *((int *) x));
x += inc_x;

LD_SP4_INC(py, 4, y0, y1, y2, y3);
FMADD4(x0, x1, x2, x3, da_vec, y0, y1, y2, y3);
ST_SP4_INC(y0, y1, y2, y3, y, 4);
}

if (n & 15)
{
if (n & 8)
{
x0 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x));
x += inc_x;
x0 = (v4f32) __msa_insert_w((v4i32) x0, 1, *((int *) x));
x += inc_x;
x0 = (v4f32) __msa_insert_w((v4i32) x0, 2, *((int *) x));
x += inc_x;
x0 = (v4f32) __msa_insert_w((v4i32) x0, 3, *((int *) x));
x += inc_x;
x1 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x));
x += inc_x;
x1 = (v4f32) __msa_insert_w((v4i32) x1, 1, *((int *) x));
x += inc_x;
x1 = (v4f32) __msa_insert_w((v4i32) x1, 2, *((int *) x));
x += inc_x;
x1 = (v4f32) __msa_insert_w((v4i32) x1, 3, *((int *) x));
x += inc_x;

LD_SP2_INC(py, 4, y0, y1);
FMADD2(x0, x1, da_vec, y0, y1);
ST_SP2_INC(y0, y1, y, 4);
}

if (n & 4)
{
x0 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x));
x += inc_x;
x0 = (v4f32) __msa_insert_w((v4i32) x0, 1, *((int *) x));
x += inc_x;
x0 = (v4f32) __msa_insert_w((v4i32) x0, 2, *((int *) x));
x += inc_x;
x0 = (v4f32) __msa_insert_w((v4i32) x0, 3, *((int *) x));
x += inc_x;

y0 = LD_SP(py); py += 4;
y0 += da_vec * x0;
ST_SP(y0, y); y += 4;
}

if (n & 2)
{
FMADD2(x[0], x[inc_x], da, y[0], y[1]);

x += 2 * inc_x;
y += 2;
}

if (n & 1)
{
y[0] += da * x[0];
}
}
}
else
{
FLOAT x0, x1, x2, x3, y0, y1, y2, y3;

for (i = (n >> 2); i--;)
{
LD_GP4_INC(x, inc_x, x0, x1, x2, x3);
LD_GP4_INC(py, inc_y, y0, y1, y2, y3);
FMADD4(x0, x1, x2, x3, da, y0, y1, y2, y3);
ST_GP4_INC(y0, y1, y2, y3, y, inc_y);
}

if (n & 3)
{
if (n & 2)
{
LD_GP2_INC(x, inc_x, x0, x1);
LD_GP2_INC(py, inc_y, y0, y1);
FMADD2(x0, x1, da, y0, y1);
ST_GP2_INC(y0, y1, y, inc_y);
}

if (n & 1)
{
*y += da * *x;
}
}
}

return (0);
}

+ 186
- 0
kernel/mips/scopy_msa.c View File

@@ -0,0 +1,186 @@
/*******************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/

#include "common.h"
#include "macros_msa.h"

int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
BLASLONG i;
v4f32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
FLOAT f0, f1, f2, f3, f4, f5, f6, f7;

if (n < 0) return (0);

if ((1 == inc_x) && (1 == inc_y))
{
if (n > 63)
{
FLOAT *x_pref;
BLASLONG pref_offset;

pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
if (pref_offset > 0)
{
pref_offset = L1_DATA_LINESIZE - pref_offset;
pref_offset = pref_offset / sizeof(FLOAT);
}
x_pref = x + pref_offset + 128 + 32;

LD_SP8_INC(x, 4, x0, x1, x2, x3, x4, x5, x6, x7);
for (i = (n >> 6) - 1; i--;)
{
PREF_OFFSET(x_pref, 0);
PREF_OFFSET(x_pref, 32);
PREF_OFFSET(x_pref, 64);
PREF_OFFSET(x_pref, 96);
PREF_OFFSET(x_pref, 128);
PREF_OFFSET(x_pref, 160);
PREF_OFFSET(x_pref, 192);
PREF_OFFSET(x_pref, 224);
x_pref += 64;

x8 = LD_SP(x); x += 4;
ST_SP(x0, y); y += 4;
x9 = LD_SP(x); x += 4;
ST_SP(x1, y); y += 4;
x10 = LD_SP(x); x += 4;
ST_SP(x2, y); y += 4;
x11 = LD_SP(x); x += 4;
ST_SP(x3, y); y += 4;
x12 = LD_SP(x); x += 4;
ST_SP(x4, y); y += 4;
x13 = LD_SP(x); x += 4;
ST_SP(x5, y); y += 4;
x14 = LD_SP(x); x += 4;
ST_SP(x6, y); y += 4;
x15 = LD_SP(x); x += 4;
ST_SP(x7, y); y += 4;
x0 = LD_SP(x); x += 4;
ST_SP(x8, y); y += 4;
x1 = LD_SP(x); x += 4;
ST_SP(x9, y); y += 4;
x2 = LD_SP(x); x += 4;
ST_SP(x10, y); y += 4;
x3 = LD_SP(x); x += 4;
ST_SP(x11, y); y += 4;
x4 = LD_SP(x); x += 4;
ST_SP(x12, y); y += 4;
x5 = LD_SP(x); x += 4;
ST_SP(x13, y); y += 4;
x6 = LD_SP(x); x += 4;
ST_SP(x14, y); y += 4;
x7 = LD_SP(x); x += 4;
ST_SP(x15, y); y += 4;
}

x8 = LD_SP(x); x += 4;
x9 = LD_SP(x); x += 4;
ST_SP(x0, y); y += 4;
x10 = LD_SP(x); x += 4;
ST_SP(x1, y); y += 4;
x11 = LD_SP(x); x += 4;
ST_SP(x2, y); y += 4;
x12 = LD_SP(x); x += 4;
ST_SP(x3, y); y += 4;
x13 = LD_SP(x); x += 4;
ST_SP(x4, y); y += 4;
x14 = LD_SP(x); x += 4;
ST_SP(x5, y); y += 4;
x15 = LD_SP(x); x += 4;
ST_SP(x6, y); y += 4;
ST_SP(x7, y); y += 4;

ST_SP8_INC(x8, x9, x10, x11, x12, x13, x14, x15, y, 4);
}

if (n & 63)
{
if (n & 32)
{
LD_SP8_INC(x, 4, x0, x1, x2, x3, x4, x5, x6, x7);
ST_SP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, y, 4);
}

if (n & 16)
{
LD_SP4_INC(x, 4, x0, x1, x2, x3);
ST_SP4_INC(x0, x1, x2, x3, y, 4);
}

if (n & 8)
{
LD_SP2_INC(x, 4, x0, x1);
ST_SP2_INC(x0, x1, y, 4);
}

if (n & 4)
{
LD_GP4_INC(x, 1, f0, f1, f2, f3);
ST_GP4_INC(f0, f1, f2, f3, y, 1);
}

if (n & 2)
{
LD_GP2_INC(x, 1, f0, f1);
ST_GP2_INC(f0, f1, y, 1);
}

if (n & 1)
{
*y = *x;
}
}
}
else
{
for (i = (n >> 3); i--;)
{
LD_GP8_INC(x, inc_x, f0, f1, f2, f3, f4, f5, f6, f7);
ST_GP8_INC(f0, f1, f2, f3, f4, f5, f6, f7, y, inc_y);
}

if (n & 4)
{
LD_GP4_INC(x, inc_x, f0, f1, f2, f3);
ST_GP4_INC(f0, f1, f2, f3, y, inc_y);
}

if (n & 2)
{
LD_GP2_INC(x, inc_x, f0, f1);
ST_GP2_INC(f0, f1, y, inc_y);
}

if (n & 1)
{
*y = *x;
}
}

return (0);
}

+ 385
- 0
kernel/mips/sscal_msa.c View File

@@ -0,0 +1,385 @@
/*******************************************************************************
Copyright (c) 2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/

#include "common.h"
#include "macros_msa.h"

int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x,
BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
BLASLONG dummy2)
{
BLASLONG i;
FLOAT *px;
FLOAT f0, f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11, f12, f13, f14, f15;
v4f32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
v4f32 da_vec;

px = x;

if (1 == inc_x)
{
if (0.0 == da)
{
v4f32 zero_v = __msa_cast_to_vector_float(0);
zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 0, 0.0);
zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 1, 0.0);
zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 2, 0.0);
zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 3, 0.0);

for (i = (n >> 6); i--;)
{
ST_SP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
zero_v, zero_v, x, 4);
ST_SP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
zero_v, zero_v, x, 4);
}

if (n & 63)
{
if (n & 32)
{
ST_SP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
zero_v, zero_v, x, 4);
}

if (n & 16)
{
ST_SP4_INC(zero_v, zero_v, zero_v, zero_v, x, 4);
}

if (n & 8)
{
ST_SP2_INC(zero_v, zero_v, x, 4);
}

if (n & 4)
{
*x = 0; x += 1;
*x = 0; x += 1;
*x = 0; x += 1;
*x = 0; x += 1;
}

if (n & 2)
{
*x = 0; x += 1;
*x = 0; x += 1;
}

if (n & 1)
{
*x = 0;
}
}
}
else
{
da_vec = COPY_FLOAT_TO_VECTOR(da);

if (n > 63)
{
FLOAT *x_pref;
BLASLONG pref_offset;

pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
if (pref_offset > 0)
{
pref_offset = L1_DATA_LINESIZE - pref_offset;
pref_offset = pref_offset / sizeof(FLOAT);
}
x_pref = x + pref_offset + 64 + 32;

LD_SP8_INC(px, 4, x0, x1, x2, x3, x4, x5, x6, x7);
for (i = 0; i < (n >> 6) - 1; i++)
{
PREF_OFFSET(x_pref, 0);
PREF_OFFSET(x_pref, 32);
PREF_OFFSET(x_pref, 64);
PREF_OFFSET(x_pref, 96);
PREF_OFFSET(x_pref, 128);
PREF_OFFSET(x_pref, 160);
PREF_OFFSET(x_pref, 192);
PREF_OFFSET(x_pref, 224);
x_pref += 64;

x8 = LD_SP(px); px += 4;
x0 *= da_vec;
x9 = LD_SP(px); px += 4;
x1 *= da_vec;
x10 = LD_SP(px); px += 4;
x2 *= da_vec;
x11 = LD_SP(px); px += 4;
x3 *= da_vec;
x12 = LD_SP(px); px += 4;
x4 *= da_vec;
x13 = LD_SP(px); px += 4;
x5 *= da_vec;
x14 = LD_SP(px); px += 4;
x6 *= da_vec;
x15 = LD_SP(px); px += 4;
x7 *= da_vec;
x8 *= da_vec;
ST_SP(x0, x); x += 4;
x9 *= da_vec;
ST_SP(x1, x); x += 4;
x10 *= da_vec;
ST_SP(x2, x); x += 4;
x11 *= da_vec;
ST_SP(x3, x); x += 4;
x12 *= da_vec;
ST_SP(x4, x); x += 4;
x13 *= da_vec;
ST_SP(x5, x); x += 4;
x14 *= da_vec;
ST_SP(x6, x); x += 4;
x15 *= da_vec;
ST_SP(x7, x); x += 4;
ST_SP(x8, x); x += 4;
x0 = LD_SP(px); px += 4;
ST_SP(x9, x); x += 4;
x1 = LD_SP(px); px += 4;
ST_SP(x10, x); x += 4;
x2 = LD_SP(px); px += 4;
ST_SP(x11, x); x += 4;
x3 = LD_SP(px); px += 4;
ST_SP(x12, x); x += 4;
x4 = LD_SP(px); px += 4;
ST_SP(x13, x); x += 4;
x5 = LD_SP(px); px += 4;
ST_SP(x14, x); x += 4;
x6 = LD_SP(px); px += 4;
ST_SP(x15, x); x += 4;
x7 = LD_SP(px); px += 4;
}

x8 = LD_SP(px); px += 4;
x0 *= da_vec;
x9 = LD_SP(px); px += 4;
x1 *= da_vec;
x10 = LD_SP(px); px += 4;
x2 *= da_vec;
x11 = LD_SP(px); px += 4;
x3 *= da_vec;
x12 = LD_SP(px); px += 4;
x4 *= da_vec;
x13 = LD_SP(px); px += 4;
x5 *= da_vec;
x14 = LD_SP(px); px += 4;
x6 *= da_vec;
x15 = LD_SP(px); px += 4;
x7 *= da_vec;
x8 *= da_vec;
ST_SP(x0, x); x += 4;
x9 *= da_vec;
ST_SP(x1, x); x += 4;
x10 *= da_vec;
ST_SP(x2, x); x += 4;
x11 *= da_vec;
ST_SP(x3, x); x += 4;
x12 *= da_vec;
ST_SP(x4, x); x += 4;
x13 *= da_vec;
ST_SP(x5, x); x += 4;
x15 *= da_vec;
ST_SP(x6, x); x += 4;
x14 *= da_vec;
ST_SP(x7, x); x += 4;

ST_SP8_INC(x8, x9, x10, x11, x12, x13, x14, x15, x, 4);
}

if (n & 63)
{
if (n & 32)
{
LD_SP8_INC(px, 4, x0, x1, x2, x3, x4, x5, x6, x7);
MUL4(x0, da_vec, x1, da_vec, x2, da_vec, x3, da_vec, x0, x1, x2, x3);
MUL4(x4, da_vec, x5, da_vec, x6, da_vec, x7, da_vec, x4, x5, x6, x7);
ST_SP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, 4);
}

if (n & 16)
{
LD_SP4_INC(px, 4, x0, x1, x2, x3);
MUL4(x0, da_vec, x1, da_vec, x2, da_vec, x3, da_vec, x0, x1, x2, x3);
ST_SP4_INC(x0, x1, x2, x3, x, 4);
}

if (n & 8)
{
LD_SP2_INC(px, 4, x0, x1);
MUL2(x0, da_vec, x1, da_vec, x0, x1);
ST_SP2_INC(x0, x1, x, 4);
}

if (n & 4)
{
LD_GP4_INC(px, 1, f0, f1, f2, f3);
MUL4(f0, da, f1, da, f2, da, f3, da, f0, f1, f2, f3);
ST_GP4_INC(f0, f1, f2, f3, x, 1);
}

if (n & 2)
{
LD_GP2_INC(px, 1, f0, f1);
MUL2(f0, da, f1, da, f0, f1);
ST_GP2_INC(f0, f1, x, 1);
}

if (n & 1)
{
*x *= da;
}
}
}
}
else
{
if (0.0 == da)
{
for (i = n; i--;)
{
*x = 0;
x += inc_x;
}
}
else
{
if (n > 15)
{
LD_GP8_INC(px, inc_x, f0, f1, f2, f3, f4, f5, f6, f7);

for (i = 0; i < (n >> 4) - 1; i++)
{
LD_GP8_INC(px, inc_x, f8, f9, f10, f11, f12, f13, f14, f15);
MUL4(f0, da, f1, da, f2, da, f3, da, f0, f1, f2, f3);

f4 *= da;
f5 *= da;
*x = f0; x += inc_x;
f6 *= da;
*x = f1; x += inc_x;
f7 *= da;
*x = f2; x += inc_x;
f8 *= da;
*x = f3; x += inc_x;
f9 *= da;
*x = f4; x += inc_x;
f10 *= da;
*x = f5; x += inc_x;
f11 *= da;
*x = f6; x += inc_x;
f12 *= da;
*x = f7; x += inc_x;
f13 *= da;
*x = f8; x += inc_x;
f14 *= da;
*x = f9; x += inc_x;
f15 *= da;
*x = f10; x += inc_x;
*x = f11; x += inc_x;
f0 = *px; px += inc_x;
*x = f12; x += inc_x;
f1 = *px; px += inc_x;
*x = f13; x += inc_x;
f2 = *px; px += inc_x;
*x = f14; x += inc_x;
f3 = *px; px += inc_x;
*x = f15; x += inc_x;
f4 = *px; px += inc_x;
f5 = *px; px += inc_x;
f6 = *px; px += inc_x;
f7 = *px; px += inc_x;
}

LD_GP8_INC(px, inc_x, f8, f9, f10, f11, f12, f13, f14, f15);
MUL4(f0, da, f1, da, f2, da, f3, da, f0, f1, f2, f3);

f4 *= da;
f5 *= da;
*x = f0; x += inc_x;
f6 *= da;
*x = f1; x += inc_x;
f7 *= da;
*x = f2; x += inc_x;
f8 *= da;
*x = f3; x += inc_x;
f9 *= da;
*x = f4; x += inc_x;
f10 *= da;
*x = f5; x += inc_x;
f11 *= da;
*x = f6; x += inc_x;
f12 *= da;
*x = f7; x += inc_x;
f13 *= da;
*x = f8; x += inc_x;
f14 *= da;
*x = f9; x += inc_x;
f15 *= da;
*x = f10; x += inc_x;
*x = f11; x += inc_x;
*x = f12; x += inc_x;
*x = f13; x += inc_x;
*x = f14; x += inc_x;
*x = f15; x += inc_x;
}

if (n & 15)
{
if (n & 8)
{
LD_GP8_INC(px, inc_x, f0, f1, f2, f3, f4, f5, f6, f7);
MUL4(f0, da, f1, da, f2, da, f3, da, f0, f1, f2, f3);
MUL4(f4, da, f5, da, f6, da, f7, da, f4, f5, f6, f7);
ST_GP8_INC(f0, f1, f2, f3, f4, f5, f6, f7, x, inc_x);
}

if (n & 4)
{
LD_GP4_INC(px, inc_x, f0, f1, f2, f3);
MUL4(f0, da, f1, da, f2, da, f3, da, f0, f1, f2, f3);
ST_GP4_INC(f0, f1, f2, f3, x, inc_x);
}

if (n & 2)
{
LD_GP2_INC(px, inc_x, f0, f1);
MUL2(f0, da, f1, da, f0, f1);
ST_GP2_INC(f0, f1, x, inc_x);
}

if (n & 1)
{
*x *= da;
}
}
}
}

return 0;
}

+ 267
- 0
kernel/mips/sswap_msa.c View File

@@ -0,0 +1,267 @@
/*******************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/

#include "common.h"
#include "macros_msa.h"

int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3,
FLOAT *srcx, BLASLONG inc_x, FLOAT *srcy, BLASLONG inc_y,
FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i = 0, pref_offsetx, pref_offsety;
FLOAT *px, *py;
FLOAT x0, x1, x2, x3, x4, x5, x6, x7;
FLOAT y0, y1, y2, y3, y4, y5, y6, y7;
v4f32 xv0, xv1, xv2, xv3, xv4, xv5, xv6, xv7;
v4f32 yv0, yv1, yv2, yv3, yv4, yv5, yv6, yv7;

if (n < 0) return (0);

pref_offsetx = (BLASLONG)srcx & (L1_DATA_LINESIZE - 1);
if (pref_offsetx > 0)
{
pref_offsetx = L1_DATA_LINESIZE - pref_offsetx;
pref_offsetx = pref_offsetx / sizeof(FLOAT);
}

pref_offsety = (BLASLONG)srcy & (L1_DATA_LINESIZE - 1);
if (pref_offsety > 0)
{
pref_offsety = L1_DATA_LINESIZE - pref_offsety;
pref_offsety = pref_offsety / sizeof(FLOAT);
}

px = srcx;
py = srcy;

if ((1 == inc_x) && (1 == inc_y))
{
if (n >> 5)
{
LD_SP8_INC(px, 4, xv0, xv1, xv2, xv3, xv4, xv5, xv6, xv7);

for (i = (n >> 5) - 1; i--;)
{
PREFETCH(px + pref_offsetx + 32);
PREFETCH(px + pref_offsetx + 40);
PREFETCH(px + pref_offsetx + 48);
PREFETCH(px + pref_offsetx + 56);

PREFETCH(py + pref_offsety + 32);
PREFETCH(py + pref_offsety + 40);
PREFETCH(py + pref_offsety + 48);
PREFETCH(py + pref_offsety + 56);

yv0 = LD_SP(py); py += 4;
ST_SP(xv0, srcy); srcy += 4;
yv1 = LD_SP(py); py += 4;
ST_SP(xv1, srcy); srcy += 4;
yv2 = LD_SP(py); py += 4;
ST_SP(xv2, srcy); srcy += 4;
yv3 = LD_SP(py); py += 4;
ST_SP(xv3, srcy); srcy += 4;
yv4 = LD_SP(py); py += 4;
ST_SP(xv4, srcy); srcy += 4;
yv5 = LD_SP(py); py += 4;
ST_SP(xv5, srcy); srcy += 4;
yv6 = LD_SP(py); py += 4;
ST_SP(xv6, srcy); srcy += 4;
yv7 = LD_SP(py); py += 4;
ST_SP(xv7, srcy); srcy += 4;

xv0 = LD_SP(px); px += 4;
ST_SP(yv0, srcx); srcx += 4;
xv1 = LD_SP(px); px += 4;
ST_SP(yv1, srcx); srcx += 4;
xv2 = LD_SP(px); px += 4;
ST_SP(yv2, srcx); srcx += 4;
xv3 = LD_SP(px); px += 4;
ST_SP(yv3, srcx); srcx += 4;
xv4 = LD_SP(px); px += 4;
ST_SP(yv4, srcx); srcx += 4;
xv5 = LD_SP(px); px += 4;
ST_SP(yv5, srcx); srcx += 4;
xv6 = LD_SP(px); px += 4;
ST_SP(yv6, srcx); srcx += 4;
xv7 = LD_SP(px); px += 4;
ST_SP(yv7, srcx); srcx += 4;
}

LD_SP8_INC(py, 4, yv0, yv1, yv2, yv3, yv4, yv5, yv6, yv7);
ST_SP8_INC(xv0, xv1, xv2, xv3, xv4, xv5, xv6, xv7, srcy, 4);
ST_SP8_INC(yv0, yv1, yv2, yv3, yv4, yv5, yv6, yv7, srcx, 4);
}

if (n & 31)
{
if ((n & 16) && (n & 8) && (n & 4))
{
LD_SP7_INC(px, 4, xv0, xv1, xv2, xv3, xv4, xv5, xv6);
LD_SP7_INC(py, 4, yv0, yv1, yv2, yv3, yv4, yv5, yv6);
ST_SP7_INC(xv0, xv1, xv2, xv3, xv4, xv5, xv6, srcy, 4);
ST_SP7_INC(yv0, yv1, yv2, yv3, yv4, yv5, yv6, srcx, 4);
}
else if ((n & 16) && (n & 8))
{
LD_SP6_INC(px, 4, xv0, xv1, xv2, xv3, xv4, xv5);
LD_SP6_INC(py, 4, yv0, yv1, yv2, yv3, yv4, yv5);
ST_SP6_INC(xv0, xv1, xv2, xv3, xv4, xv5, srcy, 4);
ST_SP6_INC(yv0, yv1, yv2, yv3, yv4, yv5, srcx, 4);
}
else if ((n & 16) && (n & 4))
{
LD_SP5_INC(px, 4, xv0, xv1, xv2, xv3, xv4);
LD_SP5_INC(py, 4, yv0, yv1, yv2, yv3, yv4);
ST_SP5_INC(xv0, xv1, xv2, xv3, xv4, srcy, 4);
ST_SP5_INC(yv0, yv1, yv2, yv3, yv4, srcx, 4);
}
else if ((n & 8) && (n & 4))
{
LD_SP3_INC(px, 4, xv0, xv1, xv2);
LD_SP3_INC(py, 4, yv0, yv1, yv2);
ST_SP3_INC(xv0, xv1, xv2, srcy, 4);
ST_SP3_INC(yv0, yv1, yv2, srcx, 4);
}
else if (n & 16)
{
LD_SP4_INC(px, 4, xv0, xv1, xv2, xv3);
LD_SP4_INC(py, 4, yv0, yv1, yv2, yv3);
ST_SP4_INC(xv0, xv1, xv2, xv3, srcy, 4);
ST_SP4_INC(yv0, yv1, yv2, yv3, srcx, 4);
}
else if (n & 8)
{
LD_SP2_INC(px, 4, xv0, xv1);
LD_SP2_INC(py, 4, yv0, yv1);
ST_SP2_INC(xv0, xv1, srcy, 4);
ST_SP2_INC(yv0, yv1, srcx, 4);
}
else if (n & 4)
{
xv0 = LD_SP(px);
yv0 = LD_SP(py);

px += 4;
py += 4;

ST_SP(xv0, srcy);
ST_SP(yv0, srcx);

srcx += 4;
srcy += 4;
}

if ((n & 2) && (n & 1))
{
LD_GP3_INC(px, 1, x0, x1, x3);
LD_GP3_INC(py, 1, y0, y1, y3);
ST_GP3_INC(x0, x1, x3, srcy, 1);
ST_GP3_INC(y0, y1, y3, srcx, 1);
}
else if (n & 2)
{
LD_GP2_INC(px, 1, x0, x1);
LD_GP2_INC(py, 1, y0, y1);
ST_GP2_INC(x0, x1, srcy, 1);
ST_GP2_INC(y0, y1, srcx, 1);
}
else if (n & 1)
{
x0 = px[0];
y0 = py[0];
srcx[0] = y0;
srcy[0] = x0;
}
}
}
else
{
for (i = (n >> 3); i--;)
{
LD_GP8_INC(px, inc_x, x0, x1, x2, x3, x4, x5, x6, x7);
LD_GP8_INC(py, inc_y, y0, y1, y2, y3, y4, y5, y6, y7);
ST_GP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, srcy, inc_y);
ST_GP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, srcx, inc_x);
}

if (n & 7)
{
if ((n & 4) && (n & 2) && (n & 1))
{
LD_GP7_INC(px, inc_x, x0, x1, x2, x3, x4, x5, x6);
LD_GP7_INC(py, inc_y, y0, y1, y2, y3, y4, y5, y6);
ST_GP7_INC(x0, x1, x2, x3, x4, x5, x6, srcy, inc_y);
ST_GP7_INC(y0, y1, y2, y3, y4, y5, y6, srcx, inc_x);
}
else if ((n & 4) && (n & 2))
{
LD_GP6_INC(px, inc_x, x0, x1, x2, x3, x4, x5);
LD_GP6_INC(py, inc_y, y0, y1, y2, y3, y4, y5);
ST_GP6_INC(x0, x1, x2, x3, x4, x5, srcy, inc_y);
ST_GP6_INC(y0, y1, y2, y3, y4, y5, srcx, inc_x);
}
else if ((n & 4) && (n & 1))
{
LD_GP5_INC(px, inc_x, x0, x1, x2, x3, x4);
LD_GP5_INC(py, inc_y, y0, y1, y2, y3, y4);
ST_GP5_INC(x0, x1, x2, x3, x4, srcy, inc_y);
ST_GP5_INC(y0, y1, y2, y3, y4, srcx, inc_x);
}
else if ((n & 2) && (n & 1))
{
LD_GP3_INC(px, inc_x, x0, x1, x2);
LD_GP3_INC(py, inc_y, y0, y1, y2);
ST_GP3_INC(x0, x1, x2, srcy, inc_y);
ST_GP3_INC(y0, y1, y2, srcx, inc_x);
}
else if (n & 4)
{
LD_GP4_INC(px, inc_x, x0, x1, x2, x3);
LD_GP4_INC(py, inc_y, y0, y1, y2, y3);
ST_GP4_INC(x0, x1, x2, x3, srcy, inc_y);
ST_GP4_INC(y0, y1, y2, y3, srcx, inc_x);
}
else if (n & 2)
{
LD_GP2_INC(px, inc_x, x0, x1);
LD_GP2_INC(py, inc_y, y0, y1);
ST_GP2_INC(x0, x1, srcy, inc_y);
ST_GP2_INC(y0, y1, srcx, inc_x);
}
else if (n & 1)
{
x0 = *srcx;
y0 = *srcy;

*srcx = y0;
*srcy = x0;
}
}
}

return (0);
}

+ 494
- 0
kernel/mips/zaxpy_msa.c View File

@@ -0,0 +1,494 @@
/*******************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/

#include "common.h"
#include "macros_msa.h"

#if !defined(CONJ)
#define OP0 +=
#define OP1 -=
#define OP2 +=
#else
#define OP0 -=
#define OP1 +=
#define OP2 -=
#endif

int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
BLASLONG dummy2)
{
BLASLONG i, inc_x2, inc_y2;
FLOAT *py;
v2f64 x0, x1, x2, x3, x4, x5, x6, x7;
v2f64 y0, y1, y2, y3, y4, y5, y6, y7, dar_vec, dai_vec;
v2f64 x0r, x1r, x2r, x3r, x0i, x1i, x2i, x3i;
v2f64 y0r, y1r, y2r, y3r, y0i, y1i, y2i, y3i;
FLOAT xd0, xd1, yd0, yd1;

if (n < 0) return(0);
if ((da_r == 0.0) && (da_i == 0.0)) return(0);

py = y;

dar_vec = COPY_DOUBLE_TO_VECTOR(da_r);
dai_vec = COPY_DOUBLE_TO_VECTOR(da_i);

if ((1 == inc_x) && (1 == inc_y))
{
FLOAT *x_pref, *y_pref;
BLASLONG pref_offset;

pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
if (pref_offset > 0)
{
pref_offset = L1_DATA_LINESIZE - pref_offset;
pref_offset = pref_offset / sizeof(FLOAT);
}
x_pref = x + pref_offset + 32;

pref_offset = (BLASLONG)y & (L1_DATA_LINESIZE - 1);
if (pref_offset > 0)
{
pref_offset = L1_DATA_LINESIZE - pref_offset;
pref_offset = pref_offset / sizeof(FLOAT);
}
y_pref = y + pref_offset + 32;

for (i = (n >> 3); i--;)
{
PREF_OFFSET(x_pref, 0);
PREF_OFFSET(x_pref, 32);
PREF_OFFSET(x_pref, 64);
PREF_OFFSET(x_pref, 96);
PREF_OFFSET(y_pref, 0);
PREF_OFFSET(y_pref, 32);
PREF_OFFSET(y_pref, 64);
PREF_OFFSET(y_pref, 96);
x_pref += 16;
y_pref += 16;

LD_DP8_INC(x, 2, x0, x1, x2, x3, x4, x5, x6, x7);
LD_DP8_INC(py, 2, y0, y1, y2, y3, y4, y5, y6, y7);
PCKEVOD_D2_DP(x1, x0, x0r, x0i);
PCKEVOD_D2_DP(y1, y0, y0r, y0i);
PCKEVOD_D2_DP(x3, x2, x1r, x1i);
PCKEVOD_D2_DP(y3, y2, y1r, y1i);
PCKEVOD_D2_DP(x5, x4, x2r, x2i);
PCKEVOD_D2_DP(y5, y4, y2r, y2i);
PCKEVOD_D2_DP(x7, x6, x3r, x3i);
PCKEVOD_D2_DP(y7, y6, y3r, y3i);

FMADD4(x0r, x1r, x2r, x3r, dar_vec, y0r, y1r, y2r, y3r);
y0i OP0 dar_vec * x0i;
y1i OP0 dar_vec * x1i;
y2i OP0 dar_vec * x2i;
y3i OP0 dar_vec * x3i;
y0r OP1 dai_vec * x0i;
y1r OP1 dai_vec * x1i;
y2r OP1 dai_vec * x2i;
y3r OP1 dai_vec * x3i;
y0i OP2 dai_vec * x0r;
y1i OP2 dai_vec * x1r;
y2i OP2 dai_vec * x2r;
y3i OP2 dai_vec * x3r;

ILVRL_D2_DP(y0i, y0r, y0, y1);
ILVRL_D2_DP(y1i, y1r, y2, y3);
ILVRL_D2_DP(y2i, y2r, y4, y5);
ILVRL_D2_DP(y3i, y3r, y6, y7);
ST_DP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, 2);
}

if (n & 7)
{
if (n & 4)
{
LD_DP4_INC(x, 2, x0, x1, x2, x3);
LD_DP4_INC(py, 2, y0, y1, y2, y3);
PCKEVOD_D2_DP(x1, x0, x0r, x0i);
PCKEVOD_D2_DP(y1, y0, y0r, y0i);
PCKEVOD_D2_DP(x3, x2, x1r, x1i);
PCKEVOD_D2_DP(y3, y2, y1r, y1i);

FMADD2(x0r, x1r, dar_vec, y0r, y1r);
y0i OP0 dar_vec * x0i;
y1i OP0 dar_vec * x1i;
y0r OP1 dai_vec * x0i;
y1r OP1 dai_vec * x1i;
y0i OP2 dai_vec * x0r;
y1i OP2 dai_vec * x1r;

ILVRL_D2_DP(y0i, y0r, y0, y1);
ILVRL_D2_DP(y1i, y1r, y2, y3);
ST_DP4_INC(y0, y1, y2, y3, y, 2);
}

if (n & 2)
{
LD_DP2_INC(x, 2, x0, x1);
LD_DP2_INC(py, 2, y0, y1);
PCKEVOD_D2_DP(x1, x0, x0r, x0i);
PCKEVOD_D2_DP(y1, y0, y0r, y0i);

y0r += dar_vec * x0r;
y0i OP0 dar_vec * x0i;
y0r OP1 dai_vec * x0i;
y0i OP2 dai_vec * x0r;

ILVRL_D2_DP(y0i, y0r, y0, y1);
ST_DP2_INC(y0, y1, y, 2);
}

if (n & 1)
{
LD_GP2_INC(x, 1, xd0, xd1);
LD_GP2_INC(py, 1, yd0, yd1);

yd0 += da_r * xd0;
yd1 OP0 da_r * xd1;
yd0 OP1 da_i * xd1;
yd1 OP2 da_i * xd0;

ST_GP2_INC(yd0, yd1, y, 1);
}
}
}
else if (1 == inc_y)
{
FLOAT *y_pref;
BLASLONG pref_offset;

pref_offset = (BLASLONG)y & (L1_DATA_LINESIZE - 1);
if (pref_offset > 0)
{
pref_offset = L1_DATA_LINESIZE - pref_offset;
pref_offset = pref_offset / sizeof(FLOAT);
}
y_pref = y + pref_offset + 32;

inc_x2 = 2 * inc_x;

for (i = (n >> 3); i--;)
{
PREF_OFFSET(y_pref, 0);
PREF_OFFSET(y_pref, 32);
PREF_OFFSET(y_pref, 64);
PREF_OFFSET(y_pref, 96);
y_pref += 16;

LD_DP8_INC(x, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7);
LD_DP8_INC(py, 2, y0, y1, y2, y3, y4, y5, y6, y7);
PCKEVOD_D2_DP(x1, x0, x0r, x0i);
PCKEVOD_D2_DP(y1, y0, y0r, y0i);
PCKEVOD_D2_DP(x3, x2, x1r, x1i);
PCKEVOD_D2_DP(y3, y2, y1r, y1i);
PCKEVOD_D2_DP(x5, x4, x2r, x2i);
PCKEVOD_D2_DP(y5, y4, y2r, y2i);
PCKEVOD_D2_DP(x7, x6, x3r, x3i);
PCKEVOD_D2_DP(y7, y6, y3r, y3i);

FMADD4(x0r, x1r, x2r, x3r, dar_vec, y0r, y1r, y2r, y3r);
y0i OP0 dar_vec * x0i;
y1i OP0 dar_vec * x1i;
y2i OP0 dar_vec * x2i;
y3i OP0 dar_vec * x3i;
y0r OP1 dai_vec * x0i;
y1r OP1 dai_vec * x1i;
y2r OP1 dai_vec * x2i;
y3r OP1 dai_vec * x3i;
y0i OP2 dai_vec * x0r;
y1i OP2 dai_vec * x1r;
y2i OP2 dai_vec * x2r;
y3i OP2 dai_vec * x3r;

ILVRL_D2_DP(y0i, y0r, y0, y1);
ILVRL_D2_DP(y1i, y1r, y2, y3);
ILVRL_D2_DP(y2i, y2r, y4, y5);
ILVRL_D2_DP(y3i, y3r, y6, y7);
ST_DP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, 2);
}

if (n & 7)
{
if (n & 4)
{
LD_DP4_INC(x, inc_x2, x0, x1, x2, x3);
LD_DP4_INC(py, 2, y0, y1, y2, y3);
PCKEVOD_D2_DP(x1, x0, x0r, x0i);
PCKEVOD_D2_DP(y1, y0, y0r, y0i);
PCKEVOD_D2_DP(x3, x2, x1r, x1i);
PCKEVOD_D2_DP(y3, y2, y1r, y1i);

FMADD2(x0r, x1r, dar_vec, y0r, y1r);
y0i OP0 dar_vec * x0i;
y1i OP0 dar_vec * x1i;
y0r OP1 dai_vec * x0i;
y1r OP1 dai_vec * x1i;
y0i OP2 dai_vec * x0r;
y1i OP2 dai_vec * x1r;

ILVRL_D2_DP(y0i, y0r, y0, y1);
ILVRL_D2_DP(y1i, y1r, y2, y3);
ST_DP4_INC(y0, y1, y2, y3, y, 2);
}

if (n & 2)
{
LD_DP2_INC(x, inc_x2, x0, x1);
LD_DP2_INC(py, 2, y0, y1);
PCKEVOD_D2_DP(x1, x0, x0r, x0i);
PCKEVOD_D2_DP(y1, y0, y0r, y0i);

y0r += dar_vec * x0r;
y0i OP0 dar_vec * x0i;
y0r OP1 dai_vec * x0i;
y0i OP2 dai_vec * x0r;

ILVRL_D2_DP(y0i, y0r, y0, y1);
ST_DP2_INC(y0, y1, y, 2);
}

if (n & 1)
{
LD_GP2_INC(x, 1, xd0, xd1);
LD_GP2_INC(py, 1, yd0, yd1);

yd0 += da_r * xd0;
yd1 OP0 da_r * xd1;
yd0 OP1 da_i * xd1;
yd1 OP2 da_i * xd0;

ST_GP2_INC(yd0, yd1, y, 1);
}
}
}
else if (1 == inc_x)
{
FLOAT *x_pref;
BLASLONG pref_offset;

pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
if (pref_offset > 0)
{
pref_offset = L1_DATA_LINESIZE - pref_offset;
pref_offset = pref_offset / sizeof(FLOAT);
}
x_pref = x + pref_offset + 32;

inc_y2 = 2 * inc_y;

for (i = (n >> 3); i--;)
{
PREF_OFFSET(x_pref, 0);
PREF_OFFSET(x_pref, 32);
PREF_OFFSET(x_pref, 64);
PREF_OFFSET(x_pref, 96);
x_pref += 16;

LD_DP8_INC(x, 2, x0, x1, x2, x3, x4, x5, x6, x7);
LD_DP8_INC(py, inc_y2, y0, y1, y2, y3, y4, y5, y6, y7);
PCKEVOD_D2_DP(x1, x0, x0r, x0i);
PCKEVOD_D2_DP(y1, y0, y0r, y0i);
PCKEVOD_D2_DP(x3, x2, x1r, x1i);
PCKEVOD_D2_DP(y3, y2, y1r, y1i);
PCKEVOD_D2_DP(x5, x4, x2r, x2i);
PCKEVOD_D2_DP(y5, y4, y2r, y2i);
PCKEVOD_D2_DP(x7, x6, x3r, x3i);
PCKEVOD_D2_DP(y7, y6, y3r, y3i);

FMADD4(x0r, x1r, x2r, x3r, dar_vec, y0r, y1r, y2r, y3r);
y0i OP0 dar_vec * x0i;
y1i OP0 dar_vec * x1i;
y2i OP0 dar_vec * x2i;
y3i OP0 dar_vec * x3i;
y0r OP1 dai_vec * x0i;
y1r OP1 dai_vec * x1i;
y2r OP1 dai_vec * x2i;
y3r OP1 dai_vec * x3i;
y0i OP2 dai_vec * x0r;
y1i OP2 dai_vec * x1r;
y2i OP2 dai_vec * x2r;
y3i OP2 dai_vec * x3r;

ILVRL_D2_DP(y0i, y0r, y0, y1);
ILVRL_D2_DP(y1i, y1r, y2, y3);
ILVRL_D2_DP(y2i, y2r, y4, y5);
ILVRL_D2_DP(y3i, y3r, y6, y7);
ST_DP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, inc_y2);
}

if (n & 7)
{
if (n & 4)
{
LD_DP4_INC(x, 2, x0, x1, x2, x3);
LD_DP4_INC(py, inc_y2, y0, y1, y2, y3);
PCKEVOD_D2_DP(x1, x0, x0r, x0i);
PCKEVOD_D2_DP(y1, y0, y0r, y0i);
PCKEVOD_D2_DP(x3, x2, x1r, x1i);
PCKEVOD_D2_DP(y3, y2, y1r, y1i);

FMADD2(x0r, x1r, dar_vec, y0r, y1r);
y0i OP0 dar_vec * x0i;
y1i OP0 dar_vec * x1i;
y0r OP1 dai_vec * x0i;
y1r OP1 dai_vec * x1i;
y0i OP2 dai_vec * x0r;
y1i OP2 dai_vec * x1r;

ILVRL_D2_DP(y0i, y0r, y0, y1);
ILVRL_D2_DP(y1i, y1r, y2, y3);
ST_DP4_INC(y0, y1, y2, y3, y, inc_y2);
}

if (n & 2)
{
LD_DP2_INC(x, 2, x0, x1);
LD_DP2_INC(py, inc_y2, y0, y1);
PCKEVOD_D2_DP(x1, x0, x0r, x0i);
PCKEVOD_D2_DP(y1, y0, y0r, y0i);

y0r += dar_vec * x0r;
y0i OP0 dar_vec * x0i;
y0r OP1 dai_vec * x0i;
y0i OP2 dai_vec * x0r;

ILVRL_D2_DP(y0i, y0r, y0, y1);
ST_DP2_INC(y0, y1, y, inc_y2);
}

if (n & 1)
{
LD_GP2_INC(x, 1, xd0, xd1);
LD_GP2_INC(py, 1, yd0, yd1);

yd0 += da_r * xd0;
yd1 OP0 da_r * xd1;
yd0 OP1 da_i * xd1;
yd1 OP2 da_i * xd0;

ST_GP2_INC(yd0, yd1, y, 1);
}
}
}
else
{
inc_x2 = 2 * inc_x;
inc_y2 = 2 * inc_y;

for (i = (n >> 3); i--;)
{
LD_DP8_INC(x, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7);
LD_DP8_INC(py, inc_y2, y0, y1, y2, y3, y4, y5, y6, y7);
PCKEVOD_D2_DP(x1, x0, x0r, x0i);
PCKEVOD_D2_DP(y1, y0, y0r, y0i);
PCKEVOD_D2_DP(x3, x2, x1r, x1i);
PCKEVOD_D2_DP(y3, y2, y1r, y1i);
PCKEVOD_D2_DP(x5, x4, x2r, x2i);
PCKEVOD_D2_DP(y5, y4, y2r, y2i);
PCKEVOD_D2_DP(x7, x6, x3r, x3i);
PCKEVOD_D2_DP(y7, y6, y3r, y3i);

FMADD4(x0r, x1r, x2r, x3r, dar_vec, y0r, y1r, y2r, y3r);
y0i OP0 dar_vec * x0i;
y1i OP0 dar_vec * x1i;
y2i OP0 dar_vec * x2i;
y3i OP0 dar_vec * x3i;
y0r OP1 dai_vec * x0i;
y1r OP1 dai_vec * x1i;
y2r OP1 dai_vec * x2i;
y3r OP1 dai_vec * x3i;
y0i OP2 dai_vec * x0r;
y1i OP2 dai_vec * x1r;
y2i OP2 dai_vec * x2r;
y3i OP2 dai_vec * x3r;

ILVRL_D2_DP(y0i, y0r, y0, y1);
ILVRL_D2_DP(y1i, y1r, y2, y3);
ILVRL_D2_DP(y2i, y2r, y4, y5);
ILVRL_D2_DP(y3i, y3r, y6, y7);
ST_DP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, inc_y2);
}

if (n & 7)
{
if (n & 4)
{
LD_DP4_INC(x, inc_x2, x0, x1, x2, x3);
LD_DP4_INC(py, inc_y2, y0, y1, y2, y3);
PCKEVOD_D2_DP(x1, x0, x0r, x0i);
PCKEVOD_D2_DP(y1, y0, y0r, y0i);
PCKEVOD_D2_DP(x3, x2, x1r, x1i);
PCKEVOD_D2_DP(y3, y2, y1r, y1i);

FMADD2(x0r, x1r, dar_vec, y0r, y1r);
y0i OP0 dar_vec * x0i;
y1i OP0 dar_vec * x1i;
y0r OP1 dai_vec * x0i;
y1r OP1 dai_vec * x1i;
y0i OP2 dai_vec * x0r;
y1i OP2 dai_vec * x1r;

ILVRL_D2_DP(y0i, y0r, y0, y1);
ILVRL_D2_DP(y1i, y1r, y2, y3);
ST_DP4_INC(y0, y1, y2, y3, y, inc_y2);
}

if (n & 2)
{
LD_DP2_INC(x, inc_x2, x0, x1);
LD_DP2_INC(py, inc_y2, y0, y1);
PCKEVOD_D2_DP(x1, x0, x0r, x0i);
PCKEVOD_D2_DP(y1, y0, y0r, y0i);

y0r += dar_vec * x0r;
y0i OP0 dar_vec * x0i;
y0r OP1 dai_vec * x0i;
y0i OP2 dai_vec * x0r;

ILVRL_D2_DP(y0i, y0r, y0, y1);
ST_DP2_INC(y0, y1, y, inc_y2);
}

if (n & 1)
{
LD_GP2_INC(x, 1, xd0, xd1);
LD_GP2_INC(py, 1, yd0, yd1);

yd0 += da_r * xd0;
yd1 OP0 da_r * xd1;
yd0 OP1 da_i * xd1;
yd1 OP2 da_i * xd0;

ST_GP2_INC(yd0, yd1, y, 1);
}
}
}

return (0);
}

+ 218
- 0
kernel/mips/zcopy_msa.c View File

@@ -0,0 +1,218 @@
/*******************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/

#include "common.h"
#include "macros_msa.h"

int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
BLASLONG i;
v2f64 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
FLOAT f0, f1;

if (n < 0) return (0);

if ((1 == inc_x) && (1 == inc_y))
{
if (n > 15)
{
FLOAT *x_pref;
BLASLONG pref_offset;

pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
if (pref_offset > 0)
{
pref_offset = L1_DATA_LINESIZE - pref_offset;
pref_offset = pref_offset / sizeof(FLOAT);
}
x_pref = x + pref_offset + 64 + 16;

LD_DP8_INC(x, 2, x0, x1, x2, x3, x4, x5, x6, x7);
for (i = (n >> 4) - 1; i--;)
{
PREF_OFFSET(x_pref, 0);
PREF_OFFSET(x_pref, 32);
PREF_OFFSET(x_pref, 64);
PREF_OFFSET(x_pref, 96);
PREF_OFFSET(x_pref, 128);
PREF_OFFSET(x_pref, 160);
PREF_OFFSET(x_pref, 192);
PREF_OFFSET(x_pref, 224);
x_pref += 32;

x8 = LD_DP(x); x += 2;
ST_DP(x0, y); y += 2;
x9 = LD_DP(x); x += 2;
ST_DP(x1, y); y += 2;
x10 = LD_DP(x); x += 2;
ST_DP(x2, y); y += 2;
x11 = LD_DP(x); x += 2;
ST_DP(x3, y); y += 2;
x12 = LD_DP(x); x += 2;
ST_DP(x4, y); y += 2;
x13 = LD_DP(x); x += 2;
ST_DP(x5, y); y += 2;
x14 = LD_DP(x); x += 2;
ST_DP(x6, y); y += 2;
x15 = LD_DP(x); x += 2;
ST_DP(x7, y); y += 2;
x0 = LD_DP(x); x += 2;
ST_DP(x8, y); y += 2;
x1 = LD_DP(x); x += 2;
ST_DP(x9, y); y += 2;
x2 = LD_DP(x); x += 2;
ST_DP(x10, y); y += 2;
x3 = LD_DP(x); x += 2;
ST_DP(x11, y); y += 2;
x4 = LD_DP(x); x += 2;
ST_DP(x12, y); y += 2;
x5 = LD_DP(x); x += 2;
ST_DP(x13, y); y += 2;
x6 = LD_DP(x); x += 2;
ST_DP(x14, y); y += 2;
x7 = LD_DP(x); x += 2;
ST_DP(x15, y); y += 2;
}

x8 = LD_DP(x); x += 2;
x9 = LD_DP(x); x += 2;
ST_DP(x0, y); y += 2;
x10 = LD_DP(x); x += 2;
ST_DP(x1, y); y += 2;
x11 = LD_DP(x); x += 2;
ST_DP(x2, y); y += 2;
x12 = LD_DP(x); x += 2;
ST_DP(x3, y); y += 2;
x13 = LD_DP(x); x += 2;
ST_DP(x4, y); y += 2;
x14 = LD_DP(x); x += 2;
ST_DP(x5, y); y += 2;
x15 = LD_DP(x); x += 2;
ST_DP(x6, y); y += 2;
ST_DP(x7, y); y += 2;

ST_DP8_INC(x8, x9, x10, x11, x12, x13, x14, x15, y, 2);
}

if (n & 15)
{
if (n & 8)
{
LD_DP8_INC(x, 2, x0, x1, x2, x3, x4, x5, x6, x7);
ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, y, 2);
}

if (n & 4)
{
LD_DP4_INC(x, 2, x0, x1, x2, x3);
ST_DP4_INC(x0, x1, x2, x3, y, 2);
}

if (n & 2)
{
LD_DP2_INC(x, 2, x0, x1);
ST_DP2_INC(x0, x1, y, 2);
}

if (n & 1)
{
LD_GP2_INC(x, 1, f0, f1);
ST_GP2_INC(f0, f1, y, 1);
}
}
}
else
{
inc_x *= 2;
inc_y *= 2;

for (i = (n >> 4); i--;)
{
x0 = LD_DP(x); x += inc_x;
x1 = LD_DP(x); x += inc_x;
x2 = LD_DP(x); x += inc_x;
x3 = LD_DP(x); x += inc_x;
x4 = LD_DP(x); x += inc_x;
x5 = LD_DP(x); x += inc_x;
x6 = LD_DP(x); x += inc_x;
x7 = LD_DP(x); x += inc_x;
x8 = LD_DP(x); x += inc_x;
ST_DP(x0, y); y += inc_y;
x9 = LD_DP(x); x += inc_x;
ST_DP(x1, y); y += inc_y;
x10 = LD_DP(x); x += inc_x;
ST_DP(x2, y); y += inc_y;
x11 = LD_DP(x); x += inc_x;
ST_DP(x3, y); y += inc_y;
x12 = LD_DP(x); x += inc_x;
ST_DP(x4, y); y += inc_y;
x13 = LD_DP(x); x += inc_x;
ST_DP(x5, y); y += inc_y;
x14 = LD_DP(x); x += inc_x;
ST_DP(x6, y); y += inc_y;
x15 = LD_DP(x); x += inc_x;
ST_DP(x7, y); y += inc_y;
ST_DP(x8, y); y += inc_y;
ST_DP(x9, y); y += inc_y;
ST_DP(x10, y); y += inc_y;
ST_DP(x11, y); y += inc_y;
ST_DP(x12, y); y += inc_y;
ST_DP(x13, y); y += inc_y;
ST_DP(x14, y); y += inc_y;
ST_DP(x15, y); y += inc_y;
}

if (n & 15)
{
if (n & 8)
{
LD_DP8_INC(x, inc_x, x0, x1, x2, x3, x4, x5, x6, x7);
ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, y, inc_y);
}

if (n & 4)
{
LD_DP4_INC(x, inc_x, x0, x1, x2, x3);
ST_DP4_INC(x0, x1, x2, x3, y, inc_y);
}

if (n & 2)
{
LD_DP2_INC(x, inc_x, x0, x1);
ST_DP2_INC(x0, x1, y, inc_y);
}

if (n & 1)
{
LD_GP2_INC(x, 1, f0, f1);
ST_GP2_INC(f0, f1, y, 1);
}
}
}

return (0);
}

+ 717
- 0
kernel/mips/zscal_msa.c View File

@@ -0,0 +1,717 @@
/*******************************************************************************
Copyright (c) 2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/

#include "common.h"
#include "macros_msa.h"

/* This will shuffle the elements in 'in' vector as (mask needed :: 01 00 11 10)
0 1 2 3 => 2 3 0 1 */
#define SHF_78 78

int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
BLASLONG dummy2)
{
BLASLONG i, inc_x2;
FLOAT *px;
FLOAT tp0, tp1, f0, f1;
v2f64 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
v2f64 d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13, d14, d15;
v2f64 da_i_vec, da_i_vec_neg, da_r_vec;

px = x;

if (1 == inc_x)
{
if ((0.0 == da_r) && (0.0 == da_i))
{
v2f64 zero_v = __msa_cast_to_vector_double(0);
zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 0, 0.0);
zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 1, 0.0);

for (i = (n >> 4); i--;)
{
ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
zero_v, zero_v, x, 2);
ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
zero_v, zero_v, x, 2);
}

if (n & 15)
{
if (n & 8)
{
ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
zero_v, zero_v, x, 2);
}

if (n & 4)
{
ST_DP4_INC(zero_v, zero_v, zero_v, zero_v, x, 2);
}

if (n & 2)
{
ST_DP2_INC(zero_v, zero_v, x, 2);
}

if (n & 1)
{
ST_DP(zero_v, x);
}
}
}
else if (0.0 == da_r)
{
da_i_vec = COPY_DOUBLE_TO_VECTOR(da_i);
da_i_vec_neg = -da_i_vec;
da_i_vec = (v2f64) __msa_ilvev_d((v2i64) da_i_vec_neg, (v2i64) da_i_vec);

if (n > 15)
{
FLOAT *x_pref;
BLASLONG pref_offset;

pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
if (pref_offset > 0)
{
pref_offset = L1_DATA_LINESIZE - pref_offset;
pref_offset = pref_offset / sizeof(FLOAT);
}
x_pref = x + pref_offset + 32 + 16;

LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7);
for (i = (n >> 4)- 1; i--;)
{
PREF_OFFSET(x_pref, 0);
PREF_OFFSET(x_pref, 32);
PREF_OFFSET(x_pref, 64);
PREF_OFFSET(x_pref, 96);
PREF_OFFSET(x_pref, 128);
PREF_OFFSET(x_pref, 160);
PREF_OFFSET(x_pref, 192);
PREF_OFFSET(x_pref, 224);
x_pref += 32;

x8 = LD_DP(px); px += 2;
x0 *= da_i_vec;
x9 = LD_DP(px); px += 2;
x1 *= da_i_vec;
x10 = LD_DP(px); px += 2;
x2 *= da_i_vec;
x11 = LD_DP(px); px += 2;
x3 *= da_i_vec;
x12 = LD_DP(px); px += 2;
x4 *= da_i_vec;
x13 = LD_DP(px); px += 2;
x5 *= da_i_vec;
x0 = (v2f64) __msa_shf_w((v4i32) x0, SHF_78);
x14 = LD_DP(px); px += 2;
x6 *= da_i_vec;
x1 = (v2f64) __msa_shf_w((v4i32) x1, SHF_78);
x15 = LD_DP(px); px += 2;
x7 *= da_i_vec;
x2 = (v2f64) __msa_shf_w((v4i32) x2, SHF_78);
x8 *= da_i_vec;
x3 = (v2f64) __msa_shf_w((v4i32) x3, SHF_78);
ST_DP(x0, x); x += 2;
x9 *= da_i_vec;
x4 = (v2f64) __msa_shf_w((v4i32) x4, SHF_78);
ST_DP(x1, x); x += 2;
x10 *= da_i_vec;
x5 = (v2f64) __msa_shf_w((v4i32) x5, SHF_78);
ST_DP(x2, x); x += 2;
x11 *= da_i_vec;
x6 = (v2f64) __msa_shf_w((v4i32) x6, SHF_78);
ST_DP(x3, x); x += 2;
x12 *= da_i_vec;
x7 = (v2f64) __msa_shf_w((v4i32) x7, SHF_78);
ST_DP(x4, x); x += 2;
x13 *= da_i_vec;
x8 = (v2f64) __msa_shf_w((v4i32) x8, SHF_78);
ST_DP(x5, x); x += 2;
x14 *= da_i_vec;
x9 = (v2f64) __msa_shf_w((v4i32) x9, SHF_78);
ST_DP(x6, x); x += 2;
x15 *= da_i_vec;
x10 = (v2f64) __msa_shf_w((v4i32) x10, SHF_78);
ST_DP(x7, x); x += 2;
x11 = (v2f64) __msa_shf_w((v4i32) x11, SHF_78);
ST_DP(x8, x); x += 2;
x0 = LD_DP(px); px += 2;
x12 = (v2f64) __msa_shf_w((v4i32) x12, SHF_78);
ST_DP(x9, x); x += 2;
x1 = LD_DP(px); px += 2;
x13 = (v2f64) __msa_shf_w((v4i32) x13, SHF_78);
ST_DP(x10, x); x += 2;
x2 = LD_DP(px); px += 2;
x14 = (v2f64) __msa_shf_w((v4i32) x14, SHF_78);
ST_DP(x11, x); x += 2;
x3 = LD_DP(px); px += 2;
x15 = (v2f64) __msa_shf_w((v4i32) x15, SHF_78);
ST_DP(x12, x); x += 2;
x4 = LD_DP(px); px += 2;
ST_DP(x13, x); x += 2;
x5 = LD_DP(px); px += 2;
ST_DP(x14, x); x += 2;
x6 = LD_DP(px); px += 2;
ST_DP(x15, x); x += 2;
x7 = LD_DP(px); px += 2;
}

LD_DP8_INC(px, 2, x8, x9, x10, x11, x12, x13, x14, x15);
MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
x0, x1, x2, x3);
MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec,
x4, x5, x6, x7);
MUL4(x8, da_i_vec, x9, da_i_vec, x10, da_i_vec, x11, da_i_vec,
x8, x9, x10, x11);
MUL4(x12, da_i_vec, x13, da_i_vec, x14, da_i_vec, x15, da_i_vec,
x12, x13, x14, x15);
SHF_W4_DP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_78);
SHF_W4_DP(x4, x5, x6, x7, x4, x5, x6, x7, SHF_78);
SHF_W4_DP(x8, x9, x10, x11, x8, x9, x10, x11, SHF_78);
SHF_W4_DP(x12, x13, x14, x15, x12, x13, x14, x15, SHF_78);
ST_DP16_INC(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11,
x12, x13, x14, x15, x, 2);
}

if (n & 15)
{
if (n & 8)
{
LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7);
MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
x0, x1, x2, x3);
MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec,
x4, x5, x6, x7);
SHF_W4_DP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_78);
SHF_W4_DP(x4, x5, x6, x7, x4, x5, x6, x7, SHF_78);
ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, 2);
}

if (n & 4)
{
LD_DP4_INC(px, 2, x0, x1, x2, x3);
MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
x0, x1, x2, x3);
SHF_W4_DP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_78);
ST_DP4_INC(x0, x1, x2, x3, x, 2);
}

if (n & 2)
{
LD_DP2_INC(px, 2, x0, x1);
MUL2(x0, da_i_vec, x1, da_i_vec, x0, x1);
SHF_W2_DP(x0, x1, x0, x1, SHF_78);
ST_DP2_INC(x0, x1, x, 2);
}

if (n & 1)
{
LD_GP2_INC(px, 1, f0, f1);
MUL2(f0, da_i, f1, -da_i, f0, f1);
ST_GP2_INC(f1, f0, x, 1);
}
}
}
else if (0.0 == da_i)
{
da_r_vec = COPY_DOUBLE_TO_VECTOR(da_r);

if (n > 15)
{
FLOAT *x_pref;
BLASLONG pref_offset;

pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
if (pref_offset > 0)
{
pref_offset = L1_DATA_LINESIZE - pref_offset;
pref_offset = pref_offset / sizeof(FLOAT);
}
x_pref = x + pref_offset + 32 + 16;

LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7);
for (i = (n >> 4)- 1; i--;)
{
PREF_OFFSET(x_pref, 0);
PREF_OFFSET(x_pref, 32);
PREF_OFFSET(x_pref, 64);
PREF_OFFSET(x_pref, 96);
PREF_OFFSET(x_pref, 128);
PREF_OFFSET(x_pref, 160);
PREF_OFFSET(x_pref, 192);
PREF_OFFSET(x_pref, 224);
x_pref += 32;

x8 = LD_DP(px); px += 2;
x0 *= da_r_vec;
x9 = LD_DP(px); px += 2;
x1 *= da_r_vec;
x10 = LD_DP(px); px += 2;
x2 *= da_r_vec;
x11 = LD_DP(px); px += 2;
x3 *= da_r_vec;
x12 = LD_DP(px); px += 2;
x4 *= da_r_vec;
x13 = LD_DP(px); px += 2;
x5 *= da_r_vec;
ST_DP(x0, x); x += 2;
x14 = LD_DP(px); px += 2;
x6 *= da_r_vec;
ST_DP(x1, x); x += 2;
x15 = LD_DP(px); px += 2;
x7 *= da_r_vec;
ST_DP(x2, x); x += 2;
x8 *= da_r_vec;
ST_DP(x3, x); x += 2;
x9 *= da_r_vec;
ST_DP(x4, x); x += 2;
x10 *= da_r_vec;
ST_DP(x5, x); x += 2;
x11 *= da_r_vec;
ST_DP(x6, x); x += 2;
x12 *= da_r_vec;
ST_DP(x7, x); x += 2;
x13 *= da_r_vec;
ST_DP(x8, x); x += 2;
x0 = LD_DP(px); px += 2;
x14 *= da_r_vec;
ST_DP(x9, x); x += 2;
x1 = LD_DP(px); px += 2;
x15 *= da_r_vec;
ST_DP(x10, x); x += 2;
x2 = LD_DP(px); px += 2;
ST_DP(x11, x); x += 2;
x3 = LD_DP(px); px += 2;
ST_DP(x12, x); x += 2;
x4 = LD_DP(px); px += 2;
ST_DP(x13, x); x += 2;
x5 = LD_DP(px); px += 2;
ST_DP(x14, x); x += 2;
x6 = LD_DP(px); px += 2;
ST_DP(x15, x); x += 2;
x7 = LD_DP(px); px += 2;
}

LD_DP8_INC(px, 2, x8, x9, x10, x11, x12, x13, x14, x15);
MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec,
x0, x1, x2, x3);
MUL4(x4, da_r_vec, x5, da_r_vec, x6, da_r_vec, x7, da_r_vec,
x4, x5, x6, x7);
MUL4(x8, da_r_vec, x9, da_r_vec, x10, da_r_vec, x11, da_r_vec,
x8, x9, x10, x11);
MUL4(x12, da_r_vec, x13, da_r_vec, x14, da_r_vec, x15, da_r_vec,
x12, x13, x14, x15);
ST_DP16_INC(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11,
x12, x13, x14, x15, x, 2);
}

if (n & 15)
{
if (n & 8)
{
LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7);
MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec,
x0, x1, x2, x3);
MUL4(x4, da_r_vec, x5, da_r_vec, x6, da_r_vec, x7, da_r_vec,
x4, x5, x6, x7);
ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, 2);
}

if (n & 4)
{
LD_DP4_INC(px, 2, x0, x1, x2, x3);
MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec,
x0, x1, x2, x3);
ST_DP4_INC(x0, x1, x2, x3, x, 2);
}

if (n & 2)
{
LD_DP2_INC(px, 2, x0, x1);
MUL2(x0, da_r_vec, x1, da_r_vec, x0, x1);
ST_DP2_INC(x0, x1, x, 2);
}

if (n & 1)
{
LD_GP2_INC(px, 1, f0, f1);
MUL2(f0, da_r, f1, da_r, f0, f1);
ST_GP2_INC(f0, f1, x, 1);
}
}
}
else
{
FLOAT *x_pref;
BLASLONG pref_offset;

pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
if (pref_offset > 0)
{
pref_offset = L1_DATA_LINESIZE - pref_offset;
pref_offset = pref_offset / sizeof(FLOAT);
}
x_pref = x + pref_offset + 32;

da_i_vec = COPY_DOUBLE_TO_VECTOR(da_i);
da_i_vec_neg = -da_i_vec;
da_i_vec = (v2f64) __msa_ilvev_d((v2i64) da_i_vec_neg, (v2i64) da_i_vec);

da_r_vec = COPY_DOUBLE_TO_VECTOR(da_r);

for (i = (n >> 4); i--;)
{
PREF_OFFSET(x_pref, 0);
PREF_OFFSET(x_pref, 32);
PREF_OFFSET(x_pref, 64);
PREF_OFFSET(x_pref, 96);
PREF_OFFSET(x_pref, 128);
PREF_OFFSET(x_pref, 160);
PREF_OFFSET(x_pref, 192);
PREF_OFFSET(x_pref, 224);
x_pref += 32;

LD_DP16_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10,
x11, x12, x13, x14, x15);
MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
d0, d1, d2, d3);
MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec,
d4, d5, d6, d7);
MUL4(x8, da_i_vec, x9, da_i_vec, x10, da_i_vec, x11, da_i_vec,
d8, d9, d10, d11);
MUL4(x12, da_i_vec, x13, da_i_vec, x14, da_i_vec, x15, da_i_vec,
d12, d13, d14, d15);
SHF_W4_DP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_78);
SHF_W4_DP(d4, d5, d6, d7, d4, d5, d6, d7, SHF_78);
SHF_W4_DP(d8, d9, d10, d11, d8, d9, d10, d11, SHF_78);
SHF_W4_DP(d12, d13, d14, d15, d12, d13, d14, d15, SHF_78);
FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3);
FMADD4(x4, x5, x6, x7, da_r_vec, d4, d5, d6, d7);
FMADD4(x8, x9, x10, x11, da_r_vec, d8, d9, d10, d11);
FMADD4(x12, x13, x14, x15, da_r_vec, d12, d13, d14, d15);
ST_DP16_INC(d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11,
d12, d13, d14, d15, x, 2);
}

if (n & 15)
{
if (n & 8)
{
LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7);
MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
d0, d1, d2, d3);
MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec,
d4, d5, d6, d7);
SHF_W4_DP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_78);
SHF_W4_DP(d4, d5, d6, d7, d4, d5, d6, d7, SHF_78);
FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3);
FMADD4(x4, x5, x6, x7, da_r_vec, d4, d5, d6, d7);
ST_DP8_INC(d0, d1, d2, d3, d4, d5, d6, d7, x, 2);
}

if (n & 4)
{
LD_DP4_INC(px, 2, x0, x1, x2, x3);
MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
d0, d1, d2, d3);
SHF_W4_DP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_78);
FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3);
ST_DP4_INC(d0, d1, d2, d3, x, 2);
}

if (n & 2)
{
LD_DP2_INC(px, 2, x0, x1);
MUL2(x0, da_i_vec, x1, da_i_vec, d0, d1);
SHF_W2_DP(d0, d1, d0, d1, SHF_78);
FMADD2(x0, x1, da_r_vec, d0, d1);
ST_DP2_INC(d0, d1, x, 2);
}

if (n & 1)
{
LD_GP2_INC(px, 1, f0, f1);

tp0 = da_r * f0;
tp0 -= da_i * f1;
tp1 = da_r * f1;
tp1 += da_i * f0;

ST_GP2_INC(tp0, tp1, x, 1);
}
}
}
}
else
{
inc_x2 = 2 * inc_x;

if ((0.0 == da_r) && (0.0 == da_i))
{
v2f64 zero_v = __msa_cast_to_vector_double(0);
zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 0, 0.0);
zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 1, 0.0);

for (i = (n >> 4); i--;)
{
ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
zero_v, zero_v, x, inc_x2);
ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
zero_v, zero_v, x, inc_x2);
}

if (n & 15)
{
if (n & 8)
{
ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
zero_v, zero_v, x, inc_x2);
}

if (n & 4)
{
ST_DP4_INC(zero_v, zero_v, zero_v, zero_v, x, inc_x2);
}

if (n & 2)
{
ST_DP2_INC(zero_v, zero_v, x, inc_x2);
}

if (n & 1)
{
ST_DP(zero_v, x);
}
}
}
else if (0.0 == da_r)
{
da_i_vec = COPY_DOUBLE_TO_VECTOR(da_i);
da_i_vec_neg = -da_i_vec;
da_i_vec = (v2f64) __msa_ilvev_d((v2i64) da_i_vec_neg, (v2i64) da_i_vec);

for (i = (n >> 4); i--;)
{
LD_DP16_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9,
x10, x11, x12, x13, x14, x15);
MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
x0, x1, x2, x3);
MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec,
x4, x5, x6, x7);
MUL4(x8, da_i_vec, x9, da_i_vec, x10, da_i_vec, x11, da_i_vec,
x8, x9, x10, x11);
MUL4(x12, da_i_vec, x13, da_i_vec, x14, da_i_vec, x15, da_i_vec,
x12, x13, x14, x15);
SHF_W4_DP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_78);
SHF_W4_DP(x4, x5, x6, x7, x4, x5, x6, x7, SHF_78);
SHF_W4_DP(x8, x9, x10, x11, x8, x9, x10, x11, SHF_78);
SHF_W4_DP(x12, x13, x14, x15, x12, x13, x14, x15, SHF_78);
ST_DP16_INC(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11,
x12, x13, x14, x15, x, inc_x2);
}

if (n & 15)
{
if (n & 8)
{
LD_DP8_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7);
MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
x0, x1, x2, x3);
MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec,
x4, x5, x6, x7);
SHF_W4_DP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_78);
SHF_W4_DP(x4, x5, x6, x7, x4, x5, x6, x7, SHF_78);
ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, inc_x2);
}

if (n & 4)
{
LD_DP4_INC(px, inc_x2, x0, x1, x2, x3);
MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
x0, x1, x2, x3);
SHF_W4_DP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_78);
ST_DP4_INC(x0, x1, x2, x3, x, inc_x2);
}

if (n & 2)
{
LD_DP2_INC(px, inc_x2, x0, x1);
MUL2(x0, da_i_vec, x1, da_i_vec, x0, x1);
SHF_W2_DP(x0, x1, x0, x1, SHF_78);
ST_DP2_INC(x0, x1, x, inc_x2);
}

if (n & 1)
{
LD_GP2_INC(px, 1, f0, f1);
MUL2(f0, da_i, f1, -da_i, f0, f1);
ST_GP2_INC(f1, f0, x, 1);
}
}
}
else if (0.0 == da_i)
{
da_r_vec = COPY_DOUBLE_TO_VECTOR(da_r);

for (i = (n >> 4); i--;)
{
LD_DP16_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9,
x10, x11, x12, x13, x14, x15);
MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec,
x0, x1, x2, x3);
MUL4(x4, da_r_vec, x5, da_r_vec, x6, da_r_vec, x7, da_r_vec,
x4, x5, x6, x7);
MUL4(x8, da_r_vec, x9, da_r_vec, x10, da_r_vec, x11, da_r_vec,
x8, x9, x10, x11);
MUL4(x12, da_r_vec, x13, da_r_vec, x14, da_r_vec, x15, da_r_vec,
x12, x13, x14, x15);
ST_DP16_INC(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11,
x12, x13, x14, x15, x, inc_x2);
}

if (n & 15)
{
if (n & 8)
{
LD_DP8_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7);
MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec,
x0, x1, x2, x3);
MUL4(x4, da_r_vec, x5, da_r_vec, x6, da_r_vec, x7, da_r_vec,
x4, x5, x6, x7);
ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, inc_x2);
}

if (n & 4)
{
LD_DP4_INC(px, inc_x2, x0, x1, x2, x3);
MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec,
x0, x1, x2, x3);
ST_DP4_INC(x0, x1, x2, x3, x, inc_x2);
}

if (n & 2)
{
LD_DP2_INC(px, inc_x2, x0, x1);
MUL2(x0, da_r_vec, x1, da_r_vec, x0, x1);
ST_DP2_INC(x0, x1, x, inc_x2);
}

if (n & 1)
{
LD_GP2_INC(px, 1, f0, f1);
MUL2(f0, da_r, f1, da_r, f0, f1);
ST_GP2_INC(f0, f1, x, 1);
}
}
}
else
{
da_i_vec = COPY_DOUBLE_TO_VECTOR(da_i);
da_i_vec_neg = -da_i_vec;
da_i_vec = (v2f64) __msa_ilvev_d((v2i64) da_i_vec_neg, (v2i64) da_i_vec);

da_r_vec = COPY_DOUBLE_TO_VECTOR(da_r);

for (i = (n >> 4); i--;)
{
LD_DP16_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9,
x10, x11, x12, x13, x14, x15);
MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
d0, d1, d2, d3);
MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec,
d4, d5, d6, d7);
MUL4(x8, da_i_vec, x9, da_i_vec, x10, da_i_vec, x11, da_i_vec,
d8, d9, d10, d11);
MUL4(x12, da_i_vec, x13, da_i_vec, x14, da_i_vec, x15, da_i_vec,
d12, d13, d14, d15);
SHF_W4_DP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_78);
SHF_W4_DP(d4, d5, d6, d7, d4, d5, d6, d7, SHF_78);
SHF_W4_DP(d8, d9, d10, d11, d8, d9, d10, d11, SHF_78);
SHF_W4_DP(d12, d13, d14, d15, d12, d13, d14, d15, SHF_78);
FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3);
FMADD4(x4, x5, x6, x7, da_r_vec, d4, d5, d6, d7);
FMADD4(x8, x9, x10, x11, da_r_vec, d8, d9, d10, d11);
FMADD4(x12, x13, x14, x15, da_r_vec, d12, d13, d14, d15);
ST_DP16_INC(d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11,
d12, d13, d14, d15, x, inc_x2);
}

if (n & 15)
{
if (n & 8)
{
LD_DP8_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7);
MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
d0, d1, d2, d3);
MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec,
d4, d5, d6, d7);
SHF_W4_DP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_78);
SHF_W4_DP(d4, d5, d6, d7, d4, d5, d6, d7, SHF_78);
FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3);
FMADD4(x4, x5, x6, x7, da_r_vec, d4, d5, d6, d7);
ST_DP8_INC(d0, d1, d2, d3, d4, d5, d6, d7, x, inc_x2);
}

if (n & 4)
{
LD_DP4_INC(px, inc_x2, x0, x1, x2, x3);
MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
d0, d1, d2, d3);
SHF_W4_DP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_78);
FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3);
ST_DP4_INC(d0, d1, d2, d3, x, inc_x2);
}

if (n & 2)
{
LD_DP2_INC(px, inc_x2, x0, x1);
MUL2(x0, da_i_vec, x1, da_i_vec, d0, d1);
SHF_W2_DP(d0, d1, d0, d1, SHF_78);
FMADD2(x0, x1, da_r_vec, d0, d1);
ST_DP2_INC(d0, d1, x, inc_x2);
}

if (n & 1)
{
LD_GP2_INC(px, 1, f0, f1);

tp0 = da_r * f0;
tp0 -= da_i * f1;
tp1 = da_r * f1;
tp1 += da_i * f0;

ST_GP2_INC(tp0, tp1, x, 1);
}
}
}
}

return (0);
}

+ 238
- 0
kernel/mips/zswap_msa.c View File

@@ -0,0 +1,238 @@
/*******************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/

#include "common.h"
#include "macros_msa.h"

int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3,
FLOAT dummy4, FLOAT *srcx, BLASLONG inc_x, FLOAT *srcy,
BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i, inc_x2, inc_y2, pref_offsetx, pref_offsety;
FLOAT *px, *py;
v2f64 x0, x1, x2, x3, x4, x5, x6, x7;
v2f64 y0, y1, y2, y3, y4, y5, y6, y7;

if (n < 0) return (0);

pref_offsetx = (BLASLONG)srcx & (L1_DATA_LINESIZE - 1);
if (pref_offsetx > 0)
{
pref_offsetx = L1_DATA_LINESIZE - pref_offsetx;
pref_offsetx = pref_offsetx / sizeof(FLOAT);
}

pref_offsety = (BLASLONG)srcy & (L1_DATA_LINESIZE - 1);
if (pref_offsety > 0)
{
pref_offsety = L1_DATA_LINESIZE - pref_offsety;
pref_offsety = pref_offsety / sizeof(FLOAT);
}

inc_x2 = 2 * inc_x;
inc_y2 = 2 * inc_y;

px = srcx;
py = srcy;

if ((1 == inc_x) && (1 == inc_y))
{
if (n >> 3)
{
LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7);

for (i = (n >> 3) - 1; i--;)
{
PREFETCH(px + pref_offsetx + 16);
PREFETCH(px + pref_offsetx + 20);
PREFETCH(px + pref_offsetx + 24);
PREFETCH(px + pref_offsetx + 28);

PREFETCH(py + pref_offsety + 16);
PREFETCH(py + pref_offsety + 20);
PREFETCH(py + pref_offsety + 24);
PREFETCH(py + pref_offsety + 28);

y0 = LD_DP(py); py += 2;
ST_DP(x0, srcy); srcy += 2;
y1 = LD_DP(py); py += 2;
ST_DP(x1, srcy); srcy += 2;
y2 = LD_DP(py); py += 2;
ST_DP(x2, srcy); srcy += 2;
y3 = LD_DP(py); py += 2;
ST_DP(x3, srcy); srcy += 2;
y4 = LD_DP(py); py += 2;
ST_DP(x4, srcy); srcy += 2;
y5 = LD_DP(py); py += 2;
ST_DP(x5, srcy); srcy += 2;
y6 = LD_DP(py); py += 2;
ST_DP(x6, srcy); srcy += 2;
y7 = LD_DP(py); py += 2;
ST_DP(x7, srcy); srcy += 2;

x0 = LD_DP(px); px += 2;
ST_DP(y0, srcx); srcx += 2;
x1 = LD_DP(px); px += 2;
ST_DP(y1, srcx); srcx += 2;
x2 = LD_DP(px); px += 2;
ST_DP(y2, srcx); srcx += 2;
x3 = LD_DP(px); px += 2;
ST_DP(y3, srcx); srcx += 2;
x4 = LD_DP(px); px += 2;
ST_DP(y4, srcx); srcx += 2;
x5 = LD_DP(px); px += 2;
ST_DP(y5, srcx); srcx += 2;
x6 = LD_DP(px); px += 2;
ST_DP(y6, srcx); srcx += 2;
x7 = LD_DP(px); px += 2;
ST_DP(y7, srcx); srcx += 2;
}

LD_DP8_INC(py, 2, y0, y1, y2, y3, y4, y5, y6, y7);
ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, srcy, 2);
ST_DP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, srcx, 2);
}

if (n & 7)
{
if ((n & 4) && (n & 2) && (n & 1))
{
LD_DP7_INC(px, 2, x0, x1, x2, x3, x4, x5, x6);
LD_DP7_INC(py, 2, y0, y1, y2, y3, y4, y5, y6);
ST_DP7_INC(x0, x1, x2, x3, x4, x5, x6, srcy, 2);
ST_DP7_INC(y0, y1, y2, y3, y4, y5, y6, srcx, 2);
}
else if ((n & 4) && (n & 2))
{
LD_DP6_INC(px, 2, x0, x1, x2, x3, x4, x5);
LD_DP6_INC(py, 2, y0, y1, y2, y3, y4, y5);
ST_DP6_INC(x0, x1, x2, x3, x4, x5, srcy, 2);
ST_DP6_INC(y0, y1, y2, y3, y4, y5, srcx, 2);
}
else if ((n & 4) && (n & 1))
{
LD_DP5_INC(px, 2, x0, x1, x2, x3, x4);
LD_DP5_INC(py, 2, y0, y1, y2, y3, y4);
ST_DP5_INC(x0, x1, x2, x3, x4, srcy, 2);
ST_DP5_INC(y0, y1, y2, y3, y4, srcx, 2);
}
else if ((n & 2) && (n & 1))
{
LD_DP3_INC(px, 2, x0, x1, x2);
LD_DP3_INC(py, 2, y0, y1, y2);
ST_DP3_INC(x0, x1, x2, srcy, 2);
ST_DP3_INC(y0, y1, y2, srcx, 2);
}
else if (n & 4)
{
LD_DP4_INC(px, 2, x0, x1, x2, x3);
LD_DP4_INC(py, 2, y0, y1, y2, y3);
ST_DP4_INC(x0, x1, x2, x3, srcy, 2);
ST_DP4_INC(y0, y1, y2, y3, srcx, 2);
}
else if (n & 2)
{
LD_DP2_INC(px, 2, x0, x1);
LD_DP2_INC(py, 2, y0, y1);
ST_DP2_INC(x0, x1, srcy, 2);
ST_DP2_INC(y0, y1, srcx, 2);
}
else if (n & 1)
{
x0 = LD_DP(px);
y0 = LD_DP(py);
ST_DP(y0, srcx);
ST_DP(x0, srcy);
}
}
}
else
{
for (i = (n >> 3); i--;)
{
LD_DP8_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7);
LD_DP8_INC(py, inc_y2, y0, y1, y2, y3, y4, y5, y6, y7);
ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, srcy, inc_y2);
ST_DP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, srcx, inc_x2);
}

if (n & 7)
{
if ((n & 4) && (n & 2) && (n & 1))
{
LD_DP7_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6);
LD_DP7_INC(py, inc_y2, y0, y1, y2, y3, y4, y5, y6);
ST_DP7_INC(x0, x1, x2, x3, x4, x5, x6, srcy, inc_y2);
ST_DP7_INC(y0, y1, y2, y3, y4, y5, y6, srcx, inc_x2);
}
else if ((n & 4) && (n & 2))
{
LD_DP6_INC(px, inc_x2, x0, x1, x2, x3, x4, x5);
LD_DP6_INC(py, inc_y2, y0, y1, y2, y3, y4, y5);
ST_DP6_INC(x0, x1, x2, x3, x4, x5, srcy, inc_y2);
ST_DP6_INC(y0, y1, y2, y3, y4, y5, srcx, inc_x2);
}
else if ((n & 4) && (n & 1))
{
LD_DP5_INC(px, inc_x2, x0, x1, x2, x3, x4);
LD_DP5_INC(py, inc_y2, y0, y1, y2, y3, y4);
ST_DP5_INC(x0, x1, x2, x3, x4, srcy, inc_y2);
ST_DP5_INC(y0, y1, y2, y3, y4, srcx, inc_x2);
}
else if ((n & 2) && (n & 1))
{
LD_DP3_INC(px, inc_x2, x0, x1, x2);
LD_DP3_INC(py, inc_y2, y0, y1, y2);
ST_DP3_INC(x0, x1, x2, srcy, inc_y2);
ST_DP3_INC(y0, y1, y2, srcx, inc_x2);
}
else if (n & 4)
{
LD_DP4_INC(px, inc_x2, x0, x1, x2, x3);
LD_DP4_INC(py, inc_y2, y0, y1, y2, y3);
ST_DP4_INC(x0, x1, x2, x3, srcy, inc_y2);
ST_DP4_INC(y0, y1, y2, y3, srcx, inc_x2);
}
else if (n & 2)
{
LD_DP2_INC(px, inc_x2, x0, x1);
LD_DP2_INC(py, inc_y2, y0, y1);
ST_DP2_INC(x0, x1, srcy, inc_y2);
ST_DP2_INC(y0, y1, srcx, inc_x2);
}
else if (n & 1)
{
x0 = LD_DP(px);
y0 = LD_DP(py);
ST_DP(y0, srcx);
ST_DP(x0, srcy);
}
}
}

return (0);
}

Loading…
Cancel
Save