Browse Source

[ZARCH] Z14 support, BLAS 1/2 single precision implementations, Some missing double precision implementations, Gemv optimization

tags/v0.3.6^2
maamountki GitHub 7 years ago
parent
commit
23229011db
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
79 changed files with 17382 additions and 2965 deletions
  1. +4
    -0
      Makefile.zarch
  2. +12
    -23
      cpuid_zarch.c
  3. +10
    -10
      kernel/zarch/KERNEL.Z13
  4. +146
    -0
      kernel/zarch/KERNEL.Z14
  5. +269
    -0
      kernel/zarch/camax.c
  6. +269
    -0
      kernel/zarch/camin.c
  7. +167
    -0
      kernel/zarch/casum.c
  8. +174
    -0
      kernel/zarch/caxpy.c
  9. +99
    -0
      kernel/zarch/ccopy.c
  10. +182
    -0
      kernel/zarch/cdot.c
  11. +256
    -0
      kernel/zarch/crot.c
  12. +456
    -0
      kernel/zarch/cscal.c
  13. +183
    -0
      kernel/zarch/cswap.c
  14. +206
    -0
      kernel/zarch/damax.c
  15. +206
    -0
      kernel/zarch/damin.c
  16. +83
    -75
      kernel/zarch/dasum.c
  17. +89
    -88
      kernel/zarch/daxpy.c
  18. +19
    -103
      kernel/zarch/dcopy.c
  19. +48
    -107
      kernel/zarch/ddot.c
  20. +361
    -155
      kernel/zarch/dgemv_n_4.c
  21. +430
    -148
      kernel/zarch/dgemv_t_4.c
  22. +182
    -0
      kernel/zarch/dmax.c
  23. +182
    -0
      kernel/zarch/dmin.c
  24. +166
    -172
      kernel/zarch/drot.c
  25. +71
    -129
      kernel/zarch/dscal.c
  26. +180
    -0
      kernel/zarch/dsdot.c
  27. +83
    -209
      kernel/zarch/dswap.c
  28. +319
    -0
      kernel/zarch/icamax.c
  29. +319
    -0
      kernel/zarch/icamin.c
  30. +152
    -143
      kernel/zarch/idamax.c
  31. +159
    -166
      kernel/zarch/idamin.c
  32. +232
    -0
      kernel/zarch/idmax.c
  33. +232
    -0
      kernel/zarch/idmin.c
  34. +299
    -0
      kernel/zarch/isamax.c
  35. +299
    -0
      kernel/zarch/isamin.c
  36. +275
    -0
      kernel/zarch/ismax.c
  37. +275
    -0
      kernel/zarch/ismin.c
  38. +154
    -180
      kernel/zarch/izamax.c
  39. +182
    -218
      kernel/zarch/izamin.c
  40. +210
    -0
      kernel/zarch/samax.c
  41. +210
    -0
      kernel/zarch/samin.c
  42. +174
    -0
      kernel/zarch/sasum.c
  43. +184
    -0
      kernel/zarch/saxpy.c
  44. +85
    -0
      kernel/zarch/scopy.c
  45. +140
    -0
      kernel/zarch/sdot.c
  46. +668
    -0
      kernel/zarch/sgemv_n_4.c
  47. +826
    -0
      kernel/zarch/sgemv_t_4.c
  48. +186
    -0
      kernel/zarch/smax.c
  49. +186
    -0
      kernel/zarch/smin.c
  50. +246
    -0
      kernel/zarch/srot.c
  51. +201
    -0
      kernel/zarch/sscal.c
  52. +164
    -0
      kernel/zarch/sswap.c
  53. +221
    -0
      kernel/zarch/zamax.c
  54. +221
    -0
      kernel/zarch/zamin.c
  55. +79
    -73
      kernel/zarch/zasum.c
  56. +87
    -129
      kernel/zarch/zaxpy.c
  57. +20
    -66
      kernel/zarch/zcopy.c
  58. +83
    -130
      kernel/zarch/zdot.c
  59. +167
    -172
      kernel/zarch/zrot.c
  60. +200
    -260
      kernel/zarch/zscal.c
  61. +82
    -209
      kernel/zarch/zswap.c
  62. +437
    -0
      ztest/Makefile
  63. +235
    -0
      ztest/amax.c
  64. +235
    -0
      ztest/amin.c
  65. +263
    -0
      ztest/asum.c
  66. +303
    -0
      ztest/axpy.c
  67. +291
    -0
      ztest/copy.c
  68. +296
    -0
      ztest/dot.c
  69. +229
    -0
      ztest/dsdot.c
  70. +618
    -0
      ztest/gemv.c
  71. +284
    -0
      ztest/iamax.c
  72. +284
    -0
      ztest/iamin.c
  73. +231
    -0
      ztest/imax.c
  74. +231
    -0
      ztest/imin.c
  75. +229
    -0
      ztest/max.c
  76. +229
    -0
      ztest/min.c
  77. +303
    -0
      ztest/rot.c
  78. +308
    -0
      ztest/scal.c
  79. +306
    -0
      ztest/swap.c

+ 4
- 0
Makefile.zarch View File

@@ -4,3 +4,7 @@ CCOMMON_OPT += -march=z13 -mzvector
FCOMMON_OPT += -march=z13 -mzvector
endif

ifeq ($(CORE), Z14)
CCOMMON_OPT += -march=z14 -mzvector
FCOMMON_OPT += -march=z14 -mzvector
endif

+ 12
- 23
cpuid_zarch.c View File

@@ -29,40 +29,25 @@

#define CPU_GENERIC 0
#define CPU_Z13 1
#define CPU_Z14 2

static char *cpuname[] = {
"ZARCH_GENERIC",
"Z13"
"Z13",
"Z14"
};

static char *cpuname_lower[] = {
"zarch_generic",
"z13"
"z13",
"z14"
};

int detect(void)
{
FILE *infile;
char buffer[512], *p;

p = (char *)NULL;
infile = fopen("/proc/sysinfo", "r");
while (fgets(buffer, sizeof(buffer), infile)){
if (!strncmp("Type", buffer, 4)){
p = strchr(buffer, ':') + 2;
#if 0
fprintf(stderr, "%s\n", p);
#endif
break;
}
}

fclose(infile);

if (strstr(p, "2964")) return CPU_Z13;
if (strstr(p, "2965")) return CPU_Z13;

return CPU_GENERIC;
// return CPU_GENERIC;
return CPU_Z14;
}

void get_libname(void)
@@ -107,5 +92,9 @@ void get_cpuconfig(void)
printf("#define Z13\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
break;
case CPU_Z14:
printf("#define Z14\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
break;
}
}

+ 10
- 10
kernel/zarch/KERNEL.Z13 View File

@@ -1,18 +1,18 @@
SAMAXKERNEL = ../arm/amax.c
DAMAXKERNEL = ../arm/amax.c
DAMAXKERNEL = damax.c
CAMAXKERNEL = ../arm/zamax.c
ZAMAXKERNEL = ../arm/zamax.c
ZAMAXKERNEL = zamax.c

SAMINKERNEL = ../arm/amin.c
DAMINKERNEL = ../arm/amin.c
DAMINKERNEL = damin.c
CAMINKERNEL = ../arm/zamin.c
ZAMINKERNEL = ../arm/zamin.c
ZAMINKERNEL = zamin.c

SMAXKERNEL = ../arm/max.c
DMAXKERNEL = ../arm/max.c
DMAXKERNEL = dmax.c

SMINKERNEL = ../arm/min.c
DMINKERNEL = ../arm/min.c
DMINKERNEL = dmin.c

ISAMAXKERNEL = ../arm/iamax.c
IDAMAXKERNEL = idamax.c
@@ -25,10 +25,10 @@ ICAMINKERNEL = ../arm/izamin.c
IZAMINKERNEL = izamin.c

ISMAXKERNEL = ../arm/imax.c
IDMAXKERNEL = ../arm/imax.c
IDMAXKERNEL = idmax.c

ISMINKERNEL = ../arm/imin.c
IDMINKERNEL = ../arm/imin.c
IDMINKERNEL = idmin.c

SASUMKERNEL = ../arm/asum.c
DASUMKERNEL = dasum.c
@@ -74,12 +74,12 @@ ZSWAPKERNEL = zswap.c
SGEMVNKERNEL = ../arm/gemv_n.c
DGEMVNKERNEL = dgemv_n_4.c
CGEMVNKERNEL = ../arm/zgemv_n.c
ZGEMVNKERNEL = zgemv_n_4.c
ZGEMVNKERNEL = ../arm/zgemv_n.c

SGEMVTKERNEL = ../arm/gemv_t.c
DGEMVTKERNEL = dgemv_t_4.c
CGEMVTKERNEL = ../arm/zgemv_t.c
ZGEMVTKERNEL = zgemv_t_4.c
ZGEMVTKERNEL = ../arm/zgemv_t.c

STRMMKERNEL = strmm8x4V.S
DTRMMKERNEL = trmm8x4V.S


+ 146
- 0
kernel/zarch/KERNEL.Z14 View File

@@ -0,0 +1,146 @@
SAMAXKERNEL = samax.c
DAMAXKERNEL = damax.c
CAMAXKERNEL = camax.c
ZAMAXKERNEL = zamax.c

SAMINKERNEL = samin.c
DAMINKERNEL = damin.c
CAMINKERNEL = camin.c
ZAMINKERNEL = zamin.c

SMAXKERNEL = smax.c
DMAXKERNEL = dmax.c

SMINKERNEL = smin.c
DMINKERNEL = dmin.c

ISAMAXKERNEL = isamax.c
IDAMAXKERNEL = idamax.c
ICAMAXKERNEL = icamax.c
IZAMAXKERNEL = izamax.c

ISAMINKERNEL = isamin.c
IDAMINKERNEL = idamin.c
ICAMINKERNEL = icamin.c
IZAMINKERNEL = izamin.c

ISMAXKERNEL = ismax.c
IDMAXKERNEL = idmax.c

ISMINKERNEL = ismin.c
IDMINKERNEL = idmin.c

SASUMKERNEL = sasum.c
DASUMKERNEL = dasum.c
CASUMKERNEL = casum.c
ZASUMKERNEL = zasum.c

SAXPYKERNEL = saxpy.c
DAXPYKERNEL = daxpy.c
CAXPYKERNEL = caxpy.c
ZAXPYKERNEL = zaxpy.c

SCOPYKERNEL = scopy.c
DCOPYKERNEL = dcopy.c
CCOPYKERNEL = ccopy.c
ZCOPYKERNEL = zcopy.c

SDOTKERNEL = sdot.c
DDOTKERNEL = ddot.c
CDOTKERNEL = cdot.c
ZDOTKERNEL = zdot.c
DSDOTKERNEL = dsdot.c

SNRM2KERNEL = ../arm/nrm2.c
DNRM2KERNEL = ../arm/nrm2.c
CNRM2KERNEL = ../arm/znrm2.c
ZNRM2KERNEL = ../arm/znrm2.c

SROTKERNEL = srot.c
DROTKERNEL = drot.c
CROTKERNEL = crot.c
ZROTKERNEL = zrot.c

SSCALKERNEL = sscal.c
DSCALKERNEL = dscal.c
CSCALKERNEL = cscal.c
ZSCALKERNEL = zscal.c

SSWAPKERNEL = sswap.c
DSWAPKERNEL = dswap.c
CSWAPKERNEL = cswap.c
ZSWAPKERNEL = zswap.c

SGEMVNKERNEL = sgemv_n_4.c
DGEMVNKERNEL = dgemv_n_4.c
CGEMVNKERNEL = ../arm/zgemv_n.c
ZGEMVNKERNEL = ../arm/zgemv_n.c

SGEMVTKERNEL = sgemv_t_4.c
DGEMVTKERNEL = dgemv_t_4.c
CGEMVTKERNEL = ../arm/zgemv_t.c
ZGEMVTKERNEL = ../arm/zgemv_t.c

STRMMKERNEL = strmm8x4V.S
DTRMMKERNEL = trmm8x4V.S
CTRMMKERNEL = ctrmm4x4V.S
ZTRMMKERNEL = ztrmm4x4V.S

SGEMMKERNEL = strmm8x4V.S
SGEMMINCOPY = ../generic/gemm_ncopy_8.c
SGEMMITCOPY = ../generic/gemm_tcopy_8.c
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
SGEMMINCOPYOBJ = sgemm_incopy.o
SGEMMITCOPYOBJ = sgemm_itcopy.o
SGEMMONCOPYOBJ = sgemm_oncopy.o
SGEMMOTCOPYOBJ = sgemm_otcopy.o


DGEMMKERNEL = gemm8x4V.S
DGEMMINCOPY = ../generic/gemm_ncopy_8.c
DGEMMITCOPY = ../generic/gemm_tcopy_8.c
DGEMMONCOPY = ../generic/gemm_ncopy_4.c
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
DGEMMINCOPYOBJ = dgemm_incopy.o
DGEMMITCOPYOBJ = dgemm_itcopy.o
DGEMMONCOPYOBJ = dgemm_oncopy.o
DGEMMOTCOPYOBJ = dgemm_otcopy.o

CGEMMKERNEL = ctrmm4x4V.S
CGEMMONCOPY = ../generic/zgemm_ncopy_4.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
CGEMMONCOPYOBJ = cgemm_oncopy.o
CGEMMOTCOPYOBJ = cgemm_otcopy.o

ZGEMMKERNEL = ztrmm4x4V.S
ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)

STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c

DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c

CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c

ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c






+ 269
- 0
kernel/zarch/camax.c View File

@@ -0,0 +1,269 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#include "common.h"
#include <math.h>

#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif

#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1]))

static FLOAT camax_kernel_32(BLASLONG n, FLOAT *x)
{
FLOAT amax;

__asm__ volatile (
"vlef %%v0,0(%2),0 \n\t"
"vlef %%v16,4(%2),0 \n\t"
"vlef %%v0,8(%2),1 \n\t"
"vlef %%v16,12(%2),1 \n\t"
"vlef %%v0,16(%2),2 \n\t"
"vlef %%v16,20(%2),2 \n\t"
"vlef %%v0,24(%2),3 \n\t"
"vlef %%v16,28(%2),3 \n\t"
"vflpsb %%v0,%%v0 \n\t"
"vflpsb %%v16,%%v16 \n\t"
"vfasb %%v0,%%v0,%%v16 \n\t"
"srlg %%r0,%1,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%2) \n\t"

"vlef %%v16,0(%%r1,%2),0 \n\t"
"vlef %%v17,4(%%r1,%2),0 \n\t"
"vlef %%v16,8(%%r1,%2),1 \n\t"
"vlef %%v17,12(%%r1,%2),1 \n\t"
"vlef %%v16,16(%%r1,%2),2 \n\t"
"vlef %%v17,20(%%r1,%2),2 \n\t"
"vlef %%v16,24(%%r1,%2),3 \n\t"
"vlef %%v17,28(%%r1,%2),3 \n\t"

"vlef %%v18,32(%%r1,%2),0 \n\t"
"vlef %%v19,36(%%r1,%2),0 \n\t"
"vlef %%v18,40(%%r1,%2),1 \n\t"
"vlef %%v19,44(%%r1,%2),1 \n\t"
"vlef %%v18,48(%%r1,%2),2 \n\t"
"vlef %%v19,52(%%r1,%2),2 \n\t"
"vlef %%v18,56(%%r1,%2),3 \n\t"
"vlef %%v19,30(%%r1,%2),3 \n\t"

"vlef %%v20,64(%%r1,%2),0 \n\t"
"vlef %%v21,68(%%r1,%2),0 \n\t"
"vlef %%v20,72(%%r1,%2),1 \n\t"
"vlef %%v21,76(%%r1,%2),1 \n\t"
"vlef %%v20,80(%%r1,%2),2 \n\t"
"vlef %%v21,84(%%r1,%2),2 \n\t"
"vlef %%v20,88(%%r1,%2),3 \n\t"
"vlef %%v21,92(%%r1,%2),3 \n\t"

"vlef %%v22,96(%%r1,%2),0 \n\t"
"vlef %%v23,100(%%r1,%2),0 \n\t"
"vlef %%v22,104(%%r1,%2),1 \n\t"
"vlef %%v23,108(%%r1,%2),1 \n\t"
"vlef %%v22,112(%%r1,%2),2 \n\t"
"vlef %%v23,116(%%r1,%2),2 \n\t"
"vlef %%v22,120(%%r1,%2),3 \n\t"
"vlef %%v23,124(%%r1,%2),3 \n\t"

"vflpsb %%v16, %%v16 \n\t"
"vflpsb %%v17, %%v17 \n\t"
"vflpsb %%v18, %%v18 \n\t"
"vflpsb %%v19, %%v19 \n\t"
"vflpsb %%v20, %%v20 \n\t"
"vflpsb %%v21, %%v21 \n\t"
"vflpsb %%v22, %%v22 \n\t"
"vflpsb %%v23, %%v23 \n\t"
"vfasb %%v16,%%v16,%%v17 \n\t"
"vfasb %%v17,%%v18,%%v19 \n\t"
"vfasb %%v18,%%v20,%%v21 \n\t"
"vfasb %%v19,%%v22,%%v23 \n\t"
"vfchsb %%v24,%%v16,%%v17 \n\t"
"vfchsb %%v25,%%v18,%%v19 \n\t"
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"

"vfchsb %%v26,%%v24,%%v25 \n\t"
"vsel %%v26,%%v24,%%v25,%%v26 \n\t"

"vfchsb %%v27,%%v26,%%v0 \n\t"
"vsel %%v0,%%v26,%%v0,%%v27 \n\t"

"vlef %%v16,128(%%r1,%2),0 \n\t"
"vlef %%v17,132(%%r1,%2),0 \n\t"
"vlef %%v16,136(%%r1,%2),1 \n\t"
"vlef %%v17,140(%%r1,%2),1 \n\t"
"vlef %%v16,144(%%r1,%2),2 \n\t"
"vlef %%v17,148(%%r1,%2),2 \n\t"
"vlef %%v16,152(%%r1,%2),3 \n\t"
"vlef %%v17,156(%%r1,%2),3 \n\t"

"vlef %%v18,160(%%r1,%2),0 \n\t"
"vlef %%v19,164(%%r1,%2),0 \n\t"
"vlef %%v18,168(%%r1,%2),1 \n\t"
"vlef %%v19,172(%%r1,%2),1 \n\t"
"vlef %%v18,176(%%r1,%2),2 \n\t"
"vlef %%v19,180(%%r1,%2),2 \n\t"
"vlef %%v18,184(%%r1,%2),3 \n\t"
"vlef %%v19,188(%%r1,%2),3 \n\t"

"vlef %%v20,192(%%r1,%2),0 \n\t"
"vlef %%v21,196(%%r1,%2),0 \n\t"
"vlef %%v20,200(%%r1,%2),1 \n\t"
"vlef %%v21,204(%%r1,%2),1 \n\t"
"vlef %%v20,208(%%r1,%2),2 \n\t"
"vlef %%v21,212(%%r1,%2),2 \n\t"
"vlef %%v20,216(%%r1,%2),3 \n\t"
"vlef %%v21,220(%%r1,%2),3 \n\t"

"vlef %%v22,224(%%r1,%2),0 \n\t"
"vlef %%v23,228(%%r1,%2),0 \n\t"
"vlef %%v22,232(%%r1,%2),1 \n\t"
"vlef %%v23,236(%%r1,%2),1 \n\t"
"vlef %%v22,240(%%r1,%2),2 \n\t"
"vlef %%v23,244(%%r1,%2),2 \n\t"
"vlef %%v22,248(%%r1,%2),3 \n\t"
"vlef %%v23,252(%%r1,%2),3 \n\t"

"vflpsb %%v16, %%v16 \n\t"
"vflpsb %%v17, %%v17 \n\t"
"vflpsb %%v18, %%v18 \n\t"
"vflpsb %%v19, %%v19 \n\t"
"vflpsb %%v20, %%v20 \n\t"
"vflpsb %%v21, %%v21 \n\t"
"vflpsb %%v22, %%v22 \n\t"
"vflpsb %%v23, %%v23 \n\t"
"vfasb %%v16,%%v16,%%v17 \n\t"
"vfasb %%v17,%%v18,%%v19 \n\t"
"vfasb %%v18,%%v20,%%v21 \n\t"
"vfasb %%v19,%%v22,%%v23 \n\t"
"vfchsb %%v24,%%v16,%%v17 \n\t"
"vfchsb %%v25,%%v18,%%v19 \n\t"
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"

"vfchsb %%v26,%%v24,%%v25 \n\t"
"vsel %%v26,%%v24,%%v25,%%v26 \n\t"

"vfchsb %%v27,%%v26,%%v0 \n\t"
"vsel %%v0,%%v26,%%v0,%%v27 \n\t"

"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"

"veslg %%v16,%%v0,32 \n\t"
"vfchsb %%v17,%%v16,%%v0 \n\t"
"vsel %%v0,%%v16,%%v0,%%v17 \n\t"

"vrepf %%v16,%%v0,2 \n\t"
"wfchsb %%v17,%%v16,%%v0 \n\t"
"vsel %%v0,%%v16,%%v0,%%v17 \n\t"
"ler %0,%%f0 "
:"=f"(amax)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27"
);

return amax;
}
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT maxf = 0.0;
BLASLONG inc_x2;

if (n <= 0 || inc_x <= 0) return (maxf);

if (inc_x == 1) {

BLASLONG n1 = n & -32;
if (n1 > 0) {

maxf = camax_kernel_32(n1, x);

i = n1;
}
else
{
maxf=CABS1(x,0);
i++;
}

while (i < n) {
if (ABS(x[i*2]) > maxf) {
maxf = ABS(x[i*2]);
}
i++;
}
return (maxf);

} else {

inc_x2 = 2 * inc_x;
maxf=CABS1(x,0);
i += inc_x2;
j++;

BLASLONG n1 = (n - 1) & -4;
while (j < n1) {

if (CABS1(x,i) > maxf) {
maxf = CABS1(x,i);
}
if (CABS1(x,i+inc_x2) > maxf) {
maxf = CABS1(x,i+inc_x2);
}
if (CABS1(x,i+inc_x2*2) > maxf) {
maxf = CABS1(x,i+inc_x2*2);
}
if (CABS1(x,i+inc_x2*3) > maxf) {
maxf = CABS1(x,i+inc_x2*3);
}

i += inc_x2 * 4;

j += 4;

}


while (j < n) {
if (CABS1(x,i) > maxf) {
maxf = CABS1(x,i);
}
i += inc_x2;
j++;
}
return (maxf);
}
}

+ 269
- 0
kernel/zarch/camin.c View File

@@ -0,0 +1,269 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#include "common.h"
#include <math.h>

#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif

#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1]))

static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x)
{
FLOAT amin;

__asm__ volatile (
"vlef %%v0,0(%2),0 \n\t"
"vlef %%v16,4(%2),0 \n\t"
"vlef %%v0,8(%2),0 \n\t"
"vlef %%v16,12(%2),0 \n\t"
"vlef %%v0,16(%2),2 \n\t"
"vlef %%v16,20(%2),2 \n\t"
"vlef %%v0,24(%2),3 \n\t"
"vlef %%v16,28(%2),3 \n\t"
"vflpsb %%v0,%%v0 \n\t"
"vflpsb %%v16,%%v16 \n\t"
"vfasb %%v0,%%v0,%%v16 \n\t"
"srlg %%r0,%1,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"

"vlef %%v16,0(%%r1,%2),0 \n\t"
"vlef %%v17,4(%%r1,%2),0 \n\t"
"vlef %%v16,8(%%r1,%2),0 \n\t"
"vlef %%v17,12(%%r1,%2),0 \n\t"
"vlef %%v16,16(%%r1,%2),2 \n\t"
"vlef %%v17,20(%%r1,%2),2 \n\t"
"vlef %%v16,24(%%r1,%2),3 \n\t"
"vlef %%v17,28(%%r1,%2),3 \n\t"

"vlef %%v18,32(%%r1,%2),0 \n\t"
"vlef %%v19,36(%%r1,%2),0 \n\t"
"vlef %%v18,40(%%r1,%2),0 \n\t"
"vlef %%v19,44(%%r1,%2),0 \n\t"
"vlef %%v18,48(%%r1,%2),2 \n\t"
"vlef %%v19,52(%%r1,%2),2 \n\t"
"vlef %%v18,56(%%r1,%2),3 \n\t"
"vlef %%v19,30(%%r1,%2),3 \n\t"

"vlef %%v20,64(%%r1,%2),0 \n\t"
"vlef %%v21,68(%%r1,%2),0 \n\t"
"vlef %%v20,72(%%r1,%2),0 \n\t"
"vlef %%v21,76(%%r1,%2),0 \n\t"
"vlef %%v20,80(%%r1,%2),2 \n\t"
"vlef %%v21,84(%%r1,%2),2 \n\t"
"vlef %%v20,88(%%r1,%2),3 \n\t"
"vlef %%v21,92(%%r1,%2),3 \n\t"

"vlef %%v22,96(%%r1,%2),0 \n\t"
"vlef %%v23,100(%%r1,%2),0 \n\t"
"vlef %%v22,104(%%r1,%2),0 \n\t"
"vlef %%v23,108(%%r1,%2),0 \n\t"
"vlef %%v22,112(%%r1,%2),2 \n\t"
"vlef %%v23,116(%%r1,%2),2 \n\t"
"vlef %%v22,120(%%r1,%2),3 \n\t"
"vlef %%v23,124(%%r1,%2),3 \n\t"

"vflpsb %%v16, %%v16 \n\t"
"vflpsb %%v17, %%v17 \n\t"
"vflpsb %%v18, %%v18 \n\t"
"vflpsb %%v19, %%v19 \n\t"
"vflpsb %%v20, %%v20 \n\t"
"vflpsb %%v21, %%v21 \n\t"
"vflpsb %%v22, %%v22 \n\t"
"vflpsb %%v23, %%v23 \n\t"
"vfasb %%v16,%%v16,%%v17 \n\t"
"vfasb %%v17,%%v18,%%v19 \n\t"
"vfasb %%v18,%%v20,%%v21 \n\t"
"vfasb %%v19,%%v22,%%v23 \n\t"
"vfchsb %%v24,%%v17,%%v16 \n\t"
"vfchsb %%v25,%%v19,%%v18 \n\t"
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"

"vfchsb %%v26,%%v25,%%v24 \n\t"
"vsel %%v26,%%v24,%%v25,%%v26 \n\t"

"vfchsb %%v27,%%v0,%%v26 \n\t"
"vsel %%v0,%%v26,%%v0,%%v27 \n\t"

"vlef %%v16,128(%%r1,%2),0 \n\t"
"vlef %%v17,132(%%r1,%2),0 \n\t"
"vlef %%v16,136(%%r1,%2),0 \n\t"
"vlef %%v17,140(%%r1,%2),0 \n\t"
"vlef %%v16,144(%%r1,%2),2 \n\t"
"vlef %%v17,148(%%r1,%2),2 \n\t"
"vlef %%v16,152(%%r1,%2),3 \n\t"
"vlef %%v17,156(%%r1,%2),3 \n\t"

"vlef %%v18,160(%%r1,%2),0 \n\t"
"vlef %%v19,164(%%r1,%2),0 \n\t"
"vlef %%v18,168(%%r1,%2),0 \n\t"
"vlef %%v19,172(%%r1,%2),0 \n\t"
"vlef %%v18,176(%%r1,%2),2 \n\t"
"vlef %%v19,180(%%r1,%2),2 \n\t"
"vlef %%v18,184(%%r1,%2),3 \n\t"
"vlef %%v19,188(%%r1,%2),3 \n\t"

"vlef %%v20,192(%%r1,%2),0 \n\t"
"vlef %%v21,196(%%r1,%2),0 \n\t"
"vlef %%v20,200(%%r1,%2),0 \n\t"
"vlef %%v21,204(%%r1,%2),0 \n\t"
"vlef %%v20,208(%%r1,%2),2 \n\t"
"vlef %%v21,212(%%r1,%2),2 \n\t"
"vlef %%v20,216(%%r1,%2),3 \n\t"
"vlef %%v21,220(%%r1,%2),3 \n\t"

"vlef %%v22,224(%%r1,%2),0 \n\t"
"vlef %%v23,228(%%r1,%2),0 \n\t"
"vlef %%v22,232(%%r1,%2),0 \n\t"
"vlef %%v23,236(%%r1,%2),0 \n\t"
"vlef %%v22,240(%%r1,%2),2 \n\t"
"vlef %%v23,244(%%r1,%2),2 \n\t"
"vlef %%v22,248(%%r1,%2),3 \n\t"
"vlef %%v23,252(%%r1,%2),3 \n\t"

"vflpsb %%v16, %%v16 \n\t"
"vflpsb %%v17, %%v17 \n\t"
"vflpsb %%v18, %%v18 \n\t"
"vflpsb %%v19, %%v19 \n\t"
"vflpsb %%v20, %%v20 \n\t"
"vflpsb %%v21, %%v21 \n\t"
"vflpsb %%v22, %%v22 \n\t"
"vflpsb %%v23, %%v23 \n\t"
"vfasb %%v16,%%v16,%%v17 \n\t"
"vfasb %%v17,%%v18,%%v19 \n\t"
"vfasb %%v18,%%v20,%%v21 \n\t"
"vfasb %%v19,%%v22,%%v23 \n\t"
"vfchsb %%v24,%%v17,%%v16 \n\t"
"vfchsb %%v25,%%v19,%%v18 \n\t"
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"

"vfchsb %%v26,%%v25,%%v24 \n\t"
"vsel %%v26,%%v24,%%v25,%%v26 \n\t"

"vfchsb %%v27,%%v0,%%v26 \n\t"
"vsel %%v0,%%v26,%%v0,%%v27 \n\t"

"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"

"veslg %%v16,%%v0,32 \n\t"
"vfchsb %%v17,%%v0,%%v16 \n\t"
"vsel %%v0,%%v16,%%v0,%%v17 \n\t"

"vrepf %%v16,%%v0,2 \n\t"
"wfchsb %%v17,%%v0,%%v16 \n\t"
"vsel %%v0,%%v16,%%v0,%%v17 \n\t"
"ler %0,%%f0 "
:"=f"(amin)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27"
);

return amin;
}
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT minf = 0.0;
BLASLONG inc_x2;

if (n <= 0 || inc_x <= 0) return (minf);

if (inc_x == 1) {

BLASLONG n1 = n & -32;
if (n1 > 0) {

minf = camin_kernel_32(n1, x);

i = n1;
}
else
{
minf=CABS1(x,0);
i++;
}

while (i < n) {
if (ABS(x[i*2]) < minf) {
minf = ABS(x[i*2]);
}
i++;
}
return (minf);

} else {

inc_x2 = 2 * inc_x;
minf=CABS1(x,0);
i += inc_x2;
j++;

BLASLONG n1 = (n - 1) & -4;
while (j < n1) {

if (CABS1(x,i) < minf) {
minf = CABS1(x,i);
}
if (CABS1(x,i+inc_x2) < minf) {
minf = CABS1(x,i+inc_x2);
}
if (CABS1(x,i+inc_x2*2) < minf) {
minf = CABS1(x,i+inc_x2*2);
}
if (CABS1(x,i+inc_x2*3) < minf) {
minf = CABS1(x,i+inc_x2*3);
}

i += inc_x2 * 4;

j += 4;

}


while (j < n) {
if (CABS1(x,i) < minf) {
minf = CABS1(x,i);
}
i += inc_x2;
j++;
}
return (minf);
}
}

+ 167
- 0
kernel/zarch/casum.c View File

@@ -0,0 +1,167 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#include "common.h"
#include <math.h>

#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif

static FLOAT casum_kernel_32(BLASLONG n, FLOAT *x)
{
FLOAT asum;

__asm__ (
"vzero %%v0 \n\t"
"vzero %%v1 \n\t"
"vzero %%v2 \n\t"
"vzero %%v3 \n\t"
"srlg %%r0,%1,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"
"vl %%v16, 0(%%r1,%2) \n\t"
"vl %%v17, 16(%%r1,%2) \n\t"
"vl %%v18, 32(%%r1,%2) \n\t"
"vl %%v19, 48(%%r1,%2) \n\t"
"vl %%v20, 64(%%r1,%2) \n\t"
"vl %%v21, 80(%%r1,%2) \n\t"
"vl %%v22, 96(%%r1,%2) \n\t"
"vl %%v23, 112(%%r1,%2) \n\t"

"vflpsb %%v16, %%v16 \n\t"
"vflpsb %%v17, %%v17 \n\t"
"vflpsb %%v18, %%v18 \n\t"
"vflpsb %%v19, %%v19 \n\t"
"vflpsb %%v20, %%v20 \n\t"
"vflpsb %%v21, %%v21 \n\t"
"vflpsb %%v22, %%v22 \n\t"
"vflpsb %%v23, %%v23 \n\t"

"vfasb %%v0,%%v0,%%v16 \n\t"
"vfasb %%v1,%%v1,%%v17 \n\t"
"vfasb %%v2,%%v2,%%v18 \n\t"
"vfasb %%v3,%%v3,%%v19 \n\t"
"vfasb %%v0,%%v0,%%v20 \n\t"
"vfasb %%v1,%%v1,%%v21 \n\t"
"vfasb %%v2,%%v2,%%v22 \n\t"
"vfasb %%v3,%%v3,%%v23 \n\t"

"vl %%v16, 128(%%r1,%2) \n\t"
"vl %%v17, 144(%%r1,%2) \n\t"
"vl %%v18, 160(%%r1,%2) \n\t"
"vl %%v19, 176(%%r1,%2) \n\t"
"vl %%v20, 192(%%r1,%2) \n\t"
"vl %%v21, 208(%%r1,%2) \n\t"
"vl %%v22, 224(%%r1,%2) \n\t"
"vl %%v23, 240(%%r1,%2) \n\t"

"vflpsb %%v16, %%v16 \n\t"
"vflpsb %%v17, %%v17 \n\t"
"vflpsb %%v18, %%v18 \n\t"
"vflpsb %%v19, %%v19 \n\t"
"vflpsb %%v20, %%v20 \n\t"
"vflpsb %%v21, %%v21 \n\t"
"vflpsb %%v22, %%v22 \n\t"
"vflpsb %%v23, %%v23 \n\t"

"vfasb %%v0,%%v0,%%v16 \n\t"
"vfasb %%v1,%%v1,%%v17 \n\t"
"vfasb %%v2,%%v2,%%v18 \n\t"
"vfasb %%v3,%%v3,%%v19 \n\t"
"vfasb %%v0,%%v0,%%v20 \n\t"
"vfasb %%v1,%%v1,%%v21 \n\t"
"vfasb %%v2,%%v2,%%v22 \n\t"
"vfasb %%v3,%%v3,%%v23 \n\t"
"agfi %%r1,256 \n\t"
"brctg %%r0,0b \n\t"
"vfasb %%v0,%%v0,%%v1 \n\t"
"vfasb %%v0,%%v0,%%v2 \n\t"
"vfasb %%v0,%%v0,%%v3 \n\t"
"veslg %%v1,%%v0,32 \n\t"
"vfasb %%v0,%%v0,%%v1 \n\t"
"vrepf %%v1,%%v0,2 \n\t"
"aebr %%f0,%%f1 \n\t"
"ler %0,%%f0 "
:"=f"(asum)
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x)
:"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23"
);

return asum;
}

FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
BLASLONG ip=0;
FLOAT sumf = 0.0;
BLASLONG n1;
BLASLONG inc_x2;

if (n <= 0 || inc_x <= 0) return(sumf);

if ( inc_x == 1 )
{

n1 = n & -32;
if ( n1 > 0 )
{

sumf = casum_kernel_32(n1, x);
i=n1;
ip=2*n1;
}

while(i < n)
{
sumf += ABS(x[ip]) + ABS(x[ip+1]);
i++;
ip+=2;
}

}
else
{
inc_x2 = 2* inc_x;

while(i < n)
{
sumf += ABS(x[ip]) + ABS(x[ip+1]);
ip+=inc_x2;
i++;
}

}
return(sumf);
}



+ 174
- 0
kernel/zarch/caxpy.c View File

@@ -0,0 +1,174 @@
/***************************************************************************
Copyright (c) 2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#include "common.h"

static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
__asm__ volatile(
#if !defined(CONJ)
"vlrepf %%v0,0(%3) \n\t"
"vlef %%v1,4(%3),0 \n\t"
"vlef %%v1,4(%3),2 \n\t"
"vflcsb %%v1,%%v1 \n\t"
"vlef %%v1,4(%3),1 \n\t"
"vlef %%v1,4(%3),3 \n\t"
#else
"vlef %%v0,0(%3),1 \n\t"
"vlef %%v0,0(%3),3 \n\t"
"vflcsb %%v0,%%v0 \n\t"
"vlef %%v0,0(%3),0 \n\t"
"vlef %%v0,0(%3),2 \n\t"
"vlrepf %%v1,4(%3) \n\t"
#endif
"srlg %%r0,%0,4 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%1) \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"

"vl %%v16,0(%%r1,%1) \n\t"
"vl %%v17,16(%%r1,%1) \n\t"
"vl %%v18,32(%%r1,%1) \n\t"
"vl %%v19,48(%%r1,%1) \n\t"
"vl %%v20,0(%%r1,%2) \n\t"
"vl %%v21,16(%%r1,%2) \n\t"
"vl %%v22,32(%%r1,%2) \n\t"
"vl %%v23,48(%%r1,%2) \n\t"
"verllg %%v24,%%v16,32 \n\t"
"verllg %%v25,%%v17,32 \n\t"
"verllg %%v26,%%v18,32 \n\t"
"verllg %%v27,%%v19,32 \n\t"

"vfmasb %%v28,%%v16,%%v0,%%v20 \n\t"
"vfmasb %%v29,%%v17,%%v0,%%v21 \n\t"
"vfmasb %%v30,%%v18,%%v0,%%v22 \n\t"
"vfmasb %%v31,%%v19,%%v0,%%v23 \n\t"

"vfmasb %%v28,%%v24,%%v1,%%v28 \n\t"
"vfmasb %%v29,%%v25,%%v1,%%v29 \n\t"
"vfmasb %%v30,%%v26,%%v1,%%v30 \n\t"
"vfmasb %%v31,%%v27,%%v1,%%v31 \n\t"

"vst %%v28,0(%%r1,%2) \n\t"
"vst %%v29,16(%%r1,%2) \n\t"
"vst %%v30,32(%%r1,%2) \n\t"
"vst %%v31,48(%%r1,%2) \n\t"

"vl %%v16,64(%%r1,%1) \n\t"
"vl %%v17,80(%%r1,%1) \n\t"
"vl %%v18,96(%%r1,%1) \n\t"
"vl %%v19,112(%%r1,%1) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"verllg %%v24,%%v16,32 \n\t"
"verllg %%v25,%%v17,32 \n\t"
"verllg %%v26,%%v18,32 \n\t"
"verllg %%v27,%%v19,32 \n\t"

"vfmasb %%v28,%%v16,%%v0,%%v20 \n\t"
"vfmasb %%v29,%%v17,%%v0,%%v21 \n\t"
"vfmasb %%v30,%%v18,%%v0,%%v22 \n\t"
"vfmasb %%v31,%%v19,%%v0,%%v23 \n\t"

"vfmasb %%v28,%%v24,%%v1,%%v28 \n\t"
"vfmasb %%v29,%%v25,%%v1,%%v29 \n\t"
"vfmasb %%v30,%%v26,%%v1,%%v30 \n\t"
"vfmasb %%v31,%%v27,%%v1,%%v31 \n\t"

"vst %%v28,64(%%r1,%2) \n\t"
"vst %%v29,80(%%r1,%2) \n\t"
"vst %%v30,96(%%r1,%2) \n\t"
"vst %%v31,112(%%r1,%2) \n\t"

"agfi %%r1,128 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"a"(alpha)
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}

int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) {
BLASLONG i = 0;
BLASLONG ix = 0, iy = 0;
FLOAT da[2];

if (n <= 0) return (0);

if ((inc_x == 1) && (inc_y == 1)) {

BLASLONG n1 = n & -16;

if (n1) {
da[0] = da_r;
da[1] = da_i;
caxpy_kernel_16(n1, x, y, da);
ix = 2 * n1;
}
i = n1;
while (i < n) {
#if !defined(CONJ)
y[ix] += (da_r * x[ix] - da_i * x[ix + 1]);
y[ix + 1] += (da_r * x[ix + 1] + da_i * x[ix]);
#else
y[ix] += (da_r * x[ix] + da_i * x[ix + 1]);
y[ix + 1] -= (da_r * x[ix + 1] - da_i * x[ix]);
#endif
i++;
ix += 2;

}
return (0);


}

inc_x *= 2;
inc_y *= 2;

while (i < n) {

#if !defined(CONJ)
y[iy] += (da_r * x[ix] - da_i * x[ix + 1]);
y[iy + 1] += (da_r * x[ix + 1] + da_i * x[ix]);
#else
y[iy] += (da_r * x[ix] + da_i * x[ix + 1]);
y[iy + 1] -= (da_r * x[ix + 1] - da_i * x[ix]);
#endif
ix += inc_x;
iy += inc_y;
i++;

}
return (0);

}



+ 99
- 0
kernel/zarch/ccopy.c View File

@@ -0,0 +1,99 @@
/***************************************************************************
Copyright (c) 2013-2018, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#include "common.h"

static void ccopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
{
__asm__ volatile (
"lgr %%r1,%1 \n\t"
"lgr %%r2,%2 \n\t"
"srlg %%r0,%0,5 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1) \n\t"
"pfd 2, 1024(%%r2) \n\t"
"mvc 0(256,%%r2),0(%%r1) \n\t"
"agfi %%r1,256 \n\t"
"agfi %%r2,256 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"a"((const FLOAT (*)[n * 2])x),"a"((FLOAT (*)[n * 2])y)
:"memory","cc","r0","r1","r2"
);
}

int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;

if ( n <= 0 ) return(0);

if ( (inc_x == 1) && (inc_y == 1 ))
{

BLASLONG n1 = n & -32;
if ( n1 > 0 )
{
ccopy_kernel_32(n1, x, y);
i=n1;
ix=n1*2;
iy=n1*2;
}

while(i < n)
{
y[iy] = x[iy] ;
y[iy+1] = x[ix+1] ;
ix+=2;
iy+=2;
i++ ;

}


}
else
{

BLASLONG inc_x2 = 2 * inc_x;
BLASLONG inc_y2 = 2 * inc_y;

while(i < n)
{
y[iy] = x[ix] ;
y[iy+1] = x[ix+1] ;
ix += inc_x2 ;
iy += inc_y2 ;
i++ ;

}

}
return(0);
}

+ 182
- 0
kernel/zarch/cdot.c View File

@@ -0,0 +1,182 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#include "common.h"

static void cdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
{
__asm__ volatile(
"vzero %%v24 \n\t"
"vzero %%v25 \n\t"
"vzero %%v26 \n\t"
"vzero %%v27 \n\t"
"vzero %%v28 \n\t"
"vzero %%v29 \n\t"
"vzero %%v30 \n\t"
"vzero %%v31 \n\t"
"srlg %%r0,%0,4 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%1) \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"

"vl %%v16, 0(%%r1,%1) \n\t"
"vl %%v17, 16(%%r1,%1) \n\t"
"vl %%v18, 32(%%r1,%1) \n\t"
"vl %%v19, 48(%%r1,%1) \n\t"
"vl %%v0, 0(%%r1,%2) \n\t"
"vl %%v1, 16(%%r1,%2) \n\t"
"vl %%v2, 32(%%r1,%2) \n\t"
"vl %%v3, 48(%%r1,%2) \n\t"
"verllg %%v20,%%v16,32 \n\t"
"verllg %%v21,%%v17,32 \n\t"
"verllg %%v22,%%v18,32 \n\t"
"verllg %%v23,%%v19,32 \n\t"

"vfmasb %%v24,%%v16,%%v0,%%v24 \n\t"
"vfmasb %%v25,%%v20,%%v0,%%v25 \n\t"
"vfmasb %%v26,%%v17,%%v1,%%v26 \n\t"
"vfmasb %%v27,%%v21,%%v1,%%v27 \n\t"
"vfmasb %%v28,%%v18,%%v2,%%v28 \n\t"
"vfmasb %%v29,%%v22,%%v2,%%v29 \n\t"
"vfmasb %%v30,%%v19,%%v3,%%v30 \n\t"
"vfmasb %%v31,%%v23,%%v3,%%v31 \n\t"

"vl %%v16, 64(%%r1,%1) \n\t"
"vl %%v17, 80(%%r1,%1) \n\t"
"vl %%v18, 96(%%r1,%1) \n\t"
"vl %%v19, 112(%%r1,%1) \n\t"
"vl %%v0, 64(%%r1,%2) \n\t"
"vl %%v1, 80(%%r1,%2) \n\t"
"vl %%v2, 96(%%r1,%2) \n\t"
"vl %%v3, 112(%%r1,%2) \n\t"
"verllg %%v20,%%v16,32 \n\t"
"verllg %%v21,%%v17,32 \n\t"
"verllg %%v22,%%v18,32 \n\t"
"verllg %%v23,%%v19,32 \n\t"

"vfmasb %%v24,%%v16,%%v0,%%v24 \n\t"
"vfmasb %%v25,%%v20,%%v0,%%v25 \n\t"
"vfmasb %%v26,%%v17,%%v1,%%v26 \n\t"
"vfmasb %%v27,%%v21,%%v1,%%v27 \n\t"
"vfmasb %%v28,%%v18,%%v2,%%v28 \n\t"
"vfmasb %%v29,%%v22,%%v2,%%v29 \n\t"
"vfmasb %%v30,%%v19,%%v3,%%v30 \n\t"
"vfmasb %%v31,%%v23,%%v3,%%v31 \n\t"

"agfi %%r1,128 \n\t"
"brctg %%r0,0b \n\t"
"vfasb %%v24,%%v24,%%v26 \n\t"
"vfasb %%v24,%%v24,%%v28 \n\t"
"vfasb %%v24,%%v24,%%v30 \n\t"
"vrepg %%v26,%%v24,1 \n\t"
"vfasb %%v24,%%v24,%%v26 \n\t"
"vfasb %%v25,%%v25,%%v27 \n\t"
"vfasb %%v25,%%v25,%%v29 \n\t"
"vfasb %%v25,%%v25,%%v31 \n\t"
"vrepg %%v27,%%v25,1 \n\t"
"vfasb %%v25,%%v25,%%v27 \n\t"
"vstef %%v24,0(%3),0 \n\t"
"vstef %%v24,4(%3),1 \n\t"
"vstef %%v25,8(%3),1 \n\t"
"vstef %%v25,12(%3),0 "
:
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((const FLOAT (*)[n * 2])y),"ZQ"((FLOAT (*)[4])d)
:"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}

OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
BLASLONG i;
BLASLONG ix, iy;
OPENBLAS_COMPLEX_FLOAT result;
FLOAT dot[4] __attribute__ ((aligned(16))) = {0.0, 0.0, 0.0, 0.0};

if (n <= 0) {
CREAL(result) = 0.0;
CIMAG(result) = 0.0;
return (result);

}

if ((inc_x == 1) && (inc_y == 1)) {

BLASLONG n1 = n & -16;

if (n1)
cdot_kernel_16(n1, x, y, dot);

i = n1;
BLASLONG j = i * 2;

while (i < n) {

dot[0] += x[j] * y[j];
dot[1] += x[j + 1] * y[j + 1];
dot[2] += x[j] * y[j + 1];
dot[3] += x[j + 1] * y[j];

j += 2;
i++;

}


} else {
i = 0;
ix = 0;
iy = 0;
inc_x <<= 1;
inc_y <<= 1;
while (i < n) {

dot[0] += x[ix] * y[iy];
dot[1] += x[ix + 1] * y[iy + 1];
dot[2] += x[ix] * y[iy + 1];
dot[3] += x[ix + 1] * y[iy];

ix += inc_x;
iy += inc_y;
i++;

}
}

#if !defined(CONJ)
CREAL(result) = dot[0] - dot[1];
CIMAG(result) = dot[2] + dot[3];
#else
CREAL(result) = dot[0] + dot[1];
CIMAG(result) = dot[2] - dot[3];

#endif

return (result);

}



+ 256
- 0
kernel/zarch/crot.c View File

@@ -0,0 +1,256 @@
/***************************************************************************
Copyright (c) 2013-2018, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#include "common.h"

static void crot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
{
__asm__ (
"vlrepf %%v0,%3 \n\t"
"vlrepf %%v1,%4 \n\t"
"srlg %%r0,%0,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 2, 1024(%%r1,%1) \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"
"vl %%v24, 0(%%r1,%1) \n\t"
"vl %%v25, 16(%%r1,%1) \n\t"
"vl %%v26, 32(%%r1,%1) \n\t"
"vl %%v27, 48(%%r1,%1) \n\t"
"vl %%v16, 0(%%r1,%2) \n\t"
"vl %%v17, 16(%%r1,%2) \n\t"
"vl %%v18, 32(%%r1,%2) \n\t"
"vl %%v19, 48(%%r1,%2) \n\t"
"vfmsb %%v28,%%v24,%%v0 \n\t"
"vfmsb %%v29,%%v25,%%v0 \n\t"
"vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v30,%%v26,%%v0 \n\t"
"vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v31,%%v27,%%v0 \n\t"
"vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
/* 2nd parts*/
"vfmasb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmasb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmasb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmasb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vst %%v28, 0(%%r1,%1) \n\t"
"vst %%v29, 16(%%r1,%1) \n\t"
"vst %%v30, 32(%%r1,%1) \n\t"
"vst %%v31, 48(%%r1,%1) \n\t"
"vst %%v20, 0(%%r1,%2) \n\t"
"vst %%v21, 16(%%r1,%2) \n\t"
"vst %%v22, 32(%%r1,%2) \n\t"
"vst %%v23, 48(%%r1,%2) \n\t"
"vl %%v24, 64(%%r1,%1) \n\t"
"vl %%v25, 80(%%r1,%1) \n\t"
"vl %%v26, 96(%%r1,%1) \n\t"
"vl %%v27, 112(%%r1,%1) \n\t"
"vl %%v16, 64(%%r1,%2) \n\t"
"vl %%v17, 80(%%r1,%2) \n\t"
"vl %%v18, 96(%%r1,%2) \n\t"
"vl %%v19, 112(%%r1,%2) \n\t"
"vfmsb %%v28,%%v24,%%v0 \n\t"
"vfmsb %%v29,%%v25,%%v0 \n\t"
"vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v30,%%v26,%%v0 \n\t"
"vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v31,%%v27,%%v0 \n\t"
"vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
/* 2nd parts*/
"vfmasb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmasb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmasb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmasb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vst %%v28, 64(%%r1,%1) \n\t"
"vst %%v29, 80(%%r1,%1) \n\t"
"vst %%v30, 96(%%r1,%1) \n\t"
"vst %%v31, 112(%%r1,%1) \n\t"
"vst %%v20, 64(%%r1,%2) \n\t"
"vst %%v21, 80(%%r1,%2) \n\t"
"vst %%v22, 96(%%r1,%2) \n\t"
"vst %%v23, 112(%%r1,%2) \n\t"
"vl %%v24, 128(%%r1,%1) \n\t"
"vl %%v25, 144(%%r1,%1) \n\t"
"vl %%v26, 160(%%r1,%1) \n\t"
"vl %%v27, 176(%%r1,%1) \n\t"
"vl %%v16, 128(%%r1,%2) \n\t"
"vl %%v17, 144(%%r1,%2) \n\t"
"vl %%v18, 160(%%r1,%2) \n\t"
"vl %%v19, 176(%%r1,%2) \n\t"
"vfmsb %%v28,%%v24,%%v0 \n\t"
"vfmsb %%v29,%%v25,%%v0 \n\t"
"vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v30,%%v26,%%v0 \n\t"
"vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v31,%%v27,%%v0 \n\t"
"vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
/* 2nd parts*/
"vfmasb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmasb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmasb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmasb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vst %%v28, 128(%%r1,%1) \n\t"
"vst %%v29, 144(%%r1,%1) \n\t"
"vst %%v30, 160(%%r1,%1) \n\t"
"vst %%v31, 176(%%r1,%1) \n\t"
"vst %%v20, 128(%%r1,%2) \n\t"
"vst %%v21, 144(%%r1,%2) \n\t"
"vst %%v22, 160(%%r1,%2) \n\t"
"vst %%v23, 176(%%r1,%2) \n\t"
"vl %%v24, 192(%%r1,%1) \n\t"
"vl %%v25, 208(%%r1,%1) \n\t"
"vl %%v26, 224(%%r1,%1) \n\t"
"vl %%v27, 240(%%r1,%1) \n\t"
"vl %%v16, 192(%%r1,%2) \n\t"
"vl %%v17, 208(%%r1,%2) \n\t"
"vl %%v18, 224(%%r1,%2) \n\t"
"vl %%v19, 240(%%r1,%2) \n\t"
"vfmsb %%v28,%%v24,%%v0 \n\t"
"vfmsb %%v29,%%v25,%%v0 \n\t"
"vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v30,%%v26,%%v0 \n\t"
"vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v31,%%v27,%%v0 \n\t"
"vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
/* 2nd parts*/
"vfmasb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmasb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmasb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmasb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vst %%v28, 192(%%r1,%1) \n\t"
"vst %%v29, 208(%%r1,%1) \n\t"
"vst %%v30, 224(%%r1,%1) \n\t"
"vst %%v31, 240(%%r1,%1) \n\t"
"vst %%v20, 192(%%r1,%2) \n\t"
"vst %%v21, 208(%%r1,%2) \n\t"
"vst %%v22, 224(%%r1,%2) \n\t"
"vst %%v23, 240(%%r1,%2) \n\t"
"agfi %%r1,256 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"m"(*c),"m"(*s)
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}

int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
FLOAT temp[2];
BLASLONG inc_x2;
BLASLONG inc_y2;

if ( n <= 0 ) return(0);

if ( (inc_x == 1) && (inc_y == 1) )
{

BLASLONG n1 = n & -32;
if ( n1 > 0 )
{
FLOAT cosa,sina;
cosa=c;
sina=s;
crot_kernel_32(n1, x, y, &cosa, &sina);
i=n1;
ix=2*n1;
}

while(i < n)
{
temp[0] = c*x[ix] + s*y[ix] ;
temp[1] = c*x[ix+1] + s*y[ix+1] ;
y[ix] = c*y[ix] - s*x[ix] ;
y[ix+1] = c*y[ix+1] - s*x[ix+1] ;
x[ix] = temp[0] ;
x[ix+1] = temp[1] ;

ix += 2 ;
i++ ;

}


}
else
{
inc_x2 = 2 * inc_x ;
inc_y2 = 2 * inc_y ;
while(i < n)
{
temp[0] = c*x[ix] + s*y[iy] ;
temp[1] = c*x[ix+1] + s*y[iy+1] ;
y[iy] = c*y[iy] - s*x[ix] ;
y[iy+1] = c*y[iy+1] - s*x[ix+1] ;
x[ix] = temp[0] ;
x[ix+1] = temp[1] ;

ix += inc_x2 ;
iy += inc_y2 ;
i++ ;

}

}
return(0);
}



+ 456
- 0
kernel/zarch/cscal.c View File

@@ -0,0 +1,456 @@
/***************************************************************************
Copyright (c) 2013 - 2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#include "common.h"

static void cscal_kernel_16(BLASLONG n, FLOAT *alpha, FLOAT *x)
{
__asm__ volatile(
"vlrepf %%v0,0(%1) \n\t"
"vlef %%v1,4(%1),0 \n\t"
"vlef %%v1,4(%1),2 \n\t"
"vflcsb %%v1,%%v1 \n\t"
"vlef %%v1,4(%1),1 \n\t"
"vlef %%v1,4(%1),3 \n\t"
"srlg %%r0,%0,4 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"

"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"verllg %%v24,%%v16,32 \n\t"
"verllg %%v25,%%v17,32 \n\t"
"verllg %%v26,%%v18,32 \n\t"
"verllg %%v27,%%v19,32 \n\t"
"verllg %%v28,%%v20,32 \n\t"
"verllg %%v29,%%v21,32 \n\t"
"verllg %%v30,%%v22,32 \n\t"
"verllg %%v31,%%v23,32 \n\t"

"vfmsb %%v16,%%v16,%%v0 \n\t"
"vfmsb %%v17,%%v17,%%v0 \n\t"
"vfmsb %%v18,%%v18,%%v0 \n\t"
"vfmsb %%v19,%%v19,%%v0 \n\t"
"vfmsb %%v20,%%v20,%%v0 \n\t"
"vfmsb %%v21,%%v21,%%v0 \n\t"
"vfmsb %%v22,%%v22,%%v0 \n\t"
"vfmsb %%v23,%%v23,%%v0 \n\t"
"vfmasb %%v16,%%v24,%%v1,%%v16 \n\t"
"vfmasb %%v17,%%v25,%%v1,%%v17 \n\t"
"vfmasb %%v18,%%v26,%%v1,%%v18 \n\t"
"vfmasb %%v19,%%v27,%%v1,%%v19 \n\t"
"vfmasb %%v20,%%v28,%%v1,%%v20 \n\t"
"vfmasb %%v21,%%v29,%%v1,%%v21 \n\t"
"vfmasb %%v22,%%v30,%%v1,%%v22 \n\t"
"vfmasb %%v23,%%v31,%%v1,%%v23 \n\t"

"vst %%v16,0(%%r1,%2) \n\t"
"vst %%v17,16(%%r1,%2) \n\t"
"vst %%v18,32(%%r1,%2) \n\t"
"vst %%v19,48(%%r1,%2) \n\t"
"vst %%v20,64(%%r1,%2) \n\t"
"vst %%v21,80(%%r1,%2) \n\t"
"vst %%v22,96(%%r1,%2) \n\t"
"vst %%v23,112(%%r1,%2) \n\t"

"agfi %%r1,128 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x)
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
static void cscal_kernel_16_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x)
{
__asm__ volatile(
"vlef %%v0,4(%1),0 \n\t"
"vlef %%v0,4(%1),2 \n\t"
"vflcsb %%v0,%%v0 \n\t"
"vlef %%v0,4(%1),1 \n\t"
"vlef %%v0,4(%1),3 \n\t"
"srlg %%r0,%0,4 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"

"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"verllg %%v16,%%v16,32 \n\t"
"verllg %%v17,%%v17,32 \n\t"
"verllg %%v18,%%v18,32 \n\t"
"verllg %%v19,%%v19,32 \n\t"
"verllg %%v20,%%v20,32 \n\t"
"verllg %%v21,%%v21,32 \n\t"
"verllg %%v22,%%v22,32 \n\t"
"verllg %%v23,%%v23,32 \n\t"

"vfmsb %%v16,%%v16,%%v0 \n\t"
"vfmsb %%v17,%%v17,%%v0 \n\t"
"vfmsb %%v18,%%v18,%%v0 \n\t"
"vfmsb %%v19,%%v19,%%v0 \n\t"
"vfmsb %%v20,%%v20,%%v0 \n\t"
"vfmsb %%v21,%%v21,%%v0 \n\t"
"vfmsb %%v22,%%v22,%%v0 \n\t"
"vfmsb %%v23,%%v23,%%v0 \n\t"

"vst %%v16,0(%%r1,%2) \n\t"
"vst %%v17,16(%%r1,%2) \n\t"
"vst %%v18,32(%%r1,%2) \n\t"
"vst %%v19,48(%%r1,%2) \n\t"
"vst %%v20,64(%%r1,%2) \n\t"
"vst %%v21,80(%%r1,%2) \n\t"
"vst %%v22,96(%%r1,%2) \n\t"
"vst %%v23,112(%%r1,%2) \n\t"

"agfi %%r1,128 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23"
);
}

static void cscal_kernel_16_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x)
{
__asm__ volatile(
"vlrepf %%v0,0(%1) \n\t"
"srlg %%r0,%0,4 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"

"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"

"vfmsb %%v16,%%v16,%%v0 \n\t"
"vfmsb %%v17,%%v17,%%v0 \n\t"
"vfmsb %%v18,%%v18,%%v0 \n\t"
"vfmsb %%v19,%%v19,%%v0 \n\t"
"vfmsb %%v20,%%v20,%%v0 \n\t"
"vfmsb %%v21,%%v21,%%v0 \n\t"
"vfmsb %%v22,%%v22,%%v0 \n\t"
"vfmsb %%v23,%%v23,%%v0 \n\t"

"vst %%v16,0(%%r1,%2) \n\t"
"vst %%v17,16(%%r1,%2) \n\t"
"vst %%v18,32(%%r1,%2) \n\t"
"vst %%v19,48(%%r1,%2) \n\t"
"vst %%v20,64(%%r1,%2) \n\t"
"vst %%v21,80(%%r1,%2) \n\t"
"vst %%v22,96(%%r1,%2) \n\t"
"vst %%v23,112(%%r1,%2) \n\t"

"agfi %%r1,128 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23"
);
}

static void cscal_kernel_16_zero(BLASLONG n, FLOAT *x)
{
__asm__ volatile(
"vzero %%v24 \n\t"
"vzero %%v25 \n\t"
"vzero %%v26 \n\t"
"vzero %%v27 \n\t"
"srlg %%r0,%0,4 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 2, 1024(%%r1,%1) \n\t"

"vst %%v24,0(%%r1,%1) \n\t"
"vst %%v25,16(%%r1,%1) \n\t"
"vst %%v26,32(%%r1,%1) \n\t"
"vst %%v27,48(%%r1,%1) \n\t"
"vst %%v24,64(%%r1,%1) \n\t"
"vst %%v25,80(%%r1,%1) \n\t"
"vst %%v26,96(%%r1,%1) \n\t"
"vst %%v27,112(%%r1,%1) \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((FLOAT (*)[n * 2])x)
:"memory","cc","r0","r1","v24","v25","v26","v27"
);
}

static void cscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i;
BLASLONG inc_x2 = 2 * inc_x;
BLASLONG inc_x3 = inc_x2 + inc_x;
FLOAT t0, t1, t2, t3;
FLOAT da_r = alpha[0];
FLOAT da_i = alpha[1];

for (i = 0; i < n; i += 4)
{
t0 = da_r * x[0] - da_i * x[1];
t1 = da_r * x[inc_x] - da_i * x[inc_x + 1];
t2 = da_r * x[inc_x2] - da_i * x[inc_x2 + 1];
t3 = da_r * x[inc_x3] - da_i * x[inc_x3 + 1];

x[1] = da_i * x[0] + da_r * x[1];
x[inc_x + 1] = da_i * x[inc_x] + da_r * x[inc_x + 1];
x[inc_x2 + 1] = da_i * x[inc_x2] + da_r * x[inc_x2 + 1];
x[inc_x3 + 1] = da_i * x[inc_x3] + da_r * x[inc_x3 + 1];

x[0] = t0;
x[inc_x] = t1;
x[inc_x2] = t2;
x[inc_x3] = t3;

x += 4 * inc_x;
}
}

int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) {
BLASLONG i = 0, j = 0;
FLOAT temp0;
FLOAT temp1;
FLOAT alpha[2] __attribute__ ((aligned(16)));

if (inc_x != 1) {
inc_x <<= 1;

if (da_r == 0.0) {

BLASLONG n1 = n & -2;

if (da_i == 0.0) {

while (j < n1) {

x[i] = 0.0;
x[i + 1] = 0.0;
x[i + inc_x] = 0.0;
x[i + 1 + inc_x] = 0.0;
i += 2 * inc_x;
j += 2;

}

while (j < n) {

x[i] = 0.0;
x[i + 1] = 0.0;
i += inc_x;
j++;

}

} else {

while (j < n1) {

temp0 = -da_i * x[i + 1];
x[i + 1] = da_i * x[i];
x[i] = temp0;
temp1 = -da_i * x[i + 1 + inc_x];
x[i + 1 + inc_x] = da_i * x[i + inc_x];
x[i + inc_x] = temp1;
i += 2 * inc_x;
j += 2;

}

while (j < n) {

temp0 = -da_i * x[i + 1];
x[i + 1] = da_i * x[i];
x[i] = temp0;
i += inc_x;
j++;

}



}

} else {


if (da_i == 0.0) {
BLASLONG n1 = n & -2;

while (j < n1) {

temp0 = da_r * x[i];
x[i + 1] = da_r * x[i + 1];
x[i] = temp0;
temp1 = da_r * x[i + inc_x];
x[i + 1 + inc_x] = da_r * x[i + 1 + inc_x];
x[i + inc_x] = temp1;
i += 2 * inc_x;
j += 2;

}

while (j < n) {

temp0 = da_r * x[i];
x[i + 1] = da_r * x[i + 1];
x[i] = temp0;
i += inc_x;
j++;

}

} else {

BLASLONG n1 = n & -8;
if (n1 > 0) {
alpha[0] = da_r;
alpha[1] = da_i;
cscal_kernel_inc_8(n1, alpha, x, inc_x);
j = n1;
i = n1 * inc_x;
}

while (j < n) {

temp0 = da_r * x[i] - da_i * x[i + 1];
x[i + 1] = da_r * x[i + 1] + da_i * x[i];
x[i] = temp0;
i += inc_x;
j++;

}

}

}

return (0);
}


BLASLONG n1 = n & -16;
if (n1 > 0) {

alpha[0] = da_r;
alpha[1] = da_i;

if (da_r == 0.0)
if (da_i == 0)
cscal_kernel_16_zero(n1, x);
else
cscal_kernel_16_zero_r(n1, alpha, x);
else
if (da_i == 0)
cscal_kernel_16_zero_i(n1, alpha, x);
else
cscal_kernel_16(n1, alpha, x);

i = n1 << 1;
j = n1;
}


if (da_r == 0.0) {

if (da_i == 0.0) {

while (j < n) {

x[i] = 0.0;
x[i + 1] = 0.0;
i += 2;
j++;

}

} else {

while (j < n) {

temp0 = -da_i * x[i + 1];
x[i + 1] = da_i * x[i];
x[i] = temp0;
i += 2;
j++;

}

}

} else {

if (da_i == 0.0) {

while (j < n) {

temp0 = da_r * x[i];
x[i + 1] = da_r * x[i + 1];
x[i] = temp0;
i += 2;
j++;

}

} else {

while (j < n) {

temp0 = da_r * x[i] - da_i * x[i + 1];
x[i + 1] = da_r * x[i + 1] + da_i * x[i];
x[i] = temp0;
i += 2;
j++;

}

}

}

return (0);
}

+ 183
- 0
kernel/zarch/cswap.c View File

@@ -0,0 +1,183 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#include "common.h"

static void cswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
{
__asm__ volatile(
"srlg %%r0,%0,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 2, 1024(%%r1,%1) \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"
"vl %%v16, 0(%%r1,%1) \n\t"
"vl %%v17, 16(%%r1,%1) \n\t"
"vl %%v18, 32(%%r1,%1) \n\t"
"vl %%v19, 48(%%r1,%1) \n\t"
"vl %%v20, 64(%%r1,%1) \n\t"
"vl %%v21, 80(%%r1,%1) \n\t"
"vl %%v22, 96(%%r1,%1) \n\t"
"vl %%v23, 112(%%r1,%1) \n\t"
"vl %%v24, 128(%%r1,%1) \n\t"
"vl %%v25, 144(%%r1,%1) \n\t"
"vl %%v26, 160(%%r1,%1) \n\t"
"vl %%v27, 176(%%r1,%1) \n\t"
"vl %%v28, 192(%%r1,%1) \n\t"
"vl %%v29, 208(%%r1,%1) \n\t"
"vl %%v30, 224(%%r1,%1) \n\t"
"vl %%v31, 240(%%r1,%1) \n\t"

"vl %%v0, 0(%%r1,%2) \n\t"
"vl %%v1, 16(%%r1,%2) \n\t"
"vl %%v2, 32(%%r1,%2) \n\t"
"vl %%v3, 48(%%r1,%2) \n\t"
"vl %%v4, 64(%%r1,%2) \n\t"
"vl %%v5, 80(%%r1,%2) \n\t"
"vl %%v6, 96(%%r1,%2) \n\t"
"vl %%v7, 112(%%r1,%2) \n\t"
"vst %%v0, 0(%%r1,%1) \n\t"
"vst %%v1, 16(%%r1,%1) \n\t"
"vst %%v2, 32(%%r1,%1) \n\t"
"vst %%v3, 48(%%r1,%1) \n\t"
"vst %%v4, 64(%%r1,%1) \n\t"
"vst %%v5, 80(%%r1,%1) \n\t"
"vst %%v6, 96(%%r1,%1) \n\t"
"vst %%v7, 112(%%r1,%1) \n\t"

"vl %%v0, 128(%%r1,%2) \n\t"
"vl %%v1, 144(%%r1,%2) \n\t"
"vl %%v2, 160(%%r1,%2) \n\t"
"vl %%v3, 176(%%r1,%2) \n\t"
"vl %%v4, 192(%%r1,%2) \n\t"
"vl %%v5, 208(%%r1,%2) \n\t"
"vl %%v6, 224(%%r1,%2) \n\t"
"vl %%v7, 240(%%r1,%2) \n\t"
"vst %%v0, 128(%%r1,%1) \n\t"
"vst %%v1, 144(%%r1,%1) \n\t"
"vst %%v2, 160(%%r1,%1) \n\t"
"vst %%v3, 176(%%r1,%1) \n\t"
"vst %%v4, 192(%%r1,%1) \n\t"
"vst %%v5, 208(%%r1,%1) \n\t"
"vst %%v6, 224(%%r1,%1) \n\t"
"vst %%v7, 240(%%r1,%1) \n\t"

"vst %%v16, 0(%%r1,%2) \n\t"
"vst %%v17, 16(%%r1,%2) \n\t"
"vst %%v18, 32(%%r1,%2) \n\t"
"vst %%v19, 48(%%r1,%2) \n\t"
"vst %%v20, 64(%%r1,%2) \n\t"
"vst %%v21, 80(%%r1,%2) \n\t"
"vst %%v22, 96(%%r1,%2) \n\t"
"vst %%v23, 112(%%r1,%2) \n\t"
"vst %%v24, 128(%%r1,%2) \n\t"
"vst %%v25, 144(%%r1,%2) \n\t"
"vst %%v26, 160(%%r1,%2) \n\t"
"vst %%v27, 176(%%r1,%2) \n\t"
"vst %%v28, 192(%%r1,%2) \n\t"
"vst %%v29, 208(%%r1,%2) \n\t"
"vst %%v30, 224(%%r1,%2) \n\t"
"vst %%v31, 240(%%r1,%2) \n\t"
"agfi %%r1,256 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}

int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
FLOAT temp[2];
BLASLONG inc_x2, inc_y2;

if ( n <= 0 ) return(0);

if ( (inc_x == 1) && (inc_y == 1 ))
{

BLASLONG n1 = n & -32;
if ( n1 > 0 )
{
cswap_kernel_32(n1, x, y);
i=n1;
ix = 2* n1;
iy = 2* n1;
}

while(i < n)
{

temp[0] = x[ix] ;
temp[1] = x[ix+1] ;
x[ix] = y[iy] ;
x[ix+1] = y[iy+1] ;
y[iy] = temp[0] ;
y[iy+1] = temp[1] ;

ix += 2 ;
iy += 2 ;
i++ ;


}


}
else
{

inc_x2 = 2 * inc_x;
inc_y2 = 2 * inc_y;

while(i < n)
{

temp[0] = x[ix] ;
temp[1] = x[ix+1] ;
x[ix] = y[iy] ;
x[ix+1] = y[iy+1] ;
y[iy] = temp[0] ;
y[iy+1] = temp[1] ;

ix += inc_x2 ;
iy += inc_y2 ;
i++ ;

}

}
return(0);

}



+ 206
- 0
kernel/zarch/damax.c View File

@@ -0,0 +1,206 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#include "common.h"
#include <math.h>

#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif

static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x)
{
FLOAT amax;

__asm__ volatile (
"vl %%v0,0(%2) \n\t"
"vflpdb %%v0,%%v0 \n\t"
"srlg %%r0,%1,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"

"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"
"vfchdb %%v24,%%v16,%%v17 \n\t"
"vfchdb %%v25,%%v18,%%v19 \n\t"
"vfchdb %%v26,%%v20,%%v21 \n\t"
"vfchdb %%v27,%%v22,%%v23 \n\t"
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
"vsel %%v26,%%v20,%%v21,%%v26 \n\t"
"vsel %%v27,%%v22,%%v23,%%v27 \n\t"

"vfchdb %%v28,%%v24,%%v25 \n\t"
"vfchdb %%v29,%%v26,%%v27 \n\t"
"vsel %%v28,%%v24,%%v25,%%v28 \n\t"
"vsel %%v29,%%v26,%%v27,%%v29 \n\t"

"vfchdb %%v30,%%v28,%%v29 \n\t"
"vsel %%v30,%%v28,%%v29,%%v30 \n\t"

"vfchdb %%v31,%%v30,%%v0 \n\t"
"vsel %%v0,%%v30,%%v0,%%v31 \n\t"

"vl %%v16,128(%%r1,%2) \n\t"
"vl %%v17,144(%%r1,%2) \n\t"
"vl %%v18,160(%%r1,%2) \n\t"
"vl %%v19,176(%%r1,%2) \n\t"
"vl %%v20,192(%%r1,%2) \n\t"
"vl %%v21,208(%%r1,%2) \n\t"
"vl %%v22,224(%%r1,%2) \n\t"
"vl %%v23,240(%%r1,%2) \n\t"
"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"
"vfchdb %%v24,%%v16,%%v17 \n\t"
"vfchdb %%v25,%%v18,%%v19 \n\t"
"vfchdb %%v26,%%v20,%%v21 \n\t"
"vfchdb %%v27,%%v22,%%v23 \n\t"
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
"vsel %%v26,%%v20,%%v21,%%v26 \n\t"
"vsel %%v27,%%v22,%%v23,%%v27 \n\t"

"vfchdb %%v28,%%v24,%%v25 \n\t"
"vfchdb %%v29,%%v26,%%v27 \n\t"
"vsel %%v28,%%v24,%%v25,%%v28 \n\t"
"vsel %%v29,%%v26,%%v27,%%v29 \n\t"

"vfchdb %%v30,%%v28,%%v29 \n\t"
"vsel %%v30,%%v28,%%v29,%%v30 \n\t"

"vfchdb %%v31,%%v30,%%v0 \n\t"
"vsel %%v0,%%v30,%%v0,%%v31 \n\t"

"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"

"vrepg %%v16,%%v0,1 \n\t"
"wfchdb %%v17,%%v16,%%v0 \n\t"
"vsel %%v0,%%v16,%%v0,%%v17 \n\t"
"ldr %0,%%f0 "
:"=f"(amax)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);

return amax;
}
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT maxf = 0.0;

if (n <= 0 || inc_x <= 0) return (maxf);

if (inc_x == 1) {

BLASLONG n1 = n & -32;
if (n1 > 0) {

maxf = damax_kernel_32(n1, x);

i = n1;
}
else
{
maxf=ABS(x[0]);
i++;
}

while (i < n) {
if (ABS(x[i]) > maxf) {
maxf = ABS(x[i]);
}
i++;
}
return (maxf);

} else {

maxf=ABS(x[0]);
i += inc_x;
j++;

BLASLONG n1 = (n - 1) & -4;
while (j < n1) {

if (ABS(x[i]) > maxf) {
maxf = ABS(x[i]);
}
if (ABS(x[i + inc_x]) > maxf) {
maxf = ABS(x[i + inc_x]);
}
if (ABS(x[i + 2 * inc_x]) > maxf) {
maxf = ABS(x[i + 2 * inc_x]);
}
if (ABS(x[i + 3 * inc_x]) > maxf) {
maxf = ABS(x[i + 3 * inc_x]);
}

i += inc_x * 4;

j += 4;

}


while (j < n) {
if (ABS(x[i]) > maxf) {
maxf = ABS(x[i]);
}
i += inc_x;
j++;
}
return (maxf);
}
}

+ 206
- 0
kernel/zarch/damin.c View File

@@ -0,0 +1,206 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#include "common.h"
#include <math.h>

#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif

static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x)
{
FLOAT amin;

__asm__ volatile (
"vl %%v0,0(%2) \n\t"
"vflpdb %%v0,%%v0 \n\t"
"srlg %%r0,%1,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"

"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"
"vfchdb %%v24,%%v17,%%v16 \n\t"
"vfchdb %%v25,%%v19,%%v18 \n\t"
"vfchdb %%v26,%%v21,%%v20 \n\t"
"vfchdb %%v27,%%v23,%%v22 \n\t"
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
"vsel %%v26,%%v20,%%v21,%%v26 \n\t"
"vsel %%v27,%%v22,%%v23,%%v27 \n\t"

"vfchdb %%v28,%%v25,%%v24 \n\t"
"vfchdb %%v29,%%v27,%%v26 \n\t"
"vsel %%v28,%%v24,%%v25,%%v28 \n\t"
"vsel %%v29,%%v26,%%v27,%%v29 \n\t"

"vfchdb %%v30,%%v29,%%v28 \n\t"
"vsel %%v30,%%v28,%%v29,%%v30 \n\t"

"vfchdb %%v31,%%v0,%%v30 \n\t"
"vsel %%v0,%%v30,%%v0,%%v31 \n\t"

"vl %%v16,128(%%r1,%2) \n\t"
"vl %%v17,144(%%r1,%2) \n\t"
"vl %%v18,160(%%r1,%2) \n\t"
"vl %%v19,176(%%r1,%2) \n\t"
"vl %%v20,192(%%r1,%2) \n\t"
"vl %%v21,208(%%r1,%2) \n\t"
"vl %%v22,224(%%r1,%2) \n\t"
"vl %%v23,240(%%r1,%2) \n\t"
"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"
"vfchdb %%v24,%%v17,%%v16 \n\t"
"vfchdb %%v25,%%v19,%%v18 \n\t"
"vfchdb %%v26,%%v21,%%v20 \n\t"
"vfchdb %%v27,%%v23,%%v22 \n\t"
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
"vsel %%v26,%%v20,%%v21,%%v26 \n\t"
"vsel %%v27,%%v22,%%v23,%%v27 \n\t"

"vfchdb %%v28,%%v25,%%v24 \n\t"
"vfchdb %%v29,%%v27,%%v26 \n\t"
"vsel %%v28,%%v24,%%v25,%%v28 \n\t"
"vsel %%v29,%%v26,%%v27,%%v29 \n\t"

"vfchdb %%v30,%%v29,%%v28 \n\t"
"vsel %%v30,%%v28,%%v29,%%v30 \n\t"

"vfchdb %%v31,%%v0,%%v30 \n\t"
"vsel %%v0,%%v30,%%v0,%%v31 \n\t"

"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"

"vrepg %%v16,%%v0,1 \n\t"
"wfchdb %%v17,%%v0,%%v16 \n\t"
"vsel %%v0,%%v16,%%v0,%%v17 \n\t"
"ldr %0,%%f0 "
:"=f"(amin)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);

return amin;
}
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT minf = 0.0;

if (n <= 0 || inc_x <= 0) return (minf);

if (inc_x == 1) {

BLASLONG n1 = n & -32;
if (n1 > 0) {

minf = damin_kernel_32(n1, x);

i = n1;
}
else
{
minf=ABS(x[0]);
i++;
}

while (i < n) {
if (ABS(x[i]) < minf) {
minf = ABS(x[i]);
}
i++;
}
return (minf);

} else {

minf=ABS(x[0]);
i += inc_x;
j++;

BLASLONG n1 = (n - 1) & -4;
while (j < n1) {

if (ABS(x[i]) < minf) {
minf = ABS(x[i]);
}
if (ABS(x[i + inc_x]) < minf) {
minf = ABS(x[i + inc_x]);
}
if (ABS(x[i + 2 * inc_x]) < minf) {
minf = ABS(x[i + 2 * inc_x]);
}
if (ABS(x[i + 3 * inc_x]) < minf) {
minf = ABS(x[i + 3 * inc_x]);
}

i += inc_x * 4;

j += 4;

}


while (j < n) {
if (ABS(x[i]) < minf) {
minf = ABS(x[i]);
}
i += inc_x;
j++;
}
return (minf);
}
}

+ 83
- 75
kernel/zarch/dasum.c View File

@@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
Copyright (c) 2013-2018, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -23,8 +23,7 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

*****************************************************************************/

#include "common.h"
#include <math.h>
@@ -35,80 +34,89 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ABS fabsf
#endif



static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x) {
FLOAT asum ;
__asm__ (
"pfd 1, 0(%[ptr_x]) \n\t"
"sllg %%r0,%[n],3 \n\t"
"agr %%r0,%[ptr_x] \n\t"
"vzero %%v0 \n\t"
"vzero %%v1 \n\t"
"vzero %%v2 \n\t"
"vzero %%v3 \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 1, 256(%[ptr_temp] ) \n\t"
"vlm %%v24,%%v31, 0(%[ptr_temp] ) \n\t"
"vflpdb %%v24, %%v24 \n\t"
"vflpdb %%v25, %%v25 \n\t"
"vflpdb %%v26, %%v26 \n\t"
"vflpdb %%v27, %%v27 \n\t"
"vflpdb %%v28, %%v28 \n\t"
"vflpdb %%v29, %%v29 \n\t"
"vflpdb %%v30, %%v30 \n\t"
"vflpdb %%v31, %%v31 \n\t"
"vfadb %%v0,%%v0,%%v24 \n\t"
"vfadb %%v1,%%v1,%%v25 \n\t"
"vfadb %%v2,%%v2,%%v26 \n\t"
"vfadb %%v3,%%v3,%%v27 \n\t"
"vfadb %%v0,%%v0,%%v28 \n\t"
"vfadb %%v1,%%v1,%%v29 \n\t"
"vfadb %%v2,%%v2,%%v30 \n\t"
"vfadb %%v3,%%v3,%%v31 \n\t"
"vlm %%v24,%%v31, 128(%[ptr_temp]) \n\t"
"vflpdb %%v24, %%v24 \n\t"
"vflpdb %%v25, %%v25 \n\t"
"vflpdb %%v26, %%v26 \n\t"
"vflpdb %%v27, %%v27 \n\t"
"vflpdb %%v28, %%v28 \n\t"
"vflpdb %%v29, %%v29 \n\t"
"vflpdb %%v30, %%v30 \n\t"
"vflpdb %%v31, %%v31 \n\t"
"la %[ptr_temp],256(%[ptr_temp]) \n\t"
"vfadb %%v0,%%v0,%%v24 \n\t"
"vfadb %%v1,%%v1,%%v25 \n\t"
"vfadb %%v2,%%v2,%%v26 \n\t"
"vfadb %%v3,%%v3,%%v27 \n\t"
"vfadb %%v0,%%v0,%%v28 \n\t"
"vfadb %%v1,%%v1,%%v29 \n\t"
"vfadb %%v2,%%v2,%%v30 \n\t"
"vfadb %%v3,%%v3,%%v31 \n\t"
"clgrjl %[ptr_temp],%%r0,1b \n\t"
"vfadb %%v24,%%v0,%%v1 \n\t"
"vfadb %%v25,%%v2,%%v3 \n\t"
"vfadb %%v0,%%v25,%%v24 \n\t"
"vrepg %%v1,%%v0,1 \n\t"
"adbr %%f0,%%f1 \n\t"
"ldr %[asum],%%f0 \n\t"
: [asum] "=f"(asum),[ptr_temp] "+&a"(x)
: [mem] "m"( *(const double (*)[n])x ), [n] "r"(n), [ptr_x] "a"(x)
: "cc", "r0" ,"f0","f1","v0","v1","v2","v3","v24","v25","v26","v27","v28","v29","v30","v31"
);
return asum;

static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x)
{
FLOAT asum;

__asm__ (
"vzero %%v0 \n\t"
"vzero %%v1 \n\t"
"vzero %%v2 \n\t"
"vzero %%v3 \n\t"
"srlg %%r0,%1,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"
"vl %%v16, 0(%%r1,%2) \n\t"
"vl %%v17, 16(%%r1,%2) \n\t"
"vl %%v18, 32(%%r1,%2) \n\t"
"vl %%v19, 48(%%r1,%2) \n\t"
"vl %%v20, 64(%%r1,%2) \n\t"
"vl %%v21, 80(%%r1,%2) \n\t"
"vl %%v22, 96(%%r1,%2) \n\t"
"vl %%v23, 112(%%r1,%2) \n\t"

"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"

"vfadb %%v0,%%v0,%%v16 \n\t"
"vfadb %%v1,%%v1,%%v17 \n\t"
"vfadb %%v2,%%v2,%%v18 \n\t"
"vfadb %%v3,%%v3,%%v19 \n\t"
"vfadb %%v0,%%v0,%%v20 \n\t"
"vfadb %%v1,%%v1,%%v21 \n\t"
"vfadb %%v2,%%v2,%%v22 \n\t"
"vfadb %%v3,%%v3,%%v23 \n\t"

"vl %%v16, 128(%%r1,%2) \n\t"
"vl %%v17, 144(%%r1,%2) \n\t"
"vl %%v18, 160(%%r1,%2) \n\t"
"vl %%v19, 176(%%r1,%2) \n\t"
"vl %%v20, 192(%%r1,%2) \n\t"
"vl %%v21, 208(%%r1,%2) \n\t"
"vl %%v22, 224(%%r1,%2) \n\t"
"vl %%v23, 240(%%r1,%2) \n\t"

"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"

"vfadb %%v0,%%v0,%%v16 \n\t"
"vfadb %%v1,%%v1,%%v17 \n\t"
"vfadb %%v2,%%v2,%%v18 \n\t"
"vfadb %%v3,%%v3,%%v19 \n\t"
"vfadb %%v0,%%v0,%%v20 \n\t"
"vfadb %%v1,%%v1,%%v21 \n\t"
"vfadb %%v2,%%v2,%%v22 \n\t"
"vfadb %%v3,%%v3,%%v23 \n\t"
"agfi %%r1,256 \n\t"
"brctg %%r0,0b \n\t"
"vfadb %%v0,%%v0,%%v1 \n\t"
"vfadb %%v0,%%v0,%%v2 \n\t"
"vfadb %%v0,%%v0,%%v3 \n\t"
"vrepg %%v1,%%v0,1 \n\t"
"adbr %%f0,%%f1 \n\t"
"ldr %0,%%f0 "
:"=f"(asum)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23"
);

return asum;
}




FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;


+ 89
- 88
kernel/zarch/daxpy.c View File

@@ -25,98 +25,99 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/


#include "common.h"

#define PREFETCH_INS 1
#if defined(Z13_A)
#include <vecintrin.h>

static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha)
{
BLASLONG i = 0;
__vector double v_a = {alpha,alpha};
__vector double * v_y=(__vector double *)y;
__vector double * v_x=(__vector double *)x;
for(; i<n/2; i+=16){

v_y[i] += v_a * v_x[i];
v_y[i+1] += v_a * v_x[i+1];
v_y[i+2] += v_a * v_x[i+2];
v_y[i+3] += v_a * v_x[i+3];
v_y[i+4] += v_a * v_x[i+4];
v_y[i+5] += v_a * v_x[i+5];
v_y[i+6] += v_a * v_x[i+6];
v_y[i+7] += v_a * v_x[i+7];
v_y[i+8] += v_a * v_x[i+8];
v_y[i+9] += v_a * v_x[i+9];
v_y[i+10] += v_a * v_x[i+10];
v_y[i+11] += v_a * v_x[i+11];
v_y[i+12] += v_a * v_x[i+12];
v_y[i+13] += v_a * v_x[i+13];
v_y[i+14] += v_a * v_x[i+14];
v_y[i+15] += v_a * v_x[i+15];
}

}
#else
static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha)
static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
__asm__ volatile(
"vlrepg %%v0,%3 \n\t"
"srlg %%r0,%0,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%1) \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"

"vl %%v16,0(%%r1,%1) \n\t"
"vl %%v17,16(%%r1,%1) \n\t"
"vl %%v18,32(%%r1,%1) \n\t"
"vl %%v19,48(%%r1,%1) \n\t"
"vl %%v20,0(%%r1,%2) \n\t"
"vl %%v21,16(%%r1,%2) \n\t"
"vl %%v22,32(%%r1,%2) \n\t"
"vl %%v23,48(%%r1,%2) \n\t"
__asm__ volatile(
#if defined(PREFETCH_INS)
"pfd 1, 0(%[x_tmp]) \n\t"
"pfd 2, 0(%[y_tmp]) \n\t"
#endif
"lgdr %%r0,%[alpha] \n\t"
"vlvgp %%v0,%%r0,%%r0 \n\t"
"srlg %%r0,%[n],5 \n\t"
"vlr %%v1,%%v0 \n\t"
".align 16 \n\t"
"1: \n\t"
#if defined(PREFETCH_INS)
"pfd 1, 256(%[x_tmp]) \n\t"
"pfd 2, 256(%[y_tmp]) \n\t"
#endif
"vlm %%v16,%%v23, 0(%[x_tmp]) \n\t"
"vlm %%v24, %%v31, 0(%[y_tmp]) \n\t"
"vfmadb %%v16,%%v0,%%v16,%%v24 \n\t"
"vfmadb %%v17,%%v1,%%v17,%%v25 \n\t"
"vfmadb %%v18,%%v0,%%v18,%%v26 \n\t"
"vfmadb %%v19,%%v1,%%v19,%%v27 \n\t"
"vfmadb %%v20,%%v0,%%v20,%%v28 \n\t"
"vfmadb %%v21,%%v1,%%v21,%%v29 \n\t"
"vfmadb %%v22,%%v0,%%v22,%%v30 \n\t"
"vfmadb %%v23,%%v1,%%v23,%%v31 \n\t"
"vstm %%v16,%%v23, 0(%[y_tmp]) \n\t"
"vlm %%v24,%%v31, 128(%[x_tmp]) \n\t"
"vlm %%v16,%%v23, 128(%[y_tmp]) \n\t"
"vfmadb %%v24,%%v0,%%v24,%%v16 \n\t"
"vfmadb %%v25,%%v1,%%v25,%%v17 \n\t"
"vfmadb %%v26,%%v0,%%v26,%%v18 \n\t"
"vfmadb %%v27,%%v1,%%v27,%%v19 \n\t"
"vfmadb %%v28,%%v0,%%v28,%%v20 \n\t"
"vfmadb %%v29,%%v1,%%v29,%%v21 \n\t"
"vfmadb %%v30,%%v0,%%v30,%%v22 \n\t"
"vfmadb %%v31,%%v1,%%v31,%%v23 \n\t"
"la %[x_tmp],256(%[x_tmp]) \n\t"
"vstm %%v24, %%v31, 128(%[y_tmp]) \n\t"
"la %[y_tmp],256(%[y_tmp]) \n\t"
"brctg %%r0,1b"
: [mem_y] "+m" (*(double (*)[n])y), [x_tmp] "+&a"(x), [y_tmp] "+&a"(y)
: [mem_x] "m" (*(const double (*)[n])x), [n] "r"(n), [alpha] "f"(alpha)
:"cc", "r0", "v0","v1","v16","v17","v18","v19","v20","v21",
"v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);

"vfmadb %%v16,%%v0,%%v16,%%v20 \n\t"
"vfmadb %%v17,%%v0,%%v17,%%v21 \n\t"
"vfmadb %%v18,%%v0,%%v18,%%v22 \n\t"
"vfmadb %%v19,%%v0,%%v19,%%v23 \n\t"

"vl %%v24,64(%%r1,%1) \n\t"
"vl %%v25,80(%%r1,%1) \n\t"
"vl %%v26,96(%%r1,%1) \n\t"
"vl %%v27,112(%%r1,%1) \n\t"
"vl %%v28,64(%%r1,%2) \n\t"
"vl %%v29,80(%%r1,%2) \n\t"
"vl %%v30,96(%%r1,%2) \n\t"
"vl %%v31,112(%%r1,%2) \n\t"

"vfmadb %%v20,%%v0,%%v24,%%v28 \n\t"
"vfmadb %%v21,%%v0,%%v25,%%v29 \n\t"
"vfmadb %%v22,%%v0,%%v26,%%v30 \n\t"
"vfmadb %%v23,%%v0,%%v27,%%v31 \n\t"

"vst %%v16,0(%%r1,%2) \n\t"
"vst %%v17,16(%%r1,%2) \n\t"
"vst %%v18,32(%%r1,%2) \n\t"
"vst %%v19,48(%%r1,%2) \n\t"
"vst %%v20,64(%%r1,%2) \n\t"
"vst %%v21,80(%%r1,%2) \n\t"
"vst %%v22,96(%%r1,%2) \n\t"
"vst %%v23,112(%%r1,%2) \n\t"

"vl %%v16,128(%%r1,%1) \n\t"
"vl %%v17,144(%%r1,%1) \n\t"
"vl %%v18,160(%%r1,%1) \n\t"
"vl %%v19,176(%%r1,%1) \n\t"
"vl %%v20,128(%%r1,%2) \n\t"
"vl %%v21,144(%%r1,%2) \n\t"
"vl %%v22,160(%%r1,%2) \n\t"
"vl %%v23,176(%%r1,%2) \n\t"
"vfmadb %%v16,%%v0,%%v16,%%v20 \n\t"
"vfmadb %%v17,%%v0,%%v17,%%v21 \n\t"
"vfmadb %%v18,%%v0,%%v18,%%v22 \n\t"
"vfmadb %%v19,%%v0,%%v19,%%v23 \n\t"

"vl %%v24,192(%%r1,%1) \n\t"
"vl %%v25,208(%%r1,%1) \n\t"
"vl %%v26,224(%%r1,%1) \n\t"
"vl %%v27,240(%%r1,%1) \n\t"
"vl %%v28,192(%%r1,%2) \n\t"
"vl %%v29,208(%%r1,%2) \n\t"
"vl %%v30,224(%%r1,%2) \n\t"
"vl %%v31,240(%%r1,%2) \n\t"

"vfmadb %%v20,%%v0,%%v24,%%v28 \n\t"
"vfmadb %%v21,%%v0,%%v25,%%v29 \n\t"
"vfmadb %%v22,%%v0,%%v26,%%v30 \n\t"
"vfmadb %%v23,%%v0,%%v27,%%v31 \n\t"
"vst %%v16,128(%%r1,%2) \n\t"
"vst %%v17,144(%%r1,%2) \n\t"
"vst %%v18,160(%%r1,%2) \n\t"
"vst %%v19,176(%%r1,%2) \n\t"
"vst %%v20,192(%%r1,%2) \n\t"
"vst %%v21,208(%%r1,%2) \n\t"
"vst %%v22,224(%%r1,%2) \n\t"
"vst %%v23,240(%%r1,%2) \n\t"

"agfi %%r1,256 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y),"m"(*alpha)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}

#endif

int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
@@ -131,7 +132,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
BLASLONG n1 = n & -32;

if ( n1 )
daxpy_kernel_32(n1, x, y , da );
daxpy_kernel_32(n1, x, y , &da);

i = n1;
while(i < n)


+ 19
- 103
kernel/zarch/dcopy.c View File

@@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
Copyright (c) 2013-2018, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -23,95 +23,28 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
*****************************************************************************/

#include "common.h"

#if defined(Z13mvc)

static void dcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) {

__asm__ volatile(
"pfd 1, 0(%[ptr_x]) \n\t"
"pfd 2, 0(%[ptr_y]) \n\t"
"srlg %[n_tmp],%[n_tmp],5 \n\t"
".align 16 \n\t"
"1: \n\t"
"mvc 0(256,%[ptr_y]),0(%[ptr_x]) \n\t"
"la %[ptr_x],256(%[ptr_x]) \n\t"
"la %[ptr_y],256(%[ptr_y]) \n\t"
"brctg %[n_tmp],1b"
: [mem_y] "=m" (*(double (*)[n])y), [n_tmp] "+&r"(n),
[ptr_x] "+&a"(x), [ptr_y] "+&a"(y)
: [mem_x] "m" (*(const double (*)[n])x)
: "cc"
);
return;

}
#else

static void dcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) {

__asm__ volatile(
"pfd 1, 0(%[ptr_x]) \n\t"
"pfd 2, 0(%[ptr_y]) \n\t"
"srlg %[n_tmp],%[n_tmp],5 \n\t"
"xgr %%r1,%%r1 \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 1, 256(%%r1,%[ptr_x]) \n\t"
"pfd 2, 256(%%r1,%[ptr_y]) \n\t"

"vl %%v24, 0(%%r1,%[ptr_x]) \n\t"
"vst %%v24, 0(%%r1,%[ptr_y]) \n\t"
"vl %%v25, 16(%%r1,%[ptr_x]) \n\t"
"vst %%v25, 16(%%r1,%[ptr_y]) \n\t"
"vl %%v26, 32(%%r1,%[ptr_x]) \n\t"
"vst %%v26, 32(%%r1,%[ptr_y]) \n\t"
"vl %%v27, 48(%%r1,%[ptr_x]) \n\t"
"vst %%v27, 48(%%r1,%[ptr_y]) \n\t"

"vl %%v24, 64(%%r1,%[ptr_x]) \n\t"
"vst %%v24, 64(%%r1,%[ptr_y]) \n\t"
"vl %%v25, 80(%%r1,%[ptr_x]) \n\t"
"vst %%v25, 80(%%r1,%[ptr_y]) \n\t"
"vl %%v26, 96(%%r1,%[ptr_x]) \n\t"
"vst %%v26, 96(%%r1,%[ptr_y]) \n\t"
"vl %%v27, 112(%%r1,%[ptr_x]) \n\t"
"vst %%v27, 112(%%r1,%[ptr_y]) \n\t"


"vl %%v24, 128(%%r1,%[ptr_x]) \n\t"
"vst %%v24, 128(%%r1,%[ptr_y]) \n\t"

"vl %%v25, 144(%%r1,%[ptr_x]) \n\t"
"vst %%v25, 144(%%r1,%[ptr_y]) \n\t"

"vl %%v26, 160(%%r1,%[ptr_x]) \n\t"
"vst %%v26, 160(%%r1,%[ptr_y]) \n\t"

"vl %%v27, 176(%%r1,%[ptr_x]) \n\t"
"vst %%v27, 176(%%r1,%[ptr_y]) \n\t"

"vl %%v24, 192(%%r1,%[ptr_x]) \n\t"
"vst %%v24, 192(%%r1,%[ptr_y]) \n\t"
"vl %%v25, 208(%%r1,%[ptr_x]) \n\t"
"vst %%v25, 208(%%r1,%[ptr_y]) \n\t"
"vl %%v26, 224(%%r1,%[ptr_x]) \n\t"
"vst %%v26, 224(%%r1,%[ptr_y]) \n\t"
"vl %%v27, 240(%%r1,%[ptr_x]) \n\t"
"vst %%v27, 240(%%r1,%[ptr_y]) \n\t"
"la %%r1,256(%%r1) \n\t"
"brctg %[n_tmp],1b"
: [mem_y] "=m" (*(double (*)[n])y), [n_tmp] "+&r"(n)
: [mem_x] "m" (*(const double (*)[n])x), [ptr_x] "a"(x), [ptr_y] "a"(y)
: "cc", "r1", "v24","v25","v26","v27"
);
return;

static void dcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
{
__asm__ volatile (
"lgr %%r1,%1 \n\t"
"lgr %%r2,%2 \n\t"
"srlg %%r0,%0,5 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1) \n\t"
"pfd 2, 1024(%%r2) \n\t"
"mvc 0(256,%%r2),0(%%r1) \n\t"
"agfi %%r1,256 \n\t"
"agfi %%r2,256 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"a"((const FLOAT (*)[n])x),"a"((FLOAT (*)[n])y)
:"memory","cc","r0","r1","r2"
);
}
#endif

int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
BLASLONG i = 0;
@@ -136,21 +69,6 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {

} else {

BLASLONG n1 = n & -4;

while (i < n1) {

y[iy] = x[ix];
y[iy + inc_y] = x[ix + inc_x];
y[iy + 2 * inc_y] = x[ix + 2 * inc_x];
y[iy + 3 * inc_y] = x[ix + 3 * inc_x];

ix += inc_x * 4;
iy += inc_y * 4;
i += 4;

}

while (i < n) {

y[iy] = x[ix];
@@ -165,5 +83,3 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {


}



+ 48
- 107
kernel/zarch/ddot.c View File

@@ -25,116 +25,59 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/


#include "common.h"


#if defined(Z13)
static FLOAT ddot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y)
static FLOAT ddot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y)
{
FLOAT dot;
__asm__ volatile(
"pfd 1, 0(%[ptr_x_tmp]) \n\t"
"pfd 1, 0(%[ptr_y_tmp]) \n\t"
"vzero %%v24 \n\t"
"vzero %%v25 \n\t"
"vzero %%v26 \n\t"
"vzero %%v27 \n\t"
"srlg %[n_tmp],%[n_tmp],4 \n\t"
"xgr %%r1,%%r1 \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 1, 256(%%r1,%[ptr_x_tmp]) \n\t"
"pfd 1, 256(%%r1,%[ptr_y_tmp]) \n\t"
"vl %%v16, 0(%%r1,%[ptr_x_tmp]) \n\t"
"vl %%v17, 16(%%r1,%[ptr_x_tmp]) \n\t"
"vl %%v18, 32(%%r1,%[ptr_x_tmp]) \n\t"
"vl %%v19, 48(%%r1,%[ptr_x_tmp]) \n\t"

"vl %%v28, 0(%%r1,%[ptr_y_tmp]) \n\t"
"vfmadb %%v24,%%v16,%%v28,%%v24 \n\t"
"vl %%v29, 16(%%r1,%[ptr_y_tmp]) \n\t"
"vfmadb %%v25,%%v17,%%v29,%%v25 \n\t"
"vl %%v30, 32(%%r1,%[ptr_y_tmp]) \n\t"
"vfmadb %%v26,%%v18,%%v30,%%v26 \n\t"
"vl %%v31, 48(%%r1,%[ptr_y_tmp]) \n\t"
"vfmadb %%v27,%%v19,%%v31,%%v27 \n\t"
"vl %%v16, 64(%%r1 ,%[ptr_x_tmp]) \n\t"
"vl %%v17, 80(%%r1,%[ptr_x_tmp]) \n\t"
"vl %%v18, 96(%%r1,%[ptr_x_tmp]) \n\t"
"vl %%v19, 112(%%r1,%[ptr_x_tmp]) \n\t"

"vl %%v28, 64(%%r1,%[ptr_y_tmp]) \n\t"
"vfmadb %%v24,%%v16,%%v28,%%v24 \n\t"
"vl %%v29, 80(%%r1,%[ptr_y_tmp]) \n\t"
"vfmadb %%v25,%%v17,%%v29,%%v25 \n\t"
"vl %%v30, 96(%%r1,%[ptr_y_tmp]) \n\t"
"vfmadb %%v26,%%v18,%%v30,%%v26 \n\t"
"vl %%v31, 112(%%r1,%[ptr_y_tmp]) \n\t"
"vfmadb %%v27,%%v19,%%v31,%%v27 \n\t"
"la %%r1,128(%%r1) \n\t"
"brctg %[n_tmp],1b \n\t"
"vfadb %%v24,%%v25,%%v24 \n\t"
"vfadb %%v24,%%v26,%%v24 \n\t"
"vfadb %%v24,%%v27,%%v24 \n\t"
"vrepg %%v1,%%v24,1 \n\t"
"vfadb %%v1,%%v24,%%v1 \n\t"
"ldr %[dot], %%f1 \n\t"
: [dot] "=f"(dot) ,[n_tmp] "+&r"(n)
: [mem_x] "m"( *(const double (*)[n])x),
[mem_y] "m"( *(const double (*)[n])y),
[ptr_x_tmp]"a"(x), [ptr_y_tmp] "a"(y)
:"cc" , "r1","f1","v16", "v17","v18","v19","v20","v21","v22","v23",
"v24","v25","v26","v27","v28","v29","v30","v31"

);
return dot;

}

__asm__ volatile (
"vzero %%v0 \n\t"
"srlg %%r0,%1,4 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1,1024(%%r1,%2) \n\t"
"pfd 2,1024(%%r1,%3) \n\t"

"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"

"vl %%v24,0(%%r1,%3) \n\t"
"vfmadb %%v0,%%v16,%%v24,%%v0 \n\t"
"vl %%v25,16(%%r1,%3) \n\t"
"vfmadb %%v0,%%v17,%%v25,%%v0 \n\t"
"vl %%v26,32(%%r1,%3) \n\t"
"vfmadb %%v0,%%v18,%%v26,%%v0 \n\t"
"vl %%v27,48(%%r1,%3) \n\t"
"vfmadb %%v0,%%v19,%%v27,%%v0 \n\t"
"vl %%v28,64(%%r1,%3) \n\t"
"vfmadb %%v0,%%v20,%%v28,%%v0 \n\t"
"vl %%v29,80(%%r1,%3) \n\t"
"vfmadb %%v0,%%v21,%%v29,%%v0 \n\t"
"vl %%v30,96(%%r1,%3) \n\t"
"vfmadb %%v0,%%v22,%%v30,%%v0 \n\t"
"vl %%v31,112(%%r1,%3) \n\t"
"vfmadb %%v0,%%v23,%%v31,%%v0 \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b \n\t"
"vrepg %%v1,%%v0,1 \n\t"
"adbr %%f0,%%f1 \n\t"
"ldr %0,%%f0 "
:"=f"(dot)
:"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((const FLOAT (*)[n])y)
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);

#else

static FLOAT ddot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y )
{
BLASLONG register i = 0;
FLOAT dot = 0.0;

while(i < n)
{
dot += y[i] * x[i]
+ y[i+1] * x[i+1]
+ y[i+2] * x[i+2]
+ y[i+3] * x[i+3]
+ y[i+4] * x[i+4]
+ y[i+5] * x[i+5]
+ y[i+6] * x[i+6]
+ y[i+7] * x[i+7] ;
dot += y[i+8] * x[i+8]
+ y[i+9] * x[i+9]
+ y[i+10] * x[i+10]
+ y[i+11] * x[i+11]
+ y[i+12] * x[i+12]
+ y[i+13] * x[i+13]
+ y[i+14] * x[i+14]
+ y[i+15] * x[i+15] ;

i+=16 ;

}
return dot;
}

#endif

FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
BLASLONG i=0;
@@ -148,13 +91,11 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{

BLASLONG n1 = n & -16;
if ( n1 ){
dot = ddot_kernel_16(n1, x, y );
i = n1;
}

if ( n1 )
dot = ddot_kernel_16(n1, x, y);

i = n1;
while(i < n)
{



+ 361
- 155
kernel/zarch/dgemv_n_4.c View File

@@ -25,186 +25,392 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/


#include "common.h"

#define NBMAX 2048

#define HAVE_KERNEL_4x4_VEC 1
#define HAVE_KERNEL_4x2_VEC 1
#define HAVE_KERNEL_4x1_VEC 1

#if defined(HAVE_KERNEL_4x4_VEC) || defined(HAVE_KERNEL_4x2_VEC) || defined(HAVE_KERNEL_4x1_VEC)
#include <vecintrin.h>
#endif

#ifdef HAVE_KERNEL_4x4

#elif HAVE_KERNEL_4x4_VEC

static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
{
BLASLONG i;
FLOAT x0,x1,x2,x3;
x0 = xo[0] * *alpha;
x1 = xo[1] * *alpha;
x2 = xo[2] * *alpha;
x3 = xo[3] * *alpha;
__vector double v_x0 = {x0,x0};
__vector double v_x1 = {x1,x1};
__vector double v_x2 = {x2,x2};
__vector double v_x3 = {x3,x3};
__vector double* v_y =(__vector double*)y;
__vector double* va0 = (__vector double*)ap[0];
__vector double* va1 = (__vector double*)ap[1];
__vector double* va2 = (__vector double*)ap[2];
__vector double* va3 = (__vector double*)ap[3];

for ( i=0; i< n/2; i+=2 )
{
v_y[i] += v_x0 * va0[i] + v_x1 * va1[i] + v_x2 * va2[i] + v_x3 * va3[i] ;
v_y[i+1] += v_x0 * va0[i+1] + v_x1 * va1[i+1] + v_x2 * va2[i+1] + v_x3 * va3[i+1] ;
}
}

#else

static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
{
BLASLONG i;
FLOAT *a0,*a1,*a2,*a3;
FLOAT x[4] __attribute__ ((aligned (16)));
a0 = ap[0];
a1 = ap[1];
a2 = ap[2];
a3 = ap[3];

for ( i=0; i<4; i++)
x[i] = xo[i] * *alpha;

for ( i=0; i< n; i+=4 )
{
y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3];
y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3];
y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3];
y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3];
}
}


#endif

#ifdef HAVE_KERNEL_4x2

#elif HAVE_KERNEL_4x2_VEC

static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
{
BLASLONG i;
FLOAT x0,x1;
x0 = xo[0] * *alpha;
x1 = xo[1] * *alpha;
__vector double v_x0 = {x0,x0};
__vector double v_x1 = {x1,x1};
__vector double* v_y =(__vector double*)y;
__vector double* va0 = (__vector double*)ap[0];
__vector double* va1 = (__vector double*)ap[1];

for ( i=0; i< n/2; i+=2 )
{
v_y[i] += v_x0 * va0[i] + v_x1 * va1[i] ;
v_y[i+1] += v_x0 * va0[i+1] + v_x1 * va1[i+1] ;
}
__asm__ volatile (
"vlrepg %%v0,0(%5) \n\t"
"vlrepg %%v1,8(%5) \n\t"
"vlrepg %%v2,16(%5) \n\t"
"vlrepg %%v3,24(%5) \n\t"
"vlrepg %%v4,%7 \n\t"
"vfmdb %%v0,%%v0,%%v4 \n\t"
"vfmdb %%v1,%%v1,%%v4 \n\t"
"vfmdb %%v2,%%v2,%%v4 \n\t"
"vfmdb %%v3,%%v3,%%v4 \n\t"
"xgr %%r1,%%r1 \n\t"

"lghi %%r0,-16 \n\t"
"ngr %%r0,%0 \n\t"
"ltgr %%r0,%%r0 \n\t"
"jz 1f \n\t"

"srlg %%r0,%%r0,4 \n\t"
"0: \n\t"
"pfd 1,1024(%%r1,%1) \n\t"
"pfd 1,1024(%%r1,%2) \n\t"
"pfd 1,1024(%%r1,%3) \n\t"
"pfd 1,1024(%%r1,%4) \n\t"
"pfd 2,1024(%%r1,%6) \n\t"

"vl %%v16,0(%%r1,%1) \n\t"
"vl %%v17,0(%%r1,%2) \n\t"
"vl %%v18,0(%%r1,%3) \n\t"
"vl %%v19,0(%%r1,%4) \n\t"
"vl %%v20,16(%%r1,%1) \n\t"
"vl %%v21,16(%%r1,%2) \n\t"
"vl %%v22,16(%%r1,%3) \n\t"
"vl %%v23,16(%%r1,%4) \n\t"
"vl %%v24,32(%%r1,%1) \n\t"
"vl %%v25,32(%%r1,%2) \n\t"
"vl %%v26,32(%%r1,%3) \n\t"
"vl %%v27,32(%%r1,%4) \n\t"
"vl %%v28,48(%%r1,%1) \n\t"
"vl %%v29,48(%%r1,%2) \n\t"
"vl %%v30,48(%%r1,%3) \n\t"
"vl %%v31,48(%%r1,%4) \n\t"

"vl %%v4,0(%%r1,%6) \n\t"
"vfmadb %%v4,%%v16,%%v0,%%v4 \n\t"
"vfmadb %%v4,%%v17,%%v1,%%v4 \n\t"
"vfmadb %%v4,%%v18,%%v2,%%v4 \n\t"
"vfmadb %%v4,%%v19,%%v3,%%v4 \n\t"
"vst %%v4,0(%%r1,%6) \n\t"

"vl %%v4,16(%%r1,%6) \n\t"
"vfmadb %%v4,%%v20,%%v0,%%v4 \n\t"
"vfmadb %%v4,%%v21,%%v1,%%v4 \n\t"
"vfmadb %%v4,%%v22,%%v2,%%v4 \n\t"
"vfmadb %%v4,%%v23,%%v3,%%v4 \n\t"
"vst %%v4,16(%%r1,%6) \n\t"

"vl %%v4,32(%%r1,%6) \n\t"
"vfmadb %%v4,%%v24,%%v0,%%v4 \n\t"
"vfmadb %%v4,%%v25,%%v1,%%v4 \n\t"
"vfmadb %%v4,%%v26,%%v2,%%v4 \n\t"
"vfmadb %%v4,%%v27,%%v3,%%v4 \n\t"
"vst %%v4,32(%%r1,%6) \n\t"

"vl %%v4,48(%%r1,%6) \n\t"
"vfmadb %%v4,%%v28,%%v0,%%v4 \n\t"
"vfmadb %%v4,%%v29,%%v1,%%v4 \n\t"
"vfmadb %%v4,%%v30,%%v2,%%v4 \n\t"
"vfmadb %%v4,%%v31,%%v3,%%v4 \n\t"
"vst %%v4,48(%%r1,%6) \n\t"

"vl %%v16,64(%%r1,%1) \n\t"
"vl %%v17,64(%%r1,%2) \n\t"
"vl %%v18,64(%%r1,%3) \n\t"
"vl %%v19,64(%%r1,%4) \n\t"
"vl %%v20,80(%%r1,%1) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,80(%%r1,%3) \n\t"
"vl %%v23,80(%%r1,%4) \n\t"
"vl %%v24,96(%%r1,%1) \n\t"
"vl %%v25,96(%%r1,%2) \n\t"
"vl %%v26,96(%%r1,%3) \n\t"
"vl %%v27,96(%%r1,%4) \n\t"
"vl %%v28,112(%%r1,%1) \n\t"
"vl %%v29,112(%%r1,%2) \n\t"
"vl %%v30,112(%%r1,%3) \n\t"
"vl %%v31,112(%%r1,%4) \n\t"

"vl %%v4,64(%%r1,%6) \n\t"
"vfmadb %%v4,%%v16,%%v0,%%v4 \n\t"
"vfmadb %%v4,%%v17,%%v1,%%v4 \n\t"
"vfmadb %%v4,%%v18,%%v2,%%v4 \n\t"
"vfmadb %%v4,%%v19,%%v3,%%v4 \n\t"
"vst %%v4,64(%%r1,%6) \n\t"

"vl %%v4,80(%%r1,%6) \n\t"
"vfmadb %%v4,%%v20,%%v0,%%v4 \n\t"
"vfmadb %%v4,%%v21,%%v1,%%v4 \n\t"
"vfmadb %%v4,%%v22,%%v2,%%v4 \n\t"
"vfmadb %%v4,%%v23,%%v3,%%v4 \n\t"
"vst %%v4,80(%%r1,%6) \n\t"

"vl %%v4,96(%%r1,%6) \n\t"
"vfmadb %%v4,%%v24,%%v0,%%v4 \n\t"
"vfmadb %%v4,%%v25,%%v1,%%v4 \n\t"
"vfmadb %%v4,%%v26,%%v2,%%v4 \n\t"
"vfmadb %%v4,%%v27,%%v3,%%v4 \n\t"
"vst %%v4,96(%%r1,%6) \n\t"

"vl %%v4,112(%%r1,%6) \n\t"
"vfmadb %%v4,%%v28,%%v0,%%v4 \n\t"
"vfmadb %%v4,%%v29,%%v1,%%v4 \n\t"
"vfmadb %%v4,%%v30,%%v2,%%v4 \n\t"
"vfmadb %%v4,%%v31,%%v3,%%v4 \n\t"
"vst %%v4,112(%%r1,%6) \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b \n\t"

"1: \n\t"
"lghi %%r0,12 \n\t"
"ngr %%r0,%0 \n\t"
"ltgr %%r0,%%r0 \n\t"
"jz 3f \n\t"

"srlg %%r0,%%r0,2 \n\t"
"2: \n\t"
"vl %%v16,0(%%r1,%1) \n\t"
"vl %%v17,0(%%r1,%2) \n\t"
"vl %%v18,0(%%r1,%3) \n\t"
"vl %%v19,0(%%r1,%4) \n\t"
"vl %%v20,16(%%r1,%1) \n\t"
"vl %%v21,16(%%r1,%2) \n\t"
"vl %%v22,16(%%r1,%3) \n\t"
"vl %%v23,16(%%r1,%4) \n\t"

"vl %%v4,0(%%r1,%6) \n\t"
"vfmadb %%v4,%%v16,%%v0,%%v4 \n\t"
"vfmadb %%v4,%%v17,%%v1,%%v4 \n\t"
"vfmadb %%v4,%%v18,%%v2,%%v4 \n\t"
"vfmadb %%v4,%%v19,%%v3,%%v4 \n\t"
"vst %%v4,0(%%r1,%6) \n\t"

"vl %%v4,16(%%r1,%6) \n\t"
"vfmadb %%v4,%%v20,%%v0,%%v4 \n\t"
"vfmadb %%v4,%%v21,%%v1,%%v4 \n\t"
"vfmadb %%v4,%%v22,%%v2,%%v4 \n\t"
"vfmadb %%v4,%%v23,%%v3,%%v4 \n\t"
"vst %%v4,16(%%r1,%6) \n\t"

"agfi %%r1,32 \n\t"
"brctg %%r0,2b \n\t"

"3: \n\t"
"nop "
:
:"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])ap[2]),"ZR"((const FLOAT (*)[n])ap[3]),"ZQ"((const FLOAT (*)[4])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
#else

static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
{
BLASLONG i;
FLOAT *a0,*a1;
FLOAT x[4] __attribute__ ((aligned (16)));
a0 = ap[0];
a1 = ap[1];

for ( i=0; i<2; i++)
x[i] = xo[i] * *alpha;

for ( i=0; i< n; i+=4 )
{
y[i] += a0[i]*x[0] + a1[i]*x[1];
y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1];
y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1];
y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1];
}
}


#endif

#ifdef HAVE_KERNEL_4x1

#elif HAVE_KERNEL_4x1_VEC
static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
{
BLASLONG i;
FLOAT x0;
x0 = xo[0] * *alpha;
__vector double v_x0 = {x0,x0};
__vector double* v_y =(__vector double*)y;
__vector double* va0 = (__vector double*)ap;

for ( i=0; i< n/2; i+=2 )
{
v_y[i] += v_x0 * va0[i] ;
v_y[i+1] += v_x0 * va0[i+1] ;
}
__asm__ volatile (
"vlrepg %%v0,0(%3) \n\t"
"vlrepg %%v1,8(%3) \n\t"
"vlrepg %%v2,%5 \n\t"
"vfmdb %%v0,%%v0,%%v2 \n\t"
"vfmdb %%v1,%%v1,%%v2 \n\t"
"xgr %%r1,%%r1 \n\t"

"lghi %%r0,-16 \n\t"
"ngr %%r0,%0 \n\t"
"ltgr %%r0,%%r0 \n\t"
"jz 1f \n\t"

"srlg %%r0,%%r0,4 \n\t"
"0: \n\t"
"pfd 1,1024(%%r1,%1) \n\t"
"pfd 1,1024(%%r1,%2) \n\t"
"pfd 2,1024(%%r1,%4) \n\t"

"vl %%v16,0(%%r1,%1) \n\t"
"vl %%v17,0(%%r1,%2) \n\t"
"vl %%v18,16(%%r1,%1) \n\t"
"vl %%v19,16(%%r1,%2) \n\t"
"vl %%v20,32(%%r1,%1) \n\t"
"vl %%v21,32(%%r1,%2) \n\t"
"vl %%v22,48(%%r1,%1) \n\t"
"vl %%v23,48(%%r1,%2) \n\t"
"vl %%v24,64(%%r1,%1) \n\t"
"vl %%v25,64(%%r1,%2) \n\t"
"vl %%v26,80(%%r1,%1) \n\t"
"vl %%v27,80(%%r1,%2) \n\t"
"vl %%v28,96(%%r1,%1) \n\t"
"vl %%v29,96(%%r1,%2) \n\t"
"vl %%v30,112(%%r1,%1) \n\t"
"vl %%v31,112(%%r1,%2) \n\t"

"vl %%v2,0(%%r1,%4) \n\t"
"vfmadb %%v2,%%v16,%%v0,%%v2 \n\t"
"vfmadb %%v2,%%v17,%%v1,%%v2 \n\t"
"vst %%v2,0(%%r1,%4) \n\t"

"vl %%v2,16(%%r1,%4) \n\t"
"vfmadb %%v2,%%v18,%%v0,%%v2 \n\t"
"vfmadb %%v2,%%v19,%%v1,%%v2 \n\t"
"vst %%v2,16(%%r1,%4) \n\t"

"vl %%v2,32(%%r1,%4) \n\t"
"vfmadb %%v2,%%v20,%%v0,%%v2 \n\t"
"vfmadb %%v2,%%v21,%%v1,%%v2 \n\t"
"vst %%v2,32(%%r1,%4) \n\t"

"vl %%v2,48(%%r1,%4) \n\t"
"vfmadb %%v2,%%v22,%%v0,%%v2 \n\t"
"vfmadb %%v2,%%v23,%%v1,%%v2 \n\t"
"vst %%v2,48(%%r1,%4) \n\t"

"vl %%v2,64(%%r1,%4) \n\t"
"vfmadb %%v2,%%v24,%%v0,%%v2 \n\t"
"vfmadb %%v2,%%v25,%%v1,%%v2 \n\t"
"vst %%v2,64(%%r1,%4) \n\t"

"vl %%v2,80(%%r1,%4) \n\t"
"vfmadb %%v2,%%v26,%%v0,%%v2 \n\t"
"vfmadb %%v2,%%v27,%%v1,%%v2 \n\t"
"vst %%v2,80(%%r1,%4) \n\t"

"vl %%v2,96(%%r1,%4) \n\t"
"vfmadb %%v2,%%v28,%%v0,%%v2 \n\t"
"vfmadb %%v2,%%v29,%%v1,%%v2 \n\t"
"vst %%v2,96(%%r1,%4) \n\t"

"vl %%v2,112(%%r1,%4) \n\t"
"vfmadb %%v2,%%v30,%%v0,%%v2 \n\t"
"vfmadb %%v2,%%v31,%%v1,%%v2 \n\t"
"vst %%v2,112(%%r1,%4) \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b \n\t"

"1: \n\t"
"lghi %%r0,12 \n\t"
"ngr %%r0,%0 \n\t"
"ltgr %%r0,%%r0 \n\t"
"jz 3f \n\t"

"srlg %%r0,%%r0,2 \n\t"
"2: \n\t"
"vl %%v16,0(%%r1,%1) \n\t"
"vl %%v17,0(%%r1,%2) \n\t"
"vl %%v18,16(%%r1,%1) \n\t"
"vl %%v19,16(%%r1,%2) \n\t"

"vl %%v2,0(%%r1,%4) \n\t"
"vfmadb %%v2,%%v16,%%v0,%%v2 \n\t"
"vfmadb %%v2,%%v17,%%v1,%%v2 \n\t"
"vst %%v2,0(%%r1,%4) \n\t"

"vl %%v2,16(%%r1,%4) \n\t"
"vfmadb %%v2,%%v18,%%v0,%%v2 \n\t"
"vfmadb %%v2,%%v19,%%v1,%%v2 \n\t"
"vst %%v2,16(%%r1,%4) \n\t"

"agfi %%r1,32 \n\t"
"brctg %%r0,2b \n\t"

"3: \n\t"
"nop "
:
:"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZQ"((const FLOAT (*)[2])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha)
:"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}

#else
static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *xo, FLOAT *y, FLOAT *alpha)
{
BLASLONG i;
FLOAT *a0;
FLOAT x[4] __attribute__ ((aligned (16)));
a0 = ap;

for ( i=0; i<1; i++)
x[i] = xo[i] * *alpha;

for ( i=0; i< n; i+=4 )
{
y[i] += a0[i]*x[0];
y[i+1] += a0[i+1]*x[0];
y[i+2] += a0[i+2]*x[0];
y[i+3] += a0[i+3]*x[0];
}
__asm__ volatile (
"vlrepg %%v0,0(%2) \n\t"
"vlrepg %%v1,%4 \n\t"
"vfmdb %%v0,%%v0,%%v1 \n\t"
"xgr %%r1,%%r1 \n\t"

"lghi %%r0,-16 \n\t"
"ngr %%r0,%0 \n\t"
"ltgr %%r0,%%r0 \n\t"
"jz 1f \n\t"

"srlg %%r0,%%r0,4 \n\t"
"0: \n\t"
"pfd 1,1024(%%r1,%1) \n\t"
"pfd 2,1024(%%r1,%3) \n\t"

"vl %%v16,0(%%r1,%1) \n\t"
"vl %%v17,16(%%r1,%1) \n\t"
"vl %%v18,32(%%r1,%1) \n\t"
"vl %%v19,48(%%r1,%1) \n\t"
"vl %%v20,64(%%r1,%1) \n\t"
"vl %%v21,80(%%r1,%1) \n\t"
"vl %%v22,96(%%r1,%1) \n\t"
"vl %%v23,112(%%r1,%1) \n\t"

"vl %%v1,0(%%r1,%3) \n\t"
"vfmadb %%v1,%%v16,%%v0,%%v1 \n\t"
"vst %%v1,0(%%r1,%3) \n\t"

"vl %%v1,16(%%r1,%3) \n\t"
"vfmadb %%v1,%%v17,%%v0,%%v1 \n\t"
"vst %%v1,16(%%r1,%3) \n\t"

"vl %%v1,32(%%r1,%3) \n\t"
"vfmadb %%v1,%%v18,%%v0,%%v1 \n\t"
"vst %%v1,32(%%r1,%3) \n\t"

"vl %%v1,48(%%r1,%3) \n\t"
"vfmadb %%v1,%%v19,%%v0,%%v1 \n\t"
"vst %%v1,48(%%r1,%3) \n\t"

"vl %%v1,64(%%r1,%3) \n\t"
"vfmadb %%v1,%%v20,%%v0,%%v1 \n\t"
"vst %%v1,64(%%r1,%3) \n\t"

"vl %%v1,80(%%r1,%3) \n\t"
"vfmadb %%v1,%%v21,%%v0,%%v1 \n\t"
"vst %%v1,80(%%r1,%3) \n\t"

"vl %%v1,96(%%r1,%3) \n\t"
"vfmadb %%v1,%%v22,%%v0,%%v1 \n\t"
"vst %%v1,96(%%r1,%3) \n\t"

"vl %%v1,112(%%r1,%3) \n\t"
"vfmadb %%v1,%%v23,%%v0,%%v1 \n\t"
"vst %%v1,112(%%r1,%3) \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b \n\t"

"1: \n\t"
"lghi %%r0,12 \n\t"
"ngr %%r0,%0 \n\t"
"ltgr %%r0,%%r0 \n\t"
"jz 3f \n\t"

"srlg %%r0,%%r0,2 \n\t"
"2: \n\t"
"vl %%v16,0(%%r1,%1) \n\t"
"vl %%v17,16(%%r1,%1) \n\t"

"vl %%v1,0(%%r1,%3) \n\t"
"vfmadb %%v1,%%v16,%%v0,%%v1 \n\t"
"vst %%v1,0(%%r1,%3) \n\t"

"vl %%v1,16(%%r1,%3) \n\t"
"vfmadb %%v1,%%v17,%%v0,%%v1 \n\t"
"vst %%v1,16(%%r1,%3) \n\t"

"agfi %%r1,32 \n\t"
"brctg %%r0,2b \n\t"

"3: \n\t"
"nop "
:
:"r"(n),"ZR"((const FLOAT (*)[n])a0),"ZQ"((const FLOAT (*)[1])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha)
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}


#endif


static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
{
BLASLONG i;
for ( i=0; i<n; i++ ){
*dest += *src;
src++;
dest += inc_dest;
for (i = 0; i < n; i++)
{
*dest += src[i];
dest += inc_dest;
}
return;
}

int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
{
BLASLONG i;
BLASLONG j;
FLOAT *a_ptr;
FLOAT *x_ptr;
FLOAT *y_ptr;


+ 430
- 148
kernel/zarch/dgemv_t_4.c View File

@@ -25,178 +25,460 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/


#include "common.h"

#define HAVE_KERNEL_4x4_VEC 1
#define HAVE_KERNEL_4x2_VEC 1
#define HAVE_KERNEL_4x1_VEC 1

#if defined(HAVE_KERNEL_4x4_VEC) || defined(HAVE_KERNEL_4x2_VEC) || defined(HAVE_KERNEL_4x1_VEC)
#include <vecintrin.h>
#endif
#define NBMAX 2048

#ifdef HAVE_KERNEL_4x4

#elif HAVE_KERNEL_4x4_VEC

static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
{
BLASLONG i;
__vector double* va0 = (__vector double*)ap[0];
__vector double* va1 = (__vector double*)ap[1];
__vector double* va2 = (__vector double*)ap[2];
__vector double* va3 = (__vector double*)ap[3];
__vector double* v_x =(__vector double*)x;
__vector double temp0 = {0,0};
__vector double temp1 = {0,0};
__vector double temp2 = {0,0};
__vector double temp3 = {0,0};

for ( i=0; i< n/2; i+=2 )
{
temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1] ;
temp1 += v_x[i] * va1[i] + v_x[i+1] * va1[i+1] ;
temp2 += v_x[i] * va2[i] + v_x[i+1] * va2[i+1] ;
temp3 += v_x[i] * va3[i] + v_x[i+1] * va3[i+1] ;
}
y[0] = temp0[0] + temp0[1];
y[1] = temp1[0] + temp1[1];
y[2] = temp2[0] + temp2[1];
y[3] = temp3[0] + temp3[1];;
}
#else
static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
{
BLASLONG i;
FLOAT *a0,*a1,*a2,*a3;
a0 = ap[0];
a1 = ap[1];
a2 = ap[2];
a3 = ap[3];
FLOAT temp0 = 0.0;
FLOAT temp1 = 0.0;
FLOAT temp2 = 0.0;
FLOAT temp3 = 0.0;

for ( i=0; i< n; i+=4 )
{
temp0 += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3];
temp1 += a1[i]*x[i] + a1[i+1]*x[i+1] + a1[i+2]*x[i+2] + a1[i+3]*x[i+3];
temp2 += a2[i]*x[i] + a2[i+1]*x[i+1] + a2[i+2]*x[i+2] + a2[i+3]*x[i+3];
temp3 += a3[i]*x[i] + a3[i+1]*x[i+1] + a3[i+2]*x[i+2] + a3[i+3]*x[i+3];
}
y[0] = temp0;
y[1] = temp1;
y[2] = temp2;
y[3] = temp3;
__asm__ volatile (
"vzero %%v0 \n\t"
"vzero %%v1 \n\t"
"vzero %%v2 \n\t"
"vzero %%v3 \n\t"
"xgr %%r1,%%r1 \n\t"

"lghi %%r0,-16 \n\t"
"ngr %%r0,%0 \n\t"
"ltgr %%r0,%%r0 \n\t"
"jz 1f \n\t"

"srlg %%r0,%%r0,4 \n\t"
"0: \n\t"
"pfd 1,1024(%%r1,%1) \n\t"
"pfd 1,1024(%%r1,%2) \n\t"
"pfd 1,1024(%%r1,%3) \n\t"
"pfd 1,1024(%%r1,%4) \n\t"
"pfd 1,1024(%%r1,%5) \n\t"

"vl %%v16,0(%%r1,%5) \n\t"
"vl %%v17,16(%%r1,%5) \n\t"
"vl %%v18,32(%%r1,%5) \n\t"
"vl %%v19,48(%%r1,%5) \n\t"
"vl %%v20,64(%%r1,%5) \n\t"
"vl %%v21,80(%%r1,%5) \n\t"
"vl %%v22,96(%%r1,%5) \n\t"
"vl %%v23,112(%%r1,%5) \n\t"

"vl %%v24,0(%%r1,%1) \n\t"
"vfmadb %%v0,%%v16,%%v24,%%v0 \n\t"
"vl %%v25,0(%%r1,%2) \n\t"
"vfmadb %%v1,%%v16,%%v25,%%v1 \n\t"
"vl %%v26,0(%%r1,%3) \n\t"
"vfmadb %%v2,%%v16,%%v26,%%v2 \n\t"
"vl %%v27,0(%%r1,%4) \n\t"
"vfmadb %%v3,%%v16,%%v27,%%v3 \n\t"

"vl %%v28,16(%%r1,%1) \n\t"
"vfmadb %%v0,%%v17,%%v28,%%v0 \n\t"
"vl %%v29,16(%%r1,%2) \n\t"
"vfmadb %%v1,%%v17,%%v29,%%v1 \n\t"
"vl %%v30,16(%%r1,%3) \n\t"
"vfmadb %%v2,%%v17,%%v30,%%v2 \n\t"
"vl %%v31,16(%%r1,%4) \n\t"
"vfmadb %%v3,%%v17,%%v31,%%v3 \n\t"

"vl %%v24,32(%%r1,%1) \n\t"
"vfmadb %%v0,%%v18,%%v24,%%v0 \n\t"
"vl %%v25,32(%%r1,%2) \n\t"
"vfmadb %%v1,%%v18,%%v25,%%v1 \n\t"
"vl %%v26,32(%%r1,%3) \n\t"
"vfmadb %%v2,%%v18,%%v26,%%v2 \n\t"
"vl %%v27,32(%%r1,%4) \n\t"
"vfmadb %%v3,%%v18,%%v27,%%v3 \n\t"

"vl %%v28,48(%%r1,%1) \n\t"
"vfmadb %%v0,%%v19,%%v28,%%v0 \n\t"
"vl %%v29,48(%%r1,%2) \n\t"
"vfmadb %%v1,%%v19,%%v29,%%v1 \n\t"
"vl %%v30,48(%%r1,%3) \n\t"
"vfmadb %%v2,%%v19,%%v30,%%v2 \n\t"
"vl %%v31,48(%%r1,%4) \n\t"
"vfmadb %%v3,%%v19,%%v31,%%v3 \n\t"

"vl %%v24,64(%%r1,%1) \n\t"
"vfmadb %%v0,%%v20,%%v24,%%v0 \n\t"
"vl %%v25,64(%%r1,%2) \n\t"
"vfmadb %%v1,%%v20,%%v25,%%v1 \n\t"
"vl %%v26,64(%%r1,%3) \n\t"
"vfmadb %%v2,%%v20,%%v26,%%v2 \n\t"
"vl %%v27,64(%%r1,%4) \n\t"
"vfmadb %%v3,%%v20,%%v27,%%v3 \n\t"

"vl %%v28,80(%%r1,%1) \n\t"
"vfmadb %%v0,%%v21,%%v28,%%v0 \n\t"
"vl %%v29,80(%%r1,%2) \n\t"
"vfmadb %%v1,%%v21,%%v29,%%v1 \n\t"
"vl %%v30,80(%%r1,%3) \n\t"
"vfmadb %%v2,%%v21,%%v30,%%v2 \n\t"
"vl %%v31,80(%%r1,%4) \n\t"
"vfmadb %%v3,%%v21,%%v31,%%v3 \n\t"

"vl %%v24,96(%%r1,%1) \n\t"
"vfmadb %%v0,%%v22,%%v24,%%v0 \n\t"
"vl %%v25,96(%%r1,%2) \n\t"
"vfmadb %%v1,%%v22,%%v25,%%v1 \n\t"
"vl %%v26,96(%%r1,%3) \n\t"
"vfmadb %%v2,%%v22,%%v26,%%v2 \n\t"
"vl %%v27,96(%%r1,%4) \n\t"
"vfmadb %%v3,%%v22,%%v27,%%v3 \n\t"

"vl %%v28,112(%%r1,%1) \n\t"
"vfmadb %%v0,%%v23,%%v28,%%v0 \n\t"
"vl %%v29,112(%%r1,%2) \n\t"
"vfmadb %%v1,%%v23,%%v29,%%v1 \n\t"
"vl %%v30,112(%%r1,%3) \n\t"
"vfmadb %%v2,%%v23,%%v30,%%v2 \n\t"
"vl %%v31,112(%%r1,%4) \n\t"
"vfmadb %%v3,%%v23,%%v31,%%v3 \n\t"

"agfi %%r1,128 \n\t"
"brctg %%r0,0b \n\t"

"1: \n\t"
"lghi %%r0,12 \n\t"
"ngr %%r0,%0 \n\t"
"ltgr %%r0,%%r0 \n\t"
"jz 3f \n\t"

"srlg %%r0,%%r0,2 \n\t"
"2: \n\t"
"vl %%v16,0(%%r1,%5) \n\t"
"vl %%v17,16(%%r1,%5) \n\t"

"vl %%v24,0(%%r1,%1) \n\t"
"vfmadb %%v0,%%v16,%%v24,%%v0 \n\t"
"vl %%v25,0(%%r1,%2) \n\t"
"vfmadb %%v1,%%v16,%%v25,%%v1 \n\t"
"vl %%v26,0(%%r1,%3) \n\t"
"vfmadb %%v2,%%v16,%%v26,%%v2 \n\t"
"vl %%v27,0(%%r1,%4) \n\t"
"vfmadb %%v3,%%v16,%%v27,%%v3 \n\t"

"vl %%v28,16(%%r1,%1) \n\t"
"vfmadb %%v0,%%v17,%%v28,%%v0 \n\t"
"vl %%v29,16(%%r1,%2) \n\t"
"vfmadb %%v1,%%v17,%%v29,%%v1 \n\t"
"vl %%v30,16(%%r1,%3) \n\t"
"vfmadb %%v2,%%v17,%%v30,%%v2 \n\t"
"vl %%v31,16(%%r1,%4) \n\t"
"vfmadb %%v3,%%v17,%%v31,%%v3 \n\t"

"agfi %%r1,32 \n\t"
"brctg %%r0,2b \n\t"

"3: \n\t"
"vrepg %%v4,%%v0,1 \n\t"
"adbr %%f0,%%f4 \n\t"
"std %%f0,0(%6) \n\t"
"vrepg %%v4,%%v1,1 \n\t"
"adbr %%f1,%%f4 \n\t"
"std %%f1,8(%6) \n\t"
"vrepg %%v4,%%v2,1 \n\t"
"adbr %%f2,%%f4 \n\t"
"std %%f2,16(%6) \n\t"
"vrepg %%v4,%%v3,1 \n\t"
"adbr %%f3,%%f4 \n\t"
"std %%f3,24(%6) "
:
:"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])ap[2]),"ZR"((const FLOAT (*)[n])ap[3]),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[4])y)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
#endif
#ifdef HAVE_KERNEL_4x2

#elif HAVE_KERNEL_4x2_VEC

static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
{
BLASLONG i;
__vector double* va0 = (__vector double*)ap[0];
__vector double* va1 = (__vector double*)ap[1];
__vector double* v_x =(__vector double*)x;
__vector double temp0 = {0,0};
__vector double temp1 = {0,0};

for ( i=0; i< n/2; i+=2 )
{
temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1] ;
temp1 += v_x[i] * va1[i] + v_x[i+1] * va1[i+1] ;
}
y[0] = temp0[0] + temp0[1];
y[1] = temp1[0] + temp1[1];
}
#else
static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
{

BLASLONG i;
FLOAT *a0,*a1;
a0 = ap[0];
a1 = ap[1];
FLOAT temp0 = 0.0;
FLOAT temp1 = 0.0;

for ( i=0; i< n; i+=4 )
{
temp0 += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3];
temp1 += a1[i]*x[i] + a1[i+1]*x[i+1] + a1[i+2]*x[i+2] + a1[i+3]*x[i+3];
}
y[0] = temp0;
y[1] = temp1;

__asm__ volatile (
"vzero %%v0 \n\t"
"vzero %%v1 \n\t"
"xgr %%r1,%%r1 \n\t"

"lghi %%r0,-16 \n\t"
"ngr %%r0,%0 \n\t"
"ltgr %%r0,%%r0 \n\t"
"jz 1f \n\t"

"srlg %%r0,%%r0,4 \n\t"
"0: \n\t"
"pfd 1,1024(%%r1,%1) \n\t"
"pfd 1,1024(%%r1,%2) \n\t"
"pfd 1,1024(%%r1,%3) \n\t"

"vl %%v16,0(%%r1,%3) \n\t"
"vl %%v17,16(%%r1,%3) \n\t"
"vl %%v18,32(%%r1,%3) \n\t"
"vl %%v19,48(%%r1,%3) \n\t"
"vl %%v20,64(%%r1,%3) \n\t"
"vl %%v21,80(%%r1,%3) \n\t"
"vl %%v22,96(%%r1,%3) \n\t"
"vl %%v23,112(%%r1,%3) \n\t"

"vl %%v24,0(%%r1,%1) \n\t"
"vfmadb %%v0,%%v16,%%v24,%%v0 \n\t"
"vl %%v25,0(%%r1,%2) \n\t"
"vfmadb %%v1,%%v16,%%v25,%%v1 \n\t"

"vl %%v26,16(%%r1,%1) \n\t"
"vfmadb %%v0,%%v17,%%v26,%%v0 \n\t"
"vl %%v27,16(%%r1,%2) \n\t"
"vfmadb %%v1,%%v17,%%v27,%%v1 \n\t"

"vl %%v28,32(%%r1,%1) \n\t"
"vfmadb %%v0,%%v18,%%v28,%%v0 \n\t"
"vl %%v29,32(%%r1,%2) \n\t"
"vfmadb %%v1,%%v18,%%v29,%%v1 \n\t"

"vl %%v30,48(%%r1,%1) \n\t"
"vfmadb %%v0,%%v19,%%v30,%%v0 \n\t"
"vl %%v31,48(%%r1,%2) \n\t"
"vfmadb %%v1,%%v19,%%v31,%%v1 \n\t"

"vl %%v24,64(%%r1,%1) \n\t"
"vfmadb %%v0,%%v20,%%v24,%%v0 \n\t"
"vl %%v25,64(%%r1,%2) \n\t"
"vfmadb %%v1,%%v20,%%v25,%%v1 \n\t"

"vl %%v26,80(%%r1,%1) \n\t"
"vfmadb %%v0,%%v21,%%v26,%%v0 \n\t"
"vl %%v27,80(%%r1,%2) \n\t"
"vfmadb %%v1,%%v21,%%v27,%%v1 \n\t"

"vl %%v28,96(%%r1,%1) \n\t"
"vfmadb %%v0,%%v22,%%v28,%%v0 \n\t"
"vl %%v29,96(%%r1,%2) \n\t"
"vfmadb %%v1,%%v22,%%v29,%%v1 \n\t"

"vl %%v30,112(%%r1,%1) \n\t"
"vfmadb %%v0,%%v23,%%v30,%%v0 \n\t"
"vl %%v31,112(%%r1,%2) \n\t"
"vfmadb %%v1,%%v23,%%v31,%%v1 \n\t"

"agfi %%r1,128 \n\t"
"brctg %%r0,0b \n\t"

"1: \n\t"
"lghi %%r0,12 \n\t"
"ngr %%r0,%0 \n\t"
"ltgr %%r0,%%r0 \n\t"
"jz 3f \n\t"

"srlg %%r0,%%r0,2 \n\t"
"2: \n\t"
"vl %%v16,0(%%r1,%3) \n\t"
"vl %%v17,16(%%r1,%3) \n\t"

"vl %%v24,0(%%r1,%1) \n\t"
"vfmadb %%v0,%%v16,%%v24,%%v0 \n\t"
"vl %%v25,0(%%r1,%2) \n\t"
"vfmadb %%v1,%%v16,%%v25,%%v1 \n\t"

"vl %%v26,16(%%r1,%1) \n\t"
"vfmadb %%v0,%%v17,%%v26,%%v0 \n\t"
"vl %%v27,16(%%r1,%2) \n\t"
"vfmadb %%v1,%%v17,%%v27,%%v1 \n\t"

"agfi %%r1,32 \n\t"
"brctg %%r0,2b \n\t"

"3: \n\t"
"vrepg %%v2,%%v0,1 \n\t"
"adbr %%f0,%%f2 \n\t"
"std %%f0,0(%4) \n\t"
"vrepg %%v2,%%v1,1 \n\t"
"adbr %%f1,%%f2 \n\t"
"std %%f1,8(%4) "
:
:"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[2])y)
:"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
#endif

#ifdef HAVE_KERNEL_4x1

#elif HAVE_KERNEL_4x1_VEC

static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y)
{
BLASLONG i;
__vector double* va0 = (__vector double*)a0;
__vector double* v_x =(__vector double*)x;
__vector double temp0 = {0,0};

for ( i=0; i< n/2; i+=2 )
{
temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1] ;
}
y[0] = temp0[0] + temp0[1];
__asm__ volatile (
"vzero %%v0 \n\t"
"xgr %%r1,%%r1 \n\t"

"lghi %%r0,-16 \n\t"
"ngr %%r0,%0 \n\t"
"ltgr %%r0,%%r0 \n\t"
"jz 1f \n\t"

"srlg %%r0,%%r0,4 \n\t"
"0: \n\t"
"pfd 1,1024(%%r1,%1) \n\t"
"pfd 1,1024(%%r1,%2) \n\t"

"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"

"vl %%v24,0(%%r1,%1) \n\t"
"vfmadb %%v0,%%v16,%%v24,%%v0 \n\t"

"vl %%v25,16(%%r1,%1) \n\t"
"vfmadb %%v0,%%v17,%%v25,%%v0 \n\t"

"vl %%v26,32(%%r1,%1) \n\t"
"vfmadb %%v0,%%v18,%%v26,%%v0 \n\t"

"vl %%v27,48(%%r1,%1) \n\t"
"vfmadb %%v0,%%v19,%%v27,%%v0 \n\t"

"vl %%v28,64(%%r1,%1) \n\t"
"vfmadb %%v0,%%v20,%%v28,%%v0 \n\t"

"vl %%v29,80(%%r1,%1) \n\t"
"vfmadb %%v0,%%v21,%%v29,%%v0 \n\t"

"vl %%v30,96(%%r1,%1) \n\t"
"vfmadb %%v0,%%v22,%%v30,%%v0 \n\t"

"vl %%v31,112(%%r1,%1) \n\t"
"vfmadb %%v0,%%v23,%%v31,%%v0 \n\t"

"agfi %%r1,128 \n\t"
"brctg %%r0,0b \n\t"

"1: \n\t"
"lghi %%r0,12 \n\t"
"ngr %%r0,%0 \n\t"
"ltgr %%r0,%%r0 \n\t"
"jz 3f \n\t"

"srlg %%r0,%%r0,2 \n\t"
"2: \n\t"
"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"

"vl %%v24,0(%%r1,%1) \n\t"
"vfmadb %%v0,%%v16,%%v24,%%v0 \n\t"

"vl %%v25,16(%%r1,%1) \n\t"
"vfmadb %%v0,%%v17,%%v25,%%v0 \n\t"

"agfi %%r1,32 \n\t"
"brctg %%r0,2b \n\t"

"3: \n\t"
"vrepg %%v1,%%v0,1 \n\t"
"adbr %%f0,%%f1 \n\t"
"std %%f0,0(%3) "
:
:"r"(n),"ZR"((const FLOAT (*)[n])a0),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[1])y)
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
#else
static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y)
static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src)
{
BLASLONG i;
FLOAT temp0 = 0.0;

for ( i=0; i< n; i+=4 )
for (i = 0; i < n; i++)
{
temp0 += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3];
dest[i] = *src;
src += inc_src;
}
y[0] = temp0;
}
#endif
static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src)
static void add_y_kernel_4(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest)
{
BLASLONG i;
for ( i=0; i<n; i++ )
{
*dest = *src;
dest++;
src += inc_src;
}
__asm__ volatile (
"vlrepg %%v0,%1 \n\t"
"xgr %%r1,%%r1 \n\t"

"lghi %%r0,-16 \n\t"
"ngr %%r0,%0 \n\t"
"ltgr %%r0,%%r0 \n\t"
"jz 1f \n\t"

"srlg %%r0,%%r0,4 \n\t"
"0: \n\t"
"pfd 1,1024(%%r1,%2) \n\t"
"pfd 2,1024(%%r1,%3) \n\t"

"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"

"vl %%v24, 0(%%r1,%3) \n\t"
"vfmadb %%v24,%%v16,%%v0,%%v24 \n\t"
"vst %%v24, 0(%%r1,%3) \n\t"
"vl %%v25, 16(%%r1,%3) \n\t"
"vfmadb %%v25,%%v17,%%v0,%%v25 \n\t"
"vst %%v25, 16(%%r1,%3) \n\t"
"vl %%v26, 32(%%r1,%3) \n\t"
"vfmadb %%v26,%%v18,%%v0,%%v26 \n\t"
"vst %%v26, 32(%%r1,%3) \n\t"
"vl %%v27, 48(%%r1,%3) \n\t"
"vfmadb %%v27,%%v19,%%v0,%%v27 \n\t"
"vst %%v27, 48(%%r1,%3) \n\t"
"vl %%v28, 64(%%r1,%3) \n\t"
"vfmadb %%v28,%%v20,%%v0,%%v28 \n\t"
"vst %%v28, 64(%%r1,%3) \n\t"
"vl %%v29, 80(%%r1,%3) \n\t"
"vfmadb %%v29,%%v21,%%v0,%%v29 \n\t"
"vst %%v29, 80(%%r1,%3) \n\t"
"vl %%v30, 96(%%r1,%3) \n\t"
"vfmadb %%v30,%%v22,%%v0,%%v30 \n\t"
"vst %%v30, 96(%%r1,%3) \n\t"
"vl %%v31, 112(%%r1,%3) \n\t"
"vfmadb %%v31,%%v23,%%v0,%%v31 \n\t"
"vst %%v31, 112(%%r1,%3) \n\t"

"agfi %%r1,128 \n\t"
"brctg %%r0,0b \n\t"

"1: \n\t"
"lghi %%r0,12 \n\t"
"ngr %%r0,%0 \n\t"
"ltgr %%r0,%%r0 \n\t"
"jz 3f \n\t"

"srlg %%r0,%%r0,2 \n\t"
"2: \n\t"
"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"

"vl %%v24, 0(%%r1,%3) \n\t"
"vfmadb %%v24,%%v16,%%v0,%%v24 \n\t"
"vst %%v24, 0(%%r1,%3) \n\t"
"vl %%v25, 16(%%r1,%3) \n\t"
"vfmadb %%v25,%%v17,%%v0,%%v25 \n\t"
"vst %%v25, 16(%%r1,%3) \n\t"

"agfi %%r1,32 \n\t"
"brctg %%r0,2b \n\t"

"3: \n\t"
"nop "
:
:"r"(n),"m"(da),"ZR"((const FLOAT (*)[n])src),"ZR"((FLOAT (*)[n])dest)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
static void add_y(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
{

if (inc_dest == 1)
add_y_kernel_4(n, da, src, dest);
else
{
BLASLONG i;
for ( i=0; i<n; i++ )
for (i = 0; i < n; i++)
{
*dest += src[i] * da;
dest += inc_dest;
*dest += src[i] * da;
dest += inc_dest;
}
return;
}
}

int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
@@ -212,7 +494,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
BLASLONG m2;
BLASLONG m3;
BLASLONG n2;
FLOAT ybuffer[4],*xbuffer;
FLOAT ybuffer[2] __attribute__ ((aligned(16)));
FLOAT *xbuffer;
FLOAT *ytemp;

if ( m < 1 ) return(0);
@@ -234,7 +517,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO

while ( NB == NBMAX )
{
m1 -= NB;
if ( m1 < 0)
{


+ 182
- 0
kernel/zarch/dmax.c View File

@@ -0,0 +1,182 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#include "common.h"

static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x)
{
FLOAT max;

__asm__ volatile (
"vl %%v0,0(%2) \n\t"
"srlg %%r0,%1,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"

"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vfchdb %%v24,%%v16,%%v17 \n\t"
"vfchdb %%v25,%%v18,%%v19 \n\t"
"vfchdb %%v26,%%v20,%%v21 \n\t"
"vfchdb %%v27,%%v22,%%v23 \n\t"
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
"vsel %%v26,%%v20,%%v21,%%v26 \n\t"
"vsel %%v27,%%v22,%%v23,%%v27 \n\t"

"vfchdb %%v28,%%v24,%%v25 \n\t"
"vfchdb %%v29,%%v26,%%v27 \n\t"
"vsel %%v28,%%v24,%%v25,%%v28 \n\t"
"vsel %%v29,%%v26,%%v27,%%v29 \n\t"

"vfchdb %%v30,%%v28,%%v29 \n\t"
"vsel %%v30,%%v28,%%v29,%%v30 \n\t"

"vfchdb %%v31,%%v30,%%v0 \n\t"
"vsel %%v0,%%v30,%%v0,%%v31 \n\t"

"vl %%v16,128(%%r1,%2) \n\t"
"vl %%v17,144(%%r1,%2) \n\t"
"vl %%v18,160(%%r1,%2) \n\t"
"vl %%v19,176(%%r1,%2) \n\t"
"vl %%v20,192(%%r1,%2) \n\t"
"vl %%v21,208(%%r1,%2) \n\t"
"vl %%v22,224(%%r1,%2) \n\t"
"vl %%v23,240(%%r1,%2) \n\t"
"vfchdb %%v24,%%v16,%%v17 \n\t"
"vfchdb %%v25,%%v18,%%v19 \n\t"
"vfchdb %%v26,%%v20,%%v21 \n\t"
"vfchdb %%v27,%%v22,%%v23 \n\t"
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
"vsel %%v26,%%v20,%%v21,%%v26 \n\t"
"vsel %%v27,%%v22,%%v23,%%v27 \n\t"

"vfchdb %%v28,%%v24,%%v25 \n\t"
"vfchdb %%v29,%%v26,%%v27 \n\t"
"vsel %%v28,%%v24,%%v25,%%v28 \n\t"
"vsel %%v29,%%v26,%%v27,%%v29 \n\t"

"vfchdb %%v30,%%v28,%%v29 \n\t"
"vsel %%v30,%%v28,%%v29,%%v30 \n\t"

"vfchdb %%v31,%%v30,%%v0 \n\t"
"vsel %%v0,%%v30,%%v0,%%v31 \n\t"

"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"

"vrepg %%v16,%%v0,1 \n\t"
"wfchdb %%v17,%%v16,%%v0 \n\t"
"vsel %%v0,%%v16,%%v0,%%v17 \n\t"
"ldr %0,%%f0 "
:"=f"(max)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);

return max;
}
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT maxf = 0.0;

if (n <= 0 || inc_x <= 0) return (maxf);

if (inc_x == 1) {

BLASLONG n1 = n & -32;
if (n1 > 0) {

maxf = dmax_kernel_32(n1, x);

i = n1;
}
else
{
maxf=x[0];
i++;
}

while (i < n) {
if (x[i] > maxf) {
maxf = x[i];
}
i++;
}
return (maxf);

} else {

maxf=x[0];
i += inc_x;
j++;

BLASLONG n1 = (n - 1) & -4;
while (j < n1) {

if (x[i] > maxf) {
maxf = x[i];
}
if (x[i + inc_x] > maxf) {
maxf = x[i + inc_x];
}
if (x[i + 2 * inc_x] > maxf) {
maxf = x[i + 2 * inc_x];
}
if (x[i + 3 * inc_x] > maxf) {
maxf = x[i + 3 * inc_x];
}

i += inc_x * 4;

j += 4;

}


while (j < n) {
if (x[i] > maxf) {
maxf = x[i];
}
i += inc_x;
j++;
}
return (maxf);
}
}

+ 182
- 0
kernel/zarch/dmin.c View File

@@ -0,0 +1,182 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#include "common.h"

static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x)
{
FLOAT min;

__asm__ volatile (
"vl %%v0,0(%2) \n\t"
"srlg %%r0,%1,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"

"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vfchdb %%v24,%%v17,%%v16 \n\t"
"vfchdb %%v25,%%v19,%%v18 \n\t"
"vfchdb %%v26,%%v21,%%v20 \n\t"
"vfchdb %%v27,%%v23,%%v22 \n\t"
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
"vsel %%v26,%%v20,%%v21,%%v26 \n\t"
"vsel %%v27,%%v22,%%v23,%%v27 \n\t"

"vfchdb %%v28,%%v25,%%v24 \n\t"
"vfchdb %%v29,%%v27,%%v26 \n\t"
"vsel %%v28,%%v24,%%v25,%%v28 \n\t"
"vsel %%v29,%%v26,%%v27,%%v29 \n\t"

"vfchdb %%v30,%%v29,%%v28 \n\t"
"vsel %%v30,%%v28,%%v29,%%v30 \n\t"

"vfchdb %%v31,%%v0,%%v30 \n\t"
"vsel %%v0,%%v30,%%v0,%%v31 \n\t"

"vl %%v16,128(%%r1,%2) \n\t"
"vl %%v17,144(%%r1,%2) \n\t"
"vl %%v18,160(%%r1,%2) \n\t"
"vl %%v19,176(%%r1,%2) \n\t"
"vl %%v20,192(%%r1,%2) \n\t"
"vl %%v21,208(%%r1,%2) \n\t"
"vl %%v22,224(%%r1,%2) \n\t"
"vl %%v23,240(%%r1,%2) \n\t"
"vfchdb %%v24,%%v17,%%v16 \n\t"
"vfchdb %%v25,%%v19,%%v18 \n\t"
"vfchdb %%v26,%%v21,%%v20 \n\t"
"vfchdb %%v27,%%v23,%%v22 \n\t"
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
"vsel %%v26,%%v20,%%v21,%%v26 \n\t"
"vsel %%v27,%%v22,%%v23,%%v27 \n\t"

"vfchdb %%v28,%%v25,%%v24 \n\t"
"vfchdb %%v29,%%v27,%%v26 \n\t"
"vsel %%v28,%%v24,%%v25,%%v28 \n\t"
"vsel %%v29,%%v26,%%v27,%%v29 \n\t"

"vfchdb %%v30,%%v29,%%v28 \n\t"
"vsel %%v30,%%v28,%%v29,%%v30 \n\t"

"vfchdb %%v31,%%v0,%%v30 \n\t"
"vsel %%v0,%%v30,%%v0,%%v31 \n\t"

"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"

"vrepg %%v16,%%v0,1 \n\t"
"wfchdb %%v17,%%v0,%%v16 \n\t"
"vsel %%v0,%%v16,%%v0,%%v17 \n\t"
"ldr %0,%%f0 "
:"=f"(min)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);

return min;
}
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT minf = 0.0;

if (n <= 0 || inc_x <= 0) return (minf);

if (inc_x == 1) {

BLASLONG n1 = n & -32;
if (n1 > 0) {

minf = dmin_kernel_32(n1, x);

i = n1;
}
else
{
minf=x[0];
i++;
}

while (i < n) {
if (x[i] < minf) {
minf = x[i];
}
i++;
}
return (minf);

} else {

minf=x[0];
i += inc_x;
j++;

BLASLONG n1 = (n - 1) & -4;
while (j < n1) {

if (x[i] < minf) {
minf = x[i];
}
if (x[i + inc_x] < minf) {
minf = x[i + inc_x];
}
if (x[i + 2 * inc_x] < minf) {
minf = x[i + 2 * inc_x];
}
if (x[i + 3 * inc_x] < minf) {
minf = x[i + 3 * inc_x];
}

i += inc_x * 4;

j += 4;

}


while (j < n) {
if (x[i] < minf) {
minf = x[i];
}
i += inc_x;
j++;
}
return (minf);
}
}

+ 166
- 172
kernel/zarch/drot.c View File

@@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
Copyright (c) 2013-2018, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -27,176 +27,166 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include "common.h"

static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT cosA, FLOAT sinA)
static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
{
__asm__ (
"pfd 2, 0(%[ptr_x]) \n\t"
"pfd 2, 0(%[ptr_y]) \n\t"
"lgdr %%r1,%[cos] \n\t"
"vlvgp %%v0,%%r1,%%r1 \n\t"
"lgdr %%r1,%[sin] \n\t"
"vlvgp %%v1,%%r1,%%r1 \n\t"
"srlg %[n_tmp],%[n_tmp],5 \n\t"
"xgr %%r1,%%r1 \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 2, 256(%%r1,%[ptr_x]) \n\t"
"pfd 2, 256(%%r1,%[ptr_y]) \n\t"
"vl %%v24, 0(%%r1,%[ptr_x]) \n\t"
"vl %%v25, 16(%%r1,%[ptr_x]) \n\t"
"vl %%v26, 32(%%r1,%[ptr_x]) \n\t"
"vl %%v27, 48(%%r1,%[ptr_x]) \n\t"
"vl %%v16, 0(%%r1,%[ptr_y]) \n\t"
"vl %%v17, 16(%%r1,%[ptr_y]) \n\t"
"vl %%v18, 32(%%r1,%[ptr_y]) \n\t"
"vl %%v19, 48(%%r1,%[ptr_y]) \n\t"
"vfmdb %%v28,%%v24,%%v0 \n\t"
"vfmdb %%v29,%%v25,%%v0 \n\t"
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v30,%%v26,%%v0 \n\t"
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v31,%%v27,%%v0 \n\t"
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
/* 2nd parts*/
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */

"vst %%v28, 0(%%r1,%[ptr_x]) \n\t"
"vst %%v29, 16(%%r1,%[ptr_x]) \n\t"
"vst %%v30, 32(%%r1,%[ptr_x]) \n\t"
"vst %%v31, 48(%%r1,%[ptr_x]) \n\t"
"vst %%v20, 0(%%r1,%[ptr_y]) \n\t"
"vst %%v21, 16(%%r1,%[ptr_y]) \n\t"
"vst %%v22, 32(%%r1,%[ptr_y]) \n\t"
"vst %%v23, 48(%%r1,%[ptr_y]) \n\t"
"vl %%v24, 64(%%r1,%[ptr_x]) \n\t"
"vl %%v25, 80(%%r1,%[ptr_x]) \n\t"
"vl %%v26, 96(%%r1,%[ptr_x]) \n\t"
"vl %%v27, 112(%%r1,%[ptr_x]) \n\t"
"vl %%v16, 64(%%r1,%[ptr_y]) \n\t"
"vl %%v17, 80(%%r1,%[ptr_y]) \n\t"
"vl %%v18, 96(%%r1,%[ptr_y]) \n\t"
"vl %%v19, 112(%%r1,%[ptr_y]) \n\t"
"vfmdb %%v28,%%v24,%%v0 \n\t"
"vfmdb %%v29,%%v25,%%v0 \n\t"
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v30,%%v26,%%v0 \n\t"
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v31,%%v27,%%v0 \n\t"
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
/* 2nd parts*/
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */

"vst %%v28, 64(%%r1,%[ptr_x]) \n\t"
"vst %%v29, 80(%%r1,%[ptr_x]) \n\t"
"vst %%v30, 96(%%r1,%[ptr_x]) \n\t"
"vst %%v31, 112(%%r1,%[ptr_x]) \n\t"
"vst %%v20, 64(%%r1,%[ptr_y]) \n\t"
"vst %%v21, 80(%%r1,%[ptr_y]) \n\t"
"vst %%v22, 96(%%r1,%[ptr_y]) \n\t"
"vst %%v23, 112(%%r1,%[ptr_y]) \n\t"
"vl %%v24, 128(%%r1,%[ptr_x]) \n\t"
"vl %%v25, 144(%%r1,%[ptr_x]) \n\t"
"vl %%v26, 160(%%r1,%[ptr_x]) \n\t"
"vl %%v27, 176(%%r1,%[ptr_x]) \n\t"
"vl %%v16, 128(%%r1,%[ptr_y]) \n\t"
"vl %%v17, 144(%%r1,%[ptr_y]) \n\t"
"vl %%v18, 160(%%r1,%[ptr_y]) \n\t"
"vl %%v19, 176(%%r1,%[ptr_y]) \n\t"
"vfmdb %%v28,%%v24,%%v0 \n\t"
"vfmdb %%v29,%%v25,%%v0 \n\t"
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v30,%%v26,%%v0 \n\t"
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v31,%%v27,%%v0 \n\t"
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
/* 2nd parts*/
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */

"vst %%v28, 128(%%r1,%[ptr_x]) \n\t"
"vst %%v29, 144(%%r1,%[ptr_x]) \n\t"
"vst %%v30, 160(%%r1,%[ptr_x]) \n\t"
"vst %%v31, 176(%%r1,%[ptr_x]) \n\t"
"vst %%v20, 128(%%r1,%[ptr_y]) \n\t"
"vst %%v21, 144(%%r1,%[ptr_y]) \n\t"
"vst %%v22, 160(%%r1,%[ptr_y]) \n\t"
"vst %%v23, 176(%%r1,%[ptr_y]) \n\t"
"vl %%v24, 192(%%r1,%[ptr_x]) \n\t"
"vl %%v25, 208(%%r1,%[ptr_x]) \n\t"
"vl %%v26, 224(%%r1,%[ptr_x]) \n\t"
"vl %%v27, 240(%%r1,%[ptr_x]) \n\t"
"vl %%v16, 192(%%r1,%[ptr_y]) \n\t"
"vl %%v17, 208(%%r1,%[ptr_y]) \n\t"
"vl %%v18, 224(%%r1,%[ptr_y]) \n\t"
"vl %%v19, 240(%%r1,%[ptr_y]) \n\t"
"vfmdb %%v28,%%v24,%%v0 \n\t"
"vfmdb %%v29,%%v25,%%v0 \n\t"
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v30,%%v26,%%v0 \n\t"
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v31,%%v27,%%v0 \n\t"
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
/* 2nd parts*/
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */

"vst %%v28, 192(%%r1,%[ptr_x]) \n\t"
"vst %%v29, 208(%%r1,%[ptr_x]) \n\t"
"vst %%v30, 224(%%r1,%[ptr_x]) \n\t"
"vst %%v31, 240(%%r1,%[ptr_x]) \n\t"
"vst %%v20, 192(%%r1,%[ptr_y]) \n\t"
"vst %%v21, 208(%%r1,%[ptr_y]) \n\t"
"vst %%v22, 224(%%r1,%[ptr_y]) \n\t"
"vst %%v23, 240(%%r1,%[ptr_y]) \n\t"

"la %%r1,256(%%r1) \n\t"
"brctg %[n_tmp],1b"
: [mem_x] "+m" (*(double (*)[n])x),
[mem_y] "+m" (*(double (*)[n])y),
[n_tmp] "+&r"(n)
: [ptr_x] "a"(x), [ptr_y] "a"(y),[cos] "f"(cosA),[sin] "f"(sinA)
: "cc", "r1" ,"v0","v1","v16",
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return;

__asm__ (
"vlrepg %%v0,%3 \n\t"
"vlrepg %%v1,%4 \n\t"
"srlg %%r0,%0,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 2, 1024(%%r1,%1) \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"
"vl %%v24, 0(%%r1,%1) \n\t"
"vl %%v25, 16(%%r1,%1) \n\t"
"vl %%v26, 32(%%r1,%1) \n\t"
"vl %%v27, 48(%%r1,%1) \n\t"
"vl %%v16, 0(%%r1,%2) \n\t"
"vl %%v17, 16(%%r1,%2) \n\t"
"vl %%v18, 32(%%r1,%2) \n\t"
"vl %%v19, 48(%%r1,%2) \n\t"
"vfmdb %%v28,%%v24,%%v0 \n\t"
"vfmdb %%v29,%%v25,%%v0 \n\t"
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v30,%%v26,%%v0 \n\t"
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v31,%%v27,%%v0 \n\t"
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
/* 2nd parts*/
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vst %%v28, 0(%%r1,%1) \n\t"
"vst %%v29, 16(%%r1,%1) \n\t"
"vst %%v30, 32(%%r1,%1) \n\t"
"vst %%v31, 48(%%r1,%1) \n\t"
"vst %%v20, 0(%%r1,%2) \n\t"
"vst %%v21, 16(%%r1,%2) \n\t"
"vst %%v22, 32(%%r1,%2) \n\t"
"vst %%v23, 48(%%r1,%2) \n\t"
"vl %%v24, 64(%%r1,%1) \n\t"
"vl %%v25, 80(%%r1,%1) \n\t"
"vl %%v26, 96(%%r1,%1) \n\t"
"vl %%v27, 112(%%r1,%1) \n\t"
"vl %%v16, 64(%%r1,%2) \n\t"
"vl %%v17, 80(%%r1,%2) \n\t"
"vl %%v18, 96(%%r1,%2) \n\t"
"vl %%v19, 112(%%r1,%2) \n\t"
"vfmdb %%v28,%%v24,%%v0 \n\t"
"vfmdb %%v29,%%v25,%%v0 \n\t"
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v30,%%v26,%%v0 \n\t"
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v31,%%v27,%%v0 \n\t"
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
/* 2nd parts*/
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vst %%v28, 64(%%r1,%1) \n\t"
"vst %%v29, 80(%%r1,%1) \n\t"
"vst %%v30, 96(%%r1,%1) \n\t"
"vst %%v31, 112(%%r1,%1) \n\t"
"vst %%v20, 64(%%r1,%2) \n\t"
"vst %%v21, 80(%%r1,%2) \n\t"
"vst %%v22, 96(%%r1,%2) \n\t"
"vst %%v23, 112(%%r1,%2) \n\t"
"vl %%v24, 128(%%r1,%1) \n\t"
"vl %%v25, 144(%%r1,%1) \n\t"
"vl %%v26, 160(%%r1,%1) \n\t"
"vl %%v27, 176(%%r1,%1) \n\t"
"vl %%v16, 128(%%r1,%2) \n\t"
"vl %%v17, 144(%%r1,%2) \n\t"
"vl %%v18, 160(%%r1,%2) \n\t"
"vl %%v19, 176(%%r1,%2) \n\t"
"vfmdb %%v28,%%v24,%%v0 \n\t"
"vfmdb %%v29,%%v25,%%v0 \n\t"
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v30,%%v26,%%v0 \n\t"
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v31,%%v27,%%v0 \n\t"
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
/* 2nd parts*/
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vst %%v28, 128(%%r1,%1) \n\t"
"vst %%v29, 144(%%r1,%1) \n\t"
"vst %%v30, 160(%%r1,%1) \n\t"
"vst %%v31, 176(%%r1,%1) \n\t"
"vst %%v20, 128(%%r1,%2) \n\t"
"vst %%v21, 144(%%r1,%2) \n\t"
"vst %%v22, 160(%%r1,%2) \n\t"
"vst %%v23, 176(%%r1,%2) \n\t"
"vl %%v24, 192(%%r1,%1) \n\t"
"vl %%v25, 208(%%r1,%1) \n\t"
"vl %%v26, 224(%%r1,%1) \n\t"
"vl %%v27, 240(%%r1,%1) \n\t"
"vl %%v16, 192(%%r1,%2) \n\t"
"vl %%v17, 208(%%r1,%2) \n\t"
"vl %%v18, 224(%%r1,%2) \n\t"
"vl %%v19, 240(%%r1,%2) \n\t"
"vfmdb %%v28,%%v24,%%v0 \n\t"
"vfmdb %%v29,%%v25,%%v0 \n\t"
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v30,%%v26,%%v0 \n\t"
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v31,%%v27,%%v0 \n\t"
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
/* 2nd parts*/
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vst %%v28, 192(%%r1,%1) \n\t"
"vst %%v29, 208(%%r1,%1) \n\t"
"vst %%v30, 224(%%r1,%1) \n\t"
"vst %%v31, 240(%%r1,%1) \n\t"
"vst %%v20, 192(%%r1,%2) \n\t"
"vst %%v21, 208(%%r1,%2) \n\t"
"vst %%v22, 224(%%r1,%2) \n\t"
"vst %%v23, 240(%%r1,%2) \n\t"
"agfi %%r1,256 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y),"m"(*c),"m"(*s)
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}

int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
@@ -214,8 +204,10 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
BLASLONG n1 = n & -32;
if ( n1 > 0 )
{
drot_kernel_32(n1, x, y, c, s);
FLOAT cosa,sina;
cosa=c;
sina=s;
drot_kernel_32(n1, x, y, &cosa, &sina);
i=n1;
}

@@ -229,6 +221,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT

}


}
else
{
@@ -250,3 +243,4 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT

}



+ 71
- 129
kernel/zarch/dscal.c View File

@@ -27,135 +27,75 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include "common.h"

#ifdef Z13_A
static void dscal_kernel_32( BLASLONG n, FLOAT da , FLOAT *x )
{

__asm__ ("pfd 2, 0(%[x_ptr]) \n\t"
"lgdr %%r0,%[alpha] \n\t"
"vlvgp %%v0,%%r0,%%r0 \n\t"
"srlg %[n],%[n],4 \n\t"
"vlr %%v1,%%v0 \n\t"
"vlm %%v16,%%v23, 0(%[x_ptr]) \n\t"
"la %[x_ptr], 128(%[x_ptr]) \n\t"
"aghik %[n], %[n], -1 \n\t"
"jle 2f \n\t"
".align 16 \n\t"
"1: \n\t"
"vfmdb %%v24, %%v16, %%v0 \n\t"
"vfmdb %%v25, %%v17, %%v0 \n\t"
"vfmdb %%v26, %%v18, %%v0 \n\t"
"vfmdb %%v27, %%v19, %%v1 \n\t"
"vlm %%v16,%%v19, 0(%[x_ptr]) \n\t"
"vfmdb %%v28, %%v20, %%v0 \n\t"
"vfmdb %%v29, %%v21, %%v1 \n\t"
"vfmdb %%v30, %%v22, %%v0 \n\t"
"vfmdb %%v31, %%v23, %%v1 \n\t"
"vlm %%v20,%%v23, 64(%[x_ptr]) \n\t"
"lay %[x_ptr], -128(%[x_ptr]) \n\t"
"vstm %%v24,%%v31, 0(%[x_ptr]) \n\t"
"la %[x_ptr],256(%[x_ptr]) \n\t"
"brctg %[n],1b \n\t"
"2: \n\t"
"vfmdb %%v24, %%v16, %%v0 \n\t"
"vfmdb %%v25, %%v17, %%v1 \n\t"
"vfmdb %%v26, %%v18, %%v0 \n\t"
"vfmdb %%v27, %%v19, %%v1 \n\t"
"lay %[x_ptr] , -128(%[x_ptr]) \n\t"
"vfmdb %%v28, %%v20, %%v0 \n\t"
"vfmdb %%v29, %%v21, %%v1 \n\t"
"vfmdb %%v30, %%v22, %%v0 \n\t"
"vfmdb %%v31, %%v23, %%v1 \n\t"
"vstm %%v24,%%v31, 0(%[x_ptr]) \n\t"
: [mem] "+m" (*(double (*)[n])x) ,[x_ptr] "+&a"(x),[n] "+&r"(n)
: [alpha] "f"(da)
:"cc" , "r0","v0","v1","v16","v17","v18","v19","v20","v21",
"v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
#else
static void dscal_kernel_32( BLASLONG n, FLOAT da , FLOAT *x )
static void dscal_kernel_16(BLASLONG n, FLOAT da, FLOAT *x)
{
__asm__ volatile (
"vlrepg %%v0,%1 \n\t"
"srlg %%r0,%0,4 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"
"vl %%v24, 0(%%r1,%2) \n\t"
"vfmdb %%v24,%%v24,%%v0 \n\t"
"vst %%v24, 0(%%r1,%2) \n\t"
"vl %%v25, 16(%%r1,%2) \n\t"
"vfmdb %%v25,%%v25,%%v0 \n\t"
"vst %%v25, 16(%%r1,%2) \n\t"
"vl %%v26, 32(%%r1,%2) \n\t"
"vfmdb %%v26,%%v26,%%v0 \n\t"
"vst %%v26, 32(%%r1,%2) \n\t"
"vl %%v27, 48(%%r1,%2) \n\t"
"vfmdb %%v27,%%v27,%%v0 \n\t"
"vst %%v27, 48(%%r1,%2) \n\t"
"vl %%v24, 64(%%r1,%2) \n\t"
"vfmdb %%v24,%%v24,%%v0 \n\t"
"vst %%v24, 64(%%r1,%2) \n\t"
"vl %%v25, 80(%%r1,%2) \n\t"
"vfmdb %%v25,%%v25,%%v0 \n\t"
"vst %%v25, 80(%%r1,%2) \n\t"
"vl %%v26, 96(%%r1,%2) \n\t"
"vfmdb %%v26,%%v26,%%v0 \n\t"
"vst %%v26, 96(%%r1,%2) \n\t"
"vl %%v27, 112(%%r1,%2) \n\t"
"vfmdb %%v27,%%v27,%%v0 \n\t"
"vst %%v27, 112(%%r1,%2) \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"m"(da),"ZR"((FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v24","v25","v26","v27"
);
}

/* faster than sequence of triples(vl vfmd vst) (tested OPENBLAS_LOOPS=10000) */
__asm__ ("pfd 2, 0(%[x_ptr]) \n\t"
"lgdr %%r0,%[alpha] \n\t"
"vlvgp %%v0,%%r0,%%r0 \n\t"
"vlr %%v1,%%v0 \n\t"
"sllg %%r0,%[n],3 \n\t"
"agr %%r0,%[x_ptr] \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 2, 256(%[x_ptr]) \n\t"
"vlm %%v16,%%v23, 0(%[x_ptr]) \n\t"
"vfmdb %%v16,%%v16,%%v0 \n\t"
"vfmdb %%v17,%%v17,%%v1 \n\t"
"vfmdb %%v18,%%v18,%%v0 \n\t"
"vfmdb %%v19,%%v19,%%v1 \n\t"
"vfmdb %%v20,%%v20,%%v0 \n\t"
"vfmdb %%v21,%%v21,%%v1 \n\t"
"vfmdb %%v22,%%v22,%%v0 \n\t"
"vfmdb %%v23,%%v23,%%v1 \n\t"
"vstm %%v16,%%v23, 0(%[x_ptr]) \n\t"
"vlm %%v24,%%v31,128(%[x_ptr]) \n\t"
"vfmdb %%v24,%%v24,%%v0 \n\t"
"vfmdb %%v25,%%v25,%%v1 \n\t"
"vfmdb %%v26,%%v26,%%v0 \n\t"
"vfmdb %%v27,%%v27,%%v1 \n\t"
"vfmdb %%v28,%%v28,%%v0 \n\t"
"vfmdb %%v29,%%v29,%%v1 \n\t"
"vfmdb %%v30,%%v30,%%v0 \n\t"
"vfmdb %%v31,%%v31,%%v1 \n\t"
"vstm %%v24,%%v31,128(%[x_ptr]) \n\t"
"la %[x_ptr], 256(%[x_ptr]) \n\t"
"clgrjl %[x_ptr],%%r0,1b \n\t"
: [mem] "+m" (*(double (*)[n])x) ,[x_ptr] "+&a"(x)
: [n] "r"(n),[alpha] "f"(da)
:"cc" , "r0","v0","v1","v16","v17","v18","v19","v20","v21",
"v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);

}
#endif
static void dscal_kernel_32_zero( BLASLONG n, FLOAT *x )
static void dscal_kernel_16_zero(BLASLONG n, FLOAT *x)
{
__asm__ ("pfd 2, 0(%[x_ptr]) \n\t"
"vzero %%v24 \n\t"
"sllg %%r0,%[n],3 \n\t"
"vzero %%v25 \n\t"
"agr %%r0,%[x_ptr] \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 2, 256(%[x_ptr]) \n\t"
"vst %%v24, 0(%[x_ptr]) \n\t"
"vst %%v25, 16(%[x_ptr]) \n\t"
"vst %%v24, 32(%[x_ptr]) \n\t"
"vst %%v25, 48(%[x_ptr]) \n\t"
"vst %%v24, 64(%[x_ptr]) \n\t"
"vst %%v25, 80(%[x_ptr]) \n\t"
"vst %%v24, 96(%[x_ptr]) \n\t"
"vst %%v25, 112(%[x_ptr]) \n\t"
"vst %%v24, 128(%[x_ptr]) \n\t"
"vst %%v25, 144(%[x_ptr]) \n\t"
"vst %%v24, 160(%[x_ptr]) \n\t"
"vst %%v25, 176(%[x_ptr]) \n\t"
"vst %%v24, 192(%[x_ptr]) \n\t"
"vst %%v25, 208(%[x_ptr]) \n\t"
"vst %%v24, 224(%[x_ptr]) \n\t"
"vst %%v25, 240(%[x_ptr]) \n\t"
"la %[x_ptr],256(%[x_ptr]) \n\t"
"clgrjl %[x_ptr],%%r0,1b \n\t"
: [mem] "=m" (*(double (*)[n])x) ,[x_ptr] "+&a"(x)
: [n] "r"(n)
:"cc" , "r0", "v24" ,"v25"
);
__asm__ volatile(
"vzero %%v24 \n\t"
"vzero %%v25 \n\t"
"vzero %%v26 \n\t"
"vzero %%v27 \n\t"
"srlg %%r0,%0,4 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 2, 1024(%%r1,%1) \n\t"

"vst %%v24,0(%%r1,%1) \n\t"
"vst %%v25,16(%%r1,%1) \n\t"
"vst %%v26,32(%%r1,%1) \n\t"
"vst %%v27,48(%%r1,%1) \n\t"
"vst %%v24,64(%%r1,%1) \n\t"
"vst %%v25,80(%%r1,%1) \n\t"
"vst %%v26,96(%%r1,%1) \n\t"
"vst %%v27,112(%%r1,%1) \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((FLOAT (*)[n])x)
:"memory","cc","r0","r1","v24","v25","v26","v27"
);
}



int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i=0,j=0;
@@ -169,11 +109,11 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
if ( da == 0.0 )
{

BLASLONG n1 = n & -32;
BLASLONG n1 = n & -16;
if ( n1 > 0 )
{
dscal_kernel_32_zero(n1 , x);
dscal_kernel_16_zero(n1, x);
j=n1;
}

@@ -188,10 +128,10 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
else
{

BLASLONG n1 = n & -32;
BLASLONG n1 = n & -16;
if ( n1 > 0 )
{
dscal_kernel_32(n1 , da , x);
dscal_kernel_16(n1, da, x);
j=n1;
}
while(j < n)
@@ -260,4 +200,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
}
return 0;

}
}



+ 180
- 0
kernel/zarch/dsdot.c View File

@@ -0,0 +1,180 @@
/***************************************************************************
Copyright (c) 2013-2018,The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms,with or without
modification,are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice,this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice,this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES,INCLUDING,BUT NOT LIMITED TO,THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT,INDIRECT,INCIDENTAL,SPECIAL,EXEMPLARY,OR CONSEQUENTIAL
DAMAGES (INCLUDING,BUT NOT LIMITED TO,PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE,DATA,OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY,WHETHER IN CONTRACT,STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE,EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#include "common.h"

static double dsdot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
{
double dot;

__asm__ volatile (
"vzero %%v0 \n\t"
"srlg %%r0,%1,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1,1024(%%r1,%2) \n\t"
"pfd 2,1024(%%r1,%3) \n\t"

"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"

"vl %%v24,0(%%r1,%3) \n\t"
"vfmsb %%v16,%%v16,%%v24 \n\t"
"vl %%v25,16(%%r1,%3) \n\t"
"vfmsb %%v17,%%v17,%%v25 \n\t"
"vl %%v26,32(%%r1,%3) \n\t"
"vfmsb %%v18,%%v18,%%v26 \n\t"
"vl %%v27,48(%%r1,%3) \n\t"
"vfmsb %%v19,%%v19,%%v27 \n\t"
"vl %%v28,64(%%r1,%3) \n\t"
"vfmsb %%v20,%%v20,%%v28 \n\t"
"vl %%v29,80(%%r1,%3) \n\t"
"vfmsb %%v21,%%v21,%%v29 \n\t"
"vl %%v30,96(%%r1,%3) \n\t"
"vfmsb %%v22,%%v22,%%v30 \n\t"
"vl %%v31,112(%%r1,%3) \n\t"
"vfmsb %%v23,%%v23,%%v31 \n\t"

"vflls %%v24,%%v16 \n\t"
"vflls %%v25,%%v17 \n\t"
"vflls %%v26,%%v18 \n\t"
"vflls %%v27,%%v19 \n\t"
"vflls %%v28,%%v20 \n\t"
"vflls %%v29,%%v21 \n\t"
"vflls %%v30,%%v22 \n\t"
"vflls %%v31,%%v23 \n\t"

"veslg %%v16,%%v16,32 \n\t"
"veslg %%v17,%%v17,32 \n\t"
"veslg %%v18,%%v18,32 \n\t"
"veslg %%v19,%%v19,32 \n\t"
"veslg %%v20,%%v20,32 \n\t"
"veslg %%v21,%%v21,32 \n\t"
"veslg %%v22,%%v22,32 \n\t"
"veslg %%v23,%%v23,32 \n\t"

"vflls %%v16,%%v16 \n\t"
"vflls %%v17,%%v17 \n\t"
"vflls %%v18,%%v18 \n\t"
"vflls %%v19,%%v19 \n\t"
"vflls %%v20,%%v20 \n\t"
"vflls %%v21,%%v21 \n\t"
"vflls %%v22,%%v22 \n\t"
"vflls %%v23,%%v23 \n\t"

"vfadb %%v16,%%v16,%%v24 \n\t"
"vfadb %%v17,%%v17,%%v25 \n\t"
"vfadb %%v18,%%v18,%%v26 \n\t"
"vfadb %%v19,%%v19,%%v27 \n\t"
"vfadb %%v20,%%v20,%%v28 \n\t"
"vfadb %%v21,%%v21,%%v29 \n\t"
"vfadb %%v22,%%v22,%%v30 \n\t"
"vfadb %%v23,%%v23,%%v31 \n\t"
"vfadb %%v16,%%v16,%%v20 \n\t"
"vfadb %%v17,%%v17,%%v21 \n\t"
"vfadb %%v18,%%v18,%%v22 \n\t"
"vfadb %%v19,%%v19,%%v23 \n\t"
"vfadb %%v16,%%v16,%%v18 \n\t"
"vfadb %%v17,%%v17,%%v19 \n\t"
"vfadb %%v16,%%v16,%%v17 \n\t"
"vfadb %%v0,%%v16,%%v0 \n\t"

"agfi %%r1,128 \n\t"
"brctg %%r0,0b \n\t"
"vrepg %%v1,%%v0,1 \n\t"
"adbr %%f0,%%f1 \n\t"
"ldr %0,%%f0 "
:"=f"(dot)
:"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((const FLOAT (*)[n])y)
:"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);

return dot;
}

double CNAME(BLASLONG n,FLOAT *x,BLASLONG inc_x,FLOAT *y,BLASLONG inc_y)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;

double dot = 0.0 ;

if ( n <= 0 ) return(dot);

if ( (inc_x == 1) && (inc_y == 1) )
{

BLASLONG n1 = n & -32;

if ( n1 )
dot = dsdot_kernel_32(n1,x,y);

i = n1;
while(i < n)
{

dot += y[i] * x[i] ;
i++ ;

}
return(dot);


}

BLASLONG n1 = n & -2;

while(i < n1)
{

dot += y[iy] * x[ix] + y[iy+inc_y] * x[ix+inc_x];
ix += inc_x*2 ;
iy += inc_y*2 ;
i+=2 ;

}

while(i < n)
{

dot += y[iy] * x[ix] ;
ix += inc_x ;
iy += inc_y ;
i++ ;

}
return(dot);

}



+ 83
- 209
kernel/zarch/dswap.c View File

@@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
Copyright (c) 2013-2018, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -25,217 +25,93 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/



#include "common.h"



#if defined(Z13_SWAP_A)
static void dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
{
__asm__ volatile(
"pfd 1, 0(%[ptr_x]) \n\t"
"pfd 2, 0(%[ptr_y]) \n\t"
"srlg %[n_tmp],%[n_tmp],5 \n\t"
"xgr %%r1,%%r1 \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 2, 256(%%r1,%[ptr_x]) \n\t"
"pfd 2, 256(%%r1,%[ptr_y]) \n\t"
"vl %%v24, 0(%%r1,%[ptr_x]) \n\t"
"vl %%v16, 0(%%r1,%[ptr_y]) \n\t"
"vst %%v24, 0(%%r1,%[ptr_y]) \n\t"
"vst %%v16, 0(%%r1,%[ptr_x]) \n\t"

"vl %%v25, 16(%%r1,%[ptr_x]) \n\t"
"vl %%v17, 16(%%r1,%[ptr_y]) \n\t"
"vst %%v25, 16(%%r1,%[ptr_y]) \n\t"
"vst %%v17, 16(%%r1,%[ptr_x]) \n\t"

"vl %%v26, 32(%%r1,%[ptr_x]) \n\t"
"vl %%v18, 32(%%r1,%[ptr_y]) \n\t"
"vst %%v26, 32(%%r1,%[ptr_y]) \n\t"
"vst %%v18, 32(%%r1,%[ptr_x]) \n\t"

"vl %%v27, 48(%%r1,%[ptr_x]) \n\t"
"vl %%v19, 48(%%r1,%[ptr_y]) \n\t"
"vst %%v27, 48(%%r1,%[ptr_y]) \n\t"
"vst %%v19, 48(%%r1,%[ptr_x]) \n\t"

"vl %%v28, 64(%%r1,%[ptr_x]) \n\t"
"vl %%v20, 64(%%r1,%[ptr_y]) \n\t"
"vst %%v28, 64(%%r1,%[ptr_y]) \n\t"
"vst %%v20, 64(%%r1,%[ptr_x]) \n\t"

"vl %%v29, 80(%%r1,%[ptr_x]) \n\t"
"vl %%v21, 80(%%r1,%[ptr_y]) \n\t"
"vst %%v29, 80(%%r1,%[ptr_y]) \n\t"
"vst %%v21, 80(%%r1,%[ptr_x]) \n\t"

"vl %%v30, 96(%%r1,%[ptr_x]) \n\t"
"vl %%v22, 96(%%r1,%[ptr_y]) \n\t"
"vst %%v30, 96(%%r1,%[ptr_y]) \n\t"
"vst %%v22, 96(%%r1,%[ptr_x]) \n\t"

"vl %%v31, 112(%%r1,%[ptr_x]) \n\t"
"vl %%v23, 112(%%r1,%[ptr_y]) \n\t"
"vst %%v31, 112(%%r1,%[ptr_y]) \n\t"
"vst %%v23, 112(%%r1,%[ptr_x]) \n\t"

"vl %%v24, 128(%%r1,%[ptr_x]) \n\t"
"vl %%v16, 128(%%r1,%[ptr_y]) \n\t"
"vst %%v24, 128(%%r1,%[ptr_y]) \n\t"
"vst %%v16, 128(%%r1,%[ptr_x]) \n\t"

"vl %%v25, 144(%%r1,%[ptr_x]) \n\t"
"vl %%v17, 144(%%r1,%[ptr_y]) \n\t"
"vst %%v25, 144(%%r1,%[ptr_y]) \n\t"
"vst %%v17, 144(%%r1,%[ptr_x]) \n\t"

"vl %%v26, 160(%%r1,%[ptr_x]) \n\t"
"vl %%v18, 160(%%r1,%[ptr_y]) \n\t"
"vst %%v26, 160(%%r1,%[ptr_y]) \n\t"
"vst %%v18, 160(%%r1,%[ptr_x]) \n\t"

"vl %%v27, 176(%%r1,%[ptr_x]) \n\t"
"vl %%v19, 176(%%r1,%[ptr_y]) \n\t"
"vst %%v27, 176(%%r1,%[ptr_y]) \n\t"
"vst %%v19, 176(%%r1,%[ptr_x]) \n\t"

"vl %%v28, 192(%%r1,%[ptr_x]) \n\t"
"vl %%v20, 192(%%r1,%[ptr_y]) \n\t"
"vst %%v28, 192(%%r1,%[ptr_y]) \n\t"
"vst %%v20, 192(%%r1,%[ptr_x]) \n\t"

"vl %%v29, 208(%%r1,%[ptr_x]) \n\t"
"vl %%v21, 208(%%r1,%[ptr_y]) \n\t"
"vst %%v29, 208(%%r1,%[ptr_y]) \n\t"
"vst %%v21, 208(%%r1,%[ptr_x]) \n\t"

"vl %%v30, 224(%%r1,%[ptr_x]) \n\t"
"vl %%v22, 224(%%r1,%[ptr_y]) \n\t"
"vst %%v30, 224(%%r1,%[ptr_y]) \n\t"
"vst %%v22, 224(%%r1,%[ptr_x]) \n\t"

"vl %%v31, 240(%%r1,%[ptr_x]) \n\t"
"vl %%v23, 240(%%r1,%[ptr_y]) \n\t"
"vst %%v31, 240(%%r1,%[ptr_y]) \n\t"
"vst %%v23, 240(%%r1,%[ptr_x]) \n\t"
"la %%r1,256(%%r1) \n\t"
"brctg %[n_tmp],1b"
: [mem_x] "+m" (*(double (*)[n])x),
[mem_y] "+m" (*(double (*)[n])y),
[n_tmp] "+&r"(n)
: [ptr_x] "a"(x), [ptr_y] "a"(y)
: "cc", "r1", "v16","v17","v18","v19","v20","v21","v22","v23"
,"v24","v25","v26","v27","v28","v29","v30","v31"
);
return;

}

#else

static void dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
static void dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
{
__asm__ volatile(
"pfd 2, 0(%[ptr_x]) \n\t"
"pfd 2, 0(%[ptr_y]) \n\t"
"srlg %[n_tmp],%[n_tmp],5 \n\t"
"xgr %%r1,%%r1 \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 2, 256(%%r1,%[ptr_x]) \n\t"
"pfd 2, 256(%%r1,%[ptr_y]) \n\t"
"vl %%v16, 0(%%r1,%[ptr_x]) \n\t"
"vl %%v17, 16(%%r1,%[ptr_x]) \n\t"
"vl %%v18, 32(%%r1,%[ptr_x]) \n\t"
"vl %%v19, 48(%%r1,%[ptr_x]) \n\t"
"vl %%v20, 64(%%r1,%[ptr_x]) \n\t"
"vl %%v21, 80(%%r1,%[ptr_x]) \n\t"
"vl %%v22, 96(%%r1,%[ptr_x]) \n\t"
"vl %%v23, 112(%%r1,%[ptr_x]) \n\t"
"vl %%v24, 128(%%r1,%[ptr_x]) \n\t"
"vl %%v25, 144(%%r1,%[ptr_x]) \n\t"
"vl %%v26, 160(%%r1,%[ptr_x]) \n\t"
"vl %%v27, 176(%%r1,%[ptr_x]) \n\t"
"vl %%v28, 192(%%r1,%[ptr_x]) \n\t"
"vl %%v29, 208(%%r1,%[ptr_x]) \n\t"
"vl %%v30, 224(%%r1,%[ptr_x]) \n\t"
"vl %%v31, 240(%%r1,%[ptr_x]) \n\t"


"vl %%v0, 0(%%r1,%[ptr_y]) \n\t"
"vl %%v1, 16(%%r1,%[ptr_y]) \n\t"
"vl %%v2, 32(%%r1,%[ptr_y]) \n\t"
"vl %%v3, 48(%%r1,%[ptr_y]) \n\t"
"vl %%v4, 64(%%r1,%[ptr_y]) \n\t"
"vl %%v5, 80(%%r1,%[ptr_y]) \n\t"
"vl %%v6, 96(%%r1,%[ptr_y]) \n\t"
"vl %%v7, 112(%%r1,%[ptr_y]) \n\t"
"vst %%v0, 0(%%r1,%[ptr_x]) \n\t"
"vst %%v1, 16(%%r1,%[ptr_x]) \n\t"
"vst %%v2, 32(%%r1,%[ptr_x]) \n\t"
"vst %%v3, 48(%%r1,%[ptr_x]) \n\t"
"vst %%v4, 64(%%r1,%[ptr_x]) \n\t"
"vst %%v5, 80(%%r1,%[ptr_x]) \n\t"
"vst %%v6, 96(%%r1,%[ptr_x]) \n\t"
"vst %%v7, 112(%%r1,%[ptr_x]) \n\t"

"vl %%v0, 128(%%r1,%[ptr_y]) \n\t"
"vl %%v1, 144(%%r1,%[ptr_y]) \n\t"
"vl %%v2, 160(%%r1,%[ptr_y]) \n\t"
"vl %%v3, 176(%%r1,%[ptr_y]) \n\t"
"vl %%v4, 192(%%r1,%[ptr_y]) \n\t"
"vl %%v5, 208(%%r1,%[ptr_y]) \n\t"
"vl %%v6, 224(%%r1,%[ptr_y]) \n\t"
"vl %%v7, 240(%%r1,%[ptr_y]) \n\t"
"vst %%v0, 128(%%r1,%[ptr_x]) \n\t"
"vst %%v1, 144(%%r1,%[ptr_x]) \n\t"
"vst %%v2, 160(%%r1,%[ptr_x]) \n\t"
"vst %%v3, 176(%%r1,%[ptr_x]) \n\t"
"vst %%v4, 192(%%r1,%[ptr_x]) \n\t"
"vst %%v5, 208(%%r1,%[ptr_x]) \n\t"
"vst %%v6, 224(%%r1,%[ptr_x]) \n\t"
"vst %%v7, 240(%%r1,%[ptr_x]) \n\t"

"vst %%v16, 0(%%r1,%[ptr_y]) \n\t"
"vst %%v17, 16(%%r1,%[ptr_y]) \n\t"
"vst %%v18, 32(%%r1,%[ptr_y]) \n\t"
"vst %%v19, 48(%%r1,%[ptr_y]) \n\t"
"vst %%v20, 64(%%r1,%[ptr_y]) \n\t"
"vst %%v21, 80(%%r1,%[ptr_y]) \n\t"
"vst %%v22, 96(%%r1,%[ptr_y]) \n\t"
"vst %%v23, 112(%%r1,%[ptr_y]) \n\t"
"vst %%v24, 128(%%r1,%[ptr_y]) \n\t"
"vst %%v25, 144(%%r1,%[ptr_y]) \n\t"
"vst %%v26, 160(%%r1,%[ptr_y]) \n\t"
"vst %%v27, 176(%%r1,%[ptr_y]) \n\t"
"vst %%v28, 192(%%r1,%[ptr_y]) \n\t"
"vst %%v29, 208(%%r1,%[ptr_y]) \n\t"
"vst %%v30, 224(%%r1,%[ptr_y]) \n\t"
"vst %%v31, 240(%%r1,%[ptr_y]) \n\t"
"la %%r1,256(%%r1) \n\t"
"brctg %[n_tmp],1b"
: [mem_x] "+m" (*(double (*)[n])x),
[mem_y] "+m" (*(double (*)[n])y),
[n_tmp] "+&r"(n)
: [ptr_x] "a"(x), [ptr_y] "a"(y)
: "cc", "r1", "v0","v1","v2","v3","v4","v5","v6","v7","v16",
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return;

__asm__ volatile(
"srlg %%r0,%0,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 2, 1024(%%r1,%1) \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"
"vl %%v16, 0(%%r1,%1) \n\t"
"vl %%v17, 16(%%r1,%1) \n\t"
"vl %%v18, 32(%%r1,%1) \n\t"
"vl %%v19, 48(%%r1,%1) \n\t"
"vl %%v20, 64(%%r1,%1) \n\t"
"vl %%v21, 80(%%r1,%1) \n\t"
"vl %%v22, 96(%%r1,%1) \n\t"
"vl %%v23, 112(%%r1,%1) \n\t"
"vl %%v24, 128(%%r1,%1) \n\t"
"vl %%v25, 144(%%r1,%1) \n\t"
"vl %%v26, 160(%%r1,%1) \n\t"
"vl %%v27, 176(%%r1,%1) \n\t"
"vl %%v28, 192(%%r1,%1) \n\t"
"vl %%v29, 208(%%r1,%1) \n\t"
"vl %%v30, 224(%%r1,%1) \n\t"
"vl %%v31, 240(%%r1,%1) \n\t"

"vl %%v0, 0(%%r1,%2) \n\t"
"vl %%v1, 16(%%r1,%2) \n\t"
"vl %%v2, 32(%%r1,%2) \n\t"
"vl %%v3, 48(%%r1,%2) \n\t"
"vl %%v4, 64(%%r1,%2) \n\t"
"vl %%v5, 80(%%r1,%2) \n\t"
"vl %%v6, 96(%%r1,%2) \n\t"
"vl %%v7, 112(%%r1,%2) \n\t"
"vst %%v0, 0(%%r1,%1) \n\t"
"vst %%v1, 16(%%r1,%1) \n\t"
"vst %%v2, 32(%%r1,%1) \n\t"
"vst %%v3, 48(%%r1,%1) \n\t"
"vst %%v4, 64(%%r1,%1) \n\t"
"vst %%v5, 80(%%r1,%1) \n\t"
"vst %%v6, 96(%%r1,%1) \n\t"
"vst %%v7, 112(%%r1,%1) \n\t"

"vl %%v0, 128(%%r1,%2) \n\t"
"vl %%v1, 144(%%r1,%2) \n\t"
"vl %%v2, 160(%%r1,%2) \n\t"
"vl %%v3, 176(%%r1,%2) \n\t"
"vl %%v4, 192(%%r1,%2) \n\t"
"vl %%v5, 208(%%r1,%2) \n\t"
"vl %%v6, 224(%%r1,%2) \n\t"
"vl %%v7, 240(%%r1,%2) \n\t"
"vst %%v0, 128(%%r1,%1) \n\t"
"vst %%v1, 144(%%r1,%1) \n\t"
"vst %%v2, 160(%%r1,%1) \n\t"
"vst %%v3, 176(%%r1,%1) \n\t"
"vst %%v4, 192(%%r1,%1) \n\t"
"vst %%v5, 208(%%r1,%1) \n\t"
"vst %%v6, 224(%%r1,%1) \n\t"
"vst %%v7, 240(%%r1,%1) \n\t"

"vst %%v16, 0(%%r1,%2) \n\t"
"vst %%v17, 16(%%r1,%2) \n\t"
"vst %%v18, 32(%%r1,%2) \n\t"
"vst %%v19, 48(%%r1,%2) \n\t"
"vst %%v20, 64(%%r1,%2) \n\t"
"vst %%v21, 80(%%r1,%2) \n\t"
"vst %%v22, 96(%%r1,%2) \n\t"
"vst %%v23, 112(%%r1,%2) \n\t"
"vst %%v24, 128(%%r1,%2) \n\t"
"vst %%v25, 144(%%r1,%2) \n\t"
"vst %%v26, 160(%%r1,%2) \n\t"
"vst %%v27, 176(%%r1,%2) \n\t"
"vst %%v28, 192(%%r1,%2) \n\t"
"vst %%v29, 208(%%r1,%2) \n\t"
"vst %%v30, 224(%%r1,%2) \n\t"
"vst %%v31, 240(%%r1,%2) \n\t"
"agfi %%r1,256 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}

#endif

int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i=0;
@@ -284,5 +160,3 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,

}



+ 319
- 0
kernel/zarch/icamax.c View File

@@ -0,0 +1,319 @@
/***************************************************************************
Copyright (c) 2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#include "common.h"
#include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1]))

static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax)
{
BLASLONG iamax;

__asm__ volatile (
"vlef %%v0,0(%3),0 \n\t"
"vlef %%v1,4(%3),0 \n\t"
"vlef %%v0,8(%3),1 \n\t"
"vlef %%v1,12(%3),1 \n\t"
"vlef %%v0,16(%3),2 \n\t"
"vlef %%v1,20(%3),2 \n\t"
"vlef %%v0,24(%3),3 \n\t"
"vlef %%v1,28(%3),3 \n\t"
"vflpsb %%v0,%%v0 \n\t"
"vflpsb %%v1,%%v1 \n\t"
"vfasb %%v0,%%v0,%%v1 \n\t"
"vleig %%v1,0,0 \n\t"
"vleig %%v1,2,1 \n\t"
"vleig %%v2,1,0 \n\t"
"vleig %%v2,3,1 \n\t"
"vrepig %%v3,16 \n\t"
"vzero %%v4 \n\t"
"vleif %%v24,0,0 \n\t"
"vleif %%v24,1,1 \n\t"
"vleif %%v24,2,2 \n\t"
"vleif %%v24,3,3 \n\t"
"vleif %%v25,4,0 \n\t"
"vleif %%v25,5,1 \n\t"
"vleif %%v25,6,2 \n\t"
"vleif %%v25,7,3 \n\t"
"vleif %%v26,8,0 \n\t"
"vleif %%v26,9,1 \n\t"
"vleif %%v26,10,2 \n\t"
"vleif %%v26,11,3 \n\t"
"vleif %%v27,12,0 \n\t"
"vleif %%v27,13,1 \n\t"
"vleif %%v27,14,2 \n\t"
"vleif %%v27,15,3 \n\t"
"srlg %%r0,%2,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%3) \n\t"

"vlef %%v16,0(%%r1,%3),0 \n\t"
"vlef %%v17,4(%%r1,%3),0 \n\t"
"vlef %%v16,8(%%r1,%3),1 \n\t"
"vlef %%v17,12(%%r1,%3),1 \n\t"
"vlef %%v16,16(%%r1,%3),2 \n\t"
"vlef %%v17,20(%%r1,%3),2 \n\t"
"vlef %%v16,24(%%r1,%3),3 \n\t"
"vlef %%v17,28(%%r1,%3),3 \n\t"

"vlef %%v18,32(%%r1,%3),0 \n\t"
"vlef %%v19,36(%%r1,%3),0 \n\t"
"vlef %%v18,40(%%r1,%3),1 \n\t"
"vlef %%v19,44(%%r1,%3),1 \n\t"
"vlef %%v18,48(%%r1,%3),2 \n\t"
"vlef %%v19,52(%%r1,%3),2 \n\t"
"vlef %%v18,56(%%r1,%3),3 \n\t"
"vlef %%v19,30(%%r1,%3),3 \n\t"

"vlef %%v20,64(%%r1,%3),0 \n\t"
"vlef %%v21,68(%%r1,%3),0 \n\t"
"vlef %%v20,72(%%r1,%3),1 \n\t"
"vlef %%v21,76(%%r1,%3),1 \n\t"
"vlef %%v20,80(%%r1,%3),2 \n\t"
"vlef %%v21,84(%%r1,%3),2 \n\t"
"vlef %%v20,88(%%r1,%3),3 \n\t"
"vlef %%v21,92(%%r1,%3),3 \n\t"

"vlef %%v22,96(%%r1,%3),0 \n\t"
"vlef %%v23,100(%%r1,%3),0 \n\t"
"vlef %%v22,104(%%r1,%3),1 \n\t"
"vlef %%v23,108(%%r1,%3),1 \n\t"
"vlef %%v22,112(%%r1,%3),2 \n\t"
"vlef %%v23,116(%%r1,%3),2 \n\t"
"vlef %%v22,120(%%r1,%3),3 \n\t"
"vlef %%v23,124(%%r1,%3),3 \n\t"

"vflpsb %%v16, %%v16 \n\t"
"vflpsb %%v17, %%v17 \n\t"
"vflpsb %%v18, %%v18 \n\t"
"vflpsb %%v19, %%v19 \n\t"
"vflpsb %%v20, %%v20 \n\t"
"vflpsb %%v21, %%v21 \n\t"
"vflpsb %%v22, %%v22 \n\t"
"vflpsb %%v23, %%v23 \n\t"
"vfasb %%v16,%%v16,%%v17 \n\t"
"vfasb %%v17,%%v18,%%v19 \n\t"
"vfasb %%v18,%%v20,%%v21 \n\t"
"vfasb %%v19,%%v22,%%v23 \n\t"
"vfchsb %%v5,%%v16,%%v17 \n\t"
"vfchsb %%v6,%%v18,%%v19 \n\t"
"vsel %%v16,%%v16,%%v17,%%v5 \n\t"
"vsel %%v5,%%v24,%%v25,%%v5 \n\t"
"vsel %%v17,%%v18,%%v19,%%v6 \n\t"
"vsel %%v6,%%v26,%%v27,%%v6 \n\t"

"vfchsb %%v18,%%v16,%%v17 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v5,%%v5,%%v6,%%v18 \n\t"
"vsegf %%v6,%%v5 \n\t"
"vesrlg %%v5,%%v5,32 \n\t"
"vag %%v5,%%v5,%%v4 \n\t"
"vag %%v6,%%v6,%%v4 \n\t"

"vfchsb %%v7,%%v16,%%v0 \n\t"
"vsel %%v0,%%v16,%%v0,%%v7 \n\t"
"vsegf %%v8,%%v7 \n\t"
"vesrlg %%v7,%%v7,32 \n\t"
"vsegf %%v7,%%v7 \n\t"
"vsel %%v1,%%v5,%%v1,%%v7 \n\t"
"vsel %%v2,%%v6,%%v2,%%v8 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"

"vlef %%v16,128(%%r1,%3),0 \n\t"
"vlef %%v17,132(%%r1,%3),0 \n\t"
"vlef %%v16,136(%%r1,%3),1 \n\t"
"vlef %%v17,140(%%r1,%3),1 \n\t"
"vlef %%v16,144(%%r1,%3),2 \n\t"
"vlef %%v17,148(%%r1,%3),2 \n\t"
"vlef %%v16,152(%%r1,%3),3 \n\t"
"vlef %%v17,156(%%r1,%3),3 \n\t"

"vlef %%v18,160(%%r1,%3),0 \n\t"
"vlef %%v19,164(%%r1,%3),0 \n\t"
"vlef %%v18,168(%%r1,%3),1 \n\t"
"vlef %%v19,172(%%r1,%3),1 \n\t"
"vlef %%v18,176(%%r1,%3),2 \n\t"
"vlef %%v19,180(%%r1,%3),2 \n\t"
"vlef %%v18,184(%%r1,%3),3 \n\t"
"vlef %%v19,188(%%r1,%3),3 \n\t"

"vlef %%v20,192(%%r1,%3),0 \n\t"
"vlef %%v21,196(%%r1,%3),0 \n\t"
"vlef %%v20,200(%%r1,%3),1 \n\t"
"vlef %%v21,204(%%r1,%3),1 \n\t"
"vlef %%v20,208(%%r1,%3),2 \n\t"
"vlef %%v21,212(%%r1,%3),2 \n\t"
"vlef %%v20,216(%%r1,%3),3 \n\t"
"vlef %%v21,220(%%r1,%3),3 \n\t"

"vlef %%v22,224(%%r1,%3),0 \n\t"
"vlef %%v23,228(%%r1,%3),0 \n\t"
"vlef %%v22,232(%%r1,%3),1 \n\t"
"vlef %%v23,236(%%r1,%3),1 \n\t"
"vlef %%v22,240(%%r1,%3),2 \n\t"
"vlef %%v23,244(%%r1,%3),2 \n\t"
"vlef %%v22,248(%%r1,%3),3 \n\t"
"vlef %%v23,252(%%r1,%3),3 \n\t"

"vflpsb %%v16, %%v16 \n\t"
"vflpsb %%v17, %%v17 \n\t"
"vflpsb %%v18, %%v18 \n\t"
"vflpsb %%v19, %%v19 \n\t"
"vflpsb %%v20, %%v20 \n\t"
"vflpsb %%v21, %%v21 \n\t"
"vflpsb %%v22, %%v22 \n\t"
"vflpsb %%v23, %%v23 \n\t"
"vfasb %%v16,%%v16,%%v17 \n\t"
"vfasb %%v17,%%v18,%%v19 \n\t"
"vfasb %%v18,%%v20,%%v21 \n\t"
"vfasb %%v19,%%v22,%%v23 \n\t"
"vfchsb %%v5,%%v16,%%v17 \n\t"
"vfchsb %%v6,%%v18,%%v19 \n\t"
"vsel %%v16,%%v16,%%v17,%%v5 \n\t"
"vsel %%v5,%%v24,%%v25,%%v5 \n\t"
"vsel %%v17,%%v18,%%v19,%%v6 \n\t"
"vsel %%v6,%%v26,%%v27,%%v6 \n\t"

"vfchsb %%v18,%%v16,%%v17 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v5,%%v5,%%v6,%%v18 \n\t"
"vsegf %%v6,%%v5 \n\t"
"vesrlg %%v5,%%v5,32 \n\t"
"vag %%v5,%%v5,%%v4 \n\t"
"vag %%v6,%%v6,%%v4 \n\t"

"vfchsb %%v7,%%v16,%%v0 \n\t"
"vsel %%v0,%%v16,%%v0,%%v7 \n\t"
"vsegf %%v8,%%v7 \n\t"
"vesrlg %%v7,%%v7,32 \n\t"
"vsegf %%v7,%%v7 \n\t"
"vsel %%v1,%%v5,%%v1,%%v7 \n\t"
"vsel %%v2,%%v6,%%v2,%%v8 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"

"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"

"veslg %%v3,%%v0,32 \n\t"
"vfchsb %%v4,%%v0,%%v3 \n\t"
"vchlg %%v5,%%v2,%%v1 \n\t"
"vfcesb %%v6,%%v0,%%v3 \n\t"
"vn %%v5,%%v5,%%v6 \n\t"
"vo %%v4,%%v4,%%v5 \n\t"
"vsel %%v0,%%v0,%%v3,%%v4 \n\t"
"vesrlg %%v4,%%v4,32 \n\t"
"vsegf %%v4,%%v4 \n\t"
"vsel %%v1,%%v1,%%v2,%%v4 \n\t"

"vrepf %%v2,%%v0,2 \n\t"
"vrepg %%v3,%%v1,1 \n\t"
"wfcsb %%v2,%%v0 \n\t"
"jne 1f \n\t"
"vstef %%v0,%1,0 \n\t"
"vmnlg %%v0,%%v1,%%v3 \n\t"
"vlgvg %0,%%v0,0 \n\t"
"j 2f \n\t"
"1: \n\t"
"wfchsb %%v4,%%v2,%%v0 \n\t"
"vsel %%v1,%%v3,%%v1,%%v4 \n\t"
"vsel %%v0,%%v2,%%v0,%%v4 \n\t"
"vlgvg %0,%%v1,0 \n\t"
"ste %%f0,%1 \n\t"
"2: \n\t"
"nop "
:"=r"(iamax),"=m"(*amax)
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27"
);

return iamax;
}

BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i = 0;
BLASLONG ix = 0;
FLOAT maxf = 0;
BLASLONG max = 0;
BLASLONG inc_x2;

if (n <= 0 || inc_x <= 0) return(max);
if (inc_x == 1) {

BLASLONG n1 = n & -32;
if (n1 > 0) {

max = icamax_kernel_32(n1, x, &maxf);

i = n1;
}

while(i < n)
{
if( CABS1(x,ix) > maxf )
{
max = i;
maxf = CABS1(x,ix);
}
ix += 2;
i++;
}
return (max + 1);

} else {
inc_x2 = 2 * inc_x;

maxf = CABS1(x,0);
ix += inc_x2;
i++;

while(i < n)
{
if( CABS1(x,ix) > maxf )
{
max = i;
maxf = CABS1(x,ix);
}
ix += inc_x2;
i++;
}
return (max + 1);
}
}



+ 319
- 0
kernel/zarch/icamin.c View File

@@ -0,0 +1,319 @@
/***************************************************************************
Copyright (c) 2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#include "common.h"
#include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1]))

static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin)
{
BLASLONG iamin;

__asm__ volatile (
"vlef %%v0,0(%3),0 \n\t"
"vlef %%v1,4(%3),0 \n\t"
"vlef %%v0,8(%3),1 \n\t"
"vlef %%v1,12(%3),1 \n\t"
"vlef %%v0,16(%3),2 \n\t"
"vlef %%v1,20(%3),2 \n\t"
"vlef %%v0,24(%3),3 \n\t"
"vlef %%v1,28(%3),3 \n\t"
"vflpsb %%v0,%%v0 \n\t"
"vflpsb %%v1,%%v1 \n\t"
"vfasb %%v0,%%v0,%%v1 \n\t"
"vleig %%v1,0,0 \n\t"
"vleig %%v1,2,1 \n\t"
"vleig %%v2,1,0 \n\t"
"vleig %%v2,3,1 \n\t"
"vrepig %%v3,16 \n\t"
"vzero %%v4 \n\t"
"vleif %%v24,0,0 \n\t"
"vleif %%v24,1,1 \n\t"
"vleif %%v24,2,2 \n\t"
"vleif %%v24,3,3 \n\t"
"vleif %%v25,4,0 \n\t"
"vleif %%v25,5,1 \n\t"
"vleif %%v25,6,2 \n\t"
"vleif %%v25,7,3 \n\t"
"vleif %%v26,8,0 \n\t"
"vleif %%v26,9,1 \n\t"
"vleif %%v26,10,2 \n\t"
"vleif %%v26,11,3 \n\t"
"vleif %%v27,12,0 \n\t"
"vleif %%v27,13,1 \n\t"
"vleif %%v27,14,2 \n\t"
"vleif %%v27,15,3 \n\t"
"srlg %%r0,%2,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%3) \n\t"

"vlef %%v16,0(%%r1,%3),0 \n\t"
"vlef %%v17,4(%%r1,%3),0 \n\t"
"vlef %%v16,8(%%r1,%3),1 \n\t"
"vlef %%v17,12(%%r1,%3),1 \n\t"
"vlef %%v16,16(%%r1,%3),2 \n\t"
"vlef %%v17,20(%%r1,%3),2 \n\t"
"vlef %%v16,24(%%r1,%3),3 \n\t"
"vlef %%v17,28(%%r1,%3),3 \n\t"

"vlef %%v18,32(%%r1,%3),0 \n\t"
"vlef %%v19,36(%%r1,%3),0 \n\t"
"vlef %%v18,40(%%r1,%3),1 \n\t"
"vlef %%v19,44(%%r1,%3),1 \n\t"
"vlef %%v18,48(%%r1,%3),2 \n\t"
"vlef %%v19,52(%%r1,%3),2 \n\t"
"vlef %%v18,56(%%r1,%3),3 \n\t"
"vlef %%v19,30(%%r1,%3),3 \n\t"

"vlef %%v20,64(%%r1,%3),0 \n\t"
"vlef %%v21,68(%%r1,%3),0 \n\t"
"vlef %%v20,72(%%r1,%3),1 \n\t"
"vlef %%v21,76(%%r1,%3),1 \n\t"
"vlef %%v20,80(%%r1,%3),2 \n\t"
"vlef %%v21,84(%%r1,%3),2 \n\t"
"vlef %%v20,88(%%r1,%3),3 \n\t"
"vlef %%v21,92(%%r1,%3),3 \n\t"

"vlef %%v22,96(%%r1,%3),0 \n\t"
"vlef %%v23,100(%%r1,%3),0 \n\t"
"vlef %%v22,104(%%r1,%3),1 \n\t"
"vlef %%v23,108(%%r1,%3),1 \n\t"
"vlef %%v22,112(%%r1,%3),2 \n\t"
"vlef %%v23,116(%%r1,%3),2 \n\t"
"vlef %%v22,120(%%r1,%3),3 \n\t"
"vlef %%v23,124(%%r1,%3),3 \n\t"

"vflpsb %%v16, %%v16 \n\t"
"vflpsb %%v17, %%v17 \n\t"
"vflpsb %%v18, %%v18 \n\t"
"vflpsb %%v19, %%v19 \n\t"
"vflpsb %%v20, %%v20 \n\t"
"vflpsb %%v21, %%v21 \n\t"
"vflpsb %%v22, %%v22 \n\t"
"vflpsb %%v23, %%v23 \n\t"
"vfasb %%v16,%%v16,%%v17 \n\t"
"vfasb %%v17,%%v18,%%v19 \n\t"
"vfasb %%v18,%%v20,%%v21 \n\t"
"vfasb %%v19,%%v22,%%v23 \n\t"
"vfchsb %%v5,%%v17,%%v16 \n\t"
"vfchsb %%v6,%%v19,%%v18 \n\t"
"vsel %%v16,%%v16,%%v17,%%v5 \n\t"
"vsel %%v5,%%v24,%%v25,%%v5 \n\t"
"vsel %%v17,%%v18,%%v19,%%v6 \n\t"
"vsel %%v6,%%v26,%%v27,%%v6 \n\t"

"vfchsb %%v18,%%v17,%%v16 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v5,%%v5,%%v6,%%v18 \n\t"
"vsegf %%v6,%%v5 \n\t"
"vesrlg %%v5,%%v5,32 \n\t"
"vag %%v5,%%v5,%%v4 \n\t"
"vag %%v6,%%v6,%%v4 \n\t"

"vfchsb %%v7,%%v0,%%v16 \n\t"
"vsel %%v0,%%v16,%%v0,%%v7 \n\t"
"vsegf %%v8,%%v7 \n\t"
"vesrlg %%v7,%%v7,32 \n\t"
"vsegf %%v7,%%v7 \n\t"
"vsel %%v1,%%v5,%%v1,%%v7 \n\t"
"vsel %%v2,%%v6,%%v2,%%v8 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"

"vlef %%v16,128(%%r1,%3),0 \n\t"
"vlef %%v17,132(%%r1,%3),0 \n\t"
"vlef %%v16,136(%%r1,%3),1 \n\t"
"vlef %%v17,140(%%r1,%3),1 \n\t"
"vlef %%v16,144(%%r1,%3),2 \n\t"
"vlef %%v17,148(%%r1,%3),2 \n\t"
"vlef %%v16,152(%%r1,%3),3 \n\t"
"vlef %%v17,156(%%r1,%3),3 \n\t"

"vlef %%v18,160(%%r1,%3),0 \n\t"
"vlef %%v19,164(%%r1,%3),0 \n\t"
"vlef %%v18,168(%%r1,%3),1 \n\t"
"vlef %%v19,172(%%r1,%3),1 \n\t"
"vlef %%v18,176(%%r1,%3),2 \n\t"
"vlef %%v19,180(%%r1,%3),2 \n\t"
"vlef %%v18,184(%%r1,%3),3 \n\t"
"vlef %%v19,188(%%r1,%3),3 \n\t"

"vlef %%v20,192(%%r1,%3),0 \n\t"
"vlef %%v21,196(%%r1,%3),0 \n\t"
"vlef %%v20,200(%%r1,%3),1 \n\t"
"vlef %%v21,204(%%r1,%3),1 \n\t"
"vlef %%v20,208(%%r1,%3),2 \n\t"
"vlef %%v21,212(%%r1,%3),2 \n\t"
"vlef %%v20,216(%%r1,%3),3 \n\t"
"vlef %%v21,220(%%r1,%3),3 \n\t"

"vlef %%v22,224(%%r1,%3),0 \n\t"
"vlef %%v23,228(%%r1,%3),0 \n\t"
"vlef %%v22,232(%%r1,%3),1 \n\t"
"vlef %%v23,236(%%r1,%3),1 \n\t"
"vlef %%v22,240(%%r1,%3),2 \n\t"
"vlef %%v23,244(%%r1,%3),2 \n\t"
"vlef %%v22,248(%%r1,%3),3 \n\t"
"vlef %%v23,252(%%r1,%3),3 \n\t"

"vflpsb %%v16, %%v16 \n\t"
"vflpsb %%v17, %%v17 \n\t"
"vflpsb %%v18, %%v18 \n\t"
"vflpsb %%v19, %%v19 \n\t"
"vflpsb %%v20, %%v20 \n\t"
"vflpsb %%v21, %%v21 \n\t"
"vflpsb %%v22, %%v22 \n\t"
"vflpsb %%v23, %%v23 \n\t"
"vfasb %%v16,%%v16,%%v17 \n\t"
"vfasb %%v17,%%v18,%%v19 \n\t"
"vfasb %%v18,%%v20,%%v21 \n\t"
"vfasb %%v19,%%v22,%%v23 \n\t"
"vfchsb %%v5,%%v17,%%v16 \n\t"
"vfchsb %%v6,%%v19,%%v18 \n\t"
"vsel %%v16,%%v16,%%v17,%%v5 \n\t"
"vsel %%v5,%%v24,%%v25,%%v5 \n\t"
"vsel %%v17,%%v18,%%v19,%%v6 \n\t"
"vsel %%v6,%%v26,%%v27,%%v6 \n\t"

"vfchsb %%v18,%%v17,%%v16 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v5,%%v5,%%v6,%%v18 \n\t"
"vsegf %%v6,%%v5 \n\t"
"vesrlg %%v5,%%v5,32 \n\t"
"vag %%v5,%%v5,%%v4 \n\t"
"vag %%v6,%%v6,%%v4 \n\t"

"vfchsb %%v7,%%v0,%%v16 \n\t"
"vsel %%v0,%%v16,%%v0,%%v7 \n\t"
"vsegf %%v8,%%v7 \n\t"
"vesrlg %%v7,%%v7,32 \n\t"
"vsegf %%v7,%%v7 \n\t"
"vsel %%v1,%%v5,%%v1,%%v7 \n\t"
"vsel %%v2,%%v6,%%v2,%%v8 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"

"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"

"veslg %%v3,%%v0,32 \n\t"
"vfchsb %%v4,%%v3,%%v0 \n\t"
"vchlg %%v5,%%v2,%%v1 \n\t"
"vfcesb %%v6,%%v0,%%v3 \n\t"
"vn %%v5,%%v5,%%v6 \n\t"
"vo %%v4,%%v4,%%v5 \n\t"
"vsel %%v0,%%v0,%%v3,%%v4 \n\t"
"vesrlg %%v4,%%v4,32 \n\t"
"vsegf %%v4,%%v4 \n\t"
"vsel %%v1,%%v1,%%v2,%%v4 \n\t"

"vrepf %%v2,%%v0,2 \n\t"
"vrepg %%v3,%%v1,1 \n\t"
"wfcsb %%v2,%%v0 \n\t"
"jne 1f \n\t"
"vstef %%v0,%1,0 \n\t"
"vmnlg %%v0,%%v1,%%v3 \n\t"
"vlgvg %0,%%v0,0 \n\t"
"j 2f \n\t"
"1: \n\t"
"wfchsb %%v4,%%v0,%%v2 \n\t"
"vsel %%v1,%%v3,%%v1,%%v4 \n\t"
"vsel %%v0,%%v2,%%v0,%%v4 \n\t"
"vlgvg %0,%%v1,0 \n\t"
"ste %%f0,%1 \n\t"
"2: \n\t"
"nop "
:"=r"(iamin),"=m"(*amin)
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27"
);

return iamin;
}

BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i = 0;
BLASLONG ix = 0;
FLOAT minf = 0;
BLASLONG min = 0;
BLASLONG inc_x2;

if (n <= 0 || inc_x <= 0) return(min);
if (inc_x == 1) {

BLASLONG n1 = n & -32;
if (n1 > 0) {

min = icamin_kernel_32(n1, x, &minf);

i = n1;
}

while(i < n)
{
if( CABS1(x,ix) < minf )
{
min = i;
minf = CABS1(x,ix);
}
ix += 2;
i++;
}
return (min + 1);

} else {
inc_x2 = 2 * inc_x;

minf = CABS1(x,0);
ix += inc_x2;
i++;

while(i < n)
{
if( CABS1(x,ix) < minf )
{
min = i;
minf = CABS1(x,ix);
}
ix += inc_x2;
i++;
}
return (min + 1);
}
}



+ 152
- 143
kernel/zarch/idamax.c View File

@@ -23,164 +23,173 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
*****************************************************************************/

#include "common.h"
#include <math.h>

#if defined(DOUBLE)

#define ABS fabs

#else

#define ABS fabsf

#endif

/**
* Find maximum index
* Warning: requirements n>0 and n % 32 == 0
* @param n
* @param x pointer to the vector
* @param maxf (out) maximum absolute value .( only for output )
* @return index
*/
static BLASLONG diamax_kernel_32_TUNED(BLASLONG n, FLOAT *x, FLOAT *maxf) {
BLASLONG index;
__asm__(
"pfd 1, 0(%[ptr_x]) \n\t"
"sllg %%r0,%[n],3 \n\t"
"agr %%r0,%[ptr_x] \n\t"
"vleig %%v20,0,0 \n\t"
"vleig %%v20,1,1 \n\t"
"vleig %%v21,2,0 \n\t"
"vleig %%v21,3,1 \n\t"
"vleig %%v22,4,0 \n\t"
"vleig %%v22,5,1 \n\t"
"vleig %%v23,6,0 \n\t"
"vleig %%v23,7,1 \n\t"
"vrepig %%v4,8 \n\t"
"vzero %%v5 \n\t"
"vzero %%v18 \n\t"
"vzero %%v19 \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 1, 256(%[ptr_tmp] ) \n\t"
"vlm %%v24,%%v31, 0(%[ptr_tmp] ) \n\t"
"vflpdb %%v24, %%v24 \n\t"
"vflpdb %%v25, %%v25 \n\t"
"vflpdb %%v26, %%v26 \n\t"
"vflpdb %%v27, %%v27 \n\t"
"vflpdb %%v28, %%v28 \n\t"
"vflpdb %%v29, %%v29 \n\t"
"vflpdb %%v30, %%v30 \n\t"
"vflpdb %%v31, %%v31 \n\t"
"vfchdb %%v16,%%v25,%%v24 \n\t "
"vfchdb %%v17,%%v27,%%v26 \n\t "
"vsel %%v1,%%v21,%%v20,%%v16 \n\t"
"vsel %%v0,%%v25,%%v24,%%v16 \n\t"
"vsel %%v2,%%v23,%%v22,%%v17 \n\t"
"vsel %%v3,%%v27,%%v26,%%v17 \n\t"
"vfchdb %%v16,%%v29,%%v28 \n\t "
"vfchdb %%v17,%%v31,%%v30 \n\t"
"vsel %%v24,%%v21,%%v20,%%v16 \n\t"
"vsel %%v25,%%v29,%%v28,%%v16 \n\t"
"vsel %%v26,%%v23,%%v22,%%v17 \n\t"
"vsel %%v27,%%v31,%%v30,%%v17 \n\t"

"vfchdb %%v28, %%v3,%%v0 \n\t"
"vfchdb %%v29,%%v27, %%v25 \n\t"
"vsel %%v1,%%v2,%%v1,%%v28 \n\t"
"vsel %%v0,%%v3,%%v0,%%v28 \n\t"
"vsel %%v24,%%v26,%%v24,%%v29 \n\t"
"vsel %%v25,%%v27,%%v25,%%v29 \n\t"
"vag %%v1,%%v1,%%v5 \n\t"
"vag %%v24,%%v24,%%v5 \n\t"
"vag %%v24,%%v24,%%v4 \n\t"
"vfchdb %%v16,%%v25 , %%v0 \n\t"
"vag %%v5,%%v5,%%v4 \n\t"
"vsel %%v29,%%v25,%%v0,%%v16 \n\t"
"vsel %%v28,%%v24,%%v1,%%v16 \n\t"
"vfchdb %%v17, %%v29,%%v18 \n\t"
"vsel %%v19,%%v28,%%v19,%%v17 \n\t"
"vsel %%v18,%%v29,%%v18,%%v17 \n\t"
"vag %%v5,%%v5,%%v4 \n\t"
"vlm %%v24,%%v31,128(%[ptr_tmp] ) \n\t"
"vflpdb %%v24, %%v24 \n\t"
"vflpdb %%v25, %%v25 \n\t"
"vflpdb %%v26, %%v26 \n\t"
"vflpdb %%v27, %%v27 \n\t"
"vflpdb %%v28, %%v28 \n\t"
"vflpdb %%v29, %%v29 \n\t"
"vflpdb %%v30, %%v30 \n\t"
"vflpdb %%v31, %%v31 \n\t"
"vfchdb %%v16,%%v25,%%v24 \n\t "
"vfchdb %%v17,%%v27,%%v26 \n\t "
"vsel %%v1,%%v21,%%v20,%%v16 \n\t"
"vsel %%v0,%%v25,%%v24,%%v16 \n\t"
"vsel %%v2,%%v23,%%v22,%%v17 \n\t"
"vsel %%v3,%%v27,%%v26,%%v17 \n\t"
"vfchdb %%v16,%%v29,%%v28 \n\t "
"vfchdb %%v17,%%v31,%%v30 \n\t"
"vsel %%v24,%%v21,%%v20,%%v16 \n\t"
"vsel %%v25,%%v29,%%v28,%%v16 \n\t"
"vsel %%v26,%%v23,%%v22,%%v17 \n\t"
"vsel %%v27,%%v31,%%v30,%%v17 \n\t"

"vfchdb %%v28, %%v3,%%v0 \n\t"
"vfchdb %%v29,%%v27, %%v25 \n\t"
"vsel %%v1,%%v2,%%v1,%%v28 \n\t"
"vsel %%v0,%%v3,%%v0,%%v28 \n\t"
"vsel %%v24,%%v26,%%v24,%%v29 \n\t"
"vsel %%v25,%%v27,%%v25,%%v29 \n\t"
"vag %%v1,%%v1,%%v5 \n\t"
"vag %%v24,%%v24,%%v5 \n\t"
"la %[ptr_tmp],256(%[ptr_tmp]) \n\t"
"vag %%v24,%%v24,%%v4 \n\t"
"vfchdb %%v16,%%v25 , %%v0 \n\t"
"vag %%v5,%%v5,%%v4 \n\t"
"vsel %%v29,%%v25,%%v0,%%v16 \n\t"
"vsel %%v28,%%v24,%%v1,%%v16 \n\t"
"vfchdb %%v17, %%v29,%%v18 \n\t"
"vsel %%v19,%%v28,%%v19,%%v17 \n\t"
"vsel %%v18,%%v29,%%v18,%%v17 \n\t"
"vag %%v5,%%v5,%%v4 \n\t"
"clgrjl %[ptr_tmp],%%r0,1b \n\t"

"vrepg %%v26,%%v18,1 \n\t"
"vrepg %%v5,%%v19,1 \n\t"
"wfcdb %%v26,%%v18 \n\t"
"jne 2f \n\t"
"vsteg %%v18,%[maxf],0 \n\t"
"vmnlg %%v1,%%v5,%%v19 \n\t"
"j 3f \n\t"

"2: \n\t"
"wfchdb %%v16,%%v26,%%v18 \n\t"
"vsel %%v1,%%v5,%%v19,%%v16 \n\t"
"vsel %%v0,%%v26,%%v18,%%v16 \n\t"
"std %%f0,%[maxf] \n\t"
"3: \n\t"
"vlgvg %[index],%%v1,0 \n\t"
: [index] "+r"(index) ,[maxf] "=m"(*maxf), [ptr_tmp] "+&a"(x)
: [mem] "m"( *(const double (*)[n])x), [n] "r"(n), [ptr_x] "r"(x)
: "cc", "r0", "f0","v0","v1","v2","v3","v4","v5","v6","v7","v16",
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return index;
static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax)
{
BLASLONG iamax;

}
__asm__ volatile (
"vl %%v0,0(%3) \n\t"
"vflpdb %%v0,%%v0 \n\t"
"vleig %%v1,0,0 \n\t"
"vleig %%v1,1,1 \n\t"
"vrepig %%v2,16 \n\t"
"vzero %%v3 \n\t"
"vleig %%v24,0,0 \n\t"
"vleig %%v24,1,1 \n\t"
"vleig %%v25,2,0 \n\t"
"vleig %%v25,3,1 \n\t"
"vleig %%v26,4,0 \n\t"
"vleig %%v26,5,1 \n\t"
"vleig %%v27,6,0 \n\t"
"vleig %%v27,7,1 \n\t"
"vleig %%v28,8,0 \n\t"
"vleig %%v28,9,1 \n\t"
"vleig %%v29,10,0 \n\t"
"vleig %%v29,11,1 \n\t"
"vleig %%v30,12,0 \n\t"
"vleig %%v30,13,1 \n\t"
"vleig %%v31,14,0 \n\t"
"vleig %%v31,15,1 \n\t"
"srlg %%r0,%2,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%3) \n\t"

"vl %%v16,0(%%r1,%3) \n\t"
"vl %%v17,16(%%r1,%3) \n\t"
"vl %%v18,32(%%r1,%3) \n\t"
"vl %%v19,48(%%r1,%3) \n\t"
"vl %%v20,64(%%r1,%3) \n\t"
"vl %%v21,80(%%r1,%3) \n\t"
"vl %%v22,96(%%r1,%3) \n\t"
"vl %%v23,112(%%r1,%3) \n\t"
"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"
"vfchdb %%v4,%%v16,%%v17 \n\t"
"vfchdb %%v5,%%v18,%%v19 \n\t"
"vfchdb %%v6,%%v20,%%v21 \n\t"
"vfchdb %%v7,%%v22,%%v23 \n\t"
"vsel %%v16,%%v16,%%v17,%%v4 \n\t"
"vsel %%v4,%%v24,%%v25,%%v4 \n\t"
"vsel %%v17,%%v18,%%v19,%%v5 \n\t"
"vsel %%v5,%%v26,%%v27,%%v5 \n\t"
"vsel %%v18,%%v20,%%v21,%%v6 \n\t"
"vsel %%v6,%%v28,%%v29,%%v6 \n\t"
"vsel %%v19,%%v22,%%v23,%%v7 \n\t"
"vsel %%v7,%%v30,%%v31,%%v7 \n\t"

"vfchdb %%v20,%%v16,%%v17 \n\t"
"vfchdb %%v21,%%v18,%%v19 \n\t"
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
"vsel %%v4,%%v4,%%v5,%%v20 \n\t"
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
"vsel %%v5,%%v6,%%v7,%%v21 \n\t"

"vfchdb %%v18,%%v16,%%v17 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v4,%%v4,%%v5,%%v18 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"

"vfchdb %%v5,%%v16,%%v0 \n\t"
"vsel %%v0,%%v16,%%v0,%%v5 \n\t"
"vsel %%v1,%%v4,%%v1,%%v5 \n\t"
"vag %%v3,%%v3,%%v2 \n\t"

"vl %%v16,128(%%r1,%3) \n\t"
"vl %%v17,144(%%r1,%3) \n\t"
"vl %%v18,160(%%r1,%3) \n\t"
"vl %%v19,176(%%r1,%3) \n\t"
"vl %%v20,192(%%r1,%3) \n\t"
"vl %%v21,208(%%r1,%3) \n\t"
"vl %%v22,224(%%r1,%3) \n\t"
"vl %%v23,240(%%r1,%3) \n\t"
"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"

"vfchdb %%v4,%%v16,%%v17 \n\t"
"vfchdb %%v5,%%v18,%%v19 \n\t"
"vfchdb %%v6,%%v20,%%v21 \n\t"
"vfchdb %%v7,%%v22,%%v23 \n\t"
"vsel %%v16,%%v16,%%v17,%%v4 \n\t"
"vsel %%v4,%%v24,%%v25,%%v4 \n\t"
"vsel %%v17,%%v18,%%v19,%%v5 \n\t"
"vsel %%v5,%%v26,%%v27,%%v5 \n\t"
"vsel %%v18,%%v20,%%v21,%%v6 \n\t"
"vsel %%v6,%%v28,%%v29,%%v6 \n\t"
"vsel %%v19,%%v22,%%v23,%%v7 \n\t"
"vsel %%v7,%%v30,%%v31,%%v7 \n\t"

"vfchdb %%v20,%%v16,%%v17 \n\t"
"vfchdb %%v21,%%v18,%%v19 \n\t"
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
"vsel %%v4,%%v4,%%v5,%%v20 \n\t"
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
"vsel %%v5,%%v6,%%v7,%%v21 \n\t"

"vfchdb %%v18,%%v16,%%v17 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v4,%%v4,%%v5,%%v18 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"

"vfchdb %%v5,%%v16,%%v0 \n\t"
"vsel %%v0,%%v16,%%v0,%%v5 \n\t"
"vsel %%v1,%%v4,%%v1,%%v5 \n\t"
"vag %%v3,%%v3,%%v2 \n\t"

"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"

"vrepg %%v2,%%v0,1 \n\t"
"vrepg %%v3,%%v1,1 \n\t"
"wfcdb %%v2,%%v0 \n\t"
"jne 1f \n\t"
"vsteg %%v0,%1,0 \n\t"
"vmnlg %%v0,%%v1,%%v3 \n\t"
"vlgvg %0,%%v0,0 \n\t"
"j 2f \n\t"
"1: \n\t"
"wfchdb %%v4,%%v2,%%v0 \n\t"
"vsel %%v1,%%v3,%%v1,%%v4 \n\t"
"vsel %%v0,%%v2,%%v0,%%v4 \n\t"
"vlgvg %0,%%v1,0 \n\t"
"std %%f0,%1 \n\t"
"2: \n\t"
"nop "
:"=r"(iamax),"=m"(*amax)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);

return iamax;
}
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
BLASLONG ix = 0;
FLOAT maxf = 0.0;
BLASLONG max = 0;

@@ -191,7 +200,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG n1 = n & -32;
if (n1 > 0) {

max = diamax_kernel_32_TUNED(n1, x, &maxf);
max = idamax_kernel_32(n1, x, &maxf);

i = n1;
}


+ 159
- 166
kernel/zarch/idamin.c View File

@@ -23,192 +23,185 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
*****************************************************************************/

#include "common.h"
#include <math.h>

#if defined(DOUBLE)

#define ABS fabs

#else

#define ABS fabsf

#endif

/**
* Find minimum index
* Warning: requirements n>0 and n % 32 == 0
* @param n
* @param x pointer to the vector
* @param minf (out) minimum absolute value .( only for output )
* @return minimum index
*/
static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
BLASLONG index;
__asm__(
"pfd 1, 0(%[ptr_x]) \n\t"
"sllg %%r0,%[n],3 \n\t"
"agr %%r0,%[ptr_x] \n\t"
"vleig %%v20,0,0 \n\t"
"vleig %%v20,1,1 \n\t"
"vleig %%v21,2,0 \n\t"
"vleig %%v21,3,1 \n\t"
"vleig %%v22,4,0 \n\t"
"vleig %%v22,5,1 \n\t"
"vleig %%v23,6,0 \n\t"
"vleig %%v23,7,1 \n\t"
"vrepig %%v4,8 \n\t"
"vlrepg %%v18,0(%[ptr_x]) \n\t"
"vzero %%v5 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vzero %%v19 \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 1, 256(%[ptr_tmp] ) \n\t"
"vlm %%v24,%%v31, 0(%[ptr_tmp] ) \n\t"

"vflpdb %%v24, %%v24 \n\t"
"vflpdb %%v25, %%v25 \n\t"
"vflpdb %%v26, %%v26 \n\t"
"vflpdb %%v27, %%v27 \n\t"
"vflpdb %%v28, %%v28 \n\t"
"vflpdb %%v29, %%v29 \n\t"
"vflpdb %%v30, %%v30 \n\t"
"vflpdb %%v31, %%v31 \n\t"

"vfchdb %%v16,%%v24,%%v25 \n\t "
"vfchdb %%v17,%%v26 ,%%v27 \n\t "
"vsel %%v1,%%v21,%%v20,%%v16 \n\t"
"vsel %%v0,%%v25,%%v24,%%v16 \n\t"
"vsel %%v2,%%v23,%%v22,%%v17 \n\t"
"vsel %%v3,%%v27,%%v26,%%v17 \n\t"
"vfchdb %%v16,%%v28, %%v29 \n\t "
"vfchdb %%v17,%%v30,%%v31 \n\t"
"vsel %%v24,%%v21,%%v20,%%v16 \n\t"
"vsel %%v25,%%v29,%%v28,%%v16 \n\t"
"vsel %%v26,%%v23,%%v22,%%v17 \n\t"
"vsel %%v27,%%v31,%%v30,%%v17 \n\t"


"vfchdb %%v28,%%v0 , %%v3 \n\t"
"vfchdb %%v29, %%v25,%%v27 \n\t"
"vsel %%v1,%%v2,%%v1,%%v28 \n\t"
"vsel %%v0,%%v3,%%v0,%%v28 \n\t"
"vsel %%v24,%%v26,%%v24,%%v29 \n\t"
"vsel %%v25,%%v27,%%v25,%%v29 \n\t"

"vag %%v1,%%v1,%%v5 \n\t"
"vag %%v24,%%v24,%%v5 \n\t"
"vag %%v24,%%v24,%%v4 \n\t"

"vfchdb %%v16, %%v0,%%v25 \n\t"
"vag %%v5,%%v5,%%v4 \n\t"
"vsel %%v29,%%v25,%%v0,%%v16 \n\t"
"vsel %%v28,%%v24,%%v1,%%v16 \n\t"

"vfchdb %%v17,%%v18, %%v29 \n\t"
"vsel %%v19,%%v28,%%v19,%%v17 \n\t"
"vsel %%v18,%%v29,%%v18,%%v17 \n\t"

"vag %%v5,%%v5,%%v4 \n\t"

"vlm %%v24,%%v31,128(%[ptr_tmp] ) \n\t"
"vflpdb %%v24, %%v24 \n\t"
"vflpdb %%v25, %%v25 \n\t"
"vflpdb %%v26, %%v26 \n\t"
"vflpdb %%v27, %%v27 \n\t"
"vflpdb %%v28, %%v28 \n\t"
"vflpdb %%v29, %%v29 \n\t"
"vflpdb %%v30, %%v30 \n\t"
"vflpdb %%v31, %%v31 \n\t"

"vfchdb %%v16,%%v24,%%v25 \n\t"
"vfchdb %%v17,%%v26 ,%%v27 \n\t"
"vsel %%v1,%%v21,%%v20,%%v16 \n\t"
"vsel %%v0,%%v25,%%v24,%%v16 \n\t"
"vsel %%v2,%%v23,%%v22,%%v17 \n\t"
"vsel %%v3,%%v27,%%v26,%%v17 \n\t"
"vfchdb %%v16,%%v28 ,%%v29 \n\t"
"vfchdb %%v17,%%v30,%%v31 \n\t"
"vsel %%v24,%%v21,%%v20,%%v16 \n\t"
"vsel %%v25,%%v29,%%v28,%%v16 \n\t"
"vsel %%v26,%%v23,%%v22,%%v17 \n\t"
"vsel %%v27,%%v31,%%v30,%%v17 \n\t"


"vfchdb %%v28,%%v0 , %%v3 \n\t"
"vfchdb %%v29, %%v25,%%v27 \n\t"
"vsel %%v1,%%v2,%%v1,%%v28 \n\t"
"vsel %%v0,%%v3,%%v0,%%v28 \n\t"
"vsel %%v24,%%v26,%%v24,%%v29 \n\t"
"vsel %%v25,%%v27,%%v25,%%v29 \n\t"

"vag %%v1,%%v1,%%v5 \n\t"
"vag %%v24,%%v24,%%v5 \n\t"
"la %[ptr_tmp],256(%[ptr_tmp]) \n\t"
"vag %%v24,%%v24,%%v4 \n\t"

"vfchdb %%v16, %%v0,%%v25 \n\t"
"vag %%v5,%%v5,%%v4 \n\t"
"vsel %%v29,%%v25,%%v0,%%v16 \n\t"
"vsel %%v28,%%v24,%%v1,%%v16 \n\t"

"vfchdb %%v17,%%v18, %%v29 \n\t"
"vsel %%v19,%%v28,%%v19,%%v17 \n\t"
"vsel %%v18,%%v29,%%v18,%%v17 \n\t"

"vag %%v5,%%v5,%%v4 \n\t"

"clgrjl %[ptr_tmp],%%r0,1b \n\t"


"vrepg %%v26,%%v18,1 \n\t"
"vrepg %%v5,%%v19,1 \n\t"
"wfcdb %%v26,%%v18 \n\t"
"jne 2f \n\t"
"vsteg %%v18,%[minf],0 \n\t"
"vmnlg %%v1,%%v5,%%v19 \n\t"
"j 3f \n\t"
"2: \n\t"
"wfchdb %%v16,%%v18 ,%%v26 \n\t "
"vsel %%v1,%%v5,%%v19,%%v16 \n\t"
"vsel %%v0,%%v26,%%v18,%%v16 \n\t"
"std %%f0,%[minf] \n\t"

"3: \n\t"
"vlgvg %[index],%%v1,0 \n\t"

: [index] "+r"(index) ,[minf] "=m"(*minf), [ptr_tmp] "+&a"(x)
: [mem] "m"( *(const double (*)[n])x), [n] "r"(n), [ptr_x] "r"(x)
: "cc","r0", "f0","v0","v1","v2","v3","v4","v5","v6","v7","v16",
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"

);
return index;

static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin)
{
BLASLONG iamin;

__asm__ volatile (
"vl %%v0,0(%3) \n\t"
"vflpdb %%v0,%%v0 \n\t"
"vleig %%v1,0,0 \n\t"
"vleig %%v1,1,1 \n\t"
"vrepig %%v2,16 \n\t"
"vzero %%v3 \n\t"
"vleig %%v24,0,0 \n\t"
"vleig %%v24,1,1 \n\t"
"vleig %%v25,2,0 \n\t"
"vleig %%v25,3,1 \n\t"
"vleig %%v26,4,0 \n\t"
"vleig %%v26,5,1 \n\t"
"vleig %%v27,6,0 \n\t"
"vleig %%v27,7,1 \n\t"
"vleig %%v28,8,0 \n\t"
"vleig %%v28,9,1 \n\t"
"vleig %%v29,10,0 \n\t"
"vleig %%v29,11,1 \n\t"
"vleig %%v30,12,0 \n\t"
"vleig %%v30,13,1 \n\t"
"vleig %%v31,14,0 \n\t"
"vleig %%v31,15,1 \n\t"
"srlg %%r0,%2,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%3) \n\t"

"vl %%v16,0(%%r1,%3) \n\t"
"vl %%v17,16(%%r1,%3) \n\t"
"vl %%v18,32(%%r1,%3) \n\t"
"vl %%v19,48(%%r1,%3) \n\t"
"vl %%v20,64(%%r1,%3) \n\t"
"vl %%v21,80(%%r1,%3) \n\t"
"vl %%v22,96(%%r1,%3) \n\t"
"vl %%v23,112(%%r1,%3) \n\t"
"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"
"vfchdb %%v4,%%v17,%%v16 \n\t"
"vfchdb %%v5,%%v19,%%v18 \n\t"
"vfchdb %%v6,%%v21,%%v20 \n\t"
"vfchdb %%v7,%%v23,%%v22 \n\t"
"vsel %%v16,%%v16,%%v17,%%v4 \n\t"
"vsel %%v4,%%v24,%%v25,%%v4 \n\t"
"vsel %%v17,%%v18,%%v19,%%v5 \n\t"
"vsel %%v5,%%v26,%%v27,%%v5 \n\t"
"vsel %%v18,%%v20,%%v21,%%v6 \n\t"
"vsel %%v6,%%v28,%%v29,%%v6 \n\t"
"vsel %%v19,%%v22,%%v23,%%v7 \n\t"
"vsel %%v7,%%v30,%%v31,%%v7 \n\t"

"vfchdb %%v20,%%v17,%%v16 \n\t"
"vfchdb %%v21,%%v19,%%v18 \n\t"
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
"vsel %%v4,%%v4,%%v5,%%v20 \n\t"
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
"vsel %%v5,%%v6,%%v7,%%v21 \n\t"

"vfchdb %%v18,%%v17,%%v16 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v4,%%v4,%%v5,%%v18 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"

"vfchdb %%v5,%%v0,%%v16 \n\t"
"vsel %%v0,%%v16,%%v0,%%v5 \n\t"
"vsel %%v1,%%v4,%%v1,%%v5 \n\t"
"vag %%v3,%%v3,%%v2 \n\t"

"vl %%v16,128(%%r1,%3) \n\t"
"vl %%v17,144(%%r1,%3) \n\t"
"vl %%v18,160(%%r1,%3) \n\t"
"vl %%v19,176(%%r1,%3) \n\t"
"vl %%v20,192(%%r1,%3) \n\t"
"vl %%v21,208(%%r1,%3) \n\t"
"vl %%v22,224(%%r1,%3) \n\t"
"vl %%v23,240(%%r1,%3) \n\t"
"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"

"vfchdb %%v4,%%v17,%%v16 \n\t"
"vfchdb %%v5,%%v19,%%v18 \n\t"
"vfchdb %%v6,%%v21,%%v20 \n\t"
"vfchdb %%v7,%%v23,%%v22 \n\t"
"vsel %%v16,%%v16,%%v17,%%v4 \n\t"
"vsel %%v4,%%v24,%%v25,%%v4 \n\t"
"vsel %%v17,%%v18,%%v19,%%v5 \n\t"
"vsel %%v5,%%v26,%%v27,%%v5 \n\t"
"vsel %%v18,%%v20,%%v21,%%v6 \n\t"
"vsel %%v6,%%v28,%%v29,%%v6 \n\t"
"vsel %%v19,%%v22,%%v23,%%v7 \n\t"
"vsel %%v7,%%v30,%%v31,%%v7 \n\t"

"vfchdb %%v20,%%v17,%%v16 \n\t"
"vfchdb %%v21,%%v19,%%v18 \n\t"
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
"vsel %%v4,%%v4,%%v5,%%v20 \n\t"
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
"vsel %%v5,%%v6,%%v7,%%v21 \n\t"

"vfchdb %%v18,%%v17,%%v16 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v4,%%v4,%%v5,%%v18 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"

"vfchdb %%v5,%%v0,%%v16 \n\t"
"vsel %%v0,%%v16,%%v0,%%v5 \n\t"
"vsel %%v1,%%v4,%%v1,%%v5 \n\t"
"vag %%v3,%%v3,%%v2 \n\t"

"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"

"vrepg %%v2,%%v0,1 \n\t"
"vrepg %%v3,%%v1,1 \n\t"
"wfcdb %%v2,%%v0 \n\t"
"jne 1f \n\t"
"vsteg %%v0,%1,0 \n\t"
"vmnlg %%v0,%%v1,%%v3 \n\t"
"vlgvg %0,%%v0,0 \n\t"
"j 2f \n\t"
"1: \n\t"
"wfchdb %%v4,%%v0,%%v2 \n\t"
"vsel %%v1,%%v3,%%v1,%%v4 \n\t"
"vsel %%v0,%%v2,%%v0,%%v4 \n\t"
"vlgvg %0,%%v1,0 \n\t"
"std %%f0,%1 \n\t"
"2: \n\t"
"nop "
:"=r"(iamin),"=m"(*amin)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);

return iamin;
}



BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
BLASLONG ix = 0;
BLASLONG min = 0;
FLOAT minf = 0.0;
BLASLONG min = 0;

if (n <= 0 || inc_x <= 0) return (min);
minf = ABS(x[0]); //index's not incremented,though it will make first comparision redundant
if (inc_x == 1) {

BLASLONG n1 = n & -32;
if (n1 > 0) {

min = diamin_kernel_32(n1, x, &minf);
min = idamin_kernel_32(n1, x, &minf);

i = n1;
}



+ 232
- 0
kernel/zarch/idmax.c View File

@@ -0,0 +1,232 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#include "common.h"

static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max)
{
BLASLONG imax;

__asm__ volatile (
"vl %%v0,0(%3) \n\t"
"vleig %%v1,0,0 \n\t"
"vleig %%v1,1,1 \n\t"
"vrepig %%v2,16 \n\t"
"vzero %%v3 \n\t"
"vleig %%v24,0,0 \n\t"
"vleig %%v24,1,1 \n\t"
"vleig %%v25,2,0 \n\t"
"vleig %%v25,3,1 \n\t"
"vleig %%v26,4,0 \n\t"
"vleig %%v26,5,1 \n\t"
"vleig %%v27,6,0 \n\t"
"vleig %%v27,7,1 \n\t"
"vleig %%v28,8,0 \n\t"
"vleig %%v28,9,1 \n\t"
"vleig %%v29,10,0 \n\t"
"vleig %%v29,11,1 \n\t"
"vleig %%v30,12,0 \n\t"
"vleig %%v30,13,1 \n\t"
"vleig %%v31,14,0 \n\t"
"vleig %%v31,15,1 \n\t"
"srlg %%r0,%2,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%3) \n\t"

"vl %%v16,0(%%r1,%3) \n\t"
"vl %%v17,16(%%r1,%3) \n\t"
"vl %%v18,32(%%r1,%3) \n\t"
"vl %%v19,48(%%r1,%3) \n\t"
"vl %%v20,64(%%r1,%3) \n\t"
"vl %%v21,80(%%r1,%3) \n\t"
"vl %%v22,96(%%r1,%3) \n\t"
"vl %%v23,112(%%r1,%3) \n\t"
"vfchdb %%v4,%%v16,%%v17 \n\t"
"vfchdb %%v5,%%v18,%%v19 \n\t"
"vfchdb %%v6,%%v20,%%v21 \n\t"
"vfchdb %%v7,%%v22,%%v23 \n\t"
"vsel %%v16,%%v16,%%v17,%%v4 \n\t"
"vsel %%v4,%%v24,%%v25,%%v4 \n\t"
"vsel %%v17,%%v18,%%v19,%%v5 \n\t"
"vsel %%v5,%%v26,%%v27,%%v5 \n\t"
"vsel %%v18,%%v20,%%v21,%%v6 \n\t"
"vsel %%v6,%%v28,%%v29,%%v6 \n\t"
"vsel %%v19,%%v22,%%v23,%%v7 \n\t"
"vsel %%v7,%%v30,%%v31,%%v7 \n\t"

"vfchdb %%v20,%%v16,%%v17 \n\t"
"vfchdb %%v21,%%v18,%%v19 \n\t"
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
"vsel %%v4,%%v4,%%v5,%%v20 \n\t"
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
"vsel %%v5,%%v6,%%v7,%%v21 \n\t"

"vfchdb %%v18,%%v16,%%v17 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v4,%%v4,%%v5,%%v18 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"

"vfchdb %%v5,%%v16,%%v0 \n\t"
"vsel %%v0,%%v16,%%v0,%%v5 \n\t"
"vsel %%v1,%%v4,%%v1,%%v5 \n\t"
"vag %%v3,%%v3,%%v2 \n\t"

"vl %%v16,128(%%r1,%3) \n\t"
"vl %%v17,144(%%r1,%3) \n\t"
"vl %%v18,160(%%r1,%3) \n\t"
"vl %%v19,176(%%r1,%3) \n\t"
"vl %%v20,192(%%r1,%3) \n\t"
"vl %%v21,208(%%r1,%3) \n\t"
"vl %%v22,224(%%r1,%3) \n\t"
"vl %%v23,240(%%r1,%3) \n\t"

"vfchdb %%v4,%%v16,%%v17 \n\t"
"vfchdb %%v5,%%v18,%%v19 \n\t"
"vfchdb %%v6,%%v20,%%v21 \n\t"
"vfchdb %%v7,%%v22,%%v23 \n\t"
"vsel %%v16,%%v16,%%v17,%%v4 \n\t"
"vsel %%v4,%%v24,%%v25,%%v4 \n\t"
"vsel %%v17,%%v18,%%v19,%%v5 \n\t"
"vsel %%v5,%%v26,%%v27,%%v5 \n\t"
"vsel %%v18,%%v20,%%v21,%%v6 \n\t"
"vsel %%v6,%%v28,%%v29,%%v6 \n\t"
"vsel %%v19,%%v22,%%v23,%%v7 \n\t"
"vsel %%v7,%%v30,%%v31,%%v7 \n\t"

"vfchdb %%v20,%%v16,%%v17 \n\t"
"vfchdb %%v21,%%v18,%%v19 \n\t"
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
"vsel %%v4,%%v4,%%v5,%%v20 \n\t"
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
"vsel %%v5,%%v6,%%v7,%%v21 \n\t"

"vfchdb %%v18,%%v16,%%v17 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v4,%%v4,%%v5,%%v18 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"

"vfchdb %%v5,%%v16,%%v0 \n\t"
"vsel %%v0,%%v16,%%v0,%%v5 \n\t"
"vsel %%v1,%%v4,%%v1,%%v5 \n\t"
"vag %%v3,%%v3,%%v2 \n\t"

"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"

"vrepg %%v2,%%v0,1 \n\t"
"vrepg %%v3,%%v1,1 \n\t"
"wfcdb %%v2,%%v0 \n\t"
"jne 1f \n\t"
"vsteg %%v0,%1,0 \n\t"
"vmnlg %%v0,%%v1,%%v3 \n\t"
"vlgvg %0,%%v0,0 \n\t"
"j 2f \n\t"
"1: \n\t"
"wfchdb %%v4,%%v2,%%v0 \n\t"
"vsel %%v1,%%v3,%%v1,%%v4 \n\t"
"vsel %%v0,%%v2,%%v0,%%v4 \n\t"
"vlgvg %0,%%v1,0 \n\t"
"std %%f0,%1 \n\t"
"2: \n\t"
"nop "
:"=r"(imax),"=m"(*max)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);

return imax;
}
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT maxf = 0.0;
BLASLONG max = 0;

if (n <= 0 || inc_x <= 0) return (max);

if (inc_x == 1) {

BLASLONG n1 = n & -32;
if (n1 > 0) {

max = idmax_kernel_32(n1, x, &maxf);

i = n1;
}

while (i < n) {
if (x[i] > maxf) {
max = i;
maxf = x[i];
}
i++;
}
return (max + 1);

} else {

BLASLONG n1 = n & -4;
while (j < n1) {

if (x[i] > maxf) {
max = j;
maxf = x[i];
}
if (x[i + inc_x] > maxf) {
max = j + 1;
maxf = x[i + inc_x];
}
if (x[i + 2 * inc_x] > maxf) {
max = j + 2;
maxf = x[i + 2 * inc_x];
}
if (x[i + 3 * inc_x] > maxf) {
max = j + 3;
maxf = x[i + 3 * inc_x];
}

i += inc_x * 4;

j += 4;

}


while (j < n) {
if (x[i] > maxf) {
max = j;
maxf = x[i];
}
i += inc_x;
j++;
}
return (max + 1);
}
}

+ 232
- 0
kernel/zarch/idmin.c View File

@@ -0,0 +1,232 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#include "common.h"

static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min)
{
BLASLONG imin;

__asm__ volatile (
"vl %%v0,0(%3) \n\t"
"vleig %%v1,0,0 \n\t"
"vleig %%v1,1,1 \n\t"
"vrepig %%v2,16 \n\t"
"vzero %%v3 \n\t"
"vleig %%v24,0,0 \n\t"
"vleig %%v24,1,1 \n\t"
"vleig %%v25,2,0 \n\t"
"vleig %%v25,3,1 \n\t"
"vleig %%v26,4,0 \n\t"
"vleig %%v26,5,1 \n\t"
"vleig %%v27,6,0 \n\t"
"vleig %%v27,7,1 \n\t"
"vleig %%v28,8,0 \n\t"
"vleig %%v28,9,1 \n\t"
"vleig %%v29,10,0 \n\t"
"vleig %%v29,11,1 \n\t"
"vleig %%v30,12,0 \n\t"
"vleig %%v30,13,1 \n\t"
"vleig %%v31,14,0 \n\t"
"vleig %%v31,15,1 \n\t"
"srlg %%r0,%2,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%3) \n\t"

"vl %%v16,0(%%r1,%3) \n\t"
"vl %%v17,16(%%r1,%3) \n\t"
"vl %%v18,32(%%r1,%3) \n\t"
"vl %%v19,48(%%r1,%3) \n\t"
"vl %%v20,64(%%r1,%3) \n\t"
"vl %%v21,80(%%r1,%3) \n\t"
"vl %%v22,96(%%r1,%3) \n\t"
"vl %%v23,112(%%r1,%3) \n\t"
"vfchdb %%v4,%%v17,%%v16 \n\t"
"vfchdb %%v5,%%v19,%%v18 \n\t"
"vfchdb %%v6,%%v21,%%v20 \n\t"
"vfchdb %%v7,%%v23,%%v22 \n\t"
"vsel %%v16,%%v16,%%v17,%%v4 \n\t"
"vsel %%v4,%%v24,%%v25,%%v4 \n\t"
"vsel %%v17,%%v18,%%v19,%%v5 \n\t"
"vsel %%v5,%%v26,%%v27,%%v5 \n\t"
"vsel %%v18,%%v20,%%v21,%%v6 \n\t"
"vsel %%v6,%%v28,%%v29,%%v6 \n\t"
"vsel %%v19,%%v22,%%v23,%%v7 \n\t"
"vsel %%v7,%%v30,%%v31,%%v7 \n\t"

"vfchdb %%v20,%%v17,%%v16 \n\t"
"vfchdb %%v21,%%v19,%%v18 \n\t"
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
"vsel %%v4,%%v4,%%v5,%%v20 \n\t"
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
"vsel %%v5,%%v6,%%v7,%%v21 \n\t"

"vfchdb %%v18,%%v17,%%v16 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v4,%%v4,%%v5,%%v18 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"

"vfchdb %%v5,%%v0,%%v16 \n\t"
"vsel %%v0,%%v16,%%v0,%%v5 \n\t"
"vsel %%v1,%%v4,%%v1,%%v5 \n\t"
"vag %%v3,%%v3,%%v2 \n\t"

"vl %%v16,128(%%r1,%3) \n\t"
"vl %%v17,144(%%r1,%3) \n\t"
"vl %%v18,160(%%r1,%3) \n\t"
"vl %%v19,176(%%r1,%3) \n\t"
"vl %%v20,192(%%r1,%3) \n\t"
"vl %%v21,208(%%r1,%3) \n\t"
"vl %%v22,224(%%r1,%3) \n\t"
"vl %%v23,240(%%r1,%3) \n\t"

"vfchdb %%v4,%%v17,%%v16 \n\t"
"vfchdb %%v5,%%v19,%%v18 \n\t"
"vfchdb %%v6,%%v21,%%v20 \n\t"
"vfchdb %%v7,%%v23,%%v22 \n\t"
"vsel %%v16,%%v16,%%v17,%%v4 \n\t"
"vsel %%v4,%%v24,%%v25,%%v4 \n\t"
"vsel %%v17,%%v18,%%v19,%%v5 \n\t"
"vsel %%v5,%%v26,%%v27,%%v5 \n\t"
"vsel %%v18,%%v20,%%v21,%%v6 \n\t"
"vsel %%v6,%%v28,%%v29,%%v6 \n\t"
"vsel %%v19,%%v22,%%v23,%%v7 \n\t"
"vsel %%v7,%%v30,%%v31,%%v7 \n\t"

"vfchdb %%v20,%%v17,%%v16 \n\t"
"vfchdb %%v21,%%v19,%%v18 \n\t"
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
"vsel %%v4,%%v4,%%v5,%%v20 \n\t"
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
"vsel %%v5,%%v6,%%v7,%%v21 \n\t"

"vfchdb %%v18,%%v17,%%v16 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v4,%%v4,%%v5,%%v18 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"

"vfchdb %%v5,%%v0,%%v16 \n\t"
"vsel %%v0,%%v16,%%v0,%%v5 \n\t"
"vsel %%v1,%%v4,%%v1,%%v5 \n\t"
"vag %%v3,%%v3,%%v2 \n\t"

"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"

"vrepg %%v2,%%v0,1 \n\t"
"vrepg %%v3,%%v1,1 \n\t"
"wfcdb %%v2,%%v0 \n\t"
"jne 1f \n\t"
"vsteg %%v0,%1,0 \n\t"
"vmnlg %%v0,%%v1,%%v3 \n\t"
"vlgvg %0,%%v0,0 \n\t"
"j 2f \n\t"
"1: \n\t"
"wfchdb %%v4,%%v0,%%v2 \n\t"
"vsel %%v1,%%v3,%%v1,%%v4 \n\t"
"vsel %%v0,%%v2,%%v0,%%v4 \n\t"
"vlgvg %0,%%v1,0 \n\t"
"std %%f0,%1 \n\t"
"2: \n\t"
"nop "
:"=r"(imin),"=m"(*min)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);

return imin;
}
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT minf = 0.0;
BLASLONG min = 0;

if (n <= 0 || inc_x <= 0) return (min);

if (inc_x == 1) {

BLASLONG n1 = n & -32;
if (n1 > 0) {

min = idmin_kernel_32(n1, x, &minf);

i = n1;
}

while (i < n) {
if (x[i] < minf) {
min = i;
minf = x[i];
}
i++;
}
return (min + 1);

} else {

BLASLONG n1 = n & -4;
while (j < n1) {

if (x[i] < minf) {
min = j;
minf = x[i];
}
if (x[i + inc_x] < minf) {
min = j + 1;
minf = x[i + inc_x];
}
if (x[i + 2 * inc_x] < minf) {
min = j + 2;
minf = x[i + 2 * inc_x];
}
if (x[i + 3 * inc_x] < minf) {
min = j + 3;
minf = x[i + 3 * inc_x];
}

i += inc_x * 4;

j += 4;

}


while (j < n) {
if (x[i] < minf) {
min = j;
minf = x[i];
}
i += inc_x;
j++;
}
return (min + 1);
}
}

+ 299
- 0
kernel/zarch/isamax.c View File

@@ -0,0 +1,299 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#include "common.h"
#include <math.h>

#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif

static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax)
{
BLASLONG iamax;

__asm__ volatile (
"vl %%v0,0(%3) \n\t"
"vflpsb %%v0,%%v0 \n\t"
"vleig %%v1,0,0 \n\t"
"vleig %%v1,2,1 \n\t"
"vleig %%v2,1,0 \n\t"
"vleig %%v2,3,1 \n\t"
"vrepig %%v3,32 \n\t"
"vzero %%v4 \n\t"
"vleif %%v24,0,0 \n\t"
"vleif %%v24,1,1 \n\t"
"vleif %%v24,2,2 \n\t"
"vleif %%v24,3,3 \n\t"
"vleif %%v25,4,0 \n\t"
"vleif %%v25,5,1 \n\t"
"vleif %%v25,6,2 \n\t"
"vleif %%v25,7,3 \n\t"
"vleif %%v26,8,0 \n\t"
"vleif %%v26,9,1 \n\t"
"vleif %%v26,10,2 \n\t"
"vleif %%v26,11,3 \n\t"
"vleif %%v27,12,0 \n\t"
"vleif %%v27,13,1 \n\t"
"vleif %%v27,14,2 \n\t"
"vleif %%v27,15,3 \n\t"
"vleif %%v28,16,0 \n\t"
"vleif %%v28,17,1 \n\t"
"vleif %%v28,18,2 \n\t"
"vleif %%v28,19,3 \n\t"
"vleif %%v29,20,0 \n\t"
"vleif %%v29,21,1 \n\t"
"vleif %%v29,22,2 \n\t"
"vleif %%v29,23,3 \n\t"
"vleif %%v30,24,0 \n\t"
"vleif %%v30,25,1 \n\t"
"vleif %%v30,26,2 \n\t"
"vleif %%v30,27,3 \n\t"
"vleif %%v31,28,0 \n\t"
"vleif %%v31,29,1 \n\t"
"vleif %%v31,30,2 \n\t"
"vleif %%v31,31,3 \n\t"
"srlg %%r0,%2,6 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%3) \n\t"

"vl %%v16,0(%%r1,%3) \n\t"
"vl %%v17,16(%%r1,%3) \n\t"
"vl %%v18,32(%%r1,%3) \n\t"
"vl %%v19,48(%%r1,%3) \n\t"
"vl %%v20,64(%%r1,%3) \n\t"
"vl %%v21,80(%%r1,%3) \n\t"
"vl %%v22,96(%%r1,%3) \n\t"
"vl %%v23,112(%%r1,%3) \n\t"
"vflpsb %%v16, %%v16 \n\t"
"vflpsb %%v17, %%v17 \n\t"
"vflpsb %%v18, %%v18 \n\t"
"vflpsb %%v19, %%v19 \n\t"
"vflpsb %%v20, %%v20 \n\t"
"vflpsb %%v21, %%v21 \n\t"
"vflpsb %%v22, %%v22 \n\t"
"vflpsb %%v23, %%v23 \n\t"
"vfchsb %%v5,%%v16,%%v17 \n\t"
"vfchsb %%v6,%%v18,%%v19 \n\t"
"vfchsb %%v7,%%v20,%%v21 \n\t"
"vfchsb %%v8,%%v22,%%v23 \n\t"
"vsel %%v16,%%v16,%%v17,%%v5 \n\t"
"vsel %%v5,%%v24,%%v25,%%v5 \n\t"
"vsel %%v17,%%v18,%%v19,%%v6 \n\t"
"vsel %%v6,%%v26,%%v27,%%v6 \n\t"
"vsel %%v18,%%v20,%%v21,%%v7 \n\t"
"vsel %%v7,%%v28,%%v29,%%v7 \n\t"
"vsel %%v19,%%v22,%%v23,%%v8 \n\t"
"vsel %%v8,%%v30,%%v31,%%v8 \n\t"

"vfchsb %%v20,%%v16,%%v17 \n\t"
"vfchsb %%v21,%%v18,%%v19 \n\t"
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
"vsel %%v5,%%v5,%%v6,%%v20 \n\t"
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
"vsel %%v6,%%v7,%%v8,%%v21 \n\t"

"vfchsb %%v18,%%v16,%%v17 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v5,%%v5,%%v6,%%v18 \n\t"
"vsegf %%v6,%%v5 \n\t"
"vesrlg %%v5,%%v5,32 \n\t"
"vag %%v5,%%v5,%%v4 \n\t"
"vag %%v6,%%v6,%%v4 \n\t"

"vfchsb %%v7,%%v16,%%v0 \n\t"
"vsel %%v0,%%v16,%%v0,%%v7 \n\t"
"vsegf %%v8,%%v7 \n\t"
"vesrlg %%v7,%%v7,32 \n\t"
"vsegf %%v7,%%v7 \n\t"
"vsel %%v1,%%v5,%%v1,%%v7 \n\t"
"vsel %%v2,%%v6,%%v2,%%v8 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"

"vl %%v16,128(%%r1,%3) \n\t"
"vl %%v17,144(%%r1,%3) \n\t"
"vl %%v18,160(%%r1,%3) \n\t"
"vl %%v19,176(%%r1,%3) \n\t"
"vl %%v20,192(%%r1,%3) \n\t"
"vl %%v21,208(%%r1,%3) \n\t"
"vl %%v22,224(%%r1,%3) \n\t"
"vl %%v23,240(%%r1,%3) \n\t"
"vflpsb %%v16, %%v16 \n\t"
"vflpsb %%v17, %%v17 \n\t"
"vflpsb %%v18, %%v18 \n\t"
"vflpsb %%v19, %%v19 \n\t"
"vflpsb %%v20, %%v20 \n\t"
"vflpsb %%v21, %%v21 \n\t"
"vflpsb %%v22, %%v22 \n\t"
"vflpsb %%v23, %%v23 \n\t"
"vfchsb %%v5,%%v16,%%v17 \n\t"
"vfchsb %%v6,%%v18,%%v19 \n\t"
"vfchsb %%v7,%%v20,%%v21 \n\t"
"vfchsb %%v8,%%v22,%%v23 \n\t"
"vsel %%v16,%%v16,%%v17,%%v5 \n\t"
"vsel %%v5,%%v24,%%v25,%%v5 \n\t"
"vsel %%v17,%%v18,%%v19,%%v6 \n\t"
"vsel %%v6,%%v26,%%v27,%%v6 \n\t"
"vsel %%v18,%%v20,%%v21,%%v7 \n\t"
"vsel %%v7,%%v28,%%v29,%%v7 \n\t"
"vsel %%v19,%%v22,%%v23,%%v8 \n\t"
"vsel %%v8,%%v30,%%v31,%%v8 \n\t"

"vfchsb %%v20,%%v16,%%v17 \n\t"
"vfchsb %%v21,%%v18,%%v19 \n\t"
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
"vsel %%v5,%%v5,%%v6,%%v20 \n\t"
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
"vsel %%v6,%%v7,%%v8,%%v21 \n\t"

"vfchsb %%v18,%%v16,%%v17 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v5,%%v5,%%v6,%%v18 \n\t"
"vsegf %%v6,%%v5 \n\t"
"vesrlg %%v5,%%v5,32 \n\t"
"vag %%v5,%%v5,%%v4 \n\t"
"vag %%v6,%%v6,%%v4 \n\t"

"vfchsb %%v7,%%v16,%%v0 \n\t"
"vsel %%v0,%%v16,%%v0,%%v7 \n\t"
"vsegf %%v8,%%v7 \n\t"
"vesrlg %%v7,%%v7,32 \n\t"
"vsegf %%v7,%%v7 \n\t"
"vsel %%v1,%%v5,%%v1,%%v7 \n\t"
"vsel %%v2,%%v6,%%v2,%%v8 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"

"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"

"veslg %%v3,%%v0,32 \n\t"
"vfchsb %%v4,%%v0,%%v3 \n\t"
"vchlg %%v5,%%v2,%%v1 \n\t"
"vfcesb %%v6,%%v0,%%v3 \n\t"
"vn %%v5,%%v5,%%v6 \n\t"
"vo %%v4,%%v4,%%v5 \n\t"
"vsel %%v0,%%v0,%%v3,%%v4 \n\t"
"vesrlg %%v4,%%v4,32 \n\t"
"vsegf %%v4,%%v4 \n\t"
"vsel %%v1,%%v1,%%v2,%%v4 \n\t"

"vrepf %%v2,%%v0,2 \n\t"
"vrepg %%v3,%%v1,1 \n\t"
"wfcsb %%v2,%%v0 \n\t"
"jne 1f \n\t"
"vstef %%v0,%1,0 \n\t"
"vmnlg %%v0,%%v1,%%v3 \n\t"
"vlgvg %0,%%v0,0 \n\t"
"j 2f \n\t"
"1: \n\t"
"wfchsb %%v4,%%v2,%%v0 \n\t"
"vsel %%v1,%%v3,%%v1,%%v4 \n\t"
"vsel %%v0,%%v2,%%v0,%%v4 \n\t"
"vlgvg %0,%%v1,0 \n\t"
"ste %%f0,%1 \n\t"
"2: \n\t"
"nop "
:"=r"(iamax),"=m"(*amax)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v1","v2","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);

return iamax;
}
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT maxf = 0.0;
BLASLONG max = 0;

if (n <= 0 || inc_x <= 0) return (max);

if (inc_x == 1) {

BLASLONG n1 = n & -64;
if (n1 > 0) {

max = isamax_kernel_64(n1, x, &maxf);

i = n1;
}

while (i < n) {
if (ABS(x[i]) > maxf) {
max = i;
maxf = ABS(x[i]);
}
i++;
}
return (max + 1);

} else {

BLASLONG n1 = n & -4;
while (j < n1) {

if (ABS(x[i]) > maxf) {
max = j;
maxf = ABS(x[i]);
}
if (ABS(x[i + inc_x]) > maxf) {
max = j + 1;
maxf = ABS(x[i + inc_x]);
}
if (ABS(x[i + 2 * inc_x]) > maxf) {
max = j + 2;
maxf = ABS(x[i + 2 * inc_x]);
}
if (ABS(x[i + 3 * inc_x]) > maxf) {
max = j + 3;
maxf = ABS(x[i + 3 * inc_x]);
}

i += inc_x * 4;

j += 4;

}


while (j < n) {
if (ABS(x[i]) > maxf) {
max = j;
maxf = ABS(x[i]);
}
i += inc_x;
j++;
}
return (max + 1);
}
}

+ 299
- 0
kernel/zarch/isamin.c View File

@@ -0,0 +1,299 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#include "common.h"
#include <math.h>

#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif

static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin)
{
BLASLONG iamin;

__asm__ volatile (
"vl %%v0,0(%3) \n\t"
"vflpsb %%v0,%%v0 \n\t"
"vleig %%v1,0,0 \n\t"
"vleig %%v1,2,1 \n\t"
"vleig %%v2,1,0 \n\t"
"vleig %%v2,3,1 \n\t"
"vrepig %%v3,32 \n\t"
"vzero %%v4 \n\t"
"vleif %%v24,0,0 \n\t"
"vleif %%v24,1,1 \n\t"
"vleif %%v24,2,2 \n\t"
"vleif %%v24,3,3 \n\t"
"vleif %%v25,4,0 \n\t"
"vleif %%v25,5,1 \n\t"
"vleif %%v25,6,2 \n\t"
"vleif %%v25,7,3 \n\t"
"vleif %%v26,8,0 \n\t"
"vleif %%v26,9,1 \n\t"
"vleif %%v26,10,2 \n\t"
"vleif %%v26,11,3 \n\t"
"vleif %%v27,12,0 \n\t"
"vleif %%v27,13,1 \n\t"
"vleif %%v27,14,2 \n\t"
"vleif %%v27,15,3 \n\t"
"vleif %%v28,16,0 \n\t"
"vleif %%v28,17,1 \n\t"
"vleif %%v28,18,2 \n\t"
"vleif %%v28,19,3 \n\t"
"vleif %%v29,20,0 \n\t"
"vleif %%v29,21,1 \n\t"
"vleif %%v29,22,2 \n\t"
"vleif %%v29,23,3 \n\t"
"vleif %%v30,24,0 \n\t"
"vleif %%v30,25,1 \n\t"
"vleif %%v30,26,2 \n\t"
"vleif %%v30,27,3 \n\t"
"vleif %%v31,28,0 \n\t"
"vleif %%v31,29,1 \n\t"
"vleif %%v31,30,2 \n\t"
"vleif %%v31,31,3 \n\t"
"srlg %%r0,%2,6 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%3) \n\t"

"vl %%v16,0(%%r1,%3) \n\t"
"vl %%v17,16(%%r1,%3) \n\t"
"vl %%v18,32(%%r1,%3) \n\t"
"vl %%v19,48(%%r1,%3) \n\t"
"vl %%v20,64(%%r1,%3) \n\t"
"vl %%v21,80(%%r1,%3) \n\t"
"vl %%v22,96(%%r1,%3) \n\t"
"vl %%v23,112(%%r1,%3) \n\t"
"vflpsb %%v16, %%v16 \n\t"
"vflpsb %%v17, %%v17 \n\t"
"vflpsb %%v18, %%v18 \n\t"
"vflpsb %%v19, %%v19 \n\t"
"vflpsb %%v20, %%v20 \n\t"
"vflpsb %%v21, %%v21 \n\t"
"vflpsb %%v22, %%v22 \n\t"
"vflpsb %%v23, %%v23 \n\t"
"vfchsb %%v5,%%v17,%%v16 \n\t"
"vfchsb %%v6,%%v19,%%v18 \n\t"
"vfchsb %%v7,%%v21,%%v20 \n\t"
"vfchsb %%v8,%%v23,%%v22 \n\t"
"vsel %%v16,%%v16,%%v17,%%v5 \n\t"
"vsel %%v5,%%v24,%%v25,%%v5 \n\t"
"vsel %%v17,%%v18,%%v19,%%v6 \n\t"
"vsel %%v6,%%v26,%%v27,%%v6 \n\t"
"vsel %%v18,%%v20,%%v21,%%v7 \n\t"
"vsel %%v7,%%v28,%%v29,%%v7 \n\t"
"vsel %%v19,%%v22,%%v23,%%v8 \n\t"
"vsel %%v8,%%v30,%%v31,%%v8 \n\t"

"vfchsb %%v20,%%v17,%%v16 \n\t"
"vfchsb %%v21,%%v19,%%v18 \n\t"
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
"vsel %%v5,%%v5,%%v6,%%v20 \n\t"
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
"vsel %%v6,%%v7,%%v8,%%v21 \n\t"

"vfchsb %%v18,%%v17,%%v16 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v5,%%v5,%%v6,%%v18 \n\t"
"vsegf %%v6,%%v5 \n\t"
"vesrlg %%v5,%%v5,32 \n\t"
"vag %%v5,%%v5,%%v4 \n\t"
"vag %%v6,%%v6,%%v4 \n\t"

"vfchsb %%v7,%%v0,%%v16 \n\t"
"vsel %%v0,%%v16,%%v0,%%v7 \n\t"
"vsegf %%v8,%%v7 \n\t"
"vesrlg %%v7,%%v7,32 \n\t"
"vsegf %%v7,%%v7 \n\t"
"vsel %%v1,%%v5,%%v1,%%v7 \n\t"
"vsel %%v2,%%v6,%%v2,%%v8 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"

"vl %%v16,128(%%r1,%3) \n\t"
"vl %%v17,144(%%r1,%3) \n\t"
"vl %%v18,160(%%r1,%3) \n\t"
"vl %%v19,176(%%r1,%3) \n\t"
"vl %%v20,192(%%r1,%3) \n\t"
"vl %%v21,208(%%r1,%3) \n\t"
"vl %%v22,224(%%r1,%3) \n\t"
"vl %%v23,240(%%r1,%3) \n\t"
"vflpsb %%v16, %%v16 \n\t"
"vflpsb %%v17, %%v17 \n\t"
"vflpsb %%v18, %%v18 \n\t"
"vflpsb %%v19, %%v19 \n\t"
"vflpsb %%v20, %%v20 \n\t"
"vflpsb %%v21, %%v21 \n\t"
"vflpsb %%v22, %%v22 \n\t"
"vflpsb %%v23, %%v23 \n\t"
"vfchsb %%v5,%%v17,%%v16 \n\t"
"vfchsb %%v6,%%v19,%%v18 \n\t"
"vfchsb %%v7,%%v21,%%v20 \n\t"
"vfchsb %%v8,%%v23,%%v22 \n\t"
"vsel %%v16,%%v16,%%v17,%%v5 \n\t"
"vsel %%v5,%%v24,%%v25,%%v5 \n\t"
"vsel %%v17,%%v18,%%v19,%%v6 \n\t"
"vsel %%v6,%%v26,%%v27,%%v6 \n\t"
"vsel %%v18,%%v20,%%v21,%%v7 \n\t"
"vsel %%v7,%%v28,%%v29,%%v7 \n\t"
"vsel %%v19,%%v22,%%v23,%%v8 \n\t"
"vsel %%v8,%%v30,%%v31,%%v8 \n\t"

"vfchsb %%v20,%%v17,%%v16 \n\t"
"vfchsb %%v21,%%v19,%%v18 \n\t"
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
"vsel %%v5,%%v5,%%v6,%%v20 \n\t"
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
"vsel %%v6,%%v7,%%v8,%%v21 \n\t"

"vfchsb %%v18,%%v17,%%v16 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v5,%%v5,%%v6,%%v18 \n\t"
"vsegf %%v6,%%v5 \n\t"
"vesrlg %%v5,%%v5,32 \n\t"
"vag %%v5,%%v5,%%v4 \n\t"
"vag %%v6,%%v6,%%v4 \n\t"

"vfchsb %%v7,%%v0,%%v16 \n\t"
"vsel %%v0,%%v16,%%v0,%%v7 \n\t"
"vsegf %%v8,%%v7 \n\t"
"vesrlg %%v7,%%v7,32 \n\t"
"vsegf %%v7,%%v7 \n\t"
"vsel %%v1,%%v5,%%v1,%%v7 \n\t"
"vsel %%v2,%%v6,%%v2,%%v8 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"

"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"

"veslg %%v3,%%v0,32 \n\t"
"vfchsb %%v4,%%v3,%%v0 \n\t"
"vchlg %%v5,%%v2,%%v1 \n\t"
"vfcesb %%v6,%%v0,%%v3 \n\t"
"vn %%v5,%%v5,%%v6 \n\t"
"vo %%v4,%%v4,%%v5 \n\t"
"vsel %%v0,%%v0,%%v3,%%v4 \n\t"
"vesrlg %%v4,%%v4,32 \n\t"
"vsegf %%v4,%%v4 \n\t"
"vsel %%v1,%%v1,%%v2,%%v4 \n\t"

"vrepf %%v2,%%v0,2 \n\t"
"vrepg %%v3,%%v1,1 \n\t"
"wfcsb %%v2,%%v0 \n\t"
"jne 1f \n\t"
"vstef %%v0,%1,0 \n\t"
"vmnlg %%v0,%%v1,%%v3 \n\t"
"vlgvg %0,%%v0,0 \n\t"
"j 2f \n\t"
"1: \n\t"
"wfchsb %%v4,%%v0,%%v2 \n\t"
"vsel %%v1,%%v3,%%v1,%%v4 \n\t"
"vsel %%v0,%%v2,%%v0,%%v4 \n\t"
"vlgvg %0,%%v1,0 \n\t"
"ste %%f0,%1 \n\t"
"2: \n\t"
"nop "
:"=r"(iamin),"=m"(*amin)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v1","v2","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);

return iamin;
}
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT minf = 0.0;
BLASLONG min = 0;

if (n <= 0 || inc_x <= 0) return (min);

if (inc_x == 1) {

BLASLONG n1 = n & -64;
if (n1 > 0) {

min = isamin_kernel_64(n1, x, &minf);

i = n1;
}

while (i < n) {
if (ABS(x[i]) < minf) {
min = i;
minf = ABS(x[i]);
}
i++;
}
return (min + 1);

} else {

BLASLONG n1 = n & -4;
while (j < n1) {

if (ABS(x[i]) < minf) {
min = j;
minf = ABS(x[i]);
}
if (ABS(x[i + inc_x]) < minf) {
min = j + 1;
minf = ABS(x[i + inc_x]);
}
if (ABS(x[i + 2 * inc_x]) < minf) {
min = j + 2;
minf = ABS(x[i + 2 * inc_x]);
}
if (ABS(x[i + 3 * inc_x]) < minf) {
min = j + 3;
minf = ABS(x[i + 3 * inc_x]);
}

i += inc_x * 4;

j += 4;

}


while (j < n) {
if (ABS(x[i]) < minf) {
min = j;
minf = ABS(x[i]);
}
i += inc_x;
j++;
}
return (min + 1);
}
}

+ 275
- 0
kernel/zarch/ismax.c View File

@@ -0,0 +1,275 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#include "common.h"

static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max)
{
BLASLONG imax;

__asm__ volatile (
"vl %%v0,0(%3) \n\t"
"vleig %%v1,0,0 \n\t"
"vleig %%v1,2,1 \n\t"
"vleig %%v2,1,0 \n\t"
"vleig %%v2,3,1 \n\t"
"vrepig %%v3,32 \n\t"
"vzero %%v4 \n\t"
"vleif %%v24,0,0 \n\t"
"vleif %%v24,1,1 \n\t"
"vleif %%v24,2,2 \n\t"
"vleif %%v24,3,3 \n\t"
"vleif %%v25,4,0 \n\t"
"vleif %%v25,5,1 \n\t"
"vleif %%v25,6,2 \n\t"
"vleif %%v25,7,3 \n\t"
"vleif %%v26,8,0 \n\t"
"vleif %%v26,9,1 \n\t"
"vleif %%v26,10,2 \n\t"
"vleif %%v26,11,3 \n\t"
"vleif %%v27,12,0 \n\t"
"vleif %%v27,13,1 \n\t"
"vleif %%v27,14,2 \n\t"
"vleif %%v27,15,3 \n\t"
"vleif %%v28,16,0 \n\t"
"vleif %%v28,17,1 \n\t"
"vleif %%v28,18,2 \n\t"
"vleif %%v28,19,3 \n\t"
"vleif %%v29,20,0 \n\t"
"vleif %%v29,21,1 \n\t"
"vleif %%v29,22,2 \n\t"
"vleif %%v29,23,3 \n\t"
"vleif %%v30,24,0 \n\t"
"vleif %%v30,25,1 \n\t"
"vleif %%v30,26,2 \n\t"
"vleif %%v30,27,3 \n\t"
"vleif %%v31,28,0 \n\t"
"vleif %%v31,29,1 \n\t"
"vleif %%v31,30,2 \n\t"
"vleif %%v31,31,3 \n\t"
"srlg %%r0,%2,6 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%3) \n\t"

"vl %%v16,0(%%r1,%3) \n\t"
"vl %%v17,16(%%r1,%3) \n\t"
"vl %%v18,32(%%r1,%3) \n\t"
"vl %%v19,48(%%r1,%3) \n\t"
"vl %%v20,64(%%r1,%3) \n\t"
"vl %%v21,80(%%r1,%3) \n\t"
"vl %%v22,96(%%r1,%3) \n\t"
"vl %%v23,112(%%r1,%3) \n\t"
"vfchsb %%v5,%%v16,%%v17 \n\t"
"vfchsb %%v6,%%v18,%%v19 \n\t"
"vfchsb %%v7,%%v20,%%v21 \n\t"
"vfchsb %%v8,%%v22,%%v23 \n\t"
"vsel %%v16,%%v16,%%v17,%%v5 \n\t"
"vsel %%v5,%%v24,%%v25,%%v5 \n\t"
"vsel %%v17,%%v18,%%v19,%%v6 \n\t"
"vsel %%v6,%%v26,%%v27,%%v6 \n\t"
"vsel %%v18,%%v20,%%v21,%%v7 \n\t"
"vsel %%v7,%%v28,%%v29,%%v7 \n\t"
"vsel %%v19,%%v22,%%v23,%%v8 \n\t"
"vsel %%v8,%%v30,%%v31,%%v8 \n\t"

"vfchsb %%v20,%%v16,%%v17 \n\t"
"vfchsb %%v21,%%v18,%%v19 \n\t"
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
"vsel %%v5,%%v5,%%v6,%%v20 \n\t"
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
"vsel %%v6,%%v7,%%v8,%%v21 \n\t"

"vfchsb %%v18,%%v16,%%v17 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v5,%%v5,%%v6,%%v18 \n\t"
"vsegf %%v6,%%v5 \n\t"
"vesrlg %%v5,%%v5,32 \n\t"
"vag %%v5,%%v5,%%v4 \n\t"
"vag %%v6,%%v6,%%v4 \n\t"

"vfchsb %%v7,%%v16,%%v0 \n\t"
"vsel %%v0,%%v16,%%v0,%%v7 \n\t"
"vsegf %%v8,%%v7 \n\t"
"vesrlg %%v7,%%v7,32 \n\t"
"vsegf %%v7,%%v7 \n\t"
"vsel %%v1,%%v5,%%v1,%%v7 \n\t"
"vsel %%v2,%%v6,%%v2,%%v8 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"

"vl %%v16,128(%%r1,%3) \n\t"
"vl %%v17,144(%%r1,%3) \n\t"
"vl %%v18,160(%%r1,%3) \n\t"
"vl %%v19,176(%%r1,%3) \n\t"
"vl %%v20,192(%%r1,%3) \n\t"
"vl %%v21,208(%%r1,%3) \n\t"
"vl %%v22,224(%%r1,%3) \n\t"
"vl %%v23,240(%%r1,%3) \n\t"
"vfchsb %%v5,%%v16,%%v17 \n\t"
"vfchsb %%v6,%%v18,%%v19 \n\t"
"vfchsb %%v7,%%v20,%%v21 \n\t"
"vfchsb %%v8,%%v22,%%v23 \n\t"
"vsel %%v16,%%v16,%%v17,%%v5 \n\t"
"vsel %%v5,%%v24,%%v25,%%v5 \n\t"
"vsel %%v17,%%v18,%%v19,%%v6 \n\t"
"vsel %%v6,%%v26,%%v27,%%v6 \n\t"
"vsel %%v18,%%v20,%%v21,%%v7 \n\t"
"vsel %%v7,%%v28,%%v29,%%v7 \n\t"
"vsel %%v19,%%v22,%%v23,%%v8 \n\t"
"vsel %%v8,%%v30,%%v31,%%v8 \n\t"

"vfchsb %%v20,%%v16,%%v17 \n\t"
"vfchsb %%v21,%%v18,%%v19 \n\t"
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
"vsel %%v5,%%v5,%%v6,%%v20 \n\t"
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
"vsel %%v6,%%v7,%%v8,%%v21 \n\t"

"vfchsb %%v18,%%v16,%%v17 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v5,%%v5,%%v6,%%v18 \n\t"
"vsegf %%v6,%%v5 \n\t"
"vesrlg %%v5,%%v5,32 \n\t"
"vag %%v5,%%v5,%%v4 \n\t"
"vag %%v6,%%v6,%%v4 \n\t"

"vfchsb %%v7,%%v16,%%v0 \n\t"
"vsel %%v0,%%v16,%%v0,%%v7 \n\t"
"vsegf %%v8,%%v7 \n\t"
"vesrlg %%v7,%%v7,32 \n\t"
"vsegf %%v7,%%v7 \n\t"
"vsel %%v1,%%v5,%%v1,%%v7 \n\t"
"vsel %%v2,%%v6,%%v2,%%v8 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"

"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"

"veslg %%v3,%%v0,32 \n\t"
"vfchsb %%v4,%%v0,%%v3 \n\t"
"vchlg %%v5,%%v2,%%v1 \n\t"
"vfcesb %%v6,%%v0,%%v3 \n\t"
"vn %%v5,%%v5,%%v6 \n\t"
"vo %%v4,%%v4,%%v5 \n\t"
"vsel %%v0,%%v0,%%v3,%%v4 \n\t"
"vesrlg %%v4,%%v4,32 \n\t"
"vsegf %%v4,%%v4 \n\t"
"vsel %%v1,%%v1,%%v2,%%v4 \n\t"

"vrepf %%v2,%%v0,2 \n\t"
"vrepg %%v3,%%v1,1 \n\t"
"wfcsb %%v2,%%v0 \n\t"
"jne 1f \n\t"
"vstef %%v0,%1,0 \n\t"
"vmnlg %%v0,%%v1,%%v3 \n\t"
"vlgvg %0,%%v0,0 \n\t"
"j 2f \n\t"
"1: \n\t"
"wfchsb %%v4,%%v2,%%v0 \n\t"
"vsel %%v1,%%v3,%%v1,%%v4 \n\t"
"vsel %%v0,%%v2,%%v0,%%v4 \n\t"
"vlgvg %0,%%v1,0 \n\t"
"ste %%f0,%1 \n\t"
"2: \n\t"
"nop "
:"=r"(imax),"=m"(*max)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v1","v2","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);

return imax;
}
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT maxf = 0.0;
BLASLONG max = 0;

if (n <= 0 || inc_x <= 0) return (max);

if (inc_x == 1) {

BLASLONG n1 = n & -64;
if (n1 > 0) {

max = ismax_kernel_64(n1, x, &maxf);

i = n1;
}

while (i < n) {
if (x[i] > maxf) {
max = i;
maxf = x[i];
}
i++;
}
return (max + 1);

} else {

BLASLONG n1 = n & -4;
while (j < n1) {

if (x[i] > maxf) {
max = j;
maxf = x[i];
}
if (x[i + inc_x] > maxf) {
max = j + 1;
maxf = x[i + inc_x];
}
if (x[i + 2 * inc_x] > maxf) {
max = j + 2;
maxf = x[i + 2 * inc_x];
}
if (x[i + 3 * inc_x] > maxf) {
max = j + 3;
maxf = x[i + 3 * inc_x];
}

i += inc_x * 4;

j += 4;

}


while (j < n) {
if (x[i] > maxf) {
max = j;
maxf = x[i];
}
i += inc_x;
j++;
}
return (max + 1);
}
}

+ 275
- 0
kernel/zarch/ismin.c View File

@@ -0,0 +1,275 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#include "common.h"

static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min)
{
BLASLONG imin;

__asm__ volatile (
"vl %%v0,0(%3) \n\t"
"vleig %%v1,0,0 \n\t"
"vleig %%v1,2,1 \n\t"
"vleig %%v2,1,0 \n\t"
"vleig %%v2,3,1 \n\t"
"vrepig %%v3,32 \n\t"
"vzero %%v4 \n\t"
"vleif %%v24,0,0 \n\t"
"vleif %%v24,1,1 \n\t"
"vleif %%v24,2,2 \n\t"
"vleif %%v24,3,3 \n\t"
"vleif %%v25,4,0 \n\t"
"vleif %%v25,5,1 \n\t"
"vleif %%v25,6,2 \n\t"
"vleif %%v25,7,3 \n\t"
"vleif %%v26,8,0 \n\t"
"vleif %%v26,9,1 \n\t"
"vleif %%v26,10,2 \n\t"
"vleif %%v26,11,3 \n\t"
"vleif %%v27,12,0 \n\t"
"vleif %%v27,13,1 \n\t"
"vleif %%v27,14,2 \n\t"
"vleif %%v27,15,3 \n\t"
"vleif %%v28,16,0 \n\t"
"vleif %%v28,17,1 \n\t"
"vleif %%v28,18,2 \n\t"
"vleif %%v28,19,3 \n\t"
"vleif %%v29,20,0 \n\t"
"vleif %%v29,21,1 \n\t"
"vleif %%v29,22,2 \n\t"
"vleif %%v29,23,3 \n\t"
"vleif %%v30,24,0 \n\t"
"vleif %%v30,25,1 \n\t"
"vleif %%v30,26,2 \n\t"
"vleif %%v30,27,3 \n\t"
"vleif %%v31,28,0 \n\t"
"vleif %%v31,29,1 \n\t"
"vleif %%v31,30,2 \n\t"
"vleif %%v31,31,3 \n\t"
"srlg %%r0,%2,6 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%3) \n\t"

"vl %%v16,0(%%r1,%3) \n\t"
"vl %%v17,16(%%r1,%3) \n\t"
"vl %%v18,32(%%r1,%3) \n\t"
"vl %%v19,48(%%r1,%3) \n\t"
"vl %%v20,64(%%r1,%3) \n\t"
"vl %%v21,80(%%r1,%3) \n\t"
"vl %%v22,96(%%r1,%3) \n\t"
"vl %%v23,112(%%r1,%3) \n\t"
"vfchsb %%v5,%%v17,%%v16 \n\t"
"vfchsb %%v6,%%v19,%%v18 \n\t"
"vfchsb %%v7,%%v21,%%v20 \n\t"
"vfchsb %%v8,%%v23,%%v22 \n\t"
"vsel %%v16,%%v16,%%v17,%%v5 \n\t"
"vsel %%v5,%%v24,%%v25,%%v5 \n\t"
"vsel %%v17,%%v18,%%v19,%%v6 \n\t"
"vsel %%v6,%%v26,%%v27,%%v6 \n\t"
"vsel %%v18,%%v20,%%v21,%%v7 \n\t"
"vsel %%v7,%%v28,%%v29,%%v7 \n\t"
"vsel %%v19,%%v22,%%v23,%%v8 \n\t"
"vsel %%v8,%%v30,%%v31,%%v8 \n\t"

"vfchsb %%v20,%%v17,%%v16 \n\t"
"vfchsb %%v21,%%v19,%%v18 \n\t"
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
"vsel %%v5,%%v5,%%v6,%%v20 \n\t"
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
"vsel %%v6,%%v7,%%v8,%%v21 \n\t"

"vfchsb %%v18,%%v17,%%v16 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v5,%%v5,%%v6,%%v18 \n\t"
"vsegf %%v6,%%v5 \n\t"
"vesrlg %%v5,%%v5,32 \n\t"
"vag %%v5,%%v5,%%v4 \n\t"
"vag %%v6,%%v6,%%v4 \n\t"

"vfchsb %%v7,%%v0,%%v16 \n\t"
"vsel %%v0,%%v16,%%v0,%%v7 \n\t"
"vsegf %%v8,%%v7 \n\t"
"vesrlg %%v7,%%v7,32 \n\t"
"vsegf %%v7,%%v7 \n\t"
"vsel %%v1,%%v5,%%v1,%%v7 \n\t"
"vsel %%v2,%%v6,%%v2,%%v8 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"

"vl %%v16,128(%%r1,%3) \n\t"
"vl %%v17,144(%%r1,%3) \n\t"
"vl %%v18,160(%%r1,%3) \n\t"
"vl %%v19,176(%%r1,%3) \n\t"
"vl %%v20,192(%%r1,%3) \n\t"
"vl %%v21,208(%%r1,%3) \n\t"
"vl %%v22,224(%%r1,%3) \n\t"
"vl %%v23,240(%%r1,%3) \n\t"
"vfchsb %%v5,%%v17,%%v16 \n\t"
"vfchsb %%v6,%%v19,%%v18 \n\t"
"vfchsb %%v7,%%v21,%%v20 \n\t"
"vfchsb %%v8,%%v23,%%v22 \n\t"
"vsel %%v16,%%v16,%%v17,%%v5 \n\t"
"vsel %%v5,%%v24,%%v25,%%v5 \n\t"
"vsel %%v17,%%v18,%%v19,%%v6 \n\t"
"vsel %%v6,%%v26,%%v27,%%v6 \n\t"
"vsel %%v18,%%v20,%%v21,%%v7 \n\t"
"vsel %%v7,%%v28,%%v29,%%v7 \n\t"
"vsel %%v19,%%v22,%%v23,%%v8 \n\t"
"vsel %%v8,%%v30,%%v31,%%v8 \n\t"

"vfchsb %%v20,%%v17,%%v16 \n\t"
"vfchsb %%v21,%%v19,%%v18 \n\t"
"vsel %%v16,%%v16,%%v17,%%v20 \n\t"
"vsel %%v5,%%v5,%%v6,%%v20 \n\t"
"vsel %%v17,%%v18,%%v19,%%v21 \n\t"
"vsel %%v6,%%v7,%%v8,%%v21 \n\t"

"vfchsb %%v18,%%v17,%%v16 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v5,%%v5,%%v6,%%v18 \n\t"
"vsegf %%v6,%%v5 \n\t"
"vesrlg %%v5,%%v5,32 \n\t"
"vag %%v5,%%v5,%%v4 \n\t"
"vag %%v6,%%v6,%%v4 \n\t"

"vfchsb %%v7,%%v0,%%v16 \n\t"
"vsel %%v0,%%v16,%%v0,%%v7 \n\t"
"vsegf %%v8,%%v7 \n\t"
"vesrlg %%v7,%%v7,32 \n\t"
"vsegf %%v7,%%v7 \n\t"
"vsel %%v1,%%v5,%%v1,%%v7 \n\t"
"vsel %%v2,%%v6,%%v2,%%v8 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"

"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"

"veslg %%v3,%%v0,32 \n\t"
"vfchsb %%v4,%%v3,%%v0 \n\t"
"vchlg %%v5,%%v2,%%v1 \n\t"
"vfcesb %%v6,%%v0,%%v3 \n\t"
"vn %%v5,%%v5,%%v6 \n\t"
"vo %%v4,%%v4,%%v5 \n\t"
"vsel %%v0,%%v0,%%v3,%%v4 \n\t"
"vesrlg %%v4,%%v4,32 \n\t"
"vsegf %%v4,%%v4 \n\t"
"vsel %%v1,%%v1,%%v2,%%v4 \n\t"

"vrepf %%v2,%%v0,2 \n\t"
"vrepg %%v3,%%v1,1 \n\t"
"wfcsb %%v2,%%v0 \n\t"
"jne 1f \n\t"
"vstef %%v0,%1,0 \n\t"
"vmnlg %%v0,%%v1,%%v3 \n\t"
"vlgvg %0,%%v0,0 \n\t"
"j 2f \n\t"
"1: \n\t"
"wfchsb %%v4,%%v0,%%v2 \n\t"
"vsel %%v1,%%v3,%%v1,%%v4 \n\t"
"vsel %%v0,%%v2,%%v0,%%v4 \n\t"
"vlgvg %0,%%v1,0 \n\t"
"ste %%f0,%1 \n\t"
"2: \n\t"
"nop "
:"=r"(imin),"=m"(*min)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v1","v2","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);

return imin;
}
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT minf = 0.0;
BLASLONG min = 0;

if (n <= 0 || inc_x <= 0) return (min);

if (inc_x == 1) {

BLASLONG n1 = n & -64;
if (n1 > 0) {

min = ismin_kernel_64(n1, x, &minf);

i = n1;
}

while (i < n) {
if (x[i] < minf) {
min = i;
minf = x[i];
}
i++;
}
return (min + 1);

} else {

BLASLONG n1 = n & -4;
while (j < n1) {

if (x[i] < minf) {
min = j;
minf = x[i];
}
if (x[i + inc_x] < minf) {
min = j + 1;
minf = x[i + inc_x];
}
if (x[i + 2 * inc_x] < minf) {
min = j + 2;
minf = x[i + 2 * inc_x];
}
if (x[i + 3 * inc_x] < minf) {
min = j + 3;
minf = x[i + 3 * inc_x];
}

i += inc_x * 4;

j += 4;

}


while (j < n) {
if (x[i] < minf) {
min = j;
minf = x[i];
}
i += inc_x;
j++;
}
return (min + 1);
}
}

+ 154
- 180
kernel/zarch/izamax.c View File

@@ -24,190 +24,165 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#include "common.h"
#include <math.h>
#define ABS fabs
#define CABS1(x,i) ABS(x[i])+ABS(x[i+1])



/**
* Find maximum index
* Warning: requirements n>0 and n % 16 == 0
* @param n
* @param x pointer to the vector
* @param maxf (out) maximum absolute value .( only for output )
* @return index
*/
static BLASLONG ziamax_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *maxf) {
BLASLONG index;
__asm__(
"pfd 1, 0(%[ptr_x]) \n\t"
"vleig %%v16,0,0 \n\t"
"vleig %%v16,1,1 \n\t"
"vleig %%v17,2,0 \n\t"
"vleig %%v17,3,1 \n\t"
"vleig %%v18,4,0 \n\t"
"vleig %%v18,5,1 \n\t"
"vleig %%v19,6,0 \n\t"
"vleig %%v19,7,1 \n\t"
"vleig %%v20,8,0 \n\t"
"vleig %%v20,9,1 \n\t"
"vleig %%v21,10,0 \n\t"
"vleig %%v21,11,1 \n\t"
"vleig %%v22,12,0 \n\t"
"vleig %%v22,13,1 \n\t"
"vleig %%v23,14,0 \n\t"
"vleig %%v23,15,1 \n\t"
"sllg %%r0,%[n],4 \n\t"
"agr %%r0,%[ptr_x] \n\t"
"vzero %%v6 \n\t"
"vzero %%v7 \n\t"
"vrepig %%v4,16 \n\t"
"vzero %%v5 \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 1, 256(%[ptr_tmp] ) \n\t"
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1]))

static BLASLONG izamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amax)
{
BLASLONG iamax;

__asm__ volatile (
"vleg %%v0,0(%3),0 \n\t"
"vleg %%v1,8(%3),0 \n\t"
"vleg %%v0,16(%3),1 \n\t"
"vleg %%v1,24(%3),1 \n\t"
"vflpdb %%v0,%%v0 \n\t"
"vflpdb %%v1,%%v1 \n\t"
"vfadb %%v0,%%v0,%%v1 \n\t"
"vleig %%v1,0,0 \n\t"
"vleig %%v1,1,1 \n\t"
"vrepig %%v2,8 \n\t"
"vzero %%v3 \n\t"
"vleig %%v24,0,0 \n\t"
"vleig %%v24,1,1 \n\t"
"vleig %%v25,2,0 \n\t"
"vleig %%v25,3,1 \n\t"
"vleig %%v26,4,0 \n\t"
"vleig %%v26,5,1 \n\t"
"vleig %%v27,6,0 \n\t"
"vleig %%v27,7,1 \n\t"
"srlg %%r0,%2,4 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%3) \n\t"

"vleg %%v16,0(%%r1,%3),0 \n\t"
"vleg %%v17,8(%%r1,%3),0 \n\t"
"vleg %%v16,16(%%r1,%3),1 \n\t"
"vleg %%v17,24(%%r1,%3),1 \n\t"
"vleg %%v18,32(%%r1,%3),0 \n\t"
"vleg %%v19,40(%%r1,%3),0 \n\t"
"vleg %%v18,48(%%r1,%3),1 \n\t"
"vleg %%v19,56(%%r1,%3),1 \n\t"
"vleg %%v20,64(%%r1,%3),0 \n\t"
"vleg %%v21,72(%%r1,%3),0 \n\t"
"vleg %%v20,80(%%r1,%3),1 \n\t"
"vleg %%v21,88(%%r1,%3),1 \n\t"
"vleg %%v22,96(%%r1,%3),0 \n\t"
"vleg %%v23,104(%%r1,%3),0 \n\t"
"vleg %%v22,112(%%r1,%3),1 \n\t"
"vleg %%v23,120(%%r1,%3),1 \n\t"
"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"
"vfadb %%v16,%%v16,%%v17 \n\t"
"vfadb %%v17,%%v18,%%v19 \n\t"
"vfadb %%v18,%%v20,%%v21 \n\t"
"vfadb %%v19,%%v22,%%v23 \n\t"
"vleg %%v24 , 0(%[ptr_tmp]),0 \n\t"
"vleg %%v25 , 8(%[ptr_tmp]),0 \n\t"
"vleg %%v24 , 16(%[ptr_tmp]),1 \n\t"
"vleg %%v25 , 24(%[ptr_tmp]),1 \n\t"
"vleg %%v26 , 32(%[ptr_tmp]),0 \n\t"
"vleg %%v27 , 40(%[ptr_tmp]),0 \n\t"
"vleg %%v26 , 48(%[ptr_tmp]),1 \n\t"
"vleg %%v27 , 56(%[ptr_tmp]),1 \n\t"
"vleg %%v28 , 64(%[ptr_tmp]),0 \n\t"
"vleg %%v29 , 72(%[ptr_tmp]),0 \n\t"
"vleg %%v28 , 80(%[ptr_tmp]),1 \n\t"
"vleg %%v29 , 88(%[ptr_tmp]),1 \n\t"
"vleg %%v30 , 96(%[ptr_tmp]),0 \n\t"
"vleg %%v31 ,104(%[ptr_tmp]),0 \n\t"
"vleg %%v30 ,112(%[ptr_tmp]),1 \n\t"
"vleg %%v31 ,120(%[ptr_tmp]),1 \n\t"
"vflpdb %%v24, %%v24 \n\t"
"vflpdb %%v25, %%v25 \n\t"
"vflpdb %%v26, %%v26 \n\t"
"vflpdb %%v27, %%v27 \n\t"
"vflpdb %%v28, %%v28 \n\t"
"vflpdb %%v29, %%v29 \n\t"
"vflpdb %%v30, %%v30 \n\t"
"vflpdb %%v31, %%v31 \n\t"
"vfadb %%v0,%%v24,%%v25 \n\t"
"vfadb %%v1,%%v26,%%v27 \n\t"
"vfadb %%v2,%%v28,%%v29 \n\t"
"vfadb %%v3,%%v30,%%v31 \n\t"
"vleg %%v24 , 128(%[ptr_tmp]),0 \n\t"
"vleg %%v25 , 136(%[ptr_tmp]),0 \n\t"
"vleg %%v24 , 144(%[ptr_tmp]),1 \n\t"
"vleg %%v25 , 152(%[ptr_tmp]),1 \n\t"
"vleg %%v26 , 160(%[ptr_tmp]),0 \n\t"
"vleg %%v27 , 168(%[ptr_tmp]),0 \n\t"
"vleg %%v26 , 176(%[ptr_tmp]),1 \n\t"
"vleg %%v27 , 184(%[ptr_tmp]),1 \n\t"
"vleg %%v28 , 192(%[ptr_tmp]),0 \n\t"
"vleg %%v29 , 200(%[ptr_tmp]),0 \n\t"
"vleg %%v28 , 208(%[ptr_tmp]),1 \n\t"
"vleg %%v29 , 216(%[ptr_tmp]),1 \n\t"
"vleg %%v30 , 224(%[ptr_tmp]),0 \n\t"
"vleg %%v31 , 232(%[ptr_tmp]),0 \n\t"
"vleg %%v30 , 240(%[ptr_tmp]),1 \n\t"
"vleg %%v31 , 248(%[ptr_tmp]),1 \n\t"
"vflpdb %%v24, %%v24 \n\t"
"vflpdb %%v25, %%v25 \n\t"
"vflpdb %%v26, %%v26 \n\t"
"vflpdb %%v27, %%v27 \n\t"
"vflpdb %%v28, %%v28 \n\t"
"vflpdb %%v29, %%v29 \n\t"
"vflpdb %%v30, %%v30 \n\t"
"vflpdb %%v31, %%v31 \n\t"
"vfadb %%v24,%%v24,%%v25 \n\t"
"vfadb %%v26,%%v26,%%v27 \n\t"
"vfadb %%v28,%%v28,%%v29 \n\t"
"vfadb %%v30,%%v30,%%v31 \n\t"
"vfchdb %%v25,%%v1,%%v0 \n\t"
"vsel %%v29,%%v17,%%v16,%%v25 \n\t"
"vsel %%v31,%%v1,%%v0,%%v25 \n\t"
"vfchdb %%v27,%%v3,%%v2 \n\t "
"vsel %%v0,%%v19,%%v18,%%v27 \n\t"
"vsel %%v1,%%v3,%%v2,%%v27 \n\t"
"vfchdb %%v25,%%v26,%%v24 \n\t"
"vsel %%v2,%%v21,%%v20,%%v25 \n\t"
"vsel %%v3,%%v26,%%v24,%%v25 \n\t"
"vfchdb %%v27,%%v30,%%v28 \n\t"
"vsel %%v25,%%v23,%%v22,%%v27 \n\t"
"vsel %%v27,%%v30,%%v28,%%v27 \n\t"
"vfchdb %%v24, %%v1,%%v31 \n\t"
"vsel %%v26,%%v0,%%v29,%%v24 \n\t"
"vsel %%v28,%%v1,%%v31,%%v24 \n\t"
"vfchdb %%v30, %%v27,%%v3 \n\t"
"vsel %%v29,%%v25,%%v2,%%v30 \n\t"
"vsel %%v31,%%v27,%%v3 ,%%v30 \n\t"
"la %[ptr_tmp],256(%[ptr_tmp]) \n\t"
"vfchdb %%v0, %%v31,%%v28 \n\t"
"vsel %%v25,%%v29,%%v26,%%v0 \n\t"
"vsel %%v27,%%v31,%%v28,%%v0 \n\t"
"vag %%v25,%%v25,%%v5 \n\t"
//cmp with previous
"vfchdb %%v30, %%v27,%%v6 \n\t"
"vsel %%v7,%%v25,%%v7,%%v30 \n\t"
"vsel %%v6,%%v27,%%v6,%%v30 \n\t"
"vag %%v5,%%v5,%%v4 \n\t"
"clgrjl %[ptr_tmp],%%r0,1b \n\t"

//xtract index
"vrepg %%v26,%%v6,1 \n\t"
"vrepg %%v5,%%v7,1 \n\t"
"wfcdb %%v26,%%v6 \n\t"
"jne 2f \n\t"
"vsteg %%v6,%[maxf],0 \n\t"
"vmnlg %%v1,%%v5,%%v7 \n\t"
"vlgvg %[index],%%v1,0 \n\t"
"j 3 \n\t"
"2: \n\t"
"wfchdb %%v16,%%v26,%%v6 \n\t"
"vsel %%v1,%%v5,%%v7,%%v16 \n\t"
"vsel %%v0,%%v26,%%v6,%%v16 \n\t"
"vlgvg %[index],%%v1,0 \n\t"
"std %%f0,%[maxf] \n\t"
"3: \n\t"
: [index] "+r"(index) ,[maxf] "=m"(*maxf), [ptr_tmp] "+&a"(x)
: [mem] "m"( *(const double (*)[2*n])x), [n] "r"(n), [ptr_x] "r"(x)
: "cc","r0", "f0","v0","v1","v2","v3","v4","v5","v6","v7","v16",
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"

);
return index;

"vfchdb %%v4,%%v16,%%v17 \n\t"
"vfchdb %%v5,%%v18,%%v19 \n\t"
"vsel %%v16,%%v16,%%v17,%%v4 \n\t"
"vsel %%v4,%%v24,%%v25,%%v4 \n\t"
"vsel %%v17,%%v18,%%v19,%%v5 \n\t"
"vsel %%v5,%%v26,%%v27,%%v5 \n\t"

"vfchdb %%v18,%%v16,%%v17 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v4,%%v4,%%v5,%%v18 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"

"vfchdb %%v5,%%v16,%%v0 \n\t"
"vsel %%v0,%%v16,%%v0,%%v5 \n\t"
"vsel %%v1,%%v4,%%v1,%%v5 \n\t"
"vag %%v3,%%v3,%%v2 \n\t"

"vleg %%v16,128(%%r1,%3),0 \n\t"
"vleg %%v17,136(%%r1,%3),0 \n\t"
"vleg %%v16,144(%%r1,%3),1 \n\t"
"vleg %%v17,152(%%r1,%3),1 \n\t"
"vleg %%v18,160(%%r1,%3),0 \n\t"
"vleg %%v19,168(%%r1,%3),0 \n\t"
"vleg %%v18,176(%%r1,%3),1 \n\t"
"vleg %%v19,184(%%r1,%3),1 \n\t"
"vleg %%v20,192(%%r1,%3),0 \n\t"
"vleg %%v21,200(%%r1,%3),0 \n\t"
"vleg %%v20,208(%%r1,%3),1 \n\t"
"vleg %%v21,216(%%r1,%3),1 \n\t"
"vleg %%v22,224(%%r1,%3),0 \n\t"
"vleg %%v23,232(%%r1,%3),0 \n\t"
"vleg %%v22,240(%%r1,%3),1 \n\t"
"vleg %%v23,248(%%r1,%3),1 \n\t"
"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"
"vfadb %%v16,%%v16,%%v17 \n\t"
"vfadb %%v17,%%v18,%%v19 \n\t"
"vfadb %%v18,%%v20,%%v21 \n\t"
"vfadb %%v19,%%v22,%%v23 \n\t"
"vfchdb %%v4,%%v16,%%v17 \n\t"
"vfchdb %%v5,%%v18,%%v19 \n\t"
"vsel %%v16,%%v16,%%v17,%%v4 \n\t"
"vsel %%v4,%%v24,%%v25,%%v4 \n\t"
"vsel %%v17,%%v18,%%v19,%%v5 \n\t"
"vsel %%v5,%%v26,%%v27,%%v5 \n\t"

"vfchdb %%v18,%%v16,%%v17 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v4,%%v4,%%v5,%%v18 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"

"vfchdb %%v5,%%v16,%%v0 \n\t"
"vsel %%v0,%%v16,%%v0,%%v5 \n\t"
"vsel %%v1,%%v4,%%v1,%%v5 \n\t"
"vag %%v3,%%v3,%%v2 \n\t"

"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"

"vrepg %%v2,%%v0,1 \n\t"
"vrepg %%v3,%%v1,1 \n\t"
"wfcdb %%v2,%%v0 \n\t"
"jne 1f \n\t"
"vsteg %%v0,%1,0 \n\t"
"vmnlg %%v0,%%v1,%%v3 \n\t"
"vlgvg %0,%%v0,0 \n\t"
"j 2f \n\t"
"1: \n\t"
"wfchdb %%v4,%%v2,%%v0 \n\t"
"vsel %%v1,%%v3,%%v1,%%v4 \n\t"
"vsel %%v0,%%v2,%%v0,%%v4 \n\t"
"vlgvg %0,%%v1,0 \n\t"
"std %%f0,%1 \n\t"
"2: \n\t"
"nop "
:"=r"(iamax),"=m"(*amax)
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27"
);

return iamax;
}



BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i = 0;
@@ -223,9 +198,9 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
BLASLONG n1 = n & -16;
if (n1 > 0) {

max = ziamax_kernel_16_TUNED(n1, x, &maxf);
max = izamax_kernel_16(n1, x, &maxf);

i = n1;
ix = n1 << 1;
}

while(i < n)
@@ -260,7 +235,6 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
}
return (max + 1);
}
}



+ 182
- 218
kernel/zarch/izamin.c View File

@@ -24,253 +24,217 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#include "common.h"
#include <math.h>
#define ABS fabs
#define CABS1(x,i) ABS(x[i])+ABS(x[i+1])

/**
* Find minimum index
* Warning: requirements n>0 and n % 16 == 0
* @param n
* @param x pointer to the vector
* @param minf (out) minimum absolute value .( only for output )
* @return minimum index
*/
static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
BLASLONG index ;
__asm__(
"pfd 1, 0(%[ptr_x]) \n\t"
"vleig %%v16,0,0 \n\t"
"vleig %%v16,1,1 \n\t"
"vleig %%v17,2,0 \n\t"
"vleig %%v17,3,1 \n\t"
"vleig %%v18,4,0 \n\t"
"vleig %%v18,5,1 \n\t"
"vleig %%v19,6,0 \n\t"
"vleig %%v19,7,1 \n\t"
"vleig %%v20,8,0 \n\t"
"vleig %%v20,9,1 \n\t"
"vleig %%v21,10,0 \n\t"
"vleig %%v21,11,1 \n\t"
"vleig %%v22,12,0 \n\t"
"vleig %%v22,13,1 \n\t"
"vleig %%v23,14,0 \n\t"
"vleig %%v23,15,1 \n\t"
"ld %%f6,0(%[ptr_x]) \n\t"
"lpdbr %%f6,%%f6 \n\t"
"ld %%f7,8(%[ptr_x]) \n\t"
"lpdbr %%f7,%%f7 \n\t"
"adbr %%f6,%%f7 \n\t"
"sllg %%r0,%[n],4 \n\t"
"agr %%r0,%[ptr_x] \n\t"
"vrepg %%v6,%%v6,0 \n\t"
"vzero %%v7 \n\t"
"vrepig %%v4,16 \n\t"
"vzero %%v5 \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 1, 256(%[ptr_tmp] ) \n\t"
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1]))

static BLASLONG izamin_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amin)
{
BLASLONG iamin;

__asm__ volatile (
"vleg %%v0,0(%3),0 \n\t"
"vleg %%v1,8(%3),0 \n\t"
"vleg %%v0,16(%3),1 \n\t"
"vleg %%v1,24(%3),1 \n\t"
"vflpdb %%v0,%%v0 \n\t"
"vflpdb %%v1,%%v1 \n\t"
"vfadb %%v0,%%v0,%%v1 \n\t"
"vleig %%v1,0,0 \n\t"
"vleig %%v1,1,1 \n\t"
"vrepig %%v2,8 \n\t"
"vzero %%v3 \n\t"
"vleig %%v24,0,0 \n\t"
"vleig %%v24,1,1 \n\t"
"vleig %%v25,2,0 \n\t"
"vleig %%v25,3,1 \n\t"
"vleig %%v26,4,0 \n\t"
"vleig %%v26,5,1 \n\t"
"vleig %%v27,6,0 \n\t"
"vleig %%v27,7,1 \n\t"
"srlg %%r0,%2,4 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%3) \n\t"

"vleg %%v16,0(%%r1,%3),0 \n\t"
"vleg %%v17,8(%%r1,%3),0 \n\t"
"vleg %%v16,16(%%r1,%3),1 \n\t"
"vleg %%v17,24(%%r1,%3),1 \n\t"
"vleg %%v18,32(%%r1,%3),0 \n\t"
"vleg %%v19,40(%%r1,%3),0 \n\t"
"vleg %%v18,48(%%r1,%3),1 \n\t"
"vleg %%v19,56(%%r1,%3),1 \n\t"
"vleg %%v20,64(%%r1,%3),0 \n\t"
"vleg %%v21,72(%%r1,%3),0 \n\t"
"vleg %%v20,80(%%r1,%3),1 \n\t"
"vleg %%v21,88(%%r1,%3),1 \n\t"
"vleg %%v22,96(%%r1,%3),0 \n\t"
"vleg %%v23,104(%%r1,%3),0 \n\t"
"vleg %%v22,112(%%r1,%3),1 \n\t"
"vleg %%v23,120(%%r1,%3),1 \n\t"
"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"
"vfadb %%v16,%%v16,%%v17 \n\t"
"vfadb %%v17,%%v18,%%v19 \n\t"
"vfadb %%v18,%%v20,%%v21 \n\t"
"vfadb %%v19,%%v22,%%v23 \n\t"
"vleg %%v24 , 0(%[ptr_tmp]),0 \n\t"
"vleg %%v25 , 8(%[ptr_tmp]),0 \n\t"
"vleg %%v24 , 16(%[ptr_tmp]),1 \n\t"
"vleg %%v25 , 24(%[ptr_tmp]),1 \n\t"
"vleg %%v26 , 32(%[ptr_tmp]),0 \n\t"
"vleg %%v27 , 40(%[ptr_tmp]),0 \n\t"
"vleg %%v26 , 48(%[ptr_tmp]),1 \n\t"
"vleg %%v27 , 56(%[ptr_tmp]),1 \n\t"
"vleg %%v28 , 64(%[ptr_tmp]),0 \n\t"
"vleg %%v29 , 72(%[ptr_tmp]),0 \n\t"
"vleg %%v28 , 80(%[ptr_tmp]),1 \n\t"
"vleg %%v29 , 88(%[ptr_tmp]),1 \n\t"
"vleg %%v30 , 96(%[ptr_tmp]),0 \n\t"
"vleg %%v31 ,104(%[ptr_tmp]),0 \n\t"
"vleg %%v30 ,112(%[ptr_tmp]),1 \n\t"
"vleg %%v31 ,120(%[ptr_tmp]),1 \n\t"
"vflpdb %%v24, %%v24 \n\t"
"vflpdb %%v25, %%v25 \n\t"
"vflpdb %%v26, %%v26 \n\t"
"vflpdb %%v27, %%v27 \n\t"
"vflpdb %%v28, %%v28 \n\t"
"vflpdb %%v29, %%v29 \n\t"
"vflpdb %%v30, %%v30 \n\t"
"vflpdb %%v31, %%v31 \n\t"
"vfadb %%v0,%%v24,%%v25 \n\t"
"vfadb %%v1,%%v26,%%v27 \n\t"
"vfadb %%v2,%%v28,%%v29 \n\t"
"vfadb %%v3,%%v30,%%v31 \n\t"
"vleg %%v24 ,128(%[ptr_tmp]),0 \n\t"
"vleg %%v25 ,136(%[ptr_tmp]),0 \n\t"
"vleg %%v24 ,144(%[ptr_tmp]),1 \n\t"
"vleg %%v25 ,152(%[ptr_tmp]),1 \n\t"
"vleg %%v26 ,160(%[ptr_tmp]),0 \n\t"
"vleg %%v27 ,168(%[ptr_tmp]),0 \n\t"
"vleg %%v26 ,176(%[ptr_tmp]),1 \n\t"
"vleg %%v27 ,184(%[ptr_tmp]),1 \n\t"
"vleg %%v28 ,192(%[ptr_tmp]),0 \n\t"
"vleg %%v29 ,200(%[ptr_tmp]),0 \n\t"
"vleg %%v28 ,208(%[ptr_tmp]),1 \n\t"
"vleg %%v29 ,216(%[ptr_tmp]),1 \n\t"
"vleg %%v30 ,224(%[ptr_tmp]),0 \n\t"
"vleg %%v31 ,232(%[ptr_tmp]),0 \n\t"
"vleg %%v30 ,240(%[ptr_tmp]),1 \n\t"
"vleg %%v31 ,248(%[ptr_tmp]),1 \n\t"
"vflpdb %%v24, %%v24 \n\t"
"vflpdb %%v25, %%v25 \n\t"
"vflpdb %%v26, %%v26 \n\t"
"vflpdb %%v27, %%v27 \n\t"
"vflpdb %%v28, %%v28 \n\t"
"vflpdb %%v29, %%v29 \n\t"
"vflpdb %%v30, %%v30 \n\t"
"vflpdb %%v31, %%v31 \n\t"
"vfadb %%v24,%%v24,%%v25 \n\t"
"vfadb %%v26,%%v26,%%v27 \n\t"
"vfadb %%v28,%%v28,%%v29 \n\t"
"vfadb %%v30,%%v30,%%v31 \n\t"
"vfchdb %%v25,%%v0 ,%%v1 \n\t"
"vsel %%v29,%%v17,%%v16,%%v25 \n\t"
"vsel %%v31,%%v1,%%v0,%%v25 \n\t"
"vfchdb %%v27,%%v2,%%v3 \n\t"
"vsel %%v0,%%v19,%%v18,%%v27 \n\t"
"vsel %%v1,%%v3,%%v2,%%v27 \n\t"
"vfchdb %%v25,%%v24,%%v26 \n\t"
"vsel %%v2,%%v21,%%v20,%%v25 \n\t"
"vsel %%v3,%%v26,%%v24,%%v25 \n\t"
"vfchdb %%v27,%%v28,%%v30 \n\t"
"vsel %%v25,%%v23,%%v22,%%v27 \n\t"
"vsel %%v27,%%v30,%%v28,%%v27 \n\t"
"vfchdb %%v24,%%v31, %%v1 \n\t"
"vsel %%v26,%%v0,%%v29,%%v24 \n\t"
"vsel %%v28,%%v1,%%v31,%%v24 \n\t"
"vfchdb %%v30,%%v3, %%v27 \n\t"
"vsel %%v29,%%v25,%%v2,%%v30 \n\t"
"vsel %%v31,%%v27,%%v3 ,%%v30 \n\t"
"la %[ptr_tmp],256(%[ptr_tmp]) \n\t"
"vfchdb %%v0,%%v28, %%v31 \n\t"
"vsel %%v25,%%v29,%%v26,%%v0 \n\t"
"vsel %%v27,%%v31,%%v28,%%v0 \n\t"
"vag %%v25,%%v25,%%v5 \n\t"
//cmp with previous
"vfchdb %%v30,%%v6 , %%v27 \n\t"
"vsel %%v7,%%v25,%%v7,%%v30 \n\t"
"vsel %%v6,%%v27,%%v6,%%v30 \n\t"
"vag %%v5,%%v5,%%v4 \n\t"
"clgrjl %[ptr_tmp],%%r0,1b \n\t"

//xtract index
"vrepg %%v26,%%v6,1 \n\t"
"vrepg %%v5,%%v7,1 \n\t"
"wfcdb %%v26,%%v6 \n\t"
"jne 2f \n\t"
"vsteg %%v6,%[minf],0 \n\t"
"vmnlg %%v1,%%v5,%%v7 \n\t"
"vlgvg %[index],%%v1,0 \n\t"
"j 3f \n\t"
"2: \n\t"
"wfchdb %%v16,%%v6 ,%%v26 \n\t"
"vsel %%v1,%%v5,%%v7,%%v16 \n\t"
"vsel %%v0,%%v26,%%v6,%%v16 \n\t"
"vlgvg %[index],%%v1,0 \n\t"
"std %%f0,%[minf] \n\t"
"3: \n\t"

: [index] "+r"(index) ,[minf] "=m"(*minf), [ptr_tmp] "+&a"(x)
: [mem] "m"( *(const double (*)[2*n])x), [n] "r"(n), [ptr_x] "r"(x)
: "cc","r0","f0","v0","v1","v2","v3","v4","v5","v6","v7","v16",
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"

);

return index;
"vfchdb %%v4,%%v17,%%v16 \n\t"
"vfchdb %%v5,%%v19,%%v18 \n\t"
"vsel %%v16,%%v16,%%v17,%%v4 \n\t"
"vsel %%v4,%%v24,%%v25,%%v4 \n\t"
"vsel %%v17,%%v18,%%v19,%%v5 \n\t"
"vsel %%v5,%%v26,%%v27,%%v5 \n\t"

"vfchdb %%v18,%%v17,%%v16 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v4,%%v4,%%v5,%%v18 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"

"vfchdb %%v5,%%v0,%%v16 \n\t"
"vsel %%v0,%%v16,%%v0,%%v5 \n\t"
"vsel %%v1,%%v4,%%v1,%%v5 \n\t"
"vag %%v3,%%v3,%%v2 \n\t"

"vleg %%v16,128(%%r1,%3),0 \n\t"
"vleg %%v17,136(%%r1,%3),0 \n\t"
"vleg %%v16,144(%%r1,%3),1 \n\t"
"vleg %%v17,152(%%r1,%3),1 \n\t"
"vleg %%v18,160(%%r1,%3),0 \n\t"
"vleg %%v19,168(%%r1,%3),0 \n\t"
"vleg %%v18,176(%%r1,%3),1 \n\t"
"vleg %%v19,184(%%r1,%3),1 \n\t"
"vleg %%v20,192(%%r1,%3),0 \n\t"
"vleg %%v21,200(%%r1,%3),0 \n\t"
"vleg %%v20,208(%%r1,%3),1 \n\t"
"vleg %%v21,216(%%r1,%3),1 \n\t"
"vleg %%v22,224(%%r1,%3),0 \n\t"
"vleg %%v23,232(%%r1,%3),0 \n\t"
"vleg %%v22,240(%%r1,%3),1 \n\t"
"vleg %%v23,248(%%r1,%3),1 \n\t"
"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"
"vfadb %%v16,%%v16,%%v17 \n\t"
"vfadb %%v17,%%v18,%%v19 \n\t"
"vfadb %%v18,%%v20,%%v21 \n\t"
"vfadb %%v19,%%v22,%%v23 \n\t"
"vfchdb %%v4,%%v17,%%v16 \n\t"
"vfchdb %%v5,%%v19,%%v18 \n\t"
"vsel %%v16,%%v16,%%v17,%%v4 \n\t"
"vsel %%v4,%%v24,%%v25,%%v4 \n\t"
"vsel %%v17,%%v18,%%v19,%%v5 \n\t"
"vsel %%v5,%%v26,%%v27,%%v5 \n\t"

"vfchdb %%v18,%%v17,%%v16 \n\t"
"vsel %%v16,%%v16,%%v17,%%v18 \n\t"
"vsel %%v4,%%v4,%%v5,%%v18 \n\t"
"vag %%v4,%%v4,%%v3 \n\t"

"vfchdb %%v5,%%v0,%%v16 \n\t"
"vsel %%v0,%%v16,%%v0,%%v5 \n\t"
"vsel %%v1,%%v4,%%v1,%%v5 \n\t"
"vag %%v3,%%v3,%%v2 \n\t"

"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"

"vrepg %%v2,%%v0,1 \n\t"
"vrepg %%v3,%%v1,1 \n\t"
"wfcdb %%v2,%%v0 \n\t"
"jne 1f \n\t"
"vsteg %%v0,%1,0 \n\t"
"vmnlg %%v0,%%v1,%%v3 \n\t"
"vlgvg %0,%%v0,0 \n\t"
"j 2f \n\t"
"1: \n\t"
"wfchdb %%v4,%%v0,%%v2 \n\t"
"vsel %%v1,%%v3,%%v1,%%v4 \n\t"
"vsel %%v0,%%v2,%%v0,%%v4 \n\t"
"vlgvg %0,%%v1,0 \n\t"
"std %%f0,%1 \n\t"
"2: \n\t"
"nop "
:"=r"(iamin),"=m"(*amin)
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27"
);

return iamin;
}



BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
BLASLONG ix=0;
FLOAT minf;
BLASLONG min=0;
BLASLONG i = 0;
BLASLONG ix = 0;
FLOAT minf = 0;
BLASLONG min = 0;
BLASLONG inc_x2;

if (n <= 0 || inc_x <= 0) return(min);

if (inc_x == 1) {

BLASLONG n1 = n & -16;
if (n1 > 0) {
BLASLONG n1 = n & -16;
if (n1 > 0) {

min = izamin_kernel_16(n1, x, &minf);

min = ziamin_kernel_16_TUNED(n1, x, &minf);
i = n1;
ix = n1 << 1;
}
else {
//assign minf
minf = CABS1(x,0);
ix += 2;
i++;
}
}

while(i < n)
while(i < n)
{
if( CABS1(x,ix) < minf )
{
if( CABS1(x,ix) < minf )
{
min = i;
minf = CABS1(x,ix);
}
ix += 2;
i++;
min = i;
minf = CABS1(x,ix);
}
ix += 2;
i++;
}
return (min + 1);

} else {
inc_x2 = 2 * inc_x;
inc_x2 = 2 * inc_x;

minf = CABS1(x,0);
ix += inc_x2;
i++;
minf = CABS1(x,0);
ix += inc_x2;
i++;

while(i < n)
while(i < n)
{
if( CABS1(x,ix) < minf )
{
if( CABS1(x,ix) < minf )
{
min = i;
minf = CABS1(x,ix);
}
ix += inc_x2;
i++;
min = i;
minf = CABS1(x,ix);
}
ix += inc_x2;
i++;
}
return (min + 1);
}
}



+ 210
- 0
kernel/zarch/samax.c View File

@@ -0,0 +1,210 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#include "common.h"
#include <math.h>

#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif

static FLOAT samax_kernel_64(BLASLONG n, FLOAT *x)
{
FLOAT amax;

__asm__ volatile (
"vl %%v0,0(%2) \n\t"
"vflpsb %%v0,%%v0 \n\t"
"srlg %%r0,%1,6 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"

"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vflpsb %%v16, %%v16 \n\t"
"vflpsb %%v17, %%v17 \n\t"
"vflpsb %%v18, %%v18 \n\t"
"vflpsb %%v19, %%v19 \n\t"
"vflpsb %%v20, %%v20 \n\t"
"vflpsb %%v21, %%v21 \n\t"
"vflpsb %%v22, %%v22 \n\t"
"vflpsb %%v23, %%v23 \n\t"
"vfchsb %%v24,%%v16,%%v17 \n\t"
"vfchsb %%v25,%%v18,%%v19 \n\t"
"vfchsb %%v26,%%v20,%%v21 \n\t"
"vfchsb %%v27,%%v22,%%v23 \n\t"
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
"vsel %%v26,%%v20,%%v21,%%v26 \n\t"
"vsel %%v27,%%v22,%%v23,%%v27 \n\t"

"vfchsb %%v28,%%v24,%%v25 \n\t"
"vfchsb %%v29,%%v26,%%v27 \n\t"
"vsel %%v28,%%v24,%%v25,%%v28 \n\t"
"vsel %%v29,%%v26,%%v27,%%v29 \n\t"

"vfchsb %%v30,%%v28,%%v29 \n\t"
"vsel %%v30,%%v28,%%v29,%%v30 \n\t"

"vfchsb %%v31,%%v30,%%v0 \n\t"
"vsel %%v0,%%v30,%%v0,%%v31 \n\t"

"vl %%v16,128(%%r1,%2) \n\t"
"vl %%v17,144(%%r1,%2) \n\t"
"vl %%v18,160(%%r1,%2) \n\t"
"vl %%v19,176(%%r1,%2) \n\t"
"vl %%v20,192(%%r1,%2) \n\t"
"vl %%v21,208(%%r1,%2) \n\t"
"vl %%v22,224(%%r1,%2) \n\t"
"vl %%v23,240(%%r1,%2) \n\t"
"vflpsb %%v16, %%v16 \n\t"
"vflpsb %%v17, %%v17 \n\t"
"vflpsb %%v18, %%v18 \n\t"
"vflpsb %%v19, %%v19 \n\t"
"vflpsb %%v20, %%v20 \n\t"
"vflpsb %%v21, %%v21 \n\t"
"vflpsb %%v22, %%v22 \n\t"
"vflpsb %%v23, %%v23 \n\t"
"vfchsb %%v24,%%v16,%%v17 \n\t"
"vfchsb %%v25,%%v18,%%v19 \n\t"
"vfchsb %%v26,%%v20,%%v21 \n\t"
"vfchsb %%v27,%%v22,%%v23 \n\t"
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
"vsel %%v26,%%v20,%%v21,%%v26 \n\t"
"vsel %%v27,%%v22,%%v23,%%v27 \n\t"

"vfchsb %%v28,%%v24,%%v25 \n\t"
"vfchsb %%v29,%%v26,%%v27 \n\t"
"vsel %%v28,%%v24,%%v25,%%v28 \n\t"
"vsel %%v29,%%v26,%%v27,%%v29 \n\t"

"vfchsb %%v30,%%v28,%%v29 \n\t"
"vsel %%v30,%%v28,%%v29,%%v30 \n\t"

"vfchsb %%v31,%%v30,%%v0 \n\t"
"vsel %%v0,%%v30,%%v0,%%v31 \n\t"

"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"

"veslg %%v16,%%v0,32 \n\t"
"vfchsb %%v17,%%v16,%%v0 \n\t"
"vsel %%v0,%%v16,%%v0,%%v17 \n\t"

"vrepf %%v16,%%v0,2 \n\t"
"wfchsb %%v17,%%v16,%%v0 \n\t"
"vsel %%v0,%%v16,%%v0,%%v17 \n\t"
"ler %0,%%f0 "
:"=f"(amax)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);

return amax;
}
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT maxf = 0.0;

if (n <= 0 || inc_x <= 0) return (maxf);

if (inc_x == 1) {

BLASLONG n1 = n & -64;
if (n1 > 0) {

maxf = samax_kernel_64(n1, x);

i = n1;
}
else
{
maxf=ABS(x[0]);
i++;
}

while (i < n) {
if (ABS(x[i]) > maxf) {
maxf = ABS(x[i]);
}
i++;
}
return (maxf);

} else {

maxf=ABS(x[0]);
i += inc_x;
j++;

BLASLONG n1 = (n - 1) & -4;
while (j < n1) {

if (ABS(x[i]) > maxf) {
maxf = ABS(x[i]);
}
if (ABS(x[i + inc_x]) > maxf) {
maxf = ABS(x[i + inc_x]);
}
if (ABS(x[i + 2 * inc_x]) > maxf) {
maxf = ABS(x[i + 2 * inc_x]);
}
if (ABS(x[i + 3 * inc_x]) > maxf) {
maxf = ABS(x[i + 3 * inc_x]);
}

i += inc_x * 4;

j += 4;

}


while (j < n) {
if (ABS(x[i]) > maxf) {
maxf = ABS(x[i]);
}
i += inc_x;
j++;
}
return (maxf);
}
}

+ 210
- 0
kernel/zarch/samin.c View File

@@ -0,0 +1,210 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#include "common.h"
#include <math.h>

#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif

static FLOAT samin_kernel_64(BLASLONG n, FLOAT *x)
{
FLOAT amin;

__asm__ volatile (
"vl %%v0,0(%2) \n\t"
"vflpsb %%v0,%%v0 \n\t"
"srlg %%r0,%1,6 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"

"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vflpsb %%v16, %%v16 \n\t"
"vflpsb %%v17, %%v17 \n\t"
"vflpsb %%v18, %%v18 \n\t"
"vflpsb %%v19, %%v19 \n\t"
"vflpsb %%v20, %%v20 \n\t"
"vflpsb %%v21, %%v21 \n\t"
"vflpsb %%v22, %%v22 \n\t"
"vflpsb %%v23, %%v23 \n\t"
"vfchsb %%v24,%%v17,%%v16 \n\t"
"vfchsb %%v25,%%v19,%%v18 \n\t"
"vfchsb %%v26,%%v21,%%v20 \n\t"
"vfchsb %%v27,%%v23,%%v22 \n\t"
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
"vsel %%v26,%%v20,%%v21,%%v26 \n\t"
"vsel %%v27,%%v22,%%v23,%%v27 \n\t"

"vfchsb %%v28,%%v25,%%v24 \n\t"
"vfchsb %%v29,%%v27,%%v26 \n\t"
"vsel %%v28,%%v24,%%v25,%%v28 \n\t"
"vsel %%v29,%%v26,%%v27,%%v29 \n\t"

"vfchsb %%v30,%%v29,%%v28 \n\t"
"vsel %%v30,%%v28,%%v29,%%v30 \n\t"

"vfchsb %%v31,%%v0,%%v30 \n\t"
"vsel %%v0,%%v30,%%v0,%%v31 \n\t"

"vl %%v16,128(%%r1,%2) \n\t"
"vl %%v17,144(%%r1,%2) \n\t"
"vl %%v18,160(%%r1,%2) \n\t"
"vl %%v19,176(%%r1,%2) \n\t"
"vl %%v20,192(%%r1,%2) \n\t"
"vl %%v21,208(%%r1,%2) \n\t"
"vl %%v22,224(%%r1,%2) \n\t"
"vl %%v23,240(%%r1,%2) \n\t"
"vflpsb %%v16, %%v16 \n\t"
"vflpsb %%v17, %%v17 \n\t"
"vflpsb %%v18, %%v18 \n\t"
"vflpsb %%v19, %%v19 \n\t"
"vflpsb %%v20, %%v20 \n\t"
"vflpsb %%v21, %%v21 \n\t"
"vflpsb %%v22, %%v22 \n\t"
"vflpsb %%v23, %%v23 \n\t"
"vfchsb %%v24,%%v17,%%v16 \n\t"
"vfchsb %%v25,%%v19,%%v18 \n\t"
"vfchsb %%v26,%%v21,%%v20 \n\t"
"vfchsb %%v27,%%v23,%%v22 \n\t"
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
"vsel %%v26,%%v20,%%v21,%%v26 \n\t"
"vsel %%v27,%%v22,%%v23,%%v27 \n\t"

"vfchsb %%v28,%%v25,%%v24 \n\t"
"vfchsb %%v29,%%v27,%%v26 \n\t"
"vsel %%v28,%%v24,%%v25,%%v28 \n\t"
"vsel %%v29,%%v26,%%v27,%%v29 \n\t"

"vfchsb %%v30,%%v29,%%v28 \n\t"
"vsel %%v30,%%v28,%%v29,%%v30 \n\t"

"vfchsb %%v31,%%v0,%%v30 \n\t"
"vsel %%v0,%%v30,%%v0,%%v31 \n\t"

"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"

"veslg %%v16,%%v0,32 \n\t"
"vfchsb %%v17,%%v0,%%v16 \n\t"
"vsel %%v0,%%v16,%%v0,%%v17 \n\t"

"vrepf %%v16,%%v0,2 \n\t"
"wfchsb %%v17,%%v0,%%v16 \n\t"
"vsel %%v0,%%v16,%%v0,%%v17 \n\t"
"ler %0,%%f0 "
:"=f"(amin)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);

return amin;
}
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT minf = 0.0;

if (n <= 0 || inc_x <= 0) return (minf);

if (inc_x == 1) {

BLASLONG n1 = n & -64;
if (n1 > 0) {

minf = samin_kernel_64(n1, x);

i = n1;
}
else
{
minf=ABS(x[0]);
i++;
}

while (i < n) {
if (ABS(x[i]) < minf) {
minf = ABS(x[i]);
}
i++;
}
return (minf);

} else {

minf=ABS(x[0]);
i += inc_x;
j++;

BLASLONG n1 = (n - 1) & -4;
while (j < n1) {

if (ABS(x[i]) < minf) {
minf = ABS(x[i]);
}
if (ABS(x[i + inc_x]) < minf) {
minf = ABS(x[i + inc_x]);
}
if (ABS(x[i + 2 * inc_x]) < minf) {
minf = ABS(x[i + 2 * inc_x]);
}
if (ABS(x[i + 3 * inc_x]) < minf) {
minf = ABS(x[i + 3 * inc_x]);
}

i += inc_x * 4;

j += 4;

}


while (j < n) {
if (ABS(x[i]) < minf) {
minf = ABS(x[i]);
}
i += inc_x;
j++;
}
return (minf);
}
}

+ 174
- 0
kernel/zarch/sasum.c View File

@@ -0,0 +1,174 @@
/***************************************************************************
Copyright (c) 2013-2018, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#include "common.h"
#include <math.h>

#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif

static FLOAT sasum_kernel_64(BLASLONG n, FLOAT *x)
{
FLOAT asum;

__asm__ (
"vzero %%v0 \n\t"
"vzero %%v1 \n\t"
"vzero %%v2 \n\t"
"vzero %%v3 \n\t"
"srlg %%r0,%1,6 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"
"vl %%v16, 0(%%r1,%2) \n\t"
"vl %%v17, 16(%%r1,%2) \n\t"
"vl %%v18, 32(%%r1,%2) \n\t"
"vl %%v19, 48(%%r1,%2) \n\t"
"vl %%v20, 64(%%r1,%2) \n\t"
"vl %%v21, 80(%%r1,%2) \n\t"
"vl %%v22, 96(%%r1,%2) \n\t"
"vl %%v23, 112(%%r1,%2) \n\t"

"vflpsb %%v16, %%v16 \n\t"
"vflpsb %%v17, %%v17 \n\t"
"vflpsb %%v18, %%v18 \n\t"
"vflpsb %%v19, %%v19 \n\t"
"vflpsb %%v20, %%v20 \n\t"
"vflpsb %%v21, %%v21 \n\t"
"vflpsb %%v22, %%v22 \n\t"
"vflpsb %%v23, %%v23 \n\t"

"vfasb %%v0,%%v0,%%v16 \n\t"
"vfasb %%v1,%%v1,%%v17 \n\t"
"vfasb %%v2,%%v2,%%v18 \n\t"
"vfasb %%v3,%%v3,%%v19 \n\t"
"vfasb %%v0,%%v0,%%v20 \n\t"
"vfasb %%v1,%%v1,%%v21 \n\t"
"vfasb %%v2,%%v2,%%v22 \n\t"
"vfasb %%v3,%%v3,%%v23 \n\t"

"vl %%v16, 128(%%r1,%2) \n\t"
"vl %%v17, 144(%%r1,%2) \n\t"
"vl %%v18, 160(%%r1,%2) \n\t"
"vl %%v19, 176(%%r1,%2) \n\t"
"vl %%v20, 192(%%r1,%2) \n\t"
"vl %%v21, 208(%%r1,%2) \n\t"
"vl %%v22, 224(%%r1,%2) \n\t"
"vl %%v23, 240(%%r1,%2) \n\t"

"vflpsb %%v16, %%v16 \n\t"
"vflpsb %%v17, %%v17 \n\t"
"vflpsb %%v18, %%v18 \n\t"
"vflpsb %%v19, %%v19 \n\t"
"vflpsb %%v20, %%v20 \n\t"
"vflpsb %%v21, %%v21 \n\t"
"vflpsb %%v22, %%v22 \n\t"
"vflpsb %%v23, %%v23 \n\t"

"vfasb %%v0,%%v0,%%v16 \n\t"
"vfasb %%v1,%%v1,%%v17 \n\t"
"vfasb %%v2,%%v2,%%v18 \n\t"
"vfasb %%v3,%%v3,%%v19 \n\t"
"vfasb %%v0,%%v0,%%v20 \n\t"
"vfasb %%v1,%%v1,%%v21 \n\t"
"vfasb %%v2,%%v2,%%v22 \n\t"
"vfasb %%v3,%%v3,%%v23 \n\t"
"agfi %%r1,256 \n\t"
"brctg %%r0,0b \n\t"
"vfasb %%v0,%%v0,%%v1 \n\t"
"vfasb %%v0,%%v0,%%v2 \n\t"
"vfasb %%v0,%%v0,%%v3 \n\t"
"veslg %%v1,%%v0,32 \n\t"
"vfasb %%v0,%%v0,%%v1 \n\t"
"vrepf %%v1,%%v0,2 \n\t"
"aebr %%f0,%%f1 \n\t"
"ler %0,%%f0 "
:"=f"(asum)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23"
);

return asum;
}

FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT sumf = 0.0;
BLASLONG n1;

if (n <= 0 || inc_x <= 0) return sumf;

if (inc_x == 1) {

n1 = n & -64;
if (n1 > 0) {

sumf = sasum_kernel_64(n1, x);
i = n1;
}

while (i < n) {
sumf += ABS(x[i]);
i++;
}

} else {
BLASLONG n1 = n & -4;
register FLOAT sum1, sum2;
sum1 = 0.0;
sum2 = 0.0;
while (j < n1) {

sum1 += ABS(x[i]);
sum2 += ABS(x[i + inc_x]);
sum1 += ABS(x[i + 2 * inc_x]);
sum2 += ABS(x[i + 3 * inc_x]);

i += inc_x * 4;
j += 4;

}
sumf = sum1 + sum2;
while (j < n) {

sumf += ABS(x[i]);
i += inc_x;
j++;
}


}
return sumf;
}



+ 184
- 0
kernel/zarch/saxpy.c View File

@@ -0,0 +1,184 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#include "common.h"

static void saxpy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
__asm__ volatile(
"vlrepf %%v0,%3 \n\t"
"srlg %%r0,%0,6 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%1) \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"

"vl %%v16,0(%%r1,%1) \n\t"
"vl %%v17,16(%%r1,%1) \n\t"
"vl %%v18,32(%%r1,%1) \n\t"
"vl %%v19,48(%%r1,%1) \n\t"
"vl %%v20,0(%%r1,%2) \n\t"
"vl %%v21,16(%%r1,%2) \n\t"
"vl %%v22,32(%%r1,%2) \n\t"
"vl %%v23,48(%%r1,%2) \n\t"
"vfmasb %%v16,%%v0,%%v16,%%v20 \n\t"
"vfmasb %%v17,%%v0,%%v17,%%v21 \n\t"
"vfmasb %%v18,%%v0,%%v18,%%v22 \n\t"
"vfmasb %%v19,%%v0,%%v19,%%v23 \n\t"

"vl %%v24,64(%%r1,%1) \n\t"
"vl %%v25,80(%%r1,%1) \n\t"
"vl %%v26,96(%%r1,%1) \n\t"
"vl %%v27,112(%%r1,%1) \n\t"
"vl %%v28,64(%%r1,%2) \n\t"
"vl %%v29,80(%%r1,%2) \n\t"
"vl %%v30,96(%%r1,%2) \n\t"
"vl %%v31,112(%%r1,%2) \n\t"

"vfmasb %%v20,%%v0,%%v24,%%v28 \n\t"
"vfmasb %%v21,%%v0,%%v25,%%v29 \n\t"
"vfmasb %%v22,%%v0,%%v26,%%v30 \n\t"
"vfmasb %%v23,%%v0,%%v27,%%v31 \n\t"

"vst %%v16,0(%%r1,%2) \n\t"
"vst %%v17,16(%%r1,%2) \n\t"
"vst %%v18,32(%%r1,%2) \n\t"
"vst %%v19,48(%%r1,%2) \n\t"
"vst %%v20,64(%%r1,%2) \n\t"
"vst %%v21,80(%%r1,%2) \n\t"
"vst %%v22,96(%%r1,%2) \n\t"
"vst %%v23,112(%%r1,%2) \n\t"

"vl %%v16,128(%%r1,%1) \n\t"
"vl %%v17,144(%%r1,%1) \n\t"
"vl %%v18,160(%%r1,%1) \n\t"
"vl %%v19,176(%%r1,%1) \n\t"
"vl %%v20,128(%%r1,%2) \n\t"
"vl %%v21,144(%%r1,%2) \n\t"
"vl %%v22,160(%%r1,%2) \n\t"
"vl %%v23,176(%%r1,%2) \n\t"
"vfmasb %%v16,%%v0,%%v16,%%v20 \n\t"
"vfmasb %%v17,%%v0,%%v17,%%v21 \n\t"
"vfmasb %%v18,%%v0,%%v18,%%v22 \n\t"
"vfmasb %%v19,%%v0,%%v19,%%v23 \n\t"

"vl %%v24,192(%%r1,%1) \n\t"
"vl %%v25,208(%%r1,%1) \n\t"
"vl %%v26,224(%%r1,%1) \n\t"
"vl %%v27,240(%%r1,%1) \n\t"
"vl %%v28,192(%%r1,%2) \n\t"
"vl %%v29,208(%%r1,%2) \n\t"
"vl %%v30,224(%%r1,%2) \n\t"
"vl %%v31,240(%%r1,%2) \n\t"

"vfmasb %%v20,%%v0,%%v24,%%v28 \n\t"
"vfmasb %%v21,%%v0,%%v25,%%v29 \n\t"
"vfmasb %%v22,%%v0,%%v26,%%v30 \n\t"
"vfmasb %%v23,%%v0,%%v27,%%v31 \n\t"
"vst %%v16,128(%%r1,%2) \n\t"
"vst %%v17,144(%%r1,%2) \n\t"
"vst %%v18,160(%%r1,%2) \n\t"
"vst %%v19,176(%%r1,%2) \n\t"
"vst %%v20,192(%%r1,%2) \n\t"
"vst %%v21,208(%%r1,%2) \n\t"
"vst %%v22,224(%%r1,%2) \n\t"
"vst %%v23,240(%%r1,%2) \n\t"

"agfi %%r1,256 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y),"m"(*alpha)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}

int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;

if ( n <= 0 ) return 0 ;

if ( (inc_x == 1) && (inc_y == 1) )
{

BLASLONG n1 = n & -64;

if ( n1 )
saxpy_kernel_64(n1, x, y , &da);

i = n1;
while(i < n)
{

y[i] += da * x[i] ;
i++ ;

}
return 0 ;


}

BLASLONG n1 = n & -4;

while(i < n1)
{

FLOAT m1 = da * x[ix] ;
FLOAT m2 = da * x[ix+inc_x] ;
FLOAT m3 = da * x[ix+2*inc_x] ;
FLOAT m4 = da * x[ix+3*inc_x] ;

y[iy] += m1 ;
y[iy+inc_y] += m2 ;
y[iy+2*inc_y] += m3 ;
y[iy+3*inc_y] += m4 ;

ix += inc_x*4 ;
iy += inc_y*4 ;
i+=4 ;

}

while(i < n)
{

y[iy] += da * x[ix] ;
ix += inc_x ;
iy += inc_y ;
i++ ;

}
return 0 ;

}



+ 85
- 0
kernel/zarch/scopy.c View File

@@ -0,0 +1,85 @@
/***************************************************************************
Copyright (c) 2013-2018, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#include "common.h"

static void scopy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y)
{
__asm__ volatile (
"lgr %%r1,%1 \n\t"
"lgr %%r2,%2 \n\t"
"srlg %%r0,%0,6 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1) \n\t"
"pfd 2, 1024(%%r2) \n\t"
"mvc 0(256,%%r2),0(%%r1) \n\t"
"agfi %%r1,256 \n\t"
"agfi %%r2,256 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"a"((const FLOAT (*)[n])x),"a"((FLOAT (*)[n])y)
:"memory","cc","r0","r1","r2"
);
}

int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
BLASLONG i = 0;
BLASLONG ix = 0, iy = 0;

if (n <= 0) return 0;

if ((inc_x == 1) && (inc_y == 1)) {

BLASLONG n1 = n & -64;
if (n1 > 0) {
scopy_kernel_64(n1, x, y);
i = n1;
}

while (i < n) {
y[i] = x[i];
i++;

}


} else {

while (i < n) {

y[iy] = x[ix];
ix += inc_x;
iy += inc_y;
i++;

}

}
return 0;


}

+ 140
- 0
kernel/zarch/sdot.c View File

@@ -0,0 +1,140 @@
/***************************************************************************
Copyright (c) 2013-2018,The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms,with or without
modification,are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice,this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice,this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES,INCLUDING,BUT NOT LIMITED TO,THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT,INDIRECT,INCIDENTAL,SPECIAL,EXEMPLARY,OR CONSEQUENTIAL
DAMAGES (INCLUDING,BUT NOT LIMITED TO,PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE,DATA,OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY,WHETHER IN CONTRACT,STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE,EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#include "common.h"

static FLOAT sdot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
{
FLOAT dot;

__asm__ volatile (
"vzero %%v0 \n\t"
"srlg %%r0,%1,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1,1024(%%r1,%2) \n\t"
"pfd 2,1024(%%r1,%3) \n\t"

"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"

"vl %%v24,0(%%r1,%3) \n\t"
"vfmasb %%v0,%%v16,%%v24,%%v0 \n\t"
"vl %%v25,16(%%r1,%3) \n\t"
"vfmasb %%v0,%%v17,%%v25,%%v0 \n\t"
"vl %%v26,32(%%r1,%3) \n\t"
"vfmasb %%v0,%%v18,%%v26,%%v0 \n\t"
"vl %%v27,48(%%r1,%3) \n\t"
"vfmasb %%v0,%%v19,%%v27,%%v0 \n\t"
"vl %%v28,64(%%r1,%3) \n\t"
"vfmasb %%v0,%%v20,%%v28,%%v0 \n\t"
"vl %%v29,80(%%r1,%3) \n\t"
"vfmasb %%v0,%%v21,%%v29,%%v0 \n\t"
"vl %%v30,96(%%r1,%3) \n\t"
"vfmasb %%v0,%%v22,%%v30,%%v0 \n\t"
"vl %%v31,112(%%r1,%3) \n\t"
"vfmasb %%v0,%%v23,%%v31,%%v0 \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b \n\t"
"vrepf %%v1,%%v0,1 \n\t"
"vrepf %%v2,%%v0,2 \n\t"
"vrepf %%v3,%%v0,3 \n\t"
"aebr %%f0,%%f1 \n\t"
"aebr %%f0,%%f2 \n\t"
"aebr %%f0,%%f3 \n\t"
"ler %0,%%f0 "
:"=f"(dot)
:"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((const FLOAT (*)[n])y)
:"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);

return dot;
}

FLOAT CNAME(BLASLONG n,FLOAT *x,BLASLONG inc_x,FLOAT *y,BLASLONG inc_y)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;

FLOAT dot = 0.0 ;

if ( n <= 0 ) return(dot);

if ( (inc_x == 1) && (inc_y == 1) )
{

BLASLONG n1 = n & -32;

if ( n1 )
dot = sdot_kernel_32(n1,x,y);

i = n1;
while(i < n)
{

dot += y[i] * x[i] ;
i++ ;

}
return(dot);


}

BLASLONG n1 = n & -2;

while(i < n1)
{

dot += y[iy] * x[ix] + y[iy+inc_y] * x[ix+inc_x];
ix += inc_x*2 ;
iy += inc_y*2 ;
i+=2 ;

}

while(i < n)
{

dot += y[iy] * x[ix] ;
ix += inc_x ;
iy += inc_y ;
i++ ;

}
return(dot);

}



+ 668
- 0
kernel/zarch/sgemv_n_4.c View File

@@ -0,0 +1,668 @@
/***************************************************************************
Copyright (c) 2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#include "common.h"

#define NBMAX 2048

static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
{
__asm__ volatile (
"vlrepf %%v0,0(%5) \n\t"
"vlrepf %%v1,4(%5) \n\t"
"vlrepf %%v2,8(%5) \n\t"
"vlrepf %%v3,12(%5) \n\t"
"vlrepf %%v4,%7 \n\t"
"vfmsb %%v0,%%v0,%%v4 \n\t"
"vfmsb %%v1,%%v1,%%v4 \n\t"
"vfmsb %%v2,%%v2,%%v4 \n\t"
"vfmsb %%v3,%%v3,%%v4 \n\t"
"xgr %%r1,%%r1 \n\t"

"lghi %%r0,-32 \n\t"
"ngr %%r0,%0 \n\t"
"ltgr %%r0,%%r0 \n\t"
"jz 1f \n\t"

"srlg %%r0,%%r0,5 \n\t"
"0: \n\t"
"pfd 1,1024(%%r1,%1) \n\t"
"pfd 1,1024(%%r1,%2) \n\t"
"pfd 1,1024(%%r1,%3) \n\t"
"pfd 1,1024(%%r1,%4) \n\t"
"pfd 2,1024(%%r1,%6) \n\t"

"vl %%v16,0(%%r1,%1) \n\t"
"vl %%v17,0(%%r1,%2) \n\t"
"vl %%v18,0(%%r1,%3) \n\t"
"vl %%v19,0(%%r1,%4) \n\t"
"vl %%v20,16(%%r1,%1) \n\t"
"vl %%v21,16(%%r1,%2) \n\t"
"vl %%v22,16(%%r1,%3) \n\t"
"vl %%v23,16(%%r1,%4) \n\t"
"vl %%v24,32(%%r1,%1) \n\t"
"vl %%v25,32(%%r1,%2) \n\t"
"vl %%v26,32(%%r1,%3) \n\t"
"vl %%v27,32(%%r1,%4) \n\t"
"vl %%v28,48(%%r1,%1) \n\t"
"vl %%v29,48(%%r1,%2) \n\t"
"vl %%v30,48(%%r1,%3) \n\t"
"vl %%v31,48(%%r1,%4) \n\t"

"vl %%v4,0(%%r1,%6) \n\t"
"vfmasb %%v4,%%v16,%%v0,%%v4 \n\t"
"vfmasb %%v4,%%v17,%%v1,%%v4 \n\t"
"vfmasb %%v4,%%v18,%%v2,%%v4 \n\t"
"vfmasb %%v4,%%v19,%%v3,%%v4 \n\t"
"vst %%v4,0(%%r1,%6) \n\t"

"vl %%v4,16(%%r1,%6) \n\t"
"vfmasb %%v4,%%v20,%%v0,%%v4 \n\t"
"vfmasb %%v4,%%v21,%%v1,%%v4 \n\t"
"vfmasb %%v4,%%v22,%%v2,%%v4 \n\t"
"vfmasb %%v4,%%v23,%%v3,%%v4 \n\t"
"vst %%v4,16(%%r1,%6) \n\t"

"vl %%v4,32(%%r1,%6) \n\t"
"vfmasb %%v4,%%v24,%%v0,%%v4 \n\t"
"vfmasb %%v4,%%v25,%%v1,%%v4 \n\t"
"vfmasb %%v4,%%v26,%%v2,%%v4 \n\t"
"vfmasb %%v4,%%v27,%%v3,%%v4 \n\t"
"vst %%v4,32(%%r1,%6) \n\t"

"vl %%v4,48(%%r1,%6) \n\t"
"vfmasb %%v4,%%v28,%%v0,%%v4 \n\t"
"vfmasb %%v4,%%v29,%%v1,%%v4 \n\t"
"vfmasb %%v4,%%v30,%%v2,%%v4 \n\t"
"vfmasb %%v4,%%v31,%%v3,%%v4 \n\t"
"vst %%v4,48(%%r1,%6) \n\t"

"vl %%v16,64(%%r1,%1) \n\t"
"vl %%v17,64(%%r1,%2) \n\t"
"vl %%v18,64(%%r1,%3) \n\t"
"vl %%v19,64(%%r1,%4) \n\t"
"vl %%v20,80(%%r1,%1) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,80(%%r1,%3) \n\t"
"vl %%v23,80(%%r1,%4) \n\t"
"vl %%v24,96(%%r1,%1) \n\t"
"vl %%v25,96(%%r1,%2) \n\t"
"vl %%v26,96(%%r1,%3) \n\t"
"vl %%v27,96(%%r1,%4) \n\t"
"vl %%v28,112(%%r1,%1) \n\t"
"vl %%v29,112(%%r1,%2) \n\t"
"vl %%v30,112(%%r1,%3) \n\t"
"vl %%v31,112(%%r1,%4) \n\t"

"vl %%v4,64(%%r1,%6) \n\t"
"vfmasb %%v4,%%v16,%%v0,%%v4 \n\t"
"vfmasb %%v4,%%v17,%%v1,%%v4 \n\t"
"vfmasb %%v4,%%v18,%%v2,%%v4 \n\t"
"vfmasb %%v4,%%v19,%%v3,%%v4 \n\t"
"vst %%v4,64(%%r1,%6) \n\t"

"vl %%v4,80(%%r1,%6) \n\t"
"vfmasb %%v4,%%v20,%%v0,%%v4 \n\t"
"vfmasb %%v4,%%v21,%%v1,%%v4 \n\t"
"vfmasb %%v4,%%v22,%%v2,%%v4 \n\t"
"vfmasb %%v4,%%v23,%%v3,%%v4 \n\t"
"vst %%v4,80(%%r1,%6) \n\t"

"vl %%v4,96(%%r1,%6) \n\t"
"vfmasb %%v4,%%v24,%%v0,%%v4 \n\t"
"vfmasb %%v4,%%v25,%%v1,%%v4 \n\t"
"vfmasb %%v4,%%v26,%%v2,%%v4 \n\t"
"vfmasb %%v4,%%v27,%%v3,%%v4 \n\t"
"vst %%v4,96(%%r1,%6) \n\t"

"vl %%v4,112(%%r1,%6) \n\t"
"vfmasb %%v4,%%v28,%%v0,%%v4 \n\t"
"vfmasb %%v4,%%v29,%%v1,%%v4 \n\t"
"vfmasb %%v4,%%v30,%%v2,%%v4 \n\t"
"vfmasb %%v4,%%v31,%%v3,%%v4 \n\t"
"vst %%v4,112(%%r1,%6) \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b \n\t"

"1: \n\t"
"lghi %%r0,28 \n\t"
"ngr %%r0,%0 \n\t"
"ltgr %%r0,%%r0 \n\t"
"jz 3f \n\t"

"srlg %%r0,%%r0,2 \n\t"
"2: \n\t"
"vl %%v16,0(%%r1,%1) \n\t"
"vl %%v17,0(%%r1,%2) \n\t"
"vl %%v18,0(%%r1,%3) \n\t"
"vl %%v19,0(%%r1,%4) \n\t"

"vl %%v4,0(%%r1,%6) \n\t"
"vfmasb %%v4,%%v16,%%v0,%%v4 \n\t"
"vfmasb %%v4,%%v17,%%v1,%%v4 \n\t"
"vfmasb %%v4,%%v18,%%v2,%%v4 \n\t"
"vfmasb %%v4,%%v19,%%v3,%%v4 \n\t"
"vst %%v4,0(%%r1,%6) \n\t"

"agfi %%r1,16 \n\t"
"brctg %%r0,2b \n\t"

"3: \n\t"
"nop "
:
:"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])ap[2]),"ZR"((const FLOAT (*)[n])ap[3]),"ZQ"((const FLOAT (*)[4])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}

static void sgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
{
__asm__ volatile (
"vlrepf %%v0,0(%3) \n\t"
"vlrepf %%v1,4(%3) \n\t"
"vlrepf %%v2,%5 \n\t"
"vfmsb %%v0,%%v0,%%v2 \n\t"
"vfmsb %%v1,%%v1,%%v2 \n\t"
"xgr %%r1,%%r1 \n\t"

"lghi %%r0,-32 \n\t"
"ngr %%r0,%0 \n\t"
"ltgr %%r0,%%r0 \n\t"
"jz 1f \n\t"

"srlg %%r0,%%r0,5 \n\t"
"0: \n\t"
"pfd 1,1024(%%r1,%1) \n\t"
"pfd 1,1024(%%r1,%2) \n\t"
"pfd 2,1024(%%r1,%4) \n\t"

"vl %%v16,0(%%r1,%1) \n\t"
"vl %%v17,0(%%r1,%2) \n\t"
"vl %%v18,16(%%r1,%1) \n\t"
"vl %%v19,16(%%r1,%2) \n\t"
"vl %%v20,32(%%r1,%1) \n\t"
"vl %%v21,32(%%r1,%2) \n\t"
"vl %%v22,48(%%r1,%1) \n\t"
"vl %%v23,48(%%r1,%2) \n\t"
"vl %%v24,64(%%r1,%1) \n\t"
"vl %%v25,64(%%r1,%2) \n\t"
"vl %%v26,80(%%r1,%1) \n\t"
"vl %%v27,80(%%r1,%2) \n\t"
"vl %%v28,96(%%r1,%1) \n\t"
"vl %%v29,96(%%r1,%2) \n\t"
"vl %%v30,112(%%r1,%1) \n\t"
"vl %%v31,112(%%r1,%2) \n\t"

"vl %%v2,0(%%r1,%4) \n\t"
"vfmasb %%v2,%%v16,%%v0,%%v2 \n\t"
"vfmasb %%v2,%%v17,%%v1,%%v2 \n\t"
"vst %%v2,0(%%r1,%4) \n\t"

"vl %%v2,16(%%r1,%4) \n\t"
"vfmasb %%v2,%%v18,%%v0,%%v2 \n\t"
"vfmasb %%v2,%%v19,%%v1,%%v2 \n\t"
"vst %%v2,16(%%r1,%4) \n\t"

"vl %%v2,32(%%r1,%4) \n\t"
"vfmasb %%v2,%%v20,%%v0,%%v2 \n\t"
"vfmasb %%v2,%%v21,%%v1,%%v2 \n\t"
"vst %%v2,32(%%r1,%4) \n\t"

"vl %%v2,48(%%r1,%4) \n\t"
"vfmasb %%v2,%%v22,%%v0,%%v2 \n\t"
"vfmasb %%v2,%%v23,%%v1,%%v2 \n\t"
"vst %%v2,48(%%r1,%4) \n\t"

"vl %%v2,64(%%r1,%4) \n\t"
"vfmasb %%v2,%%v24,%%v0,%%v2 \n\t"
"vfmasb %%v2,%%v25,%%v1,%%v2 \n\t"
"vst %%v2,64(%%r1,%4) \n\t"

"vl %%v2,80(%%r1,%4) \n\t"
"vfmasb %%v2,%%v26,%%v0,%%v2 \n\t"
"vfmasb %%v2,%%v27,%%v1,%%v2 \n\t"
"vst %%v2,80(%%r1,%4) \n\t"

"vl %%v2,96(%%r1,%4) \n\t"
"vfmasb %%v2,%%v28,%%v0,%%v2 \n\t"
"vfmasb %%v2,%%v29,%%v1,%%v2 \n\t"
"vst %%v2,96(%%r1,%4) \n\t"

"vl %%v2,112(%%r1,%4) \n\t"
"vfmasb %%v2,%%v30,%%v0,%%v2 \n\t"
"vfmasb %%v2,%%v31,%%v1,%%v2 \n\t"
"vst %%v2,112(%%r1,%4) \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b \n\t"

"1: \n\t"
"lghi %%r0,28 \n\t"
"ngr %%r0,%0 \n\t"
"ltgr %%r0,%%r0 \n\t"
"jz 3f \n\t"

"srlg %%r0,%%r0,2 \n\t"
"2: \n\t"
"vl %%v16,0(%%r1,%1) \n\t"
"vl %%v17,0(%%r1,%2) \n\t"

"vl %%v2,0(%%r1,%4) \n\t"
"vfmasb %%v2,%%v16,%%v0,%%v2 \n\t"
"vfmasb %%v2,%%v17,%%v1,%%v2 \n\t"
"vst %%v2,0(%%r1,%4) \n\t"

"agfi %%r1,16 \n\t"
"brctg %%r0,2b \n\t"

"3: \n\t"
"nop "
:
:"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZQ"((const FLOAT (*)[2])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha)
:"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}

static void sgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *xo, FLOAT *y, FLOAT *alpha)
{
__asm__ volatile (
"vlrepf %%v0,0(%2) \n\t"
"vlrepf %%v1,%4 \n\t"
"vfmsb %%v0,%%v0,%%v1 \n\t"
"xgr %%r1,%%r1 \n\t"

"lghi %%r0,-32 \n\t"
"ngr %%r0,%0 \n\t"
"ltgr %%r0,%%r0 \n\t"
"jz 1f \n\t"

"srlg %%r0,%%r0,5 \n\t"
"0: \n\t"
"pfd 1,1024(%%r1,%1) \n\t"
"pfd 2,1024(%%r1,%3) \n\t"

"vl %%v16,0(%%r1,%1) \n\t"
"vl %%v17,16(%%r1,%1) \n\t"
"vl %%v18,32(%%r1,%1) \n\t"
"vl %%v19,48(%%r1,%1) \n\t"
"vl %%v20,64(%%r1,%1) \n\t"
"vl %%v21,80(%%r1,%1) \n\t"
"vl %%v22,96(%%r1,%1) \n\t"
"vl %%v23,112(%%r1,%1) \n\t"

"vl %%v1,0(%%r1,%3) \n\t"
"vfmasb %%v1,%%v16,%%v0,%%v1 \n\t"
"vst %%v1,0(%%r1,%3) \n\t"

"vl %%v1,16(%%r1,%3) \n\t"
"vfmasb %%v1,%%v17,%%v0,%%v1 \n\t"
"vst %%v1,16(%%r1,%3) \n\t"

"vl %%v1,32(%%r1,%3) \n\t"
"vfmasb %%v1,%%v18,%%v0,%%v1 \n\t"
"vst %%v1,32(%%r1,%3) \n\t"

"vl %%v1,48(%%r1,%3) \n\t"
"vfmasb %%v1,%%v19,%%v0,%%v1 \n\t"
"vst %%v1,48(%%r1,%3) \n\t"

"vl %%v1,64(%%r1,%3) \n\t"
"vfmasb %%v1,%%v20,%%v0,%%v1 \n\t"
"vst %%v1,64(%%r1,%3) \n\t"

"vl %%v1,80(%%r1,%3) \n\t"
"vfmasb %%v1,%%v21,%%v0,%%v1 \n\t"
"vst %%v1,80(%%r1,%3) \n\t"

"vl %%v1,96(%%r1,%3) \n\t"
"vfmasb %%v1,%%v22,%%v0,%%v1 \n\t"
"vst %%v1,96(%%r1,%3) \n\t"

"vl %%v1,112(%%r1,%3) \n\t"
"vfmasb %%v1,%%v23,%%v0,%%v1 \n\t"
"vst %%v1,112(%%r1,%3) \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b \n\t"

"1: \n\t"
"lghi %%r0,28 \n\t"
"ngr %%r0,%0 \n\t"
"ltgr %%r0,%%r0 \n\t"
"jz 3f \n\t"

"srlg %%r0,%%r0,2 \n\t"
"2: \n\t"
"vl %%v16,0(%%r1,%1) \n\t"

"vl %%v1,0(%%r1,%3) \n\t"
"vfmasb %%v1,%%v16,%%v0,%%v1 \n\t"
"vst %%v1,0(%%r1,%3) \n\t"

"agfi %%r1,16 \n\t"
"brctg %%r0,2b \n\t"

"3: \n\t"
"nop "
:
:"r"(n),"ZR"((const FLOAT (*)[n])a0),"ZQ"((const FLOAT (*)[1])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha)
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}

static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
{
BLASLONG i;
for (i = 0; i < n; i++)
{
*dest += src[i];
dest += inc_dest;
}
}

int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
{
BLASLONG i;
FLOAT *a_ptr;
FLOAT *x_ptr;
FLOAT *y_ptr;
FLOAT *ap[4];
BLASLONG n1;
BLASLONG m1;
BLASLONG m2;
BLASLONG m3;
BLASLONG n2;
BLASLONG lda4 = lda << 2;
FLOAT xbuffer[8],*ybuffer;

if ( m < 1 ) return(0);
if ( n < 1 ) return(0);

ybuffer = buffer;
n1 = n >> 2 ;
n2 = n & 3 ;

m3 = m & 3 ;
m1 = m & -4 ;
m2 = (m & (NBMAX-1)) - m3 ;

y_ptr = y;

BLASLONG NB = NBMAX;

while ( NB == NBMAX )
{
m1 -= NB;
if ( m1 < 0)
{
if ( m2 == 0 ) break;
NB = m2;
}
a_ptr = a;
x_ptr = x;
ap[0] = a_ptr;
ap[1] = a_ptr + lda;
ap[2] = ap[1] + lda;
ap[3] = ap[2] + lda;

if ( inc_y != 1 )
memset(ybuffer,0,NB*8);
else
ybuffer = y_ptr;

if ( inc_x == 1 )
{


for( i = 0; i < n1 ; i++)
{
sgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,&alpha);
ap[0] += lda4;
ap[1] += lda4;
ap[2] += lda4;
ap[3] += lda4;
a_ptr += lda4;
x_ptr += 4;
}

if ( n2 & 2 )
{
sgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,&alpha);
a_ptr += lda*2;
x_ptr += 2;
}


if ( n2 & 1 )
{
sgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha);
a_ptr += lda;
x_ptr += 1;

}


}
else
{

for( i = 0; i < n1 ; i++)
{
xbuffer[0] = x_ptr[0];
x_ptr += inc_x;
xbuffer[1] = x_ptr[0];
x_ptr += inc_x;
xbuffer[2] = x_ptr[0];
x_ptr += inc_x;
xbuffer[3] = x_ptr[0];
x_ptr += inc_x;
sgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,&alpha);
ap[0] += lda4;
ap[1] += lda4;
ap[2] += lda4;
ap[3] += lda4;
a_ptr += lda4;
}

for( i = 0; i < n2 ; i++)
{
xbuffer[0] = x_ptr[0];
x_ptr += inc_x;
sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,&alpha);
a_ptr += lda;

}

}

a += NB;
if ( inc_y != 1 )
{
add_y(NB,ybuffer,y_ptr,inc_y);
y_ptr += NB * inc_y;
}
else
y_ptr += NB ;

}

if ( m3 == 0 ) return(0);

if ( m3 == 3 )
{
a_ptr = a;
x_ptr = x;
FLOAT temp0 = 0.0;
FLOAT temp1 = 0.0;
FLOAT temp2 = 0.0;
if ( lda == 3 && inc_x ==1 )
{

for( i = 0; i < ( n & -4 ); i+=4 )
{

temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1];
temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1];
temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1];

temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3];
temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3];
temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3];

a_ptr += 12;
x_ptr += 4;
}

for( ; i < n; i++ )
{
temp0 += a_ptr[0] * x_ptr[0];
temp1 += a_ptr[1] * x_ptr[0];
temp2 += a_ptr[2] * x_ptr[0];
a_ptr += 3;
x_ptr ++;
}

}
else
{

for( i = 0; i < n; i++ )
{
temp0 += a_ptr[0] * x_ptr[0];
temp1 += a_ptr[1] * x_ptr[0];
temp2 += a_ptr[2] * x_ptr[0];
a_ptr += lda;
x_ptr += inc_x;


}

}
y_ptr[0] += alpha * temp0;
y_ptr += inc_y;
y_ptr[0] += alpha * temp1;
y_ptr += inc_y;
y_ptr[0] += alpha * temp2;
return(0);
}


if ( m3 == 2 )
{
a_ptr = a;
x_ptr = x;
FLOAT temp0 = 0.0;
FLOAT temp1 = 0.0;
if ( lda == 2 && inc_x ==1 )
{

for( i = 0; i < (n & -4) ; i+=4 )
{
temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1];
temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1];
temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3];
temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3];
a_ptr += 8;
x_ptr += 4;

}


for( ; i < n; i++ )
{
temp0 += a_ptr[0] * x_ptr[0];
temp1 += a_ptr[1] * x_ptr[0];
a_ptr += 2;
x_ptr ++;
}

}
else
{

for( i = 0; i < n; i++ )
{
temp0 += a_ptr[0] * x_ptr[0];
temp1 += a_ptr[1] * x_ptr[0];
a_ptr += lda;
x_ptr += inc_x;


}

}
y_ptr[0] += alpha * temp0;
y_ptr += inc_y;
y_ptr[0] += alpha * temp1;
return(0);
}

if ( m3 == 1 )
{
a_ptr = a;
x_ptr = x;
FLOAT temp = 0.0;
if ( lda == 1 && inc_x ==1 )
{

for( i = 0; i < (n & -4); i+=4 )
{
temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3];
}

for( ; i < n; i++ )
{
temp += a_ptr[i] * x_ptr[i];
}

}
else
{

for( i = 0; i < n; i++ )
{
temp += a_ptr[0] * x_ptr[0];
a_ptr += lda;
x_ptr += inc_x;
}

}
y_ptr[0] += alpha * temp;
return(0);
}


return(0);
}



+ 826
- 0
kernel/zarch/sgemv_t_4.c View File

@@ -0,0 +1,826 @@
/***************************************************************************
Copyright (c) 2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#include "common.h"

#define NBMAX 2048

static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
{
__asm__ volatile (
"vzero %%v0 \n\t"
"vzero %%v1 \n\t"
"vzero %%v2 \n\t"
"vzero %%v3 \n\t"
"xgr %%r1,%%r1 \n\t"

"lghi %%r0,-32 \n\t"
"ngr %%r0,%0 \n\t"
"ltgr %%r0,%%r0 \n\t"
"jz 1f \n\t"

"srlg %%r0,%%r0,5 \n\t"
"0: \n\t"
"pfd 1,1024(%%r1,%1) \n\t"
"pfd 1,1024(%%r1,%2) \n\t"
"pfd 1,1024(%%r1,%3) \n\t"
"pfd 1,1024(%%r1,%4) \n\t"
"pfd 1,1024(%%r1,%5) \n\t"

"vl %%v16,0(%%r1,%5) \n\t"
"vl %%v17,16(%%r1,%5) \n\t"
"vl %%v18,32(%%r1,%5) \n\t"
"vl %%v19,48(%%r1,%5) \n\t"
"vl %%v20,64(%%r1,%5) \n\t"
"vl %%v21,80(%%r1,%5) \n\t"
"vl %%v22,96(%%r1,%5) \n\t"
"vl %%v23,112(%%r1,%5) \n\t"

"vl %%v24,0(%%r1,%1) \n\t"
"vfmasb %%v0,%%v16,%%v24,%%v0 \n\t"
"vl %%v25,0(%%r1,%2) \n\t"
"vfmasb %%v1,%%v16,%%v25,%%v1 \n\t"
"vl %%v26,0(%%r1,%3) \n\t"
"vfmasb %%v2,%%v16,%%v26,%%v2 \n\t"
"vl %%v27,0(%%r1,%4) \n\t"
"vfmasb %%v3,%%v16,%%v27,%%v3 \n\t"

"vl %%v28,16(%%r1,%1) \n\t"
"vfmasb %%v0,%%v17,%%v28,%%v0 \n\t"
"vl %%v29,16(%%r1,%2) \n\t"
"vfmasb %%v1,%%v17,%%v29,%%v1 \n\t"
"vl %%v30,16(%%r1,%3) \n\t"
"vfmasb %%v2,%%v17,%%v30,%%v2 \n\t"
"vl %%v31,16(%%r1,%4) \n\t"
"vfmasb %%v3,%%v17,%%v31,%%v3 \n\t"

"vl %%v24,32(%%r1,%1) \n\t"
"vfmasb %%v0,%%v18,%%v24,%%v0 \n\t"
"vl %%v25,32(%%r1,%2) \n\t"
"vfmasb %%v1,%%v18,%%v25,%%v1 \n\t"
"vl %%v26,32(%%r1,%3) \n\t"
"vfmasb %%v2,%%v18,%%v26,%%v2 \n\t"
"vl %%v27,32(%%r1,%4) \n\t"
"vfmasb %%v3,%%v18,%%v27,%%v3 \n\t"

"vl %%v28,48(%%r1,%1) \n\t"
"vfmasb %%v0,%%v19,%%v28,%%v0 \n\t"
"vl %%v29,48(%%r1,%2) \n\t"
"vfmasb %%v1,%%v19,%%v29,%%v1 \n\t"
"vl %%v30,48(%%r1,%3) \n\t"
"vfmasb %%v2,%%v19,%%v30,%%v2 \n\t"
"vl %%v31,48(%%r1,%4) \n\t"
"vfmasb %%v3,%%v19,%%v31,%%v3 \n\t"

"vl %%v24,64(%%r1,%1) \n\t"
"vfmasb %%v0,%%v20,%%v24,%%v0 \n\t"
"vl %%v25,64(%%r1,%2) \n\t"
"vfmasb %%v1,%%v20,%%v25,%%v1 \n\t"
"vl %%v26,64(%%r1,%3) \n\t"
"vfmasb %%v2,%%v20,%%v26,%%v2 \n\t"
"vl %%v27,64(%%r1,%4) \n\t"
"vfmasb %%v3,%%v20,%%v27,%%v3 \n\t"

"vl %%v28,80(%%r1,%1) \n\t"
"vfmasb %%v0,%%v21,%%v28,%%v0 \n\t"
"vl %%v29,80(%%r1,%2) \n\t"
"vfmasb %%v1,%%v21,%%v29,%%v1 \n\t"
"vl %%v30,80(%%r1,%3) \n\t"
"vfmasb %%v2,%%v21,%%v30,%%v2 \n\t"
"vl %%v31,80(%%r1,%4) \n\t"
"vfmasb %%v3,%%v21,%%v31,%%v3 \n\t"

"vl %%v24,96(%%r1,%1) \n\t"
"vfmasb %%v0,%%v22,%%v24,%%v0 \n\t"
"vl %%v25,96(%%r1,%2) \n\t"
"vfmasb %%v1,%%v22,%%v25,%%v1 \n\t"
"vl %%v26,96(%%r1,%3) \n\t"
"vfmasb %%v2,%%v22,%%v26,%%v2 \n\t"
"vl %%v27,96(%%r1,%4) \n\t"
"vfmasb %%v3,%%v22,%%v27,%%v3 \n\t"

"vl %%v28,112(%%r1,%1) \n\t"
"vfmasb %%v0,%%v23,%%v28,%%v0 \n\t"
"vl %%v29,112(%%r1,%2) \n\t"
"vfmasb %%v1,%%v23,%%v29,%%v1 \n\t"
"vl %%v30,112(%%r1,%3) \n\t"
"vfmasb %%v2,%%v23,%%v30,%%v2 \n\t"
"vl %%v31,112(%%r1,%4) \n\t"
"vfmasb %%v3,%%v23,%%v31,%%v3 \n\t"

"agfi %%r1,128 \n\t"
"brctg %%r0,0b \n\t"

"1: \n\t"
"lghi %%r0,28 \n\t"
"ngr %%r0,%0 \n\t"
"ltgr %%r0,%%r0 \n\t"
"jz 3f \n\t"

"srlg %%r0,%%r0,2 \n\t"
"2: \n\t"
"vl %%v16,0(%%r1,%5) \n\t"

"vl %%v24,0(%%r1,%1) \n\t"
"vfmasb %%v0,%%v16,%%v24,%%v0 \n\t"
"vl %%v25,0(%%r1,%2) \n\t"
"vfmasb %%v1,%%v16,%%v25,%%v1 \n\t"
"vl %%v26,0(%%r1,%3) \n\t"
"vfmasb %%v2,%%v16,%%v26,%%v2 \n\t"
"vl %%v27,0(%%r1,%4) \n\t"
"vfmasb %%v3,%%v16,%%v27,%%v3 \n\t"

"agfi %%r1,16 \n\t"
"brctg %%r0,2b \n\t"

"3: \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b \n\t"
"vrepf %%v4,%%v0,1 \n\t"
"aebr %%f0,%%f4 \n\t"
"vrepf %%v4,%%v0,2 \n\t"
"aebr %%f0,%%f4 \n\t"
"vrepf %%v4,%%v0,3 \n\t"
"aebr %%f0,%%f4 \n\t"
"ste %%f0,0(%6) \n\t"
"vrepf %%v4,%%v1,1 \n\t"
"aebr %%f1,%%f4 \n\t"
"vrepf %%v4,%%v1,2 \n\t"
"aebr %%f1,%%f4 \n\t"
"vrepf %%v4,%%v1,3 \n\t"
"aebr %%f1,%%f4 \n\t"
"ste %%f1,4(%6) \n\t"
"vrepf %%v4,%%v2,1 \n\t"
"aebr %%f2,%%f4 \n\t"
"vrepf %%v4,%%v2,2 \n\t"
"aebr %%f2,%%f4 \n\t"
"vrepf %%v4,%%v2,3 \n\t"
"aebr %%f2,%%f4 \n\t"
"ste %%f2,8(%6) \n\t"
"vrepf %%v4,%%v3,1 \n\t"
"aebr %%f3,%%f4 \n\t"
"vrepf %%v4,%%v3,2 \n\t"
"aebr %%f3,%%f4 \n\t"
"vrepf %%v4,%%v3,3 \n\t"
"aebr %%f3,%%f4 \n\t"
"ste %%f3,12(%6) "
:
:"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])ap[2]),"ZR"((const FLOAT (*)[n])ap[3]),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[4])y)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}

static void sgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
{
__asm__ volatile (
"vzero %%v0 \n\t"
"vzero %%v1 \n\t"
"xgr %%r1,%%r1 \n\t"

"lghi %%r0,-32 \n\t"
"ngr %%r0,%0 \n\t"
"ltgr %%r0,%%r0 \n\t"
"jz 1f \n\t"

"srlg %%r0,%%r0,5 \n\t"
"0: \n\t"
"pfd 1,1024(%%r1,%1) \n\t"
"pfd 1,1024(%%r1,%2) \n\t"
"pfd 1,1024(%%r1,%3) \n\t"

"vl %%v16,0(%%r1,%3) \n\t"
"vl %%v17,16(%%r1,%3) \n\t"
"vl %%v18,32(%%r1,%3) \n\t"
"vl %%v19,48(%%r1,%3) \n\t"
"vl %%v20,64(%%r1,%3) \n\t"
"vl %%v21,80(%%r1,%3) \n\t"
"vl %%v22,96(%%r1,%3) \n\t"
"vl %%v23,112(%%r1,%3) \n\t"

"vl %%v24,0(%%r1,%1) \n\t"
"vfmasb %%v0,%%v16,%%v24,%%v0 \n\t"
"vl %%v25,0(%%r1,%2) \n\t"
"vfmasb %%v1,%%v16,%%v25,%%v1 \n\t"

"vl %%v26,16(%%r1,%1) \n\t"
"vfmasb %%v0,%%v17,%%v26,%%v0 \n\t"
"vl %%v27,16(%%r1,%2) \n\t"
"vfmasb %%v1,%%v17,%%v27,%%v1 \n\t"

"vl %%v28,32(%%r1,%1) \n\t"
"vfmasb %%v0,%%v18,%%v28,%%v0 \n\t"
"vl %%v29,32(%%r1,%2) \n\t"
"vfmasb %%v1,%%v18,%%v29,%%v1 \n\t"

"vl %%v30,48(%%r1,%1) \n\t"
"vfmasb %%v0,%%v19,%%v30,%%v0 \n\t"
"vl %%v31,48(%%r1,%2) \n\t"
"vfmasb %%v1,%%v19,%%v31,%%v1 \n\t"

"vl %%v24,64(%%r1,%1) \n\t"
"vfmasb %%v0,%%v20,%%v24,%%v0 \n\t"
"vl %%v25,64(%%r1,%2) \n\t"
"vfmasb %%v1,%%v20,%%v25,%%v1 \n\t"

"vl %%v26,80(%%r1,%1) \n\t"
"vfmasb %%v0,%%v21,%%v26,%%v0 \n\t"
"vl %%v27,80(%%r1,%2) \n\t"
"vfmasb %%v1,%%v21,%%v27,%%v1 \n\t"

"vl %%v28,96(%%r1,%1) \n\t"
"vfmasb %%v0,%%v22,%%v28,%%v0 \n\t"
"vl %%v29,96(%%r1,%2) \n\t"
"vfmasb %%v1,%%v22,%%v29,%%v1 \n\t"

"vl %%v30,112(%%r1,%1) \n\t"
"vfmasb %%v0,%%v23,%%v30,%%v0 \n\t"
"vl %%v31,112(%%r1,%2) \n\t"
"vfmasb %%v1,%%v23,%%v31,%%v1 \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b \n\t"

"1: \n\t"
"lghi %%r0,28 \n\t"
"ngr %%r0,%0 \n\t"
"ltgr %%r0,%%r0 \n\t"
"jz 3f \n\t"

"srlg %%r0,%%r0,2 \n\t"
"2: \n\t"
"vl %%v16,0(%%r1,%3) \n\t"

"vl %%v24,0(%%r1,%1) \n\t"
"vfmasb %%v0,%%v16,%%v24,%%v0 \n\t"
"vl %%v25,0(%%r1,%2) \n\t"
"vfmasb %%v1,%%v16,%%v25,%%v1 \n\t"

"agfi %%r1,16 \n\t"
"brctg %%r0,2b \n\t"

"3: \n\t"
"vrepf %%v2,%%v0,1 \n\t"
"aebr %%f0,%%f2 \n\t"
"vrepf %%v2,%%v0,2 \n\t"
"aebr %%f0,%%f2 \n\t"
"vrepf %%v2,%%v0,3 \n\t"
"aebr %%f0,%%f2 \n\t"
"ste %%f0,0(%4) \n\t"
"vrepf %%v2,%%v1,1 \n\t"
"aebr %%f1,%%f2 \n\t"
"vrepf %%v2,%%v1,2 \n\t"
"aebr %%f1,%%f2 \n\t"
"vrepf %%v2,%%v1,3 \n\t"
"aebr %%f1,%%f2 \n\t"
"ste %%f1,4(%4) "
:
:"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[2])y)
:"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}

static void sgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y)
{
__asm__ volatile (
"vzero %%v0 \n\t"
"xgr %%r1,%%r1 \n\t"

"lghi %%r0,-32 \n\t"
"ngr %%r0,%0 \n\t"
"ltgr %%r0,%%r0 \n\t"
"jz 1f \n\t"

"srlg %%r0,%%r0,5 \n\t"
"0: \n\t"
"pfd 1,1024(%%r1,%1) \n\t"
"pfd 1,1024(%%r1,%2) \n\t"

"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"

"vl %%v24,0(%%r1,%1) \n\t"
"vfmasb %%v0,%%v16,%%v24,%%v0 \n\t"

"vl %%v25,16(%%r1,%1) \n\t"
"vfmasb %%v0,%%v17,%%v25,%%v0 \n\t"

"vl %%v26,32(%%r1,%1) \n\t"
"vfmasb %%v0,%%v18,%%v26,%%v0 \n\t"

"vl %%v27,48(%%r1,%1) \n\t"
"vfmasb %%v0,%%v19,%%v27,%%v0 \n\t"

"vl %%v28,64(%%r1,%1) \n\t"
"vfmasb %%v0,%%v20,%%v28,%%v0 \n\t"

"vl %%v29,80(%%r1,%1) \n\t"
"vfmasb %%v0,%%v21,%%v29,%%v0 \n\t"

"vl %%v30,96(%%r1,%1) \n\t"
"vfmasb %%v0,%%v22,%%v30,%%v0 \n\t"

"vl %%v31,112(%%r1,%1) \n\t"
"vfmasb %%v0,%%v23,%%v31,%%v0 \n\t"

"1: \n\t"
"lghi %%r0,28 \n\t"
"ngr %%r0,%0 \n\t"
"ltgr %%r0,%%r0 \n\t"
"jz 3f \n\t"

"srlg %%r0,%%r0,2 \n\t"
"2: \n\t"
"vl %%v16,0(%%r1,%2) \n\t"

"vl %%v24,0(%%r1,%1) \n\t"
"vfmasb %%v0,%%v16,%%v24,%%v0 \n\t"

"agfi %%r1,16 \n\t"
"brctg %%r0,2b \n\t"

"3: \n\t"
"vrepf %%v1,%%v0,1 \n\t"
"aebr %%f0,%%f1 \n\t"
"vrepf %%v1,%%v0,2 \n\t"
"aebr %%f0,%%f1 \n\t"
"vrepf %%v1,%%v0,3 \n\t"
"aebr %%f0,%%f1 \n\t"
"ste %%f0,0(%3) "
:
:"r"(n),"ZR"((const FLOAT (*)[n])a0),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[1])y)
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src)
{
BLASLONG i;
for (i = 0; i < n; i++)
{
dest[i] = *src;
src += inc_src;
}
}
static void add_y_kernel_4(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest)
{
__asm__ volatile (
"vlrepf %%v0,%1 \n\t"
"xgr %%r1,%%r1 \n\t"

"lghi %%r0,-32 \n\t"
"ngr %%r0,%0 \n\t"
"ltgr %%r0,%%r0 \n\t"
"jz 1f \n\t"

"srlg %%r0,%%r0,5 \n\t"
"0: \n\t"
"pfd 1,1024(%%r1,%2) \n\t"
"pfd 2,1024(%%r1,%3) \n\t"

"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"

"vl %%v24, 0(%%r1,%3) \n\t"
"vfmasb %%v24,%%v16,%%v0,%%v24 \n\t"
"vst %%v24, 0(%%r1,%3) \n\t"
"vl %%v25, 16(%%r1,%3) \n\t"
"vfmasb %%v25,%%v17,%%v0,%%v25 \n\t"
"vst %%v25, 16(%%r1,%3) \n\t"
"vl %%v26, 32(%%r1,%3) \n\t"
"vfmasb %%v26,%%v18,%%v0,%%v26 \n\t"
"vst %%v26, 32(%%r1,%3) \n\t"
"vl %%v27, 48(%%r1,%3) \n\t"
"vfmasb %%v27,%%v19,%%v0,%%v27 \n\t"
"vst %%v27, 48(%%r1,%3) \n\t"
"vl %%v28, 64(%%r1,%3) \n\t"
"vfmasb %%v28,%%v20,%%v0,%%v28 \n\t"
"vst %%v28, 64(%%r1,%3) \n\t"
"vl %%v29, 80(%%r1,%3) \n\t"
"vfmasb %%v29,%%v21,%%v0,%%v29 \n\t"
"vst %%v29, 80(%%r1,%3) \n\t"
"vl %%v30, 96(%%r1,%3) \n\t"
"vfmasb %%v30,%%v22,%%v0,%%v30 \n\t"
"vst %%v30, 96(%%r1,%3) \n\t"
"vl %%v31, 112(%%r1,%3) \n\t"
"vfmasb %%v31,%%v23,%%v0,%%v31 \n\t"
"vst %%v31, 112(%%r1,%3) \n\t"

"agfi %%r1,128 \n\t"
"brctg %%r0,0b \n\t"

"1: \n\t"
"lghi %%r0,28 \n\t"
"ngr %%r0,%0 \n\t"
"ltgr %%r0,%%r0 \n\t"
"jz 3f \n\t"

"srlg %%r0,%%r0,2 \n\t"
"2: \n\t"
"vl %%v16,0(%%r1,%2) \n\t"

"vl %%v24, 0(%%r1,%3) \n\t"
"vfmasb %%v24,%%v16,%%v0,%%v24 \n\t"
"vst %%v24, 0(%%r1,%3) \n\t"

"agfi %%r1,16 \n\t"
"brctg %%r0,2b \n\t"

"3: \n\t"
"nop "
:
:"r"(n),"m"(da),"ZR"((const FLOAT (*)[n])src),"ZR"((FLOAT (*)[n])dest)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
static void add_y(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
{
if (inc_dest == 1)
add_y_kernel_4(n, da, src, dest);
else
{
BLASLONG i;
for (i = 0; i < n; i++)
{
*dest += src[i] * da;
dest += inc_dest;
}
}
}

int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
{
BLASLONG register i;
BLASLONG register j;
FLOAT *a_ptr;
FLOAT *x_ptr;
FLOAT *y_ptr;
BLASLONG n0;
BLASLONG n1;
BLASLONG m1;
BLASLONG m2;
BLASLONG m3;
BLASLONG n2;
FLOAT ybuffer[2] __attribute__ ((aligned(16)));
FLOAT *xbuffer;
FLOAT *ytemp;

if ( m < 1 ) return(0);
if ( n < 1 ) return(0);

xbuffer = buffer;
ytemp = buffer + (m < NBMAX ? m : NBMAX);
n0 = n / NBMAX;
n1 = (n % NBMAX) >> 2 ;
n2 = n & 3 ;

m3 = m & 3 ;
m1 = m & -4 ;
m2 = (m & (NBMAX-1)) - m3 ;


BLASLONG NB = NBMAX;

while ( NB == NBMAX )
{
m1 -= NB;
if ( m1 < 0)
{
if ( m2 == 0 ) break;
NB = m2;
}
y_ptr = y;
a_ptr = a;
x_ptr = x;

if ( inc_x == 1 )
xbuffer = x_ptr;
else
copy_x(NB,x_ptr,xbuffer,inc_x);


FLOAT *ap[4];
FLOAT *yp;
BLASLONG register lda4 = 4 * lda;
ap[0] = a_ptr;
ap[1] = a_ptr + lda;
ap[2] = ap[1] + lda;
ap[3] = ap[2] + lda;

if ( n0 > 0 )
{
BLASLONG nb1 = NBMAX / 4;
for( j=0; j<n0; j++)
{

yp = ytemp;
for( i = 0; i < nb1 ; i++)
{
sgemv_kernel_4x4(NB,ap,xbuffer,yp);
ap[0] += lda4 ;
ap[1] += lda4 ;
ap[2] += lda4 ;
ap[3] += lda4 ;
yp += 4;
}
add_y(nb1*4, alpha, ytemp, y_ptr, inc_y );
y_ptr += nb1 * inc_y * 4;
a_ptr += nb1 * lda4 ;

}

}


yp = ytemp;

for( i = 0; i < n1 ; i++)
{
sgemv_kernel_4x4(NB,ap,xbuffer,yp);
ap[0] += lda4 ;
ap[1] += lda4 ;
ap[2] += lda4 ;
ap[3] += lda4 ;
yp += 4;
}
if ( n1 > 0 )
{
add_y(n1*4, alpha, ytemp, y_ptr, inc_y );
y_ptr += n1 * inc_y * 4;
a_ptr += n1 * lda4 ;
}

if ( n2 & 2 )
{

sgemv_kernel_4x2(NB,ap,xbuffer,ybuffer);
a_ptr += lda * 2;
*y_ptr += ybuffer[0] * alpha;
y_ptr += inc_y;
*y_ptr += ybuffer[1] * alpha;
y_ptr += inc_y;

}

if ( n2 & 1 )
{

sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer);
a_ptr += lda;
*y_ptr += ybuffer[0] * alpha;
y_ptr += inc_y;

}
a += NB;
x += NB * inc_x;
}

if ( m3 == 0 ) return(0);

x_ptr = x;
a_ptr = a;
if ( m3 == 3 )
{
FLOAT xtemp0 = *x_ptr * alpha;
x_ptr += inc_x;
FLOAT xtemp1 = *x_ptr * alpha;
x_ptr += inc_x;
FLOAT xtemp2 = *x_ptr * alpha;

FLOAT *aj = a_ptr;
y_ptr = y;

if ( lda == 3 && inc_y == 1 )
{

for ( j=0; j< ( n & -4) ; j+=4 )
{

y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2;
y_ptr[j+1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2;
y_ptr[j+2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2;
y_ptr[j+3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2;
aj += 12;
}

for ( ; j<n; j++ )
{
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2;
aj += 3;
}

}
else
{

if ( inc_y == 1 )
{

BLASLONG register lda2 = lda << 1;
BLASLONG register lda4 = lda << 2;
BLASLONG register lda3 = lda2 + lda;

for ( j=0; j< ( n & -4 ); j+=4 )
{

y_ptr[j] += *aj * xtemp0 + *(aj+1) * xtemp1 + *(aj+2) * xtemp2;
y_ptr[j+1] += *(aj+lda) * xtemp0 + *(aj+lda+1) * xtemp1 + *(aj+lda+2) * xtemp2;
y_ptr[j+2] += *(aj+lda2) * xtemp0 + *(aj+lda2+1) * xtemp1 + *(aj+lda2+2) * xtemp2;
y_ptr[j+3] += *(aj+lda3) * xtemp0 + *(aj+lda3+1) * xtemp1 + *(aj+lda3+2) * xtemp2;
aj += lda4;
}

for ( ; j< n ; j++ )
{

y_ptr[j] += *aj * xtemp0 + *(aj+1) * xtemp1 + *(aj+2) * xtemp2 ;
aj += lda;
}

}
else
{

for ( j=0; j<n; j++ )
{
*y_ptr += *aj * xtemp0 + *(aj+1) * xtemp1 + *(aj+2) * xtemp2;
y_ptr += inc_y;
aj += lda;
}


}

}
return(0);
}

if ( m3 == 2 )
{
FLOAT xtemp0 = *x_ptr * alpha;
x_ptr += inc_x;
FLOAT xtemp1 = *x_ptr * alpha;

FLOAT *aj = a_ptr;
y_ptr = y;

if ( lda == 2 && inc_y == 1 )
{

for ( j=0; j< ( n & -4) ; j+=4 )
{
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 ;
y_ptr[j+1] += aj[2] * xtemp0 + aj[3] * xtemp1 ;
y_ptr[j+2] += aj[4] * xtemp0 + aj[5] * xtemp1 ;
y_ptr[j+3] += aj[6] * xtemp0 + aj[7] * xtemp1 ;
aj += 8;

}

for ( ; j<n; j++ )
{
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 ;
aj += 2;
}

}
else
{
if ( inc_y == 1 )
{

BLASLONG register lda2 = lda << 1;
BLASLONG register lda4 = lda << 2;
BLASLONG register lda3 = lda2 + lda;

for ( j=0; j< ( n & -4 ); j+=4 )
{

y_ptr[j] += *aj * xtemp0 + *(aj+1) * xtemp1 ;
y_ptr[j+1] += *(aj+lda) * xtemp0 + *(aj+lda+1) * xtemp1 ;
y_ptr[j+2] += *(aj+lda2) * xtemp0 + *(aj+lda2+1) * xtemp1 ;
y_ptr[j+3] += *(aj+lda3) * xtemp0 + *(aj+lda3+1) * xtemp1 ;
aj += lda4;
}

for ( ; j< n ; j++ )
{

y_ptr[j] += *aj * xtemp0 + *(aj+1) * xtemp1 ;
aj += lda;
}

}
else
{
for ( j=0; j<n; j++ )
{
*y_ptr += *aj * xtemp0 + *(aj+1) * xtemp1 ;
y_ptr += inc_y;
aj += lda;
}
}

}
return(0);

}

FLOAT xtemp = *x_ptr * alpha;
FLOAT *aj = a_ptr;
y_ptr = y;
if ( lda == 1 && inc_y == 1 )
{
for ( j=0; j< ( n & -4) ; j+=4 )
{
y_ptr[j] += aj[j] * xtemp;
y_ptr[j+1] += aj[j+1] * xtemp;
y_ptr[j+2] += aj[j+2] * xtemp;
y_ptr[j+3] += aj[j+3] * xtemp;
}
for ( ; j<n ; j++ )
{
y_ptr[j] += aj[j] * xtemp;
}



}
else
{
if ( inc_y == 1 )
{

BLASLONG register lda2 = lda << 1;
BLASLONG register lda4 = lda << 2;
BLASLONG register lda3 = lda2 + lda;
for ( j=0; j< ( n & -4 ); j+=4 )
{
y_ptr[j] += *aj * xtemp;
y_ptr[j+1] += *(aj+lda) * xtemp;
y_ptr[j+2] += *(aj+lda2) * xtemp;
y_ptr[j+3] += *(aj+lda3) * xtemp;
aj += lda4 ;
}

for ( ; j<n; j++ )
{
y_ptr[j] += *aj * xtemp;
aj += lda;
}

}
else
{
for ( j=0; j<n; j++ )
{
*y_ptr += *aj * xtemp;
y_ptr += inc_y;
aj += lda;
}

}
}

return(0);
}



+ 186
- 0
kernel/zarch/smax.c View File

@@ -0,0 +1,186 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#include "common.h"

static FLOAT smax_kernel_64(BLASLONG n, FLOAT *x)
{
FLOAT max;

__asm__ volatile (
"vl %%v0,0(%2) \n\t"
"srlg %%r0,%1,6 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"

"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vfchsb %%v24,%%v16,%%v17 \n\t"
"vfchsb %%v25,%%v18,%%v19 \n\t"
"vfchsb %%v26,%%v20,%%v21 \n\t"
"vfchsb %%v27,%%v22,%%v23 \n\t"
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
"vsel %%v26,%%v20,%%v21,%%v26 \n\t"
"vsel %%v27,%%v22,%%v23,%%v27 \n\t"

"vfchsb %%v28,%%v24,%%v25 \n\t"
"vfchsb %%v29,%%v26,%%v27 \n\t"
"vsel %%v28,%%v24,%%v25,%%v28 \n\t"
"vsel %%v29,%%v26,%%v27,%%v29 \n\t"

"vfchsb %%v30,%%v28,%%v29 \n\t"
"vsel %%v30,%%v28,%%v29,%%v30 \n\t"

"vfchsb %%v31,%%v30,%%v0 \n\t"
"vsel %%v0,%%v30,%%v0,%%v31 \n\t"

"vl %%v16,128(%%r1,%2) \n\t"
"vl %%v17,144(%%r1,%2) \n\t"
"vl %%v18,160(%%r1,%2) \n\t"
"vl %%v19,176(%%r1,%2) \n\t"
"vl %%v20,192(%%r1,%2) \n\t"
"vl %%v21,208(%%r1,%2) \n\t"
"vl %%v22,224(%%r1,%2) \n\t"
"vl %%v23,240(%%r1,%2) \n\t"
"vfchsb %%v24,%%v16,%%v17 \n\t"
"vfchsb %%v25,%%v18,%%v19 \n\t"
"vfchsb %%v26,%%v20,%%v21 \n\t"
"vfchsb %%v27,%%v22,%%v23 \n\t"
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
"vsel %%v26,%%v20,%%v21,%%v26 \n\t"
"vsel %%v27,%%v22,%%v23,%%v27 \n\t"

"vfchsb %%v28,%%v24,%%v25 \n\t"
"vfchsb %%v29,%%v26,%%v27 \n\t"
"vsel %%v28,%%v24,%%v25,%%v28 \n\t"
"vsel %%v29,%%v26,%%v27,%%v29 \n\t"

"vfchsb %%v30,%%v28,%%v29 \n\t"
"vsel %%v30,%%v28,%%v29,%%v30 \n\t"

"vfchsb %%v31,%%v30,%%v0 \n\t"
"vsel %%v0,%%v30,%%v0,%%v31 \n\t"

"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"

"veslg %%v16,%%v0,32 \n\t"
"vfchsb %%v17,%%v16,%%v0 \n\t"
"vsel %%v0,%%v16,%%v0,%%v17 \n\t"

"vrepf %%v16,%%v0,2 \n\t"
"wfchsb %%v17,%%v16,%%v0 \n\t"
"vsel %%v0,%%v16,%%v0,%%v17 \n\t"
"ler %0,%%f0 "
:"=f"(max)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);

return max;
}
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT maxf = 0.0;

if (n <= 0 || inc_x <= 0) return (maxf);

if (inc_x == 1) {

BLASLONG n1 = n & -64;
if (n1 > 0) {

maxf = smax_kernel_64(n1, x);

i = n1;
}
else
{
maxf=x[0];
i++;
}

while (i < n) {
if (x[i] > maxf) {
maxf = x[i];
}
i++;
}
return (maxf);

} else {

maxf=x[0];
i += inc_x;
j++;

BLASLONG n1 = (n - 1) & -4;
while (j < n1) {

if (x[i] > maxf) {
maxf = x[i];
}
if (x[i + inc_x] > maxf) {
maxf = x[i + inc_x];
}
if (x[i + 2 * inc_x] > maxf) {
maxf = x[i + 2 * inc_x];
}
if (x[i + 3 * inc_x] > maxf) {
maxf = x[i + 3 * inc_x];
}

i += inc_x * 4;

j += 4;

}


while (j < n) {
if (x[i] > maxf) {
maxf = x[i];
}
i += inc_x;
j++;
}
return (maxf);
}
}

+ 186
- 0
kernel/zarch/smin.c View File

@@ -0,0 +1,186 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#include "common.h"

static FLOAT smin_kernel_64(BLASLONG n, FLOAT *x)
{
FLOAT min;

__asm__ volatile (
"vl %%v0,0(%2) \n\t"
"srlg %%r0,%1,6 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"

"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vfchsb %%v24,%%v17,%%v16 \n\t"
"vfchsb %%v25,%%v19,%%v18 \n\t"
"vfchsb %%v26,%%v21,%%v20 \n\t"
"vfchsb %%v27,%%v23,%%v22 \n\t"
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
"vsel %%v26,%%v20,%%v21,%%v26 \n\t"
"vsel %%v27,%%v22,%%v23,%%v27 \n\t"

"vfchsb %%v28,%%v25,%%v24 \n\t"
"vfchsb %%v29,%%v27,%%v26 \n\t"
"vsel %%v28,%%v24,%%v25,%%v28 \n\t"
"vsel %%v29,%%v26,%%v27,%%v29 \n\t"

"vfchsb %%v30,%%v29,%%v28 \n\t"
"vsel %%v30,%%v28,%%v29,%%v30 \n\t"

"vfchsb %%v31,%%v0,%%v30 \n\t"
"vsel %%v0,%%v30,%%v0,%%v31 \n\t"

"vl %%v16,128(%%r1,%2) \n\t"
"vl %%v17,144(%%r1,%2) \n\t"
"vl %%v18,160(%%r1,%2) \n\t"
"vl %%v19,176(%%r1,%2) \n\t"
"vl %%v20,192(%%r1,%2) \n\t"
"vl %%v21,208(%%r1,%2) \n\t"
"vl %%v22,224(%%r1,%2) \n\t"
"vl %%v23,240(%%r1,%2) \n\t"
"vfchsb %%v24,%%v17,%%v16 \n\t"
"vfchsb %%v25,%%v19,%%v18 \n\t"
"vfchsb %%v26,%%v21,%%v20 \n\t"
"vfchsb %%v27,%%v23,%%v22 \n\t"
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"
"vsel %%v26,%%v20,%%v21,%%v26 \n\t"
"vsel %%v27,%%v22,%%v23,%%v27 \n\t"

"vfchsb %%v28,%%v25,%%v24 \n\t"
"vfchsb %%v29,%%v27,%%v26 \n\t"
"vsel %%v28,%%v24,%%v25,%%v28 \n\t"
"vsel %%v29,%%v26,%%v27,%%v29 \n\t"

"vfchsb %%v30,%%v29,%%v28 \n\t"
"vsel %%v30,%%v28,%%v29,%%v30 \n\t"

"vfchsb %%v31,%%v0,%%v30 \n\t"
"vsel %%v0,%%v30,%%v0,%%v31 \n\t"

"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"

"veslg %%v16,%%v0,32 \n\t"
"vfchsb %%v17,%%v0,%%v16 \n\t"
"vsel %%v0,%%v16,%%v0,%%v17 \n\t"

"vrepf %%v16,%%v0,2 \n\t"
"wfchsb %%v17,%%v0,%%v16 \n\t"
"vsel %%v0,%%v16,%%v0,%%v17 \n\t"
"ler %0,%%f0 "
:"=f"(min)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);

return min;
}
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT minf = 0.0;

if (n <= 0 || inc_x <= 0) return (minf);

if (inc_x == 1) {

BLASLONG n1 = n & -64;
if (n1 > 0) {

minf = smin_kernel_64(n1, x);

i = n1;
}
else
{
minf=x[0];
i++;
}

while (i < n) {
if (x[i] < minf) {
minf = x[i];
}
i++;
}
return (minf);

} else {

minf=x[0];
i += inc_x;
j++;

BLASLONG n1 = (n - 1) & -4;
while (j < n1) {

if (x[i] < minf) {
minf = x[i];
}
if (x[i + inc_x] < minf) {
minf = x[i + inc_x];
}
if (x[i + 2 * inc_x] < minf) {
minf = x[i + 2 * inc_x];
}
if (x[i + 3 * inc_x] < minf) {
minf = x[i + 3 * inc_x];
}

i += inc_x * 4;

j += 4;

}


while (j < n) {
if (x[i] < minf) {
minf = x[i];
}
i += inc_x;
j++;
}
return (minf);
}
}

+ 246
- 0
kernel/zarch/srot.c View File

@@ -0,0 +1,246 @@
/***************************************************************************
Copyright (c) 2013-2018, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#include "common.h"

static void srot_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
{
__asm__ (
"vlrepf %%v0,%3 \n\t"
"vlrepf %%v1,%4 \n\t"
"srlg %%r0,%0,6 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 2, 1024(%%r1,%1) \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"
"vl %%v24, 0(%%r1,%1) \n\t"
"vl %%v25, 16(%%r1,%1) \n\t"
"vl %%v26, 32(%%r1,%1) \n\t"
"vl %%v27, 48(%%r1,%1) \n\t"
"vl %%v16, 0(%%r1,%2) \n\t"
"vl %%v17, 16(%%r1,%2) \n\t"
"vl %%v18, 32(%%r1,%2) \n\t"
"vl %%v19, 48(%%r1,%2) \n\t"
"vfmsb %%v28,%%v24,%%v0 \n\t"
"vfmsb %%v29,%%v25,%%v0 \n\t"
"vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v30,%%v26,%%v0 \n\t"
"vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v31,%%v27,%%v0 \n\t"
"vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
/* 2nd parts*/
"vfmasb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmasb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmasb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmasb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vst %%v28, 0(%%r1,%1) \n\t"
"vst %%v29, 16(%%r1,%1) \n\t"
"vst %%v30, 32(%%r1,%1) \n\t"
"vst %%v31, 48(%%r1,%1) \n\t"
"vst %%v20, 0(%%r1,%2) \n\t"
"vst %%v21, 16(%%r1,%2) \n\t"
"vst %%v22, 32(%%r1,%2) \n\t"
"vst %%v23, 48(%%r1,%2) \n\t"
"vl %%v24, 64(%%r1,%1) \n\t"
"vl %%v25, 80(%%r1,%1) \n\t"
"vl %%v26, 96(%%r1,%1) \n\t"
"vl %%v27, 112(%%r1,%1) \n\t"
"vl %%v16, 64(%%r1,%2) \n\t"
"vl %%v17, 80(%%r1,%2) \n\t"
"vl %%v18, 96(%%r1,%2) \n\t"
"vl %%v19, 112(%%r1,%2) \n\t"
"vfmsb %%v28,%%v24,%%v0 \n\t"
"vfmsb %%v29,%%v25,%%v0 \n\t"
"vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v30,%%v26,%%v0 \n\t"
"vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v31,%%v27,%%v0 \n\t"
"vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
/* 2nd parts*/
"vfmasb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmasb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmasb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmasb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vst %%v28, 64(%%r1,%1) \n\t"
"vst %%v29, 80(%%r1,%1) \n\t"
"vst %%v30, 96(%%r1,%1) \n\t"
"vst %%v31, 112(%%r1,%1) \n\t"
"vst %%v20, 64(%%r1,%2) \n\t"
"vst %%v21, 80(%%r1,%2) \n\t"
"vst %%v22, 96(%%r1,%2) \n\t"
"vst %%v23, 112(%%r1,%2) \n\t"
"vl %%v24, 128(%%r1,%1) \n\t"
"vl %%v25, 144(%%r1,%1) \n\t"
"vl %%v26, 160(%%r1,%1) \n\t"
"vl %%v27, 176(%%r1,%1) \n\t"
"vl %%v16, 128(%%r1,%2) \n\t"
"vl %%v17, 144(%%r1,%2) \n\t"
"vl %%v18, 160(%%r1,%2) \n\t"
"vl %%v19, 176(%%r1,%2) \n\t"
"vfmsb %%v28,%%v24,%%v0 \n\t"
"vfmsb %%v29,%%v25,%%v0 \n\t"
"vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v30,%%v26,%%v0 \n\t"
"vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v31,%%v27,%%v0 \n\t"
"vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
/* 2nd parts*/
"vfmasb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmasb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmasb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmasb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vst %%v28, 128(%%r1,%1) \n\t"
"vst %%v29, 144(%%r1,%1) \n\t"
"vst %%v30, 160(%%r1,%1) \n\t"
"vst %%v31, 176(%%r1,%1) \n\t"
"vst %%v20, 128(%%r1,%2) \n\t"
"vst %%v21, 144(%%r1,%2) \n\t"
"vst %%v22, 160(%%r1,%2) \n\t"
"vst %%v23, 176(%%r1,%2) \n\t"
"vl %%v24, 192(%%r1,%1) \n\t"
"vl %%v25, 208(%%r1,%1) \n\t"
"vl %%v26, 224(%%r1,%1) \n\t"
"vl %%v27, 240(%%r1,%1) \n\t"
"vl %%v16, 192(%%r1,%2) \n\t"
"vl %%v17, 208(%%r1,%2) \n\t"
"vl %%v18, 224(%%r1,%2) \n\t"
"vl %%v19, 240(%%r1,%2) \n\t"
"vfmsb %%v28,%%v24,%%v0 \n\t"
"vfmsb %%v29,%%v25,%%v0 \n\t"
"vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v30,%%v26,%%v0 \n\t"
"vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmsb %%v31,%%v27,%%v0 \n\t"
"vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
/* 2nd parts*/
"vfmasb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmasb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmasb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmasb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vst %%v28, 192(%%r1,%1) \n\t"
"vst %%v29, 208(%%r1,%1) \n\t"
"vst %%v30, 224(%%r1,%1) \n\t"
"vst %%v31, 240(%%r1,%1) \n\t"
"vst %%v20, 192(%%r1,%2) \n\t"
"vst %%v21, 208(%%r1,%2) \n\t"
"vst %%v22, 224(%%r1,%2) \n\t"
"vst %%v23, 240(%%r1,%2) \n\t"
"agfi %%r1,256 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y),"m"(*c),"m"(*s)
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}

int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
FLOAT temp;

if ( n <= 0 ) return(0);

if ( (inc_x == 1) && (inc_y == 1) )
{

BLASLONG n1 = n & -64;
if ( n1 > 0 )
{
FLOAT cosa,sina;
cosa=c;
sina=s;
srot_kernel_64(n1, x, y, &cosa, &sina);
i=n1;
}

while(i < n)
{
temp = c*x[i] + s*y[i] ;
y[i] = c*y[i] - s*x[i] ;
x[i] = temp ;

i++ ;

}


}
else
{

while(i < n)
{
temp = c*x[ix] + s*y[iy] ;
y[iy] = c*y[iy] - s*x[ix] ;
x[ix] = temp ;

ix += inc_x ;
iy += inc_y ;
i++ ;

}

}
return(0);

}



+ 201
- 0
kernel/zarch/sscal.c View File

@@ -0,0 +1,201 @@
/***************************************************************************
Copyright (c) 2013-2018, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#include "common.h"

static void sscal_kernel_32(BLASLONG n, FLOAT da, FLOAT *x)
{
__asm__ volatile (
"vlrepf %%v0,%1 \n\t"
"srlg %%r0,%0,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"
"vl %%v24, 0(%%r1,%2) \n\t"
"vfmsb %%v24,%%v24,%%v0 \n\t"
"vst %%v24, 0(%%r1,%2) \n\t"
"vl %%v25, 16(%%r1,%2) \n\t"
"vfmsb %%v25,%%v25,%%v0 \n\t"
"vst %%v25, 16(%%r1,%2) \n\t"
"vl %%v26, 32(%%r1,%2) \n\t"
"vfmsb %%v26,%%v26,%%v0 \n\t"
"vst %%v26, 32(%%r1,%2) \n\t"
"vl %%v27, 48(%%r1,%2) \n\t"
"vfmsb %%v27,%%v27,%%v0 \n\t"
"vst %%v27, 48(%%r1,%2) \n\t"
"vl %%v24, 64(%%r1,%2) \n\t"
"vfmsb %%v24,%%v24,%%v0 \n\t"
"vst %%v24, 64(%%r1,%2) \n\t"
"vl %%v25, 80(%%r1,%2) \n\t"
"vfmsb %%v25,%%v25,%%v0 \n\t"
"vst %%v25, 80(%%r1,%2) \n\t"
"vl %%v26, 96(%%r1,%2) \n\t"
"vfmsb %%v26,%%v26,%%v0 \n\t"
"vst %%v26, 96(%%r1,%2) \n\t"
"vl %%v27, 112(%%r1,%2) \n\t"
"vfmsb %%v27,%%v27,%%v0 \n\t"
"vst %%v27, 112(%%r1,%2) \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"m"(da),"ZR"((FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v24","v25","v26","v27"
);
}

static void sscal_kernel_32_zero(BLASLONG n, FLOAT *x)
{
__asm__ volatile(
"vzero %%v24 \n\t"
"vzero %%v25 \n\t"
"vzero %%v26 \n\t"
"vzero %%v27 \n\t"
"srlg %%r0,%0,5 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 2, 1024(%%r1,%1) \n\t"

"vst %%v24,0(%%r1,%1) \n\t"
"vst %%v25,16(%%r1,%1) \n\t"
"vst %%v26,32(%%r1,%1) \n\t"
"vst %%v27,48(%%r1,%1) \n\t"
"vst %%v24,64(%%r1,%1) \n\t"
"vst %%v25,80(%%r1,%1) \n\t"
"vst %%v26,96(%%r1,%1) \n\t"
"vst %%v27,112(%%r1,%1) \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((FLOAT (*)[n])x)
:"memory","cc","r0","r1","v24","v25","v26","v27"
);
}

int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i=0,j=0;
if ( n <= 0 || inc_x <=0 )
return(0);

if ( inc_x == 1 )
{

if ( da == 0.0 )
{

BLASLONG n1 = n & -32;
if ( n1 > 0 )
{
sscal_kernel_32_zero(n1, x);
j=n1;
}

while(j < n)
{

x[j]=0.0;
j++;
}

}
else
{

BLASLONG n1 = n & -32;
if ( n1 > 0 )
{
sscal_kernel_32(n1, da, x);
j=n1;
}
while(j < n)
{

x[j] = da * x[j] ;
j++;
}
}


}
else
{

if ( da == 0.0 )
{

BLASLONG n1 = n & -2;

while (j < n1) {

x[i]=0.0;
x[i + inc_x]=0.0;

i += inc_x * 2;
j += 2;

}
while(j < n)
{

x[i]=0.0;
i += inc_x ;
j++;
}

}
else
{
BLASLONG n1 = n & -2;

while (j < n1) {

x[i] = da * x[i] ;
x[i + inc_x] = da * x[i + inc_x];

i += inc_x * 2;
j += 2;

}

while(j < n)
{

x[i] = da * x[i] ;
i += inc_x ;
j++;
}
}

}
return 0;

}



+ 164
- 0
kernel/zarch/sswap.c View File

@@ -0,0 +1,164 @@
/***************************************************************************
Copyright (c) 2013-2018, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#include "common.h"

static void sswap_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y)
{
__asm__ volatile(
"srlg %%r0,%0,6 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 2, 1024(%%r1,%1) \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"
"vl %%v16, 0(%%r1,%1) \n\t"
"vl %%v17, 16(%%r1,%1) \n\t"
"vl %%v18, 32(%%r1,%1) \n\t"
"vl %%v19, 48(%%r1,%1) \n\t"
"vl %%v20, 64(%%r1,%1) \n\t"
"vl %%v21, 80(%%r1,%1) \n\t"
"vl %%v22, 96(%%r1,%1) \n\t"
"vl %%v23, 112(%%r1,%1) \n\t"
"vl %%v24, 128(%%r1,%1) \n\t"
"vl %%v25, 144(%%r1,%1) \n\t"
"vl %%v26, 160(%%r1,%1) \n\t"
"vl %%v27, 176(%%r1,%1) \n\t"
"vl %%v28, 192(%%r1,%1) \n\t"
"vl %%v29, 208(%%r1,%1) \n\t"
"vl %%v30, 224(%%r1,%1) \n\t"
"vl %%v31, 240(%%r1,%1) \n\t"

"vl %%v0, 0(%%r1,%2) \n\t"
"vl %%v1, 16(%%r1,%2) \n\t"
"vl %%v2, 32(%%r1,%2) \n\t"
"vl %%v3, 48(%%r1,%2) \n\t"
"vl %%v4, 64(%%r1,%2) \n\t"
"vl %%v5, 80(%%r1,%2) \n\t"
"vl %%v6, 96(%%r1,%2) \n\t"
"vl %%v7, 112(%%r1,%2) \n\t"
"vst %%v0, 0(%%r1,%1) \n\t"
"vst %%v1, 16(%%r1,%1) \n\t"
"vst %%v2, 32(%%r1,%1) \n\t"
"vst %%v3, 48(%%r1,%1) \n\t"
"vst %%v4, 64(%%r1,%1) \n\t"
"vst %%v5, 80(%%r1,%1) \n\t"
"vst %%v6, 96(%%r1,%1) \n\t"
"vst %%v7, 112(%%r1,%1) \n\t"

"vl %%v0, 128(%%r1,%2) \n\t"
"vl %%v1, 144(%%r1,%2) \n\t"
"vl %%v2, 160(%%r1,%2) \n\t"
"vl %%v3, 176(%%r1,%2) \n\t"
"vl %%v4, 192(%%r1,%2) \n\t"
"vl %%v5, 208(%%r1,%2) \n\t"
"vl %%v6, 224(%%r1,%2) \n\t"
"vl %%v7, 240(%%r1,%2) \n\t"
"vst %%v0, 128(%%r1,%1) \n\t"
"vst %%v1, 144(%%r1,%1) \n\t"
"vst %%v2, 160(%%r1,%1) \n\t"
"vst %%v3, 176(%%r1,%1) \n\t"
"vst %%v4, 192(%%r1,%1) \n\t"
"vst %%v5, 208(%%r1,%1) \n\t"
"vst %%v6, 224(%%r1,%1) \n\t"
"vst %%v7, 240(%%r1,%1) \n\t"

"vst %%v16, 0(%%r1,%2) \n\t"
"vst %%v17, 16(%%r1,%2) \n\t"
"vst %%v18, 32(%%r1,%2) \n\t"
"vst %%v19, 48(%%r1,%2) \n\t"
"vst %%v20, 64(%%r1,%2) \n\t"
"vst %%v21, 80(%%r1,%2) \n\t"
"vst %%v22, 96(%%r1,%2) \n\t"
"vst %%v23, 112(%%r1,%2) \n\t"
"vst %%v24, 128(%%r1,%2) \n\t"
"vst %%v25, 144(%%r1,%2) \n\t"
"vst %%v26, 160(%%r1,%2) \n\t"
"vst %%v27, 176(%%r1,%2) \n\t"
"vst %%v28, 192(%%r1,%2) \n\t"
"vst %%v29, 208(%%r1,%2) \n\t"
"vst %%v30, 224(%%r1,%2) \n\t"
"vst %%v31, 240(%%r1,%2) \n\t"
"agfi %%r1,256 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}

int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
FLOAT temp;

if ( n <= 0 ) return(0);

if ( (inc_x == 1) && (inc_y == 1 ))
{

BLASLONG n1 = n & -64;
if ( n1 > 0 )
{
sswap_kernel_64(n1, x, y);
i=n1;
}

while(i < n)
{
temp = y[i];
y[i] = x[i] ;
x[i] = temp;
i++ ;

}


}
else
{

while(i < n)
{
temp = y[iy];
y[iy] = x[ix] ;
x[ix] = temp;
ix += inc_x ;
iy += inc_y ;
i++ ;

}

}
return(0);

}



+ 221
- 0
kernel/zarch/zamax.c View File

@@ -0,0 +1,221 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#include "common.h"
#include <math.h>

#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif

#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1]))

static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x)
{
FLOAT amax;

__asm__ volatile (
"vleg %%v0,0(%2),0 \n\t"
"vleg %%v16,8(%2),0 \n\t"
"vleg %%v0,16(%2),1 \n\t"
"vleg %%v16,24(%2),1 \n\t"
"vflpdb %%v0,%%v0 \n\t"
"vflpdb %%v16,%%v16 \n\t"
"vfadb %%v0,%%v0,%%v16 \n\t"
"srlg %%r0,%1,4 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"

"vleg %%v16,0(%%r1,%2),0 \n\t"
"vleg %%v17,8(%%r1,%2),0 \n\t"
"vleg %%v16,16(%%r1,%2),1 \n\t"
"vleg %%v17,24(%%r1,%2),1 \n\t"
"vleg %%v18,32(%%r1,%2),0 \n\t"
"vleg %%v19,40(%%r1,%2),0 \n\t"
"vleg %%v18,48(%%r1,%2),1 \n\t"
"vleg %%v19,56(%%r1,%2),1 \n\t"
"vleg %%v20,64(%%r1,%2),0 \n\t"
"vleg %%v21,72(%%r1,%2),0 \n\t"
"vleg %%v20,80(%%r1,%2),1 \n\t"
"vleg %%v21,88(%%r1,%2),1 \n\t"
"vleg %%v22,96(%%r1,%2),0 \n\t"
"vleg %%v23,104(%%r1,%2),0 \n\t"
"vleg %%v22,112(%%r1,%2),1 \n\t"
"vleg %%v23,120(%%r1,%2),1 \n\t"
"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"
"vfadb %%v16,%%v16,%%v17 \n\t"
"vfadb %%v17,%%v18,%%v19 \n\t"
"vfadb %%v18,%%v20,%%v21 \n\t"
"vfadb %%v19,%%v22,%%v23 \n\t"
"vfchdb %%v24,%%v16,%%v17 \n\t"
"vfchdb %%v25,%%v18,%%v19 \n\t"
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"

"vfchdb %%v26,%%v24,%%v25 \n\t"
"vsel %%v26,%%v24,%%v25,%%v26 \n\t"

"vfchdb %%v27,%%v26,%%v0 \n\t"
"vsel %%v0,%%v26,%%v0,%%v27 \n\t"

"vleg %%v16,128(%%r1,%2),0 \n\t"
"vleg %%v17,136(%%r1,%2),0 \n\t"
"vleg %%v16,144(%%r1,%2),1 \n\t"
"vleg %%v17,152(%%r1,%2),1 \n\t"
"vleg %%v18,160(%%r1,%2),0 \n\t"
"vleg %%v19,168(%%r1,%2),0 \n\t"
"vleg %%v18,176(%%r1,%2),1 \n\t"
"vleg %%v19,184(%%r1,%2),1 \n\t"
"vleg %%v20,192(%%r1,%2),0 \n\t"
"vleg %%v21,200(%%r1,%2),0 \n\t"
"vleg %%v20,208(%%r1,%2),1 \n\t"
"vleg %%v21,216(%%r1,%2),1 \n\t"
"vleg %%v22,224(%%r1,%2),0 \n\t"
"vleg %%v23,232(%%r1,%2),0 \n\t"
"vleg %%v22,240(%%r1,%2),1 \n\t"
"vleg %%v23,248(%%r1,%2),1 \n\t"
"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"
"vfadb %%v16,%%v16,%%v17 \n\t"
"vfadb %%v17,%%v18,%%v19 \n\t"
"vfadb %%v18,%%v20,%%v21 \n\t"
"vfadb %%v19,%%v22,%%v23 \n\t"
"vfchdb %%v24,%%v16,%%v17 \n\t"
"vfchdb %%v25,%%v18,%%v19 \n\t"
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"

"vfchdb %%v26,%%v24,%%v25 \n\t"
"vsel %%v26,%%v24,%%v25,%%v26 \n\t"

"vfchdb %%v27,%%v26,%%v0 \n\t"
"vsel %%v0,%%v26,%%v0,%%v27 \n\t"

"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"

"vrepg %%v16,%%v0,1 \n\t"
"wfchdb %%v17,%%v16,%%v0 \n\t"
"vsel %%v0,%%v16,%%v0,%%v17 \n\t"
"ldr %0,%%f0 "
:"=f"(amax)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27"
);

return amax;
}
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT maxf = 0.0;
BLASLONG inc_x2;

if (n <= 0 || inc_x <= 0) return (maxf);

if (inc_x == 1) {

BLASLONG n1 = n & -16;
if (n1 > 0) {

maxf = zamax_kernel_16(n1, x);

i = n1;
}
else
{
maxf=CABS1(x,0);
i++;
}

while (i < n) {
if (ABS(x[i*2]) > maxf) {
maxf = ABS(x[i*2]);
}
i++;
}
return (maxf);

} else {

inc_x2 = 2 * inc_x;
maxf=CABS1(x,0);
i += inc_x2;
j++;

BLASLONG n1 = (n - 1) & -4;
while (j < n1) {

if (CABS1(x,i) > maxf) {
maxf = CABS1(x,i);
}
if (CABS1(x,i+inc_x2) > maxf) {
maxf = CABS1(x,i+inc_x2);
}
if (CABS1(x,i+inc_x2*2) > maxf) {
maxf = CABS1(x,i+inc_x2*2);
}
if (CABS1(x,i+inc_x2*3) > maxf) {
maxf = CABS1(x,i+inc_x2*3);
}

i += inc_x2 * 4;

j += 4;

}


while (j < n) {
if (CABS1(x,i) > maxf) {
maxf = CABS1(x,i);
}
i += inc_x2;
j++;
}
return (maxf);
}
}

+ 221
- 0
kernel/zarch/zamin.c View File

@@ -0,0 +1,221 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#include "common.h"
#include <math.h>

#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif

#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1]))

static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x)
{
FLOAT amin;

__asm__ volatile (
"vleg %%v0,0(%2),0 \n\t"
"vleg %%v16,8(%2),0 \n\t"
"vleg %%v0,16(%2),1 \n\t"
"vleg %%v16,24(%2),1 \n\t"
"vflpdb %%v0,%%v0 \n\t"
"vflpdb %%v16,%%v16 \n\t"
"vfadb %%v0,%%v0,%%v16 \n\t"
"srlg %%r0,%1,4 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"

"vleg %%v16,0(%%r1,%2),0 \n\t"
"vleg %%v17,8(%%r1,%2),0 \n\t"
"vleg %%v16,16(%%r1,%2),1 \n\t"
"vleg %%v17,24(%%r1,%2),1 \n\t"
"vleg %%v18,32(%%r1,%2),0 \n\t"
"vleg %%v19,40(%%r1,%2),0 \n\t"
"vleg %%v18,48(%%r1,%2),1 \n\t"
"vleg %%v19,56(%%r1,%2),1 \n\t"
"vleg %%v20,64(%%r1,%2),0 \n\t"
"vleg %%v21,72(%%r1,%2),0 \n\t"
"vleg %%v20,80(%%r1,%2),1 \n\t"
"vleg %%v21,88(%%r1,%2),1 \n\t"
"vleg %%v22,96(%%r1,%2),0 \n\t"
"vleg %%v23,104(%%r1,%2),0 \n\t"
"vleg %%v22,112(%%r1,%2),1 \n\t"
"vleg %%v23,120(%%r1,%2),1 \n\t"
"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"
"vfadb %%v16,%%v16,%%v17 \n\t"
"vfadb %%v17,%%v18,%%v19 \n\t"
"vfadb %%v18,%%v20,%%v21 \n\t"
"vfadb %%v19,%%v22,%%v23 \n\t"
"vfchdb %%v24,%%v17,%%v16 \n\t"
"vfchdb %%v25,%%v19,%%v18 \n\t"
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"

"vfchdb %%v26,%%v25,%%v24 \n\t"
"vsel %%v26,%%v24,%%v25,%%v26 \n\t"

"vfchdb %%v27,%%v0,%%v26 \n\t"
"vsel %%v0,%%v26,%%v0,%%v27 \n\t"

"vleg %%v16,128(%%r1,%2),0 \n\t"
"vleg %%v17,136(%%r1,%2),0 \n\t"
"vleg %%v16,144(%%r1,%2),1 \n\t"
"vleg %%v17,152(%%r1,%2),1 \n\t"
"vleg %%v18,160(%%r1,%2),0 \n\t"
"vleg %%v19,168(%%r1,%2),0 \n\t"
"vleg %%v18,176(%%r1,%2),1 \n\t"
"vleg %%v19,184(%%r1,%2),1 \n\t"
"vleg %%v20,192(%%r1,%2),0 \n\t"
"vleg %%v21,200(%%r1,%2),0 \n\t"
"vleg %%v20,208(%%r1,%2),1 \n\t"
"vleg %%v21,216(%%r1,%2),1 \n\t"
"vleg %%v22,224(%%r1,%2),0 \n\t"
"vleg %%v23,232(%%r1,%2),0 \n\t"
"vleg %%v22,240(%%r1,%2),1 \n\t"
"vleg %%v23,248(%%r1,%2),1 \n\t"
"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"
"vfadb %%v16,%%v16,%%v17 \n\t"
"vfadb %%v17,%%v18,%%v19 \n\t"
"vfadb %%v18,%%v20,%%v21 \n\t"
"vfadb %%v19,%%v22,%%v23 \n\t"
"vfchdb %%v24,%%v17,%%v16 \n\t"
"vfchdb %%v25,%%v19,%%v18 \n\t"
"vsel %%v24,%%v16,%%v17,%%v24 \n\t"
"vsel %%v25,%%v18,%%v19,%%v25 \n\t"

"vfchdb %%v26,%%v25,%%v24 \n\t"
"vsel %%v26,%%v24,%%v25,%%v26 \n\t"

"vfchdb %%v27,%%v0,%%v26 \n\t"
"vsel %%v0,%%v26,%%v0,%%v27 \n\t"

"agfi %%r1, 256 \n\t"
"brctg %%r0, 0b \n\t"

"vrepg %%v16,%%v0,1 \n\t"
"wfchdb %%v17,%%v0,%%v16 \n\t"
"vsel %%v0,%%v16,%%v0,%%v17 \n\t"
"ldr %0,%%f0 "
:"=f"(amin)
:"r"(n),"ZR"((const FLOAT (*)[n])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27"
);

return amin;
}
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT minf = 0.0;
BLASLONG inc_x2;

if (n <= 0 || inc_x <= 0) return (minf);

if (inc_x == 1) {

BLASLONG n1 = n & -16;
if (n1 > 0) {

minf = zamin_kernel_16(n1, x);

i = n1;
}
else
{
minf=CABS1(x,0);
i++;
}

while (i < n) {
if (ABS(x[i*2]) < minf) {
minf = ABS(x[i*2]);
}
i++;
}
return (minf);

} else {

inc_x2 = 2 * inc_x;
minf=CABS1(x,0);
i += inc_x2;
j++;

BLASLONG n1 = (n - 1) & -4;
while (j < n1) {

if (CABS1(x,i) < minf) {
minf = CABS1(x,i);
}
if (CABS1(x,i+inc_x2) < minf) {
minf = CABS1(x,i+inc_x2);
}
if (CABS1(x,i+inc_x2*2) < minf) {
minf = CABS1(x,i+inc_x2*2);
}
if (CABS1(x,i+inc_x2*3) < minf) {
minf = CABS1(x,i+inc_x2*3);
}

i += inc_x2 * 4;

j += 4;

}


while (j < n) {
if (CABS1(x,i) < minf) {
minf = CABS1(x,i);
}
i += inc_x2;
j++;
}
return (minf);
}
}

+ 79
- 73
kernel/zarch/zasum.c View File

@@ -25,92 +25,98 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/


#include "common.h"
#include <math.h>

#if defined(DOUBLE)

#define ABS fabs

#else

#define ABS fabsf

#endif


static FLOAT zasum_kernel_16(BLASLONG n, FLOAT *x) {
static FLOAT zasum_kernel_16(BLASLONG n, FLOAT *x)
{
FLOAT asum;
__asm__ (
"pfd 1, 0(%[ptr_x]) \n\t"
"sllg %%r0,%[n],4 \n\t"
"agr %%r0,%[ptr_x] \n\t"
"vzero %%v0 \n\t"
"vzero %%v1 \n\t"
"vzero %%v22 \n\t"
"vzero %%v23 \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 1, 256(%[ptr_tmp] ) \n\t"
"vlm %%v24,%%v31,0(%[ptr_tmp]) \n\t"
"vflpdb %%v24, %%v24 \n\t"
"vflpdb %%v25, %%v25 \n\t"
"vflpdb %%v26, %%v26 \n\t"
"vflpdb %%v27, %%v27 \n\t"
"vflpdb %%v28, %%v28 \n\t"
"vflpdb %%v29, %%v29 \n\t"
"vflpdb %%v30, %%v30 \n\t"
"vflpdb %%v31, %%v31 \n\t"
"vfadb %%v0,%%v0,%%v24 \n\t"
"vfadb %%v1,%%v1,%%v25 \n\t"
"vfadb %%v23,%%v23,%%v26 \n\t"
"vfadb %%v22,%%v22,%%v27 \n\t"
"vfadb %%v0,%%v0,%%v28 \n\t"
"vfadb %%v1,%%v1,%%v29 \n\t"
"vfadb %%v23,%%v23,%%v30 \n\t"
"vfadb %%v22,%%v22,%%v31 \n\t"
"vlm %%v24,%%v31, 128(%[ptr_tmp]) \n\t"
"vflpdb %%v24, %%v24 \n\t"
"vflpdb %%v25, %%v25 \n\t"
"vflpdb %%v26, %%v26 \n\t"
"vflpdb %%v27, %%v27 \n\t"
"vflpdb %%v28, %%v28 \n\t"
"vflpdb %%v29, %%v29 \n\t"
"vflpdb %%v30, %%v30 \n\t"
"vflpdb %%v31, %%v31 \n\t"
"la %[ptr_tmp],256(%[ptr_tmp]) \n\t"
"vfadb %%v0,%%v0,%%v24 \n\t"
"vfadb %%v1,%%v1,%%v25 \n\t"
"vfadb %%v23,%%v23,%%v26 \n\t"
"vfadb %%v22,%%v22,%%v27 \n\t"
"vfadb %%v0,%%v0,%%v28 \n\t"
"vfadb %%v1,%%v1,%%v29 \n\t"
"vfadb %%v23,%%v23,%%v30 \n\t"
"vfadb %%v22,%%v22,%%v31 \n\t"
"clgrjl %[ptr_tmp],%%r0,1b \n\t"
"vfadb %%v24,%%v0,%%v1 \n\t"
"vfadb %%v25,%%v23,%%v22 \n\t"
"vfadb %%v0,%%v25,%%v24 \n\t"
"vrepg %%v1,%%v0,1 \n\t"
"adbr %%f0,%%f1 \n\t"
"ldr %[asum] ,%%f0"
: [asum] "=f"(asum),[ptr_tmp] "+&a"(x)
: [mem] "m"( *(const double (*)[2*n])x ), [n] "r"(n), [ptr_x] "a"(x)
: "cc", "r0","f0","f1","v0","v1","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return asum;
__asm__ (
"vzero %%v0 \n\t"
"vzero %%v1 \n\t"
"vzero %%v2 \n\t"
"vzero %%v3 \n\t"
"srlg %%r0,%1,4 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"
"vl %%v16, 0(%%r1,%2) \n\t"
"vl %%v17, 16(%%r1,%2) \n\t"
"vl %%v18, 32(%%r1,%2) \n\t"
"vl %%v19, 48(%%r1,%2) \n\t"
"vl %%v20, 64(%%r1,%2) \n\t"
"vl %%v21, 80(%%r1,%2) \n\t"
"vl %%v22, 96(%%r1,%2) \n\t"
"vl %%v23, 112(%%r1,%2) \n\t"

"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"

"vfadb %%v0,%%v0,%%v16 \n\t"
"vfadb %%v1,%%v1,%%v17 \n\t"
"vfadb %%v2,%%v2,%%v18 \n\t"
"vfadb %%v3,%%v3,%%v19 \n\t"
"vfadb %%v0,%%v0,%%v20 \n\t"
"vfadb %%v1,%%v1,%%v21 \n\t"
"vfadb %%v2,%%v2,%%v22 \n\t"
"vfadb %%v3,%%v3,%%v23 \n\t"

"vl %%v16, 128(%%r1,%2) \n\t"
"vl %%v17, 144(%%r1,%2) \n\t"
"vl %%v18, 160(%%r1,%2) \n\t"
"vl %%v19, 176(%%r1,%2) \n\t"
"vl %%v20, 192(%%r1,%2) \n\t"
"vl %%v21, 208(%%r1,%2) \n\t"
"vl %%v22, 224(%%r1,%2) \n\t"
"vl %%v23, 240(%%r1,%2) \n\t"

"vflpdb %%v16, %%v16 \n\t"
"vflpdb %%v17, %%v17 \n\t"
"vflpdb %%v18, %%v18 \n\t"
"vflpdb %%v19, %%v19 \n\t"
"vflpdb %%v20, %%v20 \n\t"
"vflpdb %%v21, %%v21 \n\t"
"vflpdb %%v22, %%v22 \n\t"
"vflpdb %%v23, %%v23 \n\t"

"vfadb %%v0,%%v0,%%v16 \n\t"
"vfadb %%v1,%%v1,%%v17 \n\t"
"vfadb %%v2,%%v2,%%v18 \n\t"
"vfadb %%v3,%%v3,%%v19 \n\t"
"vfadb %%v0,%%v0,%%v20 \n\t"
"vfadb %%v1,%%v1,%%v21 \n\t"
"vfadb %%v2,%%v2,%%v22 \n\t"
"vfadb %%v3,%%v3,%%v23 \n\t"
"agfi %%r1,256 \n\t"
"brctg %%r0,0b \n\t"
"vfadb %%v0,%%v0,%%v1 \n\t"
"vfadb %%v0,%%v0,%%v2 \n\t"
"vfadb %%v0,%%v0,%%v3 \n\t"
"vrepg %%v1,%%v0,1 \n\t"
"adbr %%f0,%%f1 \n\t"
"ldr %0,%%f0 "
:"=f"(asum)
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x)
:"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23"
);

return asum;
}


FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
@@ -128,7 +134,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
if ( n1 > 0 )
{

sumf=zasum_kernel_16(n1, x );
sumf = zasum_kernel_16(n1, x);
i=n1;
ip=2*n1;
}


+ 87
- 129
kernel/zarch/zaxpy.c View File

@@ -23,142 +23,98 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

*****************************************************************************/

#include "common.h"

static void zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT da_r,FLOAT da_i) {

BLASLONG tempR1 ;
__asm__ ("pfd 1, 0(%[x_tmp]) \n\t"
"pfd 2, 0(%[y_tmp]) \n\t"
static void zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
{
__asm__ volatile(
#if !defined(CONJ)
"lgdr %[t1],%[alpha_r] \n\t"
"vlvgp %%v28,%[t1],%[t1] \n\t" //load both from disjoint
"lgdr %[t1],%[alpha_i] \n\t"
"vlvgp %%v29,%[t1],%[t1] \n\t" //load both from disjoint
"vflcdb %%v29,%%v29 \n\t" //complement both
"vlvgg %%v29,%[t1],1 \n\t" //restore 2nd so that {-alpha_i, alpha_i}

#else
"lgdr %[t1],%[alpha_i] \n\t"
"vlvgp %%v29,%[t1],%[t1] \n\t" //load both from disjoint
"lgdr %[t1],%[alpha_r] \n\t"
"vlvgp %%v28,%[t1],%[t1] \n\t" //load both from disjoint
"vflcdb %%v28,%%v28 \n\t" //complement both
"vlvgg %%v28,%[t1],0 \n\t" //restore 1st so that {alpha_r,-alpha_r}
#endif
"xgr %[t1],%[t1] \n\t"
"sllg %[tmp],%[tmp],4 \n\t"
"vl %%v30 , 0(%[t1],%[y_tmp]) \n\t"
"vl %%v31 , 16(%[t1],%[y_tmp]) \n\t"
"vl %%v6 , 32(%[t1],%[y_tmp]) \n\t"
"vl %%v7 , 48(%[t1],%[y_tmp]) \n\t"
"vl %%v20 , 0(%[t1],%[x_tmp]) \n\t"
"vl %%v21 , 16(%[t1],%[x_tmp]) \n\t"
"vl %%v22 , 32(%[t1],%[x_tmp]) \n\t"
"vl %%v23 , 48(%[t1],%[x_tmp]) \n\t"
"lay %[tmp],-64 (%[tmp]) \n\t" //tmp-=64 so that t1+64 can break tmp condition
"j 2f \n\t"
".align 16 \n\t"
"1: \n\t"
"vpdi %%v24 , %%v20, %%v20, 4 \n\t"
"vpdi %%v25 , %%v21, %%v21, 4 \n\t"
"vpdi %%v26 , %%v22, %%v22, 4 \n\t"
"vpdi %%v27 , %%v23, %%v23, 4 \n\t"
"vfmadb %%v16, %%v20, %%v28, %%v16 \n\t"
"vfmadb %%v17, %%v21, %%v28, %%v17 \n\t"
"vfmadb %%v18, %%v22, %%v28, %%v18 \n\t"
"vfmadb %%v19, %%v23, %%v28, %%v19 \n\t"
"vl %%v30, 64(%[t1],%[y_tmp]) \n\t"
"vl %%v31, 80(%[t1],%[y_tmp]) \n\t"
"vl %%v6 , 96(%[t1],%[y_tmp]) \n\t"
"vl %%v7 , 112(%[t1],%[y_tmp]) \n\t"
"vfmadb %%v16, %%v24, %%v29, %%v16 \n\t"
"vfmadb %%v17, %%v25, %%v29, %%v17 \n\t"
"vfmadb %%v18, %%v26, %%v29, %%v18 \n\t"
"vfmadb %%v19, %%v27, %%v29, %%v19 \n\t"
"vl %%v20 , 64(%[t1],%[x_tmp]) \n\t"
"vl %%v21 , 80(%[t1],%[x_tmp]) \n\t"
"vl %%v22 , 96(%[t1],%[x_tmp]) \n\t"
"vl %%v23 ,112(%[t1],%[x_tmp]) \n\t"

"vst %%v16 , 0(%[t1],%[y_tmp]) \n\t"
"vst %%v17 , 16(%[t1],%[y_tmp]) \n\t"
"vst %%v18 , 32(%[t1],%[y_tmp]) \n\t"
"vst %%v19 , 48(%[t1],%[y_tmp]) \n\t"
"la %[t1],64(%[t1] ) \n\t"
"2: \n\t"
"pfd 1, 256(%[t1],%[x_tmp]) \n\t"
"pfd 2, 256(%[t1],%[y_tmp]) \n\t"
"vpdi %%v24 , %%v20, %%v20, 4 \n\t"
"vpdi %%v25 , %%v21, %%v21, 4 \n\t"
"vpdi %%v26 , %%v22, %%v22, 4 \n\t"
"vpdi %%v27 , %%v23, %%v23, 4 \n\t"

"vfmadb %%v30, %%v20, %%v28, %%v30 \n\t"
"vfmadb %%v31, %%v21, %%v28, %%v31 \n\t"
"vfmadb %%v6, %%v22, %%v28, %%v6 \n\t"
"vfmadb %%v7, %%v23, %%v28, %%v7 \n\t"
"vl %%v16, 64(%[t1],%[y_tmp]) \n\t"
"vl %%v17, 80(%[t1],%[y_tmp]) \n\t"
"vl %%v18, 96(%[t1],%[y_tmp]) \n\t"
"vl %%v19, 112(%[t1],%[y_tmp]) \n\t"
"vfmadb %%v30, %%v24, %%v29, %%v30 \n\t"
"vfmadb %%v31, %%v25, %%v29, %%v31 \n\t"
"vfmadb %%v6, %%v26, %%v29, %%v6 \n\t"
"vfmadb %%v7, %%v27, %%v29, %%v7 \n\t"

"vl %%v20 , 64(%[t1],%[x_tmp]) \n\t"
"vl %%v21 , 80(%[t1],%[x_tmp]) \n\t"
"vl %%v22 , 96(%[t1],%[x_tmp]) \n\t"
"vl %%v23 ,112(%[t1],%[x_tmp]) \n\t"

"vst %%v30 , 0(%[t1],%[y_tmp]) \n\t"
"vst %%v31 , 16(%[t1],%[y_tmp]) \n\t"
"vst %%v6 , 32(%[t1],%[y_tmp]) \n\t"
"vst %%v7 , 48(%[t1],%[y_tmp]) \n\t"
"la %[t1],64(%[t1] ) \n\t"

"clgrjl %[t1],%[tmp],1b \n\t"
//----------------------------------------------------------------------
"vfmadb %%v16, %%v20, %%v28, %%v16 \n\t"
"vfmadb %%v17, %%v21, %%v28, %%v17 \n\t"
"vfmadb %%v18, %%v22, %%v28, %%v18 \n\t"
"vfmadb %%v19, %%v23, %%v28, %%v19 \n\t"
"vpdi %%v24 , %%v20, %%v20, 4 \n\t"
"vpdi %%v25 , %%v21, %%v21, 4 \n\t"
"vpdi %%v26 , %%v22, %%v22, 4 \n\t"
"vpdi %%v27 , %%v23, %%v23, 4 \n\t"
"vfmadb %%v16, %%v24, %%v29, %%v16 \n\t"
"vfmadb %%v17, %%v25, %%v29, %%v17 \n\t"
"vfmadb %%v18, %%v26, %%v29, %%v18 \n\t"
"vfmadb %%v19, %%v27, %%v29, %%v19 \n\t"

"vst %%v16 , 0(%[t1],%[y_tmp]) \n\t"
"vst %%v17 , 16(%[t1],%[y_tmp]) \n\t"
"vst %%v18 , 32(%[t1],%[y_tmp]) \n\t"
"vst %%v19 , 48(%[t1],%[y_tmp]) \n\t"

: [mem_y] "+m" (*(double (*)[2*n])y),[tmp]"+&r"(n) , [t1] "=&a" (tempR1)
: [mem_x] "m" (*(const double (*)[2*n])x), [x_tmp] "a"(x), [y_tmp] "a"(y), [alpha_r] "f"(da_r),[alpha_i] "f"(da_i)
: "cc", "v6","v7", "v16",
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);

"vlrepg %%v0,0(%3) \n\t"
"vleg %%v1,8(%3),0 \n\t"
"wflcdb %%v1,%%v1 \n\t"
"vleg %%v1,8(%3),1 \n\t"
#else
"vleg %%v0,0(%3),1 \n\t"
"vflcdb %%v0,%%v0 \n\t"
"vleg %%v0,0(%3),0 \n\t"
"vlrepg %%v1,8(%3) \n\t"
#endif
"srlg %%r0,%0,3 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%1) \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"

"vl %%v16,0(%%r1,%1) \n\t"
"vl %%v17,16(%%r1,%1) \n\t"
"vl %%v18,32(%%r1,%1) \n\t"
"vl %%v19,48(%%r1,%1) \n\t"
"vl %%v20,0(%%r1,%2) \n\t"
"vl %%v21,16(%%r1,%2) \n\t"
"vl %%v22,32(%%r1,%2) \n\t"
"vl %%v23,48(%%r1,%2) \n\t"
"vpdi %%v24,%%v16,%%v16,4 \n\t"
"vpdi %%v25,%%v17,%%v17,4 \n\t"
"vpdi %%v26,%%v18,%%v18,4 \n\t"
"vpdi %%v27,%%v19,%%v19,4 \n\t"

"vfmadb %%v28,%%v16,%%v0,%%v20 \n\t"
"vfmadb %%v29,%%v17,%%v0,%%v21 \n\t"
"vfmadb %%v30,%%v18,%%v0,%%v22 \n\t"
"vfmadb %%v31,%%v19,%%v0,%%v23 \n\t"

"vfmadb %%v28,%%v24,%%v1,%%v28 \n\t"
"vfmadb %%v29,%%v25,%%v1,%%v29 \n\t"
"vfmadb %%v30,%%v26,%%v1,%%v30 \n\t"
"vfmadb %%v31,%%v27,%%v1,%%v31 \n\t"

"vst %%v28,0(%%r1,%2) \n\t"
"vst %%v29,16(%%r1,%2) \n\t"
"vst %%v30,32(%%r1,%2) \n\t"
"vst %%v31,48(%%r1,%2) \n\t"

"vl %%v16,64(%%r1,%1) \n\t"
"vl %%v17,80(%%r1,%1) \n\t"
"vl %%v18,96(%%r1,%1) \n\t"
"vl %%v19,112(%%r1,%1) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vpdi %%v24,%%v16,%%v16,4 \n\t"
"vpdi %%v25,%%v17,%%v17,4 \n\t"
"vpdi %%v26,%%v18,%%v18,4 \n\t"
"vpdi %%v27,%%v19,%%v19,4 \n\t"

"vfmadb %%v28,%%v16,%%v0,%%v20 \n\t"
"vfmadb %%v29,%%v17,%%v0,%%v21 \n\t"
"vfmadb %%v30,%%v18,%%v0,%%v22 \n\t"
"vfmadb %%v31,%%v19,%%v0,%%v23 \n\t"

"vfmadb %%v28,%%v24,%%v1,%%v28 \n\t"
"vfmadb %%v29,%%v25,%%v1,%%v29 \n\t"
"vfmadb %%v30,%%v26,%%v1,%%v30 \n\t"
"vfmadb %%v31,%%v27,%%v1,%%v31 \n\t"

"vst %%v28,64(%%r1,%2) \n\t"
"vst %%v29,80(%%r1,%2) \n\t"
"vst %%v30,96(%%r1,%2) \n\t"
"vst %%v31,112(%%r1,%2) \n\t"

"agfi %%r1,128 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"a"(alpha)
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}


int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) {
BLASLONG i = 0;
BLASLONG ix = 0, iy = 0;
FLOAT da[2];

if (n <= 0) return (0);

@@ -166,8 +122,10 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,

BLASLONG n1 = n & -8;

if (n1) {
zaxpy_kernel_8(n1, x, y, da_r,da_i);
if (n1) {
da[0] = da_r;
da[1] = da_i;
zaxpy_kernel_8(n1, x, y, da);
ix = 2 * n1;
}
i = n1;


+ 20
- 66
kernel/zarch/zcopy.c View File

@@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2013-2017, The OpenBLAS Project
Copyright (c) 2013-2018, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -24,71 +24,28 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
static void zcopy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) {

__asm__ volatile(
"pfd 1, 0(%[ptr_x]) \n\t"
"pfd 2, 0(%[ptr_y]) \n\t"
"srlg %[n_tmp],%[n_tmp],4 \n\t"
"xgr %%r1,%%r1 \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 1, 256(%%r1,%[ptr_x]) \n\t"
"pfd 2, 256(%%r1,%[ptr_y]) \n\t"

"vl %%v24, 0(%%r1,%[ptr_x]) \n\t"
"vst %%v24, 0(%%r1,%[ptr_y]) \n\t"
"vl %%v25, 16(%%r1,%[ptr_x]) \n\t"
"vst %%v25, 16(%%r1,%[ptr_y]) \n\t"
"vl %%v26, 32(%%r1,%[ptr_x]) \n\t"
"vst %%v26, 32(%%r1,%[ptr_y]) \n\t"
"vl %%v27, 48(%%r1,%[ptr_x]) \n\t"
"vst %%v27, 48(%%r1,%[ptr_y]) \n\t"

"vl %%v28, 64(%%r1,%[ptr_x]) \n\t"
"vst %%v28, 64(%%r1,%[ptr_y]) \n\t"
"vl %%v29, 80(%%r1,%[ptr_x]) \n\t"
"vst %%v29, 80(%%r1,%[ptr_y]) \n\t"
"vl %%v30, 96(%%r1,%[ptr_x]) \n\t"
"vst %%v30, 96(%%r1,%[ptr_y]) \n\t"
"vl %%v31, 112(%%r1,%[ptr_x]) \n\t"
"vst %%v31, 112(%%r1,%[ptr_y]) \n\t"


"vl %%v24, 128(%%r1,%[ptr_x]) \n\t"
"vst %%v24, 128(%%r1,%[ptr_y]) \n\t"

"vl %%v25, 144(%%r1,%[ptr_x]) \n\t"
"vst %%v25, 144(%%r1,%[ptr_y]) \n\t"

"vl %%v26, 160(%%r1,%[ptr_x]) \n\t"
"vst %%v26, 160(%%r1,%[ptr_y]) \n\t"

"vl %%v27, 176(%%r1,%[ptr_x]) \n\t"
"vst %%v27, 176(%%r1,%[ptr_y]) \n\t"

"vl %%v28, 192(%%r1,%[ptr_x]) \n\t"
"vst %%v28, 192(%%r1,%[ptr_y]) \n\t"
"vl %%v29, 208(%%r1,%[ptr_x]) \n\t"
"vst %%v29, 208(%%r1,%[ptr_y]) \n\t"
"vl %%v30, 224(%%r1,%[ptr_x]) \n\t"
"vst %%v30, 224(%%r1,%[ptr_y]) \n\t"
"vl %%v31, 240(%%r1,%[ptr_x]) \n\t"
"vst %%v31, 240(%%r1,%[ptr_y]) \n\t"
"la %%r1,256(%%r1) \n\t"
"brctg %[n_tmp],1b"
: [mem_y] "=m" (*(double (*)[2*n])y), [n_tmp] "+&r"(n)
: [mem_x] "m" (*(const double (*)[2*n])x), [ptr_x] "a"(x), [ptr_y] "a"(y)
: "cc", "r1", "v24","v25","v26","v27","v28","v29","v30","v31"
);
return;
#include "common.h"

static void zcopy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y)
{
__asm__ volatile (
"lgr %%r1,%1 \n\t"
"lgr %%r2,%2 \n\t"
"srlg %%r0,%0,4 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1) \n\t"
"pfd 2, 1024(%%r2) \n\t"
"mvc 0(256,%%r2),0(%%r1) \n\t"
"agfi %%r1,256 \n\t"
"agfi %%r2,256 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"a"((const FLOAT (*)[n * 2])x),"a"((FLOAT (*)[n * 2])y)
:"memory","cc","r0","r1","r2"
);
}


int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
BLASLONG i=0;
@@ -137,9 +94,6 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
}

}
return(0);
return(0);
}



+ 83
- 130
kernel/zarch/zdot.c View File

@@ -23,137 +23,92 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

*****************************************************************************/

#include "common.h"
#if defined(Z13)

static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) {

static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
{
__asm__ volatile(
"pfd 1, 0(%[ptr_x_tmp]) \n\t"
"pfd 1, 0(%[ptr_y_tmp]) \n\t"
"vzero %%v24 \n\t"
"vzero %%v25 \n\t"
"vzero %%v26 \n\t"
"vzero %%v27 \n\t"
"srlg %[n_tmp],%[n_tmp],3 \n\t"
"xgr %%r1,%%r1 \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 1, 256(%%r1,%[ptr_x_tmp]) \n\t"
"pfd 1, 256(%%r1,%[ptr_y_tmp]) \n\t"
"vl %%v16, 0(%%r1,%[ptr_x_tmp]) \n\t"
"vl %%v17, 16(%%r1,%[ptr_x_tmp]) \n\t"
"vl %%v18, 32(%%r1,%[ptr_x_tmp]) \n\t"
"vl %%v19, 48(%%r1,%[ptr_x_tmp]) \n\t"
"vl %%v28, 0(%%r1,%[ptr_y_tmp]) \n\t"
"vl %%v29, 16(%%r1,%[ptr_y_tmp]) \n\t"
"vl %%v30, 32(%%r1,%[ptr_y_tmp]) \n\t"
"vl %%v31, 48(%%r1,%[ptr_y_tmp]) \n\t"
"vpdi %%v20,%%v16,%%v16,4 \n\t"
"vpdi %%v21,%%v17,%%v17,4 \n\t"
"vpdi %%v22,%%v18,%%v18,4 \n\t"
"vpdi %%v23,%%v19,%%v19,4 \n\t"


"vfmadb %%v24,%%v16,%%v28,%%v24 \n\t"
"vfmadb %%v25,%%v20,%%v28,%%v25 \n\t"
"vfmadb %%v26,%%v17,%%v29,%%v26 \n\t"
"vfmadb %%v27,%%v21,%%v29,%%v27 \n\t"
"vfmadb %%v24,%%v18,%%v30,%%v24 \n\t"
"vfmadb %%v25,%%v22,%%v30,%%v25 \n\t"
"vfmadb %%v26,%%v19,%%v31,%%v26 \n\t"
"vfmadb %%v27,%%v23,%%v31,%%v27 \n\t"



"vl %%v16, 64(%%r1,%[ptr_x_tmp]) \n\t"
"vl %%v17, 80(%%r1,%[ptr_x_tmp]) \n\t"
"vl %%v18, 96(%%r1,%[ptr_x_tmp]) \n\t"
"vl %%v19,112(%%r1,%[ptr_x_tmp]) \n\t"
"vl %%v28, 64(%%r1,%[ptr_y_tmp]) \n\t"
"vl %%v29, 80(%%r1,%[ptr_y_tmp]) \n\t"
"vl %%v30, 96(%%r1,%[ptr_y_tmp]) \n\t"
"vl %%v31,112(%%r1,%[ptr_y_tmp]) \n\t"
"vpdi %%v20,%%v16,%%v16,4 \n\t"
"vpdi %%v21,%%v17,%%v17,4 \n\t"
"vpdi %%v22,%%v18,%%v18,4 \n\t"
"vpdi %%v23,%%v19,%%v19,4 \n\t"
"vfmadb %%v24,%%v16,%%v28,%%v24 \n\t"
"vfmadb %%v25,%%v20,%%v28,%%v25 \n\t"
"vfmadb %%v26,%%v17,%%v29,%%v26 \n\t"
"vfmadb %%v27,%%v21,%%v29,%%v27 \n\t"
"vfmadb %%v24,%%v18,%%v30,%%v24 \n\t"
"vfmadb %%v25,%%v22,%%v30,%%v25 \n\t"
"vfmadb %%v26,%%v19,%%v31,%%v26 \n\t"
"vfmadb %%v27,%%v23,%%v31,%%v27 \n\t"


"la %%r1,128(%%r1) \n\t"
"brctg %[n_tmp],1b \n\t"
"vfadb %%v24,%%v26,%%v24 \n\t"
"vfadb %%v25,%%v25,%%v27 \n\t"
"vsteg %%v24, 0(%[ptr_d]),0 \n\t"
"vsteg %%v24, 8(%[ptr_d]),1 \n\t"
"vsteg %%v25,16(%[ptr_d]),1 \n\t"
"vsteg %%v25,24(%[ptr_d]),0 \n\t"
: [mem_out] "=m"(*(double (*)[4])d ) ,[n_tmp] "+&r"(n)
: [mem_x] "m"( *(const double (*)[2*n])x),
[mem_y] "m"( *(const double (*)[2*n])y),
[ptr_x_tmp] "a"(x), [ptr_y_tmp] "a"(y), [ptr_d] "a"(d)
: "cc", "r1","v16",
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);

}

#else

static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) {
BLASLONG register i = 0;
FLOAT dot[4] = {0.0, 0.0, 0.0, 0.0};
BLASLONG j = 0;

while (i < n) {

dot[0] += x[j] * y[j];
dot[1] += x[j + 1] * y[j + 1];
dot[2] += x[j] * y[j + 1];
dot[3] += x[j + 1] * y[j];

dot[0] += x[j + 2] * y[j + 2];
dot[1] += x[j + 3] * y[j + 3];
dot[2] += x[j + 2] * y[j + 3];
dot[3] += x[j + 3] * y[j + 2];

dot[0] += x[j + 4] * y[j + 4];
dot[1] += x[j + 5] * y[j + 5];
dot[2] += x[j + 4] * y[j + 5];
dot[3] += x[j + 5] * y[j + 4];

dot[0] += x[j + 6] * y[j + 6];
dot[1] += x[j + 7] * y[j + 7];
dot[2] += x[j + 6] * y[j + 7];
dot[3] += x[j + 7] * y[j + 6];

j += 8;
i += 4;

}
d[0] = dot[0];
d[1] = dot[1];
d[2] = dot[2];
d[3] = dot[3];

"vzero %%v24 \n\t"
"vzero %%v25 \n\t"
"vzero %%v26 \n\t"
"vzero %%v27 \n\t"
"vzero %%v28 \n\t"
"vzero %%v29 \n\t"
"vzero %%v30 \n\t"
"vzero %%v31 \n\t"
"srlg %%r0,%0,3 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 1, 1024(%%r1,%1) \n\t"
"pfd 1, 1024(%%r1,%2) \n\t"

"vl %%v16, 0(%%r1,%1) \n\t"
"vl %%v17, 16(%%r1,%1) \n\t"
"vl %%v18, 32(%%r1,%1) \n\t"
"vl %%v19, 48(%%r1,%1) \n\t"
"vl %%v0, 0(%%r1,%2) \n\t"
"vl %%v1, 16(%%r1,%2) \n\t"
"vl %%v2, 32(%%r1,%2) \n\t"
"vl %%v3, 48(%%r1,%2) \n\t"
"vpdi %%v20,%%v16,%%v16,4 \n\t"
"vpdi %%v21,%%v17,%%v17,4 \n\t"
"vpdi %%v22,%%v18,%%v18,4 \n\t"
"vpdi %%v23,%%v19,%%v19,4 \n\t"

"vfmadb %%v24,%%v16,%%v0,%%v24 \n\t"
"vfmadb %%v25,%%v20,%%v0,%%v25 \n\t"
"vfmadb %%v26,%%v17,%%v1,%%v26 \n\t"
"vfmadb %%v27,%%v21,%%v1,%%v27 \n\t"
"vfmadb %%v28,%%v18,%%v2,%%v28 \n\t"
"vfmadb %%v29,%%v22,%%v2,%%v29 \n\t"
"vfmadb %%v30,%%v19,%%v3,%%v30 \n\t"
"vfmadb %%v31,%%v23,%%v3,%%v31 \n\t"

"vl %%v16, 64(%%r1,%1) \n\t"
"vl %%v17, 80(%%r1,%1) \n\t"
"vl %%v18, 96(%%r1,%1) \n\t"
"vl %%v19, 112(%%r1,%1) \n\t"
"vl %%v0, 64(%%r1,%2) \n\t"
"vl %%v1, 80(%%r1,%2) \n\t"
"vl %%v2, 96(%%r1,%2) \n\t"
"vl %%v3, 112(%%r1,%2) \n\t"
"vpdi %%v20,%%v16,%%v16,4 \n\t"
"vpdi %%v21,%%v17,%%v17,4 \n\t"
"vpdi %%v22,%%v18,%%v18,4 \n\t"
"vpdi %%v23,%%v19,%%v19,4 \n\t"

"vfmadb %%v24,%%v16,%%v0,%%v24 \n\t"
"vfmadb %%v25,%%v20,%%v0,%%v25 \n\t"
"vfmadb %%v26,%%v17,%%v1,%%v26 \n\t"
"vfmadb %%v27,%%v21,%%v1,%%v27 \n\t"
"vfmadb %%v28,%%v18,%%v2,%%v28 \n\t"
"vfmadb %%v29,%%v22,%%v2,%%v29 \n\t"
"vfmadb %%v30,%%v19,%%v3,%%v30 \n\t"
"vfmadb %%v31,%%v23,%%v3,%%v31 \n\t"

"agfi %%r1,128 \n\t"
"brctg %%r0,0b \n\t"
"vfadb %%v24,%%v24,%%v26 \n\t"
"vfadb %%v24,%%v24,%%v28 \n\t"
"vfadb %%v24,%%v24,%%v30 \n\t"
"vfadb %%v25,%%v25,%%v27 \n\t"
"vfadb %%v25,%%v25,%%v29 \n\t"
"vfadb %%v25,%%v25,%%v31 \n\t"
"vsteg %%v24,0(%3),0 \n\t"
"vsteg %%v24,8(%3),1 \n\t"
"vsteg %%v25,16(%3),1 \n\t"
"vsteg %%v25,24(%3),0 "
:
:"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((const FLOAT (*)[n * 2])y),"ZQ"((FLOAT (*)[4])d)
:"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}

#endif

OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
BLASLONG i = 0;
BLASLONG ix=0, iy=0;
BLASLONG i;
BLASLONG ix, iy;
OPENBLAS_COMPLEX_FLOAT result;
FLOAT dot[4] __attribute__ ((aligned(16))) = {0.0, 0.0, 0.0, 0.0};

@@ -167,14 +122,12 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
if ((inc_x == 1) && (inc_y == 1)) {

BLASLONG n1 = n & -8;
BLASLONG j=0;

if (n1){
if (n1)
zdot_kernel_8(n1, x, y, dot);
i = n1;
j = n1 <<1;
}

i = n1;
BLASLONG j = i * 2;

while (i < n) {



+ 167
- 172
kernel/zarch/zrot.c View File

@@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2017, The OpenBLAS Project
Copyright (c) 2013-2018, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@@ -27,176 +27,166 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include "common.h"

static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT cosA, FLOAT sinA)
static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
{
__asm__ (
"pfd 2, 0(%[ptr_x]) \n\t"
"pfd 2, 0(%[ptr_y]) \n\t"
"lgdr %%r1,%[cos] \n\t"
"vlvgp %%v0,%%r1,%%r1 \n\t"
"lgdr %%r1,%[sin] \n\t"
"vlvgp %%v1,%%r1,%%r1 \n\t"
"sllg %[tmp],%[tmp],4 \n\t"
"xgr %%r1,%%r1 \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 2, 256(%%r1,%[ptr_x]) \n\t"
"pfd 2, 256(%%r1,%[ptr_y]) \n\t"
"vl %%v24, 0(%%r1,%[ptr_x]) \n\t"
"vl %%v25, 16(%%r1,%[ptr_x]) \n\t"
"vl %%v26, 32(%%r1,%[ptr_x]) \n\t"
"vl %%v27, 48(%%r1,%[ptr_x]) \n\t"
"vl %%v16, 0(%%r1,%[ptr_y]) \n\t"
"vl %%v17, 16(%%r1,%[ptr_y]) \n\t"
"vl %%v18, 32(%%r1,%[ptr_y]) \n\t"
"vl %%v19, 48(%%r1,%[ptr_y]) \n\t"
"vfmdb %%v28,%%v24,%%v0 \n\t"
"vfmdb %%v29,%%v25,%%v0 \n\t"
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v30,%%v26,%%v0 \n\t"
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v31,%%v27,%%v0 \n\t"
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
/* 2nd parts*/
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */

"vst %%v28, 0(%%r1,%[ptr_x]) \n\t"
"vst %%v29, 16(%%r1,%[ptr_x]) \n\t"
"vst %%v30, 32(%%r1,%[ptr_x]) \n\t"
"vst %%v31, 48(%%r1,%[ptr_x]) \n\t"
"vst %%v20, 0(%%r1,%[ptr_y]) \n\t"
"vst %%v21, 16(%%r1,%[ptr_y]) \n\t"
"vst %%v22, 32(%%r1,%[ptr_y]) \n\t"
"vst %%v23, 48(%%r1,%[ptr_y]) \n\t"
"vl %%v24, 64(%%r1,%[ptr_x]) \n\t"
"vl %%v25, 80(%%r1,%[ptr_x]) \n\t"
"vl %%v26, 96(%%r1,%[ptr_x]) \n\t"
"vl %%v27,112(%%r1,%[ptr_x]) \n\t"
"vl %%v16, 64(%%r1,%[ptr_y]) \n\t"
"vl %%v17, 80(%%r1,%[ptr_y]) \n\t"
"vl %%v18, 96(%%r1,%[ptr_y]) \n\t"
"vl %%v19,112(%%r1,%[ptr_y]) \n\t"
"vfmdb %%v28,%%v24,%%v0 \n\t"
"vfmdb %%v29,%%v25,%%v0 \n\t"
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v30,%%v26,%%v0 \n\t"
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v31,%%v27,%%v0 \n\t"
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
/* 2nd parts*/
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */

"vst %%v28, 64(%%r1,%[ptr_x]) \n\t"
"vst %%v29, 80(%%r1,%[ptr_x]) \n\t"
"vst %%v30, 96(%%r1,%[ptr_x]) \n\t"
"vst %%v31, 112(%%r1,%[ptr_x]) \n\t"
"vst %%v20, 64(%%r1,%[ptr_y]) \n\t"
"vst %%v21, 80(%%r1,%[ptr_y]) \n\t"
"vst %%v22, 96(%%r1,%[ptr_y]) \n\t"
"vst %%v23, 112(%%r1,%[ptr_y]) \n\t"
"vl %%v24, 128(%%r1,%[ptr_x]) \n\t"
"vl %%v25, 144(%%r1,%[ptr_x]) \n\t"
"vl %%v26, 160(%%r1,%[ptr_x]) \n\t"
"vl %%v27, 176(%%r1,%[ptr_x]) \n\t"
"vl %%v16, 128(%%r1,%[ptr_y]) \n\t"
"vl %%v17, 144(%%r1,%[ptr_y]) \n\t"
"vl %%v18, 160(%%r1,%[ptr_y]) \n\t"
"vl %%v19, 176(%%r1,%[ptr_y]) \n\t"
"vfmdb %%v28,%%v24,%%v0 \n\t"
"vfmdb %%v29,%%v25,%%v0 \n\t"
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v30,%%v26,%%v0 \n\t"
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v31,%%v27,%%v0 \n\t"
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
/* 2nd parts*/
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */

"vst %%v28, 128(%%r1,%[ptr_x]) \n\t"
"vst %%v29, 144(%%r1,%[ptr_x]) \n\t"
"vst %%v30, 160(%%r1,%[ptr_x]) \n\t"
"vst %%v31, 176(%%r1,%[ptr_x]) \n\t"
"vst %%v20, 128(%%r1,%[ptr_y]) \n\t"
"vst %%v21, 144(%%r1,%[ptr_y]) \n\t"
"vst %%v22, 160(%%r1,%[ptr_y]) \n\t"
"vst %%v23, 176(%%r1,%[ptr_y]) \n\t"
"vl %%v24, 192(%%r1,%[ptr_x]) \n\t"
"vl %%v25, 208(%%r1,%[ptr_x]) \n\t"
"vl %%v26, 224(%%r1,%[ptr_x]) \n\t"
"vl %%v27, 240(%%r1,%[ptr_x]) \n\t"
"vl %%v16, 192(%%r1,%[ptr_y]) \n\t"
"vl %%v17, 208(%%r1,%[ptr_y]) \n\t"
"vl %%v18, 224(%%r1,%[ptr_y]) \n\t"
"vl %%v19, 240(%%r1,%[ptr_y]) \n\t"
"vfmdb %%v28,%%v24,%%v0 \n\t"
"vfmdb %%v29,%%v25,%%v0 \n\t"
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v30,%%v26,%%v0 \n\t"
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v31,%%v27,%%v0 \n\t"
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
/* 2nd parts*/
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */

"vst %%v28, 192(%%r1,%[ptr_x]) \n\t"
"vst %%v29, 208(%%r1,%[ptr_x]) \n\t"
"vst %%v30, 224(%%r1,%[ptr_x]) \n\t"
"vst %%v31, 240(%%r1,%[ptr_x]) \n\t"
"vst %%v20, 192(%%r1,%[ptr_y]) \n\t"
"vst %%v21, 208(%%r1,%[ptr_y]) \n\t"
"vst %%v22, 224(%%r1,%[ptr_y]) \n\t"
"vst %%v23, 240(%%r1,%[ptr_y]) \n\t"

"la %%r1,256(%%r1) \n\t"
"clgrjl %%r1,%[tmp],1b \n\t"
: [mem_x] "+m" (*(double (*)[2*n])x),
[mem_y] "+m" (*(double (*)[2*n])y),
[tmp] "+&r"(n)
: [ptr_x] "a"(x), [ptr_y] "a"(y),[cos] "f"(cosA),[sin] "f"(sinA)
: "cc","r1" ,"v0","v1","v16",
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return;

__asm__ (
"vlrepg %%v0,%3 \n\t"
"vlrepg %%v1,%4 \n\t"
"srlg %%r0,%0,4 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 2, 1024(%%r1,%1) \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"
"vl %%v24, 0(%%r1,%1) \n\t"
"vl %%v25, 16(%%r1,%1) \n\t"
"vl %%v26, 32(%%r1,%1) \n\t"
"vl %%v27, 48(%%r1,%1) \n\t"
"vl %%v16, 0(%%r1,%2) \n\t"
"vl %%v17, 16(%%r1,%2) \n\t"
"vl %%v18, 32(%%r1,%2) \n\t"
"vl %%v19, 48(%%r1,%2) \n\t"
"vfmdb %%v28,%%v24,%%v0 \n\t"
"vfmdb %%v29,%%v25,%%v0 \n\t"
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v30,%%v26,%%v0 \n\t"
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v31,%%v27,%%v0 \n\t"
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
/* 2nd parts*/
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vst %%v28, 0(%%r1,%1) \n\t"
"vst %%v29, 16(%%r1,%1) \n\t"
"vst %%v30, 32(%%r1,%1) \n\t"
"vst %%v31, 48(%%r1,%1) \n\t"
"vst %%v20, 0(%%r1,%2) \n\t"
"vst %%v21, 16(%%r1,%2) \n\t"
"vst %%v22, 32(%%r1,%2) \n\t"
"vst %%v23, 48(%%r1,%2) \n\t"
"vl %%v24, 64(%%r1,%1) \n\t"
"vl %%v25, 80(%%r1,%1) \n\t"
"vl %%v26, 96(%%r1,%1) \n\t"
"vl %%v27, 112(%%r1,%1) \n\t"
"vl %%v16, 64(%%r1,%2) \n\t"
"vl %%v17, 80(%%r1,%2) \n\t"
"vl %%v18, 96(%%r1,%2) \n\t"
"vl %%v19, 112(%%r1,%2) \n\t"
"vfmdb %%v28,%%v24,%%v0 \n\t"
"vfmdb %%v29,%%v25,%%v0 \n\t"
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v30,%%v26,%%v0 \n\t"
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v31,%%v27,%%v0 \n\t"
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
/* 2nd parts*/
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vst %%v28, 64(%%r1,%1) \n\t"
"vst %%v29, 80(%%r1,%1) \n\t"
"vst %%v30, 96(%%r1,%1) \n\t"
"vst %%v31, 112(%%r1,%1) \n\t"
"vst %%v20, 64(%%r1,%2) \n\t"
"vst %%v21, 80(%%r1,%2) \n\t"
"vst %%v22, 96(%%r1,%2) \n\t"
"vst %%v23, 112(%%r1,%2) \n\t"
"vl %%v24, 128(%%r1,%1) \n\t"
"vl %%v25, 144(%%r1,%1) \n\t"
"vl %%v26, 160(%%r1,%1) \n\t"
"vl %%v27, 176(%%r1,%1) \n\t"
"vl %%v16, 128(%%r1,%2) \n\t"
"vl %%v17, 144(%%r1,%2) \n\t"
"vl %%v18, 160(%%r1,%2) \n\t"
"vl %%v19, 176(%%r1,%2) \n\t"
"vfmdb %%v28,%%v24,%%v0 \n\t"
"vfmdb %%v29,%%v25,%%v0 \n\t"
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v30,%%v26,%%v0 \n\t"
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v31,%%v27,%%v0 \n\t"
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
/* 2nd parts*/
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vst %%v28, 128(%%r1,%1) \n\t"
"vst %%v29, 144(%%r1,%1) \n\t"
"vst %%v30, 160(%%r1,%1) \n\t"
"vst %%v31, 176(%%r1,%1) \n\t"
"vst %%v20, 128(%%r1,%2) \n\t"
"vst %%v21, 144(%%r1,%2) \n\t"
"vst %%v22, 160(%%r1,%2) \n\t"
"vst %%v23, 176(%%r1,%2) \n\t"
"vl %%v24, 192(%%r1,%1) \n\t"
"vl %%v25, 208(%%r1,%1) \n\t"
"vl %%v26, 224(%%r1,%1) \n\t"
"vl %%v27, 240(%%r1,%1) \n\t"
"vl %%v16, 192(%%r1,%2) \n\t"
"vl %%v17, 208(%%r1,%2) \n\t"
"vl %%v18, 224(%%r1,%2) \n\t"
"vl %%v19, 240(%%r1,%2) \n\t"
"vfmdb %%v28,%%v24,%%v0 \n\t"
"vfmdb %%v29,%%v25,%%v0 \n\t"
"vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v30,%%v26,%%v0 \n\t"
"vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */
"vfmdb %%v31,%%v27,%%v0 \n\t"
"vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */
/* 2nd parts*/
"vfmadb %%v28,%%v16,%%v1,%%v28 \n\t"
"vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */
"vfmadb %%v29,%%v17,%%v1,%%v29 \n\t"
"vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */
"vfmadb %%v30,%%v18,%%v1,%%v30 \n\t"
"vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */
"vfmadb %%v31,%%v19,%%v1,%%v31 \n\t"
"vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */
"vst %%v28, 192(%%r1,%1) \n\t"
"vst %%v29, 208(%%r1,%1) \n\t"
"vst %%v30, 224(%%r1,%1) \n\t"
"vst %%v31, 240(%%r1,%1) \n\t"
"vst %%v20, 192(%%r1,%2) \n\t"
"vst %%v21, 208(%%r1,%2) \n\t"
"vst %%v22, 224(%%r1,%2) \n\t"
"vst %%v23, 240(%%r1,%2) \n\t"
"agfi %%r1,256 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"m"(*c),"m"(*s)
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}

int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
@@ -214,8 +204,11 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT

BLASLONG n1 = n & -16;
if ( n1 > 0 )
{
zrot_kernel_16(n1, x, y, c, s);
{
FLOAT cosa,sina;
cosa=c;
sina=s;
zrot_kernel_16(n1, x, y, &cosa, &sina);
i=n1;
ix=2*n1;
}
@@ -234,6 +227,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT

}


}
else
{
@@ -259,3 +253,4 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
}



+ 200
- 260
kernel/zarch/zscal.c View File

@@ -23,270 +23,211 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
*****************************************************************************/

#include "common.h"


static void zscal_kernel_8(BLASLONG n, FLOAT da_r,FLOAT da_i, FLOAT *x) {
BLASLONG tempR1 ;
__asm__ (
"pfd 2, 0(%[x_tmp]) \n\t"
#if !defined(CONJ)
"lgdr %[t1],%[alpha_r] \n\t"
"vlvgp %%v28,%[t1],%[t1] \n\t" //load both from disjoint
"lgdr %[t1],%[alpha_i] \n\t"
"vlvgp %%v29,%[t1],%[t1] \n\t" //load both from disjoint
"vflcdb %%v29,%%v29 \n\t" //complement both
"vlvgg %%v29,%[t1],1 \n\t" //restore 2nd so that {-alpha_i, alpha_i}

#else
"lgdr %[t1],%[alpha_i] \n\t"
"vlvgp %%v29,%[t1],%[t1] \n\t" //load both from disjoint
"lgdr %[t1],%[alpha_r] \n\t"
"vlvgp %%v28,%[t1],%[t1] \n\t" //load both from disjoint
"vflcdb %%v28,%%v28 \n\t" //complement both
"vlvgg %%v28,%[t1],0 \n\t" //restore 1st so that {alpha_r,-alpha_r}
#endif
"xgr %[t1],%[t1] \n\t"
"sllg %[tmp],%[tmp],4 \n\t"
"vl %%v20 , 0(%[t1],%[x_tmp]) \n\t"
"vl %%v21 , 16(%[t1],%[x_tmp]) \n\t"
"vl %%v22 , 32(%[t1],%[x_tmp]) \n\t"
"vl %%v23 , 48(%[t1],%[x_tmp]) \n\t"
"lay %[tmp],-64 (%[tmp]) \n\t" //tmp-=64 so that t1+64 can break tmp condition
"j 2f \n\t"
".align 16 \n\t"
"1: \n\t"
"vpdi %%v24 , %%v20, %%v20, 4 \n\t"
"vpdi %%v25 , %%v21, %%v21, 4 \n\t"
"vpdi %%v26 , %%v22, %%v22, 4 \n\t"
"vpdi %%v27 , %%v23, %%v23, 4 \n\t"
"vfmdb %%v16, %%v20, %%v28 \n\t"
"vfmdb %%v17, %%v21, %%v28 \n\t"
"vfmdb %%v18, %%v22, %%v28 \n\t"
"vfmdb %%v19, %%v23, %%v28 \n\t"
"vl %%v20, 64(%[t1],%[x_tmp]) \n\t"
"vl %%v21, 80(%[t1],%[x_tmp]) \n\t"
"vl %%v22, 96(%[t1],%[x_tmp]) \n\t"
"vl %%v23, 112(%[t1],%[x_tmp]) \n\t"
"vfmadb %%v16, %%v24, %%v29, %%v16 \n\t"
"vfmadb %%v17, %%v25, %%v29, %%v17 \n\t"
"vfmadb %%v18, %%v26, %%v29, %%v18 \n\t"
"vfmadb %%v19, %%v27, %%v29, %%v19 \n\t"


"vst %%v16 , 0(%[t1],%[x_tmp]) \n\t"
"vst %%v17 , 16(%[t1],%[x_tmp]) \n\t"
"vst %%v18 , 32(%[t1],%[x_tmp]) \n\t"
"vst %%v19 , 48(%[t1],%[x_tmp]) \n\t"
"la %[t1],64(%[t1] ) \n\t"
"2: \n\t"
"pfd 2, 256(%[t1],%[x_tmp]) \n\t"
"vpdi %%v24 , %%v20, %%v20, 4 \n\t"
"vpdi %%v25 , %%v21, %%v21, 4 \n\t"
"vpdi %%v26 , %%v22, %%v22, 4 \n\t"
"vpdi %%v27 , %%v23, %%v23, 4 \n\t"

"vfmdb %%v30, %%v20, %%v28 \n\t"
"vfmdb %%v31, %%v21, %%v28 \n\t"
"vfmdb %%v6, %%v22, %%v28 \n\t"
"vfmdb %%v7, %%v23, %%v28 \n\t"

"vl %%v20 , 64(%[t1],%[x_tmp]) \n\t"
"vl %%v21 , 80(%[t1],%[x_tmp]) \n\t"
"vl %%v22 , 96(%[t1],%[x_tmp]) \n\t"
"vl %%v23 ,112(%[t1],%[x_tmp]) \n\t"

"vfmadb %%v30, %%v24, %%v29, %%v30 \n\t"
"vfmadb %%v31, %%v25, %%v29, %%v31 \n\t"
"vfmadb %%v6, %%v26, %%v29, %%v6 \n\t"
"vfmadb %%v7, %%v27, %%v29, %%v7 \n\t"


"vst %%v30 , 0(%[t1],%[x_tmp]) \n\t"
"vst %%v31 , 16(%[t1],%[x_tmp]) \n\t"
"vst %%v6 , 32(%[t1],%[x_tmp]) \n\t"
"vst %%v7 , 48(%[t1],%[x_tmp]) \n\t"
"la %[t1],64(%[t1] ) \n\t"

"clgrjl %[t1],%[tmp],1b \n\t"
//----------------------------------------------------------------------
"vfmdb %%v16, %%v20, %%v28 \n\t"
"vfmdb %%v17, %%v21, %%v28 \n\t"
"vfmdb %%v18, %%v22, %%v28 \n\t"
"vfmdb %%v19, %%v23, %%v28 \n\t"
"vpdi %%v24 , %%v20, %%v20, 4 \n\t"
"vpdi %%v25 , %%v21, %%v21, 4 \n\t"
"vpdi %%v26 , %%v22, %%v22, 4 \n\t"
"vpdi %%v27 , %%v23, %%v23, 4 \n\t"
"vfmadb %%v16, %%v24, %%v29, %%v16 \n\t"
"vfmadb %%v17, %%v25, %%v29, %%v17 \n\t"
"vfmadb %%v18, %%v26, %%v29, %%v18 \n\t"
"vfmadb %%v19, %%v27, %%v29, %%v19 \n\t"

"vst %%v16 , 0(%[t1],%[x_tmp]) \n\t"
"vst %%v17 , 16(%[t1],%[x_tmp]) \n\t"
"vst %%v18 , 32(%[t1],%[x_tmp]) \n\t"
"vst %%v19 , 48(%[t1],%[x_tmp]) \n\t"

: [mem_x] "+m" (*(double (*)[2*n])x),[tmp]"+&r"(n) , [t1] "=&a" (tempR1)
: [x_tmp] "a"(x), [alpha_r] "f"(da_r),[alpha_i] "f"(da_i)
: "cc", "v6","v7", "v16",
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);


static void zscal_kernel_8(BLASLONG n, FLOAT *alpha, FLOAT *x)
{
__asm__ volatile(
"vlrepg %%v0,0(%1) \n\t"
"vleg %%v1,8(%1),0 \n\t"
"wflcdb %%v1,%%v1 \n\t"
"vleg %%v1,8(%1),1 \n\t"
"srlg %%r0,%0,3 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"

"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vpdi %%v24,%%v16,%%v16,4 \n\t"
"vpdi %%v25,%%v17,%%v17,4 \n\t"
"vpdi %%v26,%%v18,%%v18,4 \n\t"
"vpdi %%v27,%%v19,%%v19,4 \n\t"
"vpdi %%v28,%%v20,%%v20,4 \n\t"
"vpdi %%v29,%%v21,%%v21,4 \n\t"
"vpdi %%v30,%%v22,%%v22,4 \n\t"
"vpdi %%v31,%%v23,%%v23,4 \n\t"

"vfmdb %%v16,%%v16,%%v0 \n\t"
"vfmdb %%v17,%%v17,%%v0 \n\t"
"vfmdb %%v18,%%v18,%%v0 \n\t"
"vfmdb %%v19,%%v19,%%v0 \n\t"
"vfmdb %%v20,%%v20,%%v0 \n\t"
"vfmdb %%v21,%%v21,%%v0 \n\t"
"vfmdb %%v22,%%v22,%%v0 \n\t"
"vfmdb %%v23,%%v23,%%v0 \n\t"
"vfmadb %%v16,%%v24,%%v1,%%v16 \n\t"
"vfmadb %%v17,%%v25,%%v1,%%v17 \n\t"
"vfmadb %%v18,%%v26,%%v1,%%v18 \n\t"
"vfmadb %%v19,%%v27,%%v1,%%v19 \n\t"
"vfmadb %%v20,%%v28,%%v1,%%v20 \n\t"
"vfmadb %%v21,%%v29,%%v1,%%v21 \n\t"
"vfmadb %%v22,%%v30,%%v1,%%v22 \n\t"
"vfmadb %%v23,%%v31,%%v1,%%v23 \n\t"

"vst %%v16,0(%%r1,%2) \n\t"
"vst %%v17,16(%%r1,%2) \n\t"
"vst %%v18,32(%%r1,%2) \n\t"
"vst %%v19,48(%%r1,%2) \n\t"
"vst %%v20,64(%%r1,%2) \n\t"
"vst %%v21,80(%%r1,%2) \n\t"
"vst %%v22,96(%%r1,%2) \n\t"
"vst %%v23,112(%%r1,%2) \n\t"

"agfi %%r1,128 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x)
:"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
static void zscal_kernel_8_zero_r(BLASLONG n, FLOAT da_i, FLOAT *x) {
__asm__ ( "pfd 2, 0(%1) \n\t"
"lgdr %%r0,%[alpha] \n\t"
"vlvgp %%v16,%%r0,%%r0 \n\t" //load both from disjoint
"vflcdb %%v16,%%v16 \n\t" //complement both
"vlvgg %%v16,%%r0,0 \n\t" //restore 1st
"vlr %%v17 ,%%v16 \n\t"
"sllg %%r0,%[n],4 \n\t"
"agr %%r0,%[x_ptr] \n\t"
".align 16 \n\t"
"1: \n\t"
"vl %%v24, 0(%[x_ptr]) \n\t"
"vfmdb %%v24,%%v24,%%v16 \n\t"
"vsteg %%v24, 0(%[x_ptr]),1 \n\t"
"vsteg %%v24, 8(%[x_ptr]),0 \n\t"
"vl %%v25, 16(%[x_ptr]) \n\t"
"vfmdb %%v25,%%v25,%%v17 \n\t"
"vsteg %%v25, 16(%[x_ptr]),1 \n\t"
"vsteg %%v25, 24(%[x_ptr]),0 \n\t"
"vl %%v26, 32(%[x_ptr]) \n\t"
"vfmdb %%v26,%%v26,%%v16 \n\t"
"vsteg %%v26, 32(%[x_ptr]),1 \n\t"
"vsteg %%v26, 40(%[x_ptr]),0 \n\t"
"vl %%v27, 48(%[x_ptr]) \n\t"
"vfmdb %%v27,%%v27,%%v17 \n\t"
"vsteg %%v27, 48(%[x_ptr]),1 \n\t"
"vsteg %%v27, 56(%[x_ptr]),0 \n\t"
"vl %%v28, 64(%[x_ptr]) \n\t"
"vfmdb %%v28,%%v28,%%v16 \n\t"
"vsteg %%v28, 64(%[x_ptr]),1 \n\t"
"vsteg %%v28, 72(%[x_ptr]),0 \n\t"
"vl %%v29, 80(%[x_ptr]) \n\t"
"vfmdb %%v29,%%v29,%%v17 \n\t"
"vsteg %%v29, 80(%[x_ptr]),1 \n\t"
"vsteg %%v29, 88(%[x_ptr]),0 \n\t"
"vl %%v30, 96(%[x_ptr]) \n\t"
"vfmdb %%v30,%%v30,%%v16 \n\t"
"vsteg %%v30, 96(%[x_ptr]),1 \n\t"
"vsteg %%v30, 104(%[x_ptr]),0 \n\t"
"vl %%v31, 112(%[x_ptr]) \n\t"
"vfmdb %%v31,%%v31,%%v17 \n\t"
"vsteg %%v31, 112(%[x_ptr]),1 \n\t"
"vsteg %%v31, 120(%[x_ptr]),0 \n\t"
"la %[x_ptr],128(%[x_ptr]) \n\t"
"clgrjl %[x_ptr],%%r0,1b \n\t"
: [mem] "+m" (*(double (*)[2*n])x) ,[x_ptr] "+&a"(x)
: [n] "r"(n),[alpha] "f"(da_i)
:"cc", "r0","f0", "f1","v16","v17" ,"v24","v25","v26","v27","v28","v29","v30","v31"
);
static void zscal_kernel_8_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x)
{
__asm__ volatile(
"vleg %%v0,8(%1),0 \n\t"
"wflcdb %%v0,%%v0 \n\t"
"vleg %%v0,8(%1),1 \n\t"
"srlg %%r0,%0,3 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"

"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vpdi %%v16,%%v16,%%v16,4 \n\t"
"vpdi %%v17,%%v17,%%v17,4 \n\t"
"vpdi %%v18,%%v18,%%v18,4 \n\t"
"vpdi %%v19,%%v19,%%v19,4 \n\t"
"vpdi %%v20,%%v20,%%v20,4 \n\t"
"vpdi %%v21,%%v21,%%v21,4 \n\t"
"vpdi %%v22,%%v22,%%v22,4 \n\t"
"vpdi %%v23,%%v23,%%v23,4 \n\t"

"vfmdb %%v16,%%v16,%%v0 \n\t"
"vfmdb %%v17,%%v17,%%v0 \n\t"
"vfmdb %%v18,%%v18,%%v0 \n\t"
"vfmdb %%v19,%%v19,%%v0 \n\t"
"vfmdb %%v20,%%v20,%%v0 \n\t"
"vfmdb %%v21,%%v21,%%v0 \n\t"
"vfmdb %%v22,%%v22,%%v0 \n\t"
"vfmdb %%v23,%%v23,%%v0 \n\t"

"vst %%v16,0(%%r1,%2) \n\t"
"vst %%v17,16(%%r1,%2) \n\t"
"vst %%v18,32(%%r1,%2) \n\t"
"vst %%v19,48(%%r1,%2) \n\t"
"vst %%v20,64(%%r1,%2) \n\t"
"vst %%v21,80(%%r1,%2) \n\t"
"vst %%v22,96(%%r1,%2) \n\t"
"vst %%v23,112(%%r1,%2) \n\t"

"agfi %%r1,128 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23"
);
}

static void zscal_kernel_8_zero_i(BLASLONG n, FLOAT da_r, FLOAT *x) {
__asm__ ("pfd 2, 0(%[x_ptr]) \n\t"
"lgdr %%r0,%[alpha] \n\t"
"vlvgp %%v18,%%r0,%%r0 \n\t"
"vlr %%v19,%%v18 \n\t"
"vlr %%v16,%%v18 \n\t"
"vlr %%v17,%%v18 \n\t"
"sllg %%r0,%[n],4 \n\t"
"agr %%r0,%[x_ptr] \n\t"
".align 16 \n\t"
"1: \n\t"
"vl %%v24, 0(%[x_ptr]) \n\t"
"vfmdb %%v24,%%v24,%%v18 \n\t"
"vst %%v24, 0(%[x_ptr]) \n\t"
"vl %%v25, 16(%[x_ptr]) \n\t"
"vfmdb %%v25,%%v25,%%v19 \n\t"
"vst %%v25, 16(%[x_ptr]) \n\t"
"vl %%v26, 32(%[x_ptr]) \n\t"
"vfmdb %%v26,%%v26,%%v16 \n\t"
"vst %%v26, 32(%[x_ptr]) \n\t"
"vl %%v27, 48(%[x_ptr]) \n\t"
"vfmdb %%v27,%%v27,%%v17 \n\t"
"vst %%v27, 48(%[x_ptr]) \n\t"
"vl %%v28, 64(%[x_ptr]) \n\t"
"vfmdb %%v28,%%v28,%%v18 \n\t"
"vst %%v28, 64(%[x_ptr]) \n\t"
"vl %%v29, 80(%[x_ptr]) \n\t"
"vfmdb %%v29,%%v29,%%v19 \n\t"
"vst %%v29, 80(%[x_ptr]) \n\t"
"vl %%v30, 96(%[x_ptr]) \n\t"
"vfmdb %%v30,%%v30,%%v16 \n\t"
"vst %%v30, 96(%[x_ptr]) \n\t"
"vl %%v31,112(%[x_ptr]) \n\t"
"vfmdb %%v31,%%v31,%%v17 \n\t"
"vst %%v31,112(%[x_ptr]) \n\t"
"la %[x_ptr],128(%[x_ptr]) \n\t"
"clgrjl %[x_ptr],%%r0,1b \n\t"
: [mem] "+m" (*(double (*)[2*n])x) ,[x_ptr] "+&a"(x)
: [n] "r"(n),[alpha] "f"(da_r)
: "cc", "r0","v16", "v17","v18","v19","v24","v25","v26","v27","v28","v29","v30","v31"
);
static void zscal_kernel_8_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x)
{
__asm__ volatile(
"vlrepg %%v0,0(%1) \n\t"
"srlg %%r0,%0,3 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"
"vl %%v16,0(%%r1,%2) \n\t"
"vl %%v17,16(%%r1,%2) \n\t"
"vl %%v18,32(%%r1,%2) \n\t"
"vl %%v19,48(%%r1,%2) \n\t"
"vl %%v20,64(%%r1,%2) \n\t"
"vl %%v21,80(%%r1,%2) \n\t"
"vl %%v22,96(%%r1,%2) \n\t"
"vl %%v23,112(%%r1,%2) \n\t"
"vfmdb %%v16,%%v16,%%v0 \n\t"
"vfmdb %%v17,%%v17,%%v0 \n\t"
"vfmdb %%v18,%%v18,%%v0 \n\t"
"vfmdb %%v19,%%v19,%%v0 \n\t"
"vfmdb %%v20,%%v20,%%v0 \n\t"
"vfmdb %%v21,%%v21,%%v0 \n\t"
"vfmdb %%v22,%%v22,%%v0 \n\t"
"vfmdb %%v23,%%v23,%%v0 \n\t"
"vst %%v16,0(%%r1,%2) \n\t"
"vst %%v17,16(%%r1,%2) \n\t"
"vst %%v18,32(%%r1,%2) \n\t"
"vst %%v19,48(%%r1,%2) \n\t"
"vst %%v20,64(%%r1,%2) \n\t"
"vst %%v21,80(%%r1,%2) \n\t"
"vst %%v22,96(%%r1,%2) \n\t"
"vst %%v23,112(%%r1,%2) \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x)
:"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23"
);
}

static void zscal_kernel_8_zero(BLASLONG n, FLOAT *x) {

__asm__ ( "pfd 2, 0(%[x_ptr]) \n\t"
"vzero %%v24 \n\t"
"vzero %%v25 \n\t"
"vzero %%v26 \n\t"
"vzero %%v27 \n\t"
"sllg %%r0,%[n],4 \n\t"
"agr %%r0,%[x_ptr] \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 2, 256( %[x_ptr]) \n\t"
"vst %%v24, 0( %[x_ptr]) \n\t"
"vst %%v25, 16( %[x_ptr]) \n\t"
"vst %%v26, 32( %[x_ptr]) \n\t"
"vst %%v27, 48( %[x_ptr]) \n\t"
"vst %%v24, 64( %[x_ptr]) \n\t"
"vst %%v25, 80( %[x_ptr]) \n\t"
"vst %%v26, 96( %[x_ptr]) \n\t"
"vst %%v27,112( %[x_ptr]) \n\t"
"la %[x_ptr],128(%[x_ptr]) \n\t"
"clgrjl %[x_ptr],%%r0,1b \n\t"
: [mem] "+m" (*(double (*)[2*n])x),[x_ptr] "+&a"(x)
: [n] "r"(n)
:"cc" ,"r0","v24","v25","v26","v27"
);

static void zscal_kernel_8_zero(BLASLONG n, FLOAT *x)
{
__asm__ volatile(
"vzero %%v24 \n\t"
"vzero %%v25 \n\t"
"vzero %%v26 \n\t"
"vzero %%v27 \n\t"
"srlg %%r0,%0,3 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 2, 1024(%%r1,%1) \n\t"

"vst %%v24,0(%%r1,%1) \n\t"
"vst %%v25,16(%%r1,%1) \n\t"
"vst %%v26,32(%%r1,%1) \n\t"
"vst %%v27,48(%%r1,%1) \n\t"
"vst %%v24,64(%%r1,%1) \n\t"
"vst %%v25,80(%%r1,%1) \n\t"
"vst %%v26,96(%%r1,%1) \n\t"
"vst %%v27,112(%%r1,%1) \n\t"
"agfi %%r1,128 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((FLOAT (*)[n * 2])x)
:"memory","cc","r0","r1","v24","v25","v26","v27"
);
}





static void zscal_kernel_inc_8(BLASLONG n, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x) {

static void zscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i;
BLASLONG inc_x2 = 2 * inc_x;
BLASLONG inc_x3 = inc_x2 + inc_x;
FLOAT t0, t1, t2, t3;
FLOAT t0, t1, t2, t3;
FLOAT da_r = alpha[0];
FLOAT da_i = alpha[1];

for (i = 0; i < n; i += 4) {
for (i = 0; i < n; i += 4)
{
t0 = da_r * x[0] - da_i * x[1];
t1 = da_r * x[inc_x] - da_i * x[inc_x + 1];
t2 = da_r * x[inc_x2] - da_i * x[inc_x2 + 1];
@@ -303,17 +244,14 @@ static void zscal_kernel_inc_8(BLASLONG n, FLOAT da_r,FLOAT da_i, FLOAT *x, BLAS
x[inc_x3] = t3;

x += 4 * inc_x;

}


}

int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) {
BLASLONG i = 0, j = 0;
FLOAT temp0;
FLOAT temp1;
FLOAT alpha[2] __attribute__ ((aligned(16)));

if (inc_x != 1) {
inc_x <<= 1;
@@ -405,8 +343,10 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
} else {

BLASLONG n1 = n & -8;
if (n1 > 0) {
zscal_kernel_inc_8(n1, da_r,da_i, x, inc_x);
if (n1 > 0) {
alpha[0] = da_r;
alpha[1] = da_i;
zscal_kernel_inc_8(n1, alpha, x, inc_x);
j = n1;
i = n1 * inc_x;
}
@@ -432,17 +372,19 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
BLASLONG n1 = n & -8;
if (n1 > 0) {

alpha[0] = da_r;
alpha[1] = da_i;

if (da_r == 0.0)
if (da_i == 0)
zscal_kernel_8_zero(n1, x);
else
zscal_kernel_8_zero_r(n1, da_i, x);
zscal_kernel_8_zero_r(n1, alpha, x);
else
if (da_i == 0)
zscal_kernel_8_zero_i(n1, da_r, x);
zscal_kernel_8_zero_i(n1, alpha, x);
else
zscal_kernel_8(n1, da_r,da_i, x);
zscal_kernel_8(n1, alpha, x);

i = n1 << 1;
j = n1;
@@ -508,5 +450,3 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,

return (0);
}



+ 82
- 209
kernel/zarch/zswap.c View File

@@ -25,220 +25,93 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#include "common.h"


#if defined(Z13_SWAP_A)
static void zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y)
static void zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y)
{
__asm__ volatile(
"pfd 1, 0(%[ptr_x]) \n\t"
"pfd 2, 0(%[ptr_y]) \n\t"
"srlg %[n_tmp],%[n_tmp],4 \n\t"
"xgr %%r1,%%r1 \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 2, 256(%%r1,%[ptr_x]) \n\t"
"pfd 2, 256(%%r1,%[ptr_y]) \n\t"
"vl %%v24, 0(%%r1,%[ptr_x]) \n\t"
"vl %%v16, 0(%%r1,%[ptr_y]) \n\t"
"vst %%v24, 0(%%r1,%[ptr_y]) \n\t"
"vst %%v16, 0(%%r1,%[ptr_x]) \n\t"

"vl %%v25, 16(%%r1,%[ptr_x]) \n\t"
"vl %%v17, 16(%%r1,%[ptr_y]) \n\t"
"vst %%v25, 16(%%r1,%[ptr_y]) \n\t"
"vst %%v17, 16(%%r1,%[ptr_x]) \n\t"

"vl %%v26, 32(%%r1,%[ptr_x]) \n\t"
"vl %%v18, 32(%%r1,%[ptr_y]) \n\t"
"vst %%v26, 32(%%r1,%[ptr_y]) \n\t"
"vst %%v18, 32(%%r1,%[ptr_x]) \n\t"

"vl %%v27, 48(%%r1,%[ptr_x]) \n\t"
"vl %%v19, 48(%%r1,%[ptr_y]) \n\t"
"vst %%v27, 48(%%r1,%[ptr_y]) \n\t"
"vst %%v19, 48(%%r1,%[ptr_x]) \n\t"

"vl %%v28, 64(%%r1,%[ptr_x]) \n\t"
"vl %%v20, 64(%%r1,%[ptr_y]) \n\t"
"vst %%v28, 64(%%r1,%[ptr_y]) \n\t"
"vst %%v20, 64(%%r1,%[ptr_x]) \n\t"

"vl %%v29, 80(%%r1,%[ptr_x]) \n\t"
"vl %%v21, 80(%%r1,%[ptr_y]) \n\t"
"vst %%v29, 80(%%r1,%[ptr_y]) \n\t"
"vst %%v21, 80(%%r1,%[ptr_x]) \n\t"

"vl %%v30, 96(%%r1,%[ptr_x]) \n\t"
"vl %%v22, 96(%%r1,%[ptr_y]) \n\t"
"vst %%v30, 96(%%r1,%[ptr_y]) \n\t"
"vst %%v22, 96(%%r1,%[ptr_x]) \n\t"

"vl %%v31, 112(%%r1,%[ptr_x]) \n\t"
"vl %%v23, 112(%%r1,%[ptr_y]) \n\t"
"vst %%v31, 112(%%r1,%[ptr_y]) \n\t"
"vst %%v23, 112(%%r1,%[ptr_x]) \n\t"

"vl %%v24, 128(%%r1,%[ptr_x]) \n\t"
"vl %%v16, 128(%%r1,%[ptr_y]) \n\t"
"vst %%v24, 128(%%r1,%[ptr_y]) \n\t"
"vst %%v16, 128(%%r1,%[ptr_x]) \n\t"

"vl %%v25, 144(%%r1,%[ptr_x]) \n\t"
"vl %%v17, 144(%%r1,%[ptr_y]) \n\t"
"vst %%v25, 144(%%r1,%[ptr_y]) \n\t"
"vst %%v17, 144(%%r1,%[ptr_x]) \n\t"

"vl %%v26, 160(%%r1,%[ptr_x]) \n\t"
"vl %%v18, 160(%%r1,%[ptr_y]) \n\t"
"vst %%v26, 160(%%r1,%[ptr_y]) \n\t"
"vst %%v18, 160(%%r1,%[ptr_x]) \n\t"

"vl %%v27, 176(%%r1,%[ptr_x]) \n\t"
"vl %%v19, 176(%%r1,%[ptr_y]) \n\t"
"vst %%v27, 176(%%r1,%[ptr_y]) \n\t"
"vst %%v19, 176(%%r1,%[ptr_x]) \n\t"

"vl %%v28, 192(%%r1,%[ptr_x]) \n\t"
"vl %%v20, 192(%%r1,%[ptr_y]) \n\t"
"vst %%v28, 192(%%r1,%[ptr_y]) \n\t"
"vst %%v20, 192(%%r1,%[ptr_x]) \n\t"

"vl %%v29, 208(%%r1,%[ptr_x]) \n\t"
"vl %%v21, 208(%%r1,%[ptr_y]) \n\t"
"vst %%v29, 208(%%r1,%[ptr_y]) \n\t"
"vst %%v21, 208(%%r1,%[ptr_x]) \n\t"

"vl %%v30, 224(%%r1,%[ptr_x]) \n\t"
"vl %%v22, 224(%%r1,%[ptr_y]) \n\t"
"vst %%v30, 224(%%r1,%[ptr_y]) \n\t"
"vst %%v22, 224(%%r1,%[ptr_x]) \n\t"

"vl %%v31, 240(%%r1,%[ptr_x]) \n\t"
"vl %%v23, 240(%%r1,%[ptr_y]) \n\t"
"vst %%v31, 240(%%r1,%[ptr_y]) \n\t"
"vst %%v23, 240(%%r1,%[ptr_x]) \n\t"
"la %%r1,256(%%r1) \n\t"
"brctg %[n_tmp],1b"
: [mem_x] "+m" (*(double (*)[2*n])x),
[mem_y] "+m" (*(double (*)[2*n])y),
[n_tmp] "+&r"(n)
: [ptr_x] "a"(x), [ptr_y] "a"(y)
: "cc", "r1", "v16","v17","v18","v19","v20","v21","v22","v23"
,"v24","v25","v26","v27","v28","v29","v30","v31"
);
return;

__asm__ volatile(
"srlg %%r0,%0,4 \n\t"
"xgr %%r1,%%r1 \n\t"
"0: \n\t"
"pfd 2, 1024(%%r1,%1) \n\t"
"pfd 2, 1024(%%r1,%2) \n\t"
"vl %%v16, 0(%%r1,%1) \n\t"
"vl %%v17, 16(%%r1,%1) \n\t"
"vl %%v18, 32(%%r1,%1) \n\t"
"vl %%v19, 48(%%r1,%1) \n\t"
"vl %%v20, 64(%%r1,%1) \n\t"
"vl %%v21, 80(%%r1,%1) \n\t"
"vl %%v22, 96(%%r1,%1) \n\t"
"vl %%v23, 112(%%r1,%1) \n\t"
"vl %%v24, 128(%%r1,%1) \n\t"
"vl %%v25, 144(%%r1,%1) \n\t"
"vl %%v26, 160(%%r1,%1) \n\t"
"vl %%v27, 176(%%r1,%1) \n\t"
"vl %%v28, 192(%%r1,%1) \n\t"
"vl %%v29, 208(%%r1,%1) \n\t"
"vl %%v30, 224(%%r1,%1) \n\t"
"vl %%v31, 240(%%r1,%1) \n\t"

"vl %%v0, 0(%%r1,%2) \n\t"
"vl %%v1, 16(%%r1,%2) \n\t"
"vl %%v2, 32(%%r1,%2) \n\t"
"vl %%v3, 48(%%r1,%2) \n\t"
"vl %%v4, 64(%%r1,%2) \n\t"
"vl %%v5, 80(%%r1,%2) \n\t"
"vl %%v6, 96(%%r1,%2) \n\t"
"vl %%v7, 112(%%r1,%2) \n\t"
"vst %%v0, 0(%%r1,%1) \n\t"
"vst %%v1, 16(%%r1,%1) \n\t"
"vst %%v2, 32(%%r1,%1) \n\t"
"vst %%v3, 48(%%r1,%1) \n\t"
"vst %%v4, 64(%%r1,%1) \n\t"
"vst %%v5, 80(%%r1,%1) \n\t"
"vst %%v6, 96(%%r1,%1) \n\t"
"vst %%v7, 112(%%r1,%1) \n\t"

"vl %%v0, 128(%%r1,%2) \n\t"
"vl %%v1, 144(%%r1,%2) \n\t"
"vl %%v2, 160(%%r1,%2) \n\t"
"vl %%v3, 176(%%r1,%2) \n\t"
"vl %%v4, 192(%%r1,%2) \n\t"
"vl %%v5, 208(%%r1,%2) \n\t"
"vl %%v6, 224(%%r1,%2) \n\t"
"vl %%v7, 240(%%r1,%2) \n\t"
"vst %%v0, 128(%%r1,%1) \n\t"
"vst %%v1, 144(%%r1,%1) \n\t"
"vst %%v2, 160(%%r1,%1) \n\t"
"vst %%v3, 176(%%r1,%1) \n\t"
"vst %%v4, 192(%%r1,%1) \n\t"
"vst %%v5, 208(%%r1,%1) \n\t"
"vst %%v6, 224(%%r1,%1) \n\t"
"vst %%v7, 240(%%r1,%1) \n\t"

"vst %%v16, 0(%%r1,%2) \n\t"
"vst %%v17, 16(%%r1,%2) \n\t"
"vst %%v18, 32(%%r1,%2) \n\t"
"vst %%v19, 48(%%r1,%2) \n\t"
"vst %%v20, 64(%%r1,%2) \n\t"
"vst %%v21, 80(%%r1,%2) \n\t"
"vst %%v22, 96(%%r1,%2) \n\t"
"vst %%v23, 112(%%r1,%2) \n\t"
"vst %%v24, 128(%%r1,%2) \n\t"
"vst %%v25, 144(%%r1,%2) \n\t"
"vst %%v26, 160(%%r1,%2) \n\t"
"vst %%v27, 176(%%r1,%2) \n\t"
"vst %%v28, 192(%%r1,%2) \n\t"
"vst %%v29, 208(%%r1,%2) \n\t"
"vst %%v30, 224(%%r1,%2) \n\t"
"vst %%v31, 240(%%r1,%2) \n\t"
"agfi %%r1,256 \n\t"
"brctg %%r0,0b "
:
:"r"(n),"ZR"((FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y)
:"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}

#else

static void zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y)
{
__asm__ volatile(
"pfd 2, 0(%[ptr_x]) \n\t"
"pfd 2, 0(%[ptr_y]) \n\t"
"srlg %[n_tmp],%[n_tmp],4 \n\t"
"xgr %%r1,%%r1 \n\t"
".align 16 \n\t"
"1: \n\t"
"pfd 2, 256(%%r1,%[ptr_x]) \n\t"
"pfd 2, 256(%%r1,%[ptr_y]) \n\t"
"vl %%v16, 0(%%r1,%[ptr_x]) \n\t"
"vl %%v17, 16(%%r1,%[ptr_x]) \n\t"
"vl %%v18, 32(%%r1,%[ptr_x]) \n\t"
"vl %%v19, 48(%%r1,%[ptr_x]) \n\t"
"vl %%v20, 64(%%r1,%[ptr_x]) \n\t"
"vl %%v21, 80(%%r1,%[ptr_x]) \n\t"
"vl %%v22, 96(%%r1,%[ptr_x]) \n\t"
"vl %%v23, 112(%%r1,%[ptr_x]) \n\t"
"vl %%v24, 128(%%r1,%[ptr_x]) \n\t"
"vl %%v25, 144(%%r1,%[ptr_x]) \n\t"
"vl %%v26, 160(%%r1,%[ptr_x]) \n\t"
"vl %%v27, 176(%%r1,%[ptr_x]) \n\t"
"vl %%v28, 192(%%r1,%[ptr_x]) \n\t"
"vl %%v29, 208(%%r1,%[ptr_x]) \n\t"
"vl %%v30, 224(%%r1,%[ptr_x]) \n\t"
"vl %%v31, 240(%%r1,%[ptr_x]) \n\t"


"vl %%v0, 0(%%r1,%[ptr_y]) \n\t"
"vl %%v1, 16(%%r1,%[ptr_y]) \n\t"
"vl %%v2, 32(%%r1,%[ptr_y]) \n\t"
"vl %%v3, 48(%%r1,%[ptr_y]) \n\t"
"vl %%v4, 64(%%r1,%[ptr_y]) \n\t"
"vl %%v5, 80(%%r1,%[ptr_y]) \n\t"
"vl %%v6, 96(%%r1,%[ptr_y]) \n\t"
"vl %%v7, 112(%%r1,%[ptr_y]) \n\t"
"vst %%v0, 0(%%r1,%[ptr_x]) \n\t"
"vst %%v1, 16(%%r1,%[ptr_x]) \n\t"
"vst %%v2, 32(%%r1,%[ptr_x]) \n\t"
"vst %%v3, 48(%%r1,%[ptr_x]) \n\t"
"vst %%v4, 64(%%r1,%[ptr_x]) \n\t"
"vst %%v5, 80(%%r1,%[ptr_x]) \n\t"
"vst %%v6, 96(%%r1,%[ptr_x]) \n\t"
"vst %%v7, 112(%%r1,%[ptr_x]) \n\t"

"vl %%v0, 128(%%r1,%[ptr_y]) \n\t"
"vl %%v1, 144(%%r1,%[ptr_y]) \n\t"
"vl %%v2, 160(%%r1,%[ptr_y]) \n\t"
"vl %%v3, 176(%%r1,%[ptr_y]) \n\t"
"vl %%v4, 192(%%r1,%[ptr_y]) \n\t"
"vl %%v5, 208(%%r1,%[ptr_y]) \n\t"
"vl %%v6, 224(%%r1,%[ptr_y]) \n\t"
"vl %%v7, 240(%%r1,%[ptr_y]) \n\t"
"vst %%v0, 128(%%r1,%[ptr_x]) \n\t"
"vst %%v1, 144(%%r1,%[ptr_x]) \n\t"
"vst %%v2, 160(%%r1,%[ptr_x]) \n\t"
"vst %%v3, 176(%%r1,%[ptr_x]) \n\t"
"vst %%v4, 192(%%r1,%[ptr_x]) \n\t"
"vst %%v5, 208(%%r1,%[ptr_x]) \n\t"
"vst %%v6, 224(%%r1,%[ptr_x]) \n\t"
"vst %%v7, 240(%%r1,%[ptr_x]) \n\t"

"vst %%v16, 0(%%r1,%[ptr_y]) \n\t"
"vst %%v17, 16(%%r1,%[ptr_y]) \n\t"
"vst %%v18, 32(%%r1,%[ptr_y]) \n\t"
"vst %%v19, 48(%%r1,%[ptr_y]) \n\t"
"vst %%v20, 64(%%r1,%[ptr_y]) \n\t"
"vst %%v21, 80(%%r1,%[ptr_y]) \n\t"
"vst %%v22, 96(%%r1,%[ptr_y]) \n\t"
"vst %%v23, 112(%%r1,%[ptr_y]) \n\t"
"vst %%v24, 128(%%r1,%[ptr_y]) \n\t"
"vst %%v25, 144(%%r1,%[ptr_y]) \n\t"
"vst %%v26, 160(%%r1,%[ptr_y]) \n\t"
"vst %%v27, 176(%%r1,%[ptr_y]) \n\t"
"vst %%v28, 192(%%r1,%[ptr_y]) \n\t"
"vst %%v29, 208(%%r1,%[ptr_y]) \n\t"
"vst %%v30, 224(%%r1,%[ptr_y]) \n\t"
"vst %%v31, 240(%%r1,%[ptr_y]) \n\t"
"la %%r1,256(%%r1) \n\t"
"brctg %[n_tmp],1b"
: [mem_x] "+m" (*(double (*)[2*n])x),
[mem_y] "+m" (*(double (*)[2*n])y),
[n_tmp] "+&r"(n)
: [ptr_x] "a"(x), [ptr_y] "a"(y)
: "cc", "r1", "v0","v1","v2","v3","v4","v5","v6","v7","v16",
"v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
return;

}

#endif





int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i=0;


+ 437
- 0
ztest/Makefile View File

@@ -0,0 +1,437 @@
TOPDIR = ..
include $(TOPDIR)/Makefile.system

goto :: sdot.goto ddot.goto cdot.goto zdot.goto dsdot.goto sswap.goto dswap.goto cswap.goto zswap.goto isamax.goto idamax.goto icamax.goto izamax.goto samax.goto damax.goto ismax.goto idmax.goto smax.goto dmax.goto isamin.goto idamin.goto icamin.goto izamin.goto samin.goto damin.goto camin.goto zamin.goto ismin.goto idmin.goto smin.goto dmin.goto sgemv.goto dgemv.goto cgemv.goto zgemv.goto sscal.goto dscal.goto cscal.goto zscal.goto saxpy.goto daxpy.goto caxpy.goto zaxpy.goto srot.goto drot.goto crot.goto zrot.goto sasum.goto dasum.goto casum.goto zasum.goto scopy.goto dcopy.goto ccopy.goto zcopy.goto

##################################### Sdot ####################################################
sdot.goto : sdot.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm

##################################### Ddot ####################################################
ddot.goto : ddot.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm

##################################### Cdot ####################################################
cdot.goto : cdot.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm

##################################### Zdot ####################################################
zdot.goto : zdot.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm

##################################### Dsdot ####################################################
dsdot.goto : dsdot.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm

############################################## ISAMAX ##############################################
isamax.goto : isamax.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm

############################################## IDAMAX ##############################################
idamax.goto : idamax.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm

############################################## ICAMAX ##############################################
icamax.goto : icamax.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm

############################################## IZAMAX ##############################################
izamax.goto : izamax.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm

############################################## SAMAX ##############################################
samax.goto : samax.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm

############################################## DAMAX ##############################################
damax.goto : damax.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm

############################################## ISMAX ##############################################
ismax.goto : ismax.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm

############################################## IDMAX ##############################################
idmax.goto : idmax.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm

############################################## SMAX ##############################################
smax.goto : smax.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm

############################################## DMAX ##############################################
dmax.goto : dmax.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm

############################################## ISAMIN ##############################################
isamin.goto : isamin.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm

############################################## IDAMIN ##############################################
idamin.goto : idamin.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm

############################################## ICAMIN ##############################################
icamin.goto : icamin.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm

############################################## IZAMIN ##############################################
izamin.goto : izamin.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm

############################################## SAMIN ##############################################
samin.goto : samin.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm

############################################## DAMIN ##############################################
damin.goto : damin.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm

############################################## CAMIN ##############################################
camin.goto : camin.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm

############################################## ZAMIN ##############################################
zamin.goto : zamin.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm

############################################## ISMIN ##############################################
ismin.goto : ismin.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm

############################################## IDMIN ##############################################
idmin.goto : idmin.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm

############################################## SMIN ##############################################
smin.goto : smin.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm

############################################## DMIN ##############################################
dmin.goto : dmin.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm

##################################### Sgemv ####################################################
sgemv.goto : sgemv.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm

##################################### Dgemv ####################################################
dgemv.goto : dgemv.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm

##################################### Cgemv ####################################################

cgemv.goto : cgemv.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm

##################################### Zgemv ####################################################

zgemv.goto : zgemv.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm

##################################### Sscal ####################################################
sscal.goto : sscal.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm

##################################### Dscal ####################################################
dscal.goto : dscal.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm

##################################### Cscal ####################################################

cscal.goto : cscal.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm

##################################### Zscal ####################################################

zscal.goto : zscal.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm

##################################### Saxpy ####################################################
saxpy.goto : saxpy.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm

##################################### Daxpy ####################################################
daxpy.goto : daxpy.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm

##################################### Caxpy ####################################################

caxpy.goto : caxpy.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm

##################################### Zaxpy ####################################################

zaxpy.goto : zaxpy.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm

##################################### Srot ####################################################
srot.goto : srot.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm

##################################### Drot ####################################################
drot.goto : drot.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm

##################################### Crot ####################################################
crot.goto : crot.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm

##################################### Zrot ####################################################
zrot.goto : zrot.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm

##################################### Sswap ####################################################
sswap.goto : sswap.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm

##################################### Dswap ####################################################
dswap.goto : dswap.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm

##################################### Cswap ####################################################

cswap.goto : cswap.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm

##################################### Zswap ####################################################

zswap.goto : zswap.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm

##################################### Saxpy ####################################################
saxpy.goto : saxpy.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm

##################################### Daxpy ####################################################
daxpy.goto : daxpy.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm

##################################### Caxpy ####################################################

caxpy.goto : caxpy.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm

##################################### Zaxpy ####################################################

zaxpy.goto : zaxpy.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm

##################################### Sasum ####################################################
sasum.goto : sasum.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm

##################################### Dasum ####################################################
dasum.goto : dasum.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm

##################################### Casum ####################################################

casum.goto : casum.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm

##################################### Zasum ####################################################

zasum.goto : zasum.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm

##################################### Scopy ####################################################
scopy.goto : scopy.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm

##################################### Dcopy ####################################################
dcopy.goto : dcopy.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm

##################################### Ccopy ####################################################

ccopy.goto : ccopy.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm

##################################### Zcopy ####################################################

zcopy.goto : zcopy.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm

###################################################################################################

sdot.$(SUFFIX) : dot.c
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^

ddot.$(SUFFIX) : dot.c
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^

cdot.$(SUFFIX) : dot.c
$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^

zdot.$(SUFFIX) : dot.c
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^

dsdot.$(SUFFIX) : dsdot.c
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^

isamax.$(SUFFIX) : iamax.c
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^

idamax.$(SUFFIX) : iamax.c
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^

icamax.$(SUFFIX) : iamax.c
$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^

izamax.$(SUFFIX) : iamax.c
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^

samax.$(SUFFIX) : amax.c
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^

damax.$(SUFFIX) : amax.c
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^

ismax.$(SUFFIX) : imax.c
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^

idmax.$(SUFFIX) : imax.c
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^

smax.$(SUFFIX) : max.c
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^

dmax.$(SUFFIX) : max.c
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^

isamin.$(SUFFIX) : iamin.c
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^

idamin.$(SUFFIX) : iamin.c
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^

icamin.$(SUFFIX) : iamin.c
$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^

izamin.$(SUFFIX) : iamin.c
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^

samin.$(SUFFIX) : amin.c
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^

damin.$(SUFFIX) : amin.c
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^

camin.$(SUFFIX) : amin.c
$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^

zamin.$(SUFFIX) : amin.c
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^

ismin.$(SUFFIX) : imin.c
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^

idmin.$(SUFFIX) : imin.c
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^

smin.$(SUFFIX) : min.c
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^

dmin.$(SUFFIX) : min.c
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^

sgemv.$(SUFFIX) : gemv.c
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^

dgemv.$(SUFFIX) : gemv.c
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^

cgemv.$(SUFFIX) : gemv.c
$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^

zgemv.$(SUFFIX) : gemv.c
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^

sscal.$(SUFFIX) : scal.c
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^

dscal.$(SUFFIX) : scal.c
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^

cscal.$(SUFFIX) : scal.c
$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^

zscal.$(SUFFIX) : scal.c
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^

saxpy.$(SUFFIX) : axpy.c
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^

daxpy.$(SUFFIX) : axpy.c
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^

caxpy.$(SUFFIX) : axpy.c
$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^

zaxpy.$(SUFFIX) : axpy.c
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^

srot.$(SUFFIX) : rot.c
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^

drot.$(SUFFIX) : rot.c
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^

crot.$(SUFFIX) : rot.c
$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^

zrot.$(SUFFIX) : rot.c
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^

sswap.$(SUFFIX) : swap.c
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^

dswap.$(SUFFIX) : swap.c
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^

cswap.$(SUFFIX) : swap.c
$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^

zswap.$(SUFFIX) : swap.c
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^

saxpy.$(SUFFIX) : axpy.c
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^

daxpy.$(SUFFIX) : axpy.c
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^

caxpy.$(SUFFIX) : axpy.c
$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^

zaxpy.$(SUFFIX) : axpy.c
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^

sasum.$(SUFFIX) : asum.c
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^

dasum.$(SUFFIX) : asum.c
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^

casum.$(SUFFIX) : asum.c
$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^

zasum.$(SUFFIX) : asum.c
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^

scopy.$(SUFFIX) : copy.c
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^

dcopy.$(SUFFIX) : copy.c
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^

ccopy.$(SUFFIX) : copy.c
$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^

zcopy.$(SUFFIX) : copy.c
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^

clean ::
@rm -f *.goto


+ 235
- 0
ztest/amax.c View File

@@ -0,0 +1,235 @@
/***************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"

#define SINGLE_EPS 1e-04
#define DOUBLE_EPS 1e-13

int assert_dbl_near(double exp, double real, double tol) {
double diff = exp - real;
double absdiff = diff;
/* avoid using fabs and linking with a math lib */
if(diff < 0) {
absdiff *= -1;
}
if (absdiff > tol) {
return 0;
}
return 1;
}

#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif

FLOAT amax_c(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
BLASLONG ix=0;
FLOAT maxf=0.0;

if (n <= 0 || inc_x <= 0) return(maxf);

maxf=ABS(x[0]);
ix += inc_x;
i++;

while(i < n)
{
if( ABS(x[ix]) > maxf )
{
maxf = ABS(x[ix]);
}
ix += inc_x;
i++;
}
return(maxf);
}

#undef AMAX
#ifdef DOUBLE
#define AMAX BLASFUNC(damax)
#else
#define AMAX BLASFUNC(samax)
#endif

#if defined(__WIN32__) || defined(__WIN64__)

#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif

int gettimeofday(struct timeval *tv, void *tz){

FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;

if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);

tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;

/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}

return 0;
}

#endif

#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0

static void *huge_malloc(BLASLONG size){
int shmid;
void *address;

#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif

if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}

address = shmat(shmid, NULL, SHM_RND);

if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}

shmctl(shmid, IPC_RMID, 0);

return address;
}

#define malloc huge_malloc

#endif

int main(int argc, char *argv[]){

FLOAT *x;
FLOAT result, result_c;
blasint m, i;
blasint inc_x=1;
int loops = 1;
int l;
char *p;

int from = 1;
int to = 200;
int step = 1;

struct timeval start, stop;
double time1,timeg,timeg_c;

int test = 1;

argc--;argv++;

if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}

if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);

fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);

if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}

#ifdef linux
srandom(getpid());
#endif

fprintf(stderr, " SIZE Flops Time CTime Test\n");

for(m = from; m <= to; m += step)
{

timeg=0;
timeg_c=0;

fprintf(stderr, " %6d :", (int)m);


for (l=0; l<loops; l++)
{

for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}

gettimeofday( &start, (struct timezone *)0);
result = AMAX (&m, x, &inc_x);
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;

gettimeofday( &start, (struct timezone *)0);
result_c = amax_c(m, x, inc_x);
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg_c += time1;

test &= assert_dbl_near(result, result_c, SINGLE_EPS);

}

timeg /= loops;
timeg_c /= loops;
fprintf(stderr, "%10.2f MFlops %10.6f sec %10.6f sec %s\n", 1. * (double)m / timeg * 1.e-6, timeg, timeg_c, test ? "PASS" : "FAILD");

}

return 0;
}

// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

+ 235
- 0
ztest/amin.c View File

@@ -0,0 +1,235 @@
/***************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"

#define SINGLE_EPS 1e-04
#define DOUBLE_EPS 1e-13

int assert_dbl_near(double exp, double real, double tol) {
double diff = exp - real;
double absdiff = diff;
/* avoid using fabs and linking with a math lib */
if(diff < 0) {
absdiff *= -1;
}
if (absdiff > tol) {
return 0;
}
return 1;
}

#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif

FLOAT amin_c(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
BLASLONG ix=0;
FLOAT minf=0.0;

if (n <= 0 || inc_x <= 0) return(minf);

minf=ABS(x[0]);
ix += inc_x;
i++;

while(i < n)
{
if( ABS(x[ix]) < minf )
{
minf = ABS(x[ix]);
}
ix += inc_x;
i++;
}
return(minf);
}

#undef AMIN
#ifdef DOUBLE
#define AMIN BLASFUNC(damin)
#else
#define AMIN BLASFUNC(samin)
#endif

#if defined(__WIN32__) || defined(__WIN64__)

#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif

int gettimeofday(struct timeval *tv, void *tz){

FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;

if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);

tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;

/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}

return 0;
}

#endif

#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0

static void *huge_malloc(BLASLONG size){
int shmid;
void *address;

#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif

if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}

address = shmat(shmid, NULL, SHM_RND);

if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}

shmctl(shmid, IPC_RMID, 0);

return address;
}

#define malloc huge_malloc

#endif

int main(int argc, char *argv[]){

FLOAT *x;
FLOAT result, result_c;
blasint m, i;
blasint inc_x=1;
int loops = 1;
int l;
char *p;

int from = 1;
int to = 200;
int step = 1;

struct timeval start, stop;
double time1,timeg,timeg_c;

int test = 1;

argc--;argv++;

if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}

if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);

fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);

if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}

#ifdef linux
srandom(getpid());
#endif

fprintf(stderr, " SIZE Flops Time CTime Test\n");

for(m = from; m <= to; m += step)
{

timeg=0;
timeg_c=0;

fprintf(stderr, " %6d :", (int)m);


for (l=0; l<loops; l++)
{

for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}

gettimeofday( &start, (struct timezone *)0);
result = AMIN (&m, x, &inc_x);
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;

gettimeofday( &start, (struct timezone *)0);
result_c = amin_c(m, x, inc_x);
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg_c += time1;

test &= assert_dbl_near(result, result_c, SINGLE_EPS);

}

timeg /= loops;
timeg_c /= loops;
fprintf(stderr, "%10.2f MFlops %10.6f sec %10.6f sec %s\n", 1. * (double)m / timeg * 1.e-6, timeg, timeg_c, test ? "PASS" : "FAILD");

}

return 0;
}

// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

+ 263
- 0
ztest/asum.c View File

@@ -0,0 +1,263 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"

#define SINGLE_EPS 1e-04
#define DOUBLE_EPS 1e-13

int assert_dbl_near(double exp, double real, double tol) {
double diff = exp - real;
double absdiff = diff;
/* avoid using fabs and linking with a math lib */
if(diff < 0) {
absdiff *= -1;
}
if (absdiff > tol) {
return 0;
}
return 1;
}

#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
#ifdef COMPLEX
#define CABS1(x,i) ABS(x[i])+ABS(x[i+1])
FLOAT zasum_c(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
FLOAT sumf = 0.0;
BLASLONG inc_x2;

if (n <= 0 || inc_x <= 0) return(sumf);

inc_x2 = 2 * inc_x;

n *= inc_x2;
while(i < n)
{
sumf += CABS1(x,i);
i += inc_x2;
}
return(sumf);
}
#else
FLOAT asum_c(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
FLOAT sumf = 0.0;
if (n <= 0 || inc_x <= 0) return(sumf);

n *= inc_x;
while(i < n)
{
sumf += ABS(x[i]);
i += inc_x;
}
return(sumf);
}
#endif

#undef ASUM
#ifdef COMPLEX
#ifdef DOUBLE
#define ASUM BLASFUNC(dzasum)
#else
#define ASUM BLASFUNC(scasum)
#endif
#else
#ifdef DOUBLE
#define ASUM BLASFUNC(dasum)
#else
#define ASUM BLASFUNC(sasum)
#endif
#endif

#if defined(__WIN32__) || defined(__WIN64__)

#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif

int gettimeofday(struct timeval *tv, void *tz){

FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;

if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);

tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;

/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}

return 0;
}

#endif

#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0

static void *huge_malloc(BLASLONG size){
int shmid;
void *address;

#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif

if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}

address = shmat(shmid, NULL, SHM_RND);

if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}

shmctl(shmid, IPC_RMID, 0);

return address;
}

#define malloc huge_malloc

#endif

int main(int argc, char *argv[]){

FLOAT *x;
FLOAT result, result_c;
blasint m, i;
blasint inc_x=1;
int loops = 1;
int l;
char *p;

int from = 1;
int to = 200;
int step = 1;

struct timeval start, stop;
double time1,timeg,timeg_c;

int test = 1;

argc--;argv++;

if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}

if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);

fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);

if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}


#ifdef linux
srandom(getpid());
#endif

fprintf(stderr, " SIZE Flops Time CTime Test\n");

for(m = from; m <= to; m += step)
{

timeg=0;
timeg_c=0;

fprintf(stderr, " %6d :", (int)m);


for (l=0; l<loops; l++)
{

for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}

gettimeofday( &start, (struct timezone *)0);
result = ASUM (&m, x, &inc_x);
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;

gettimeofday( &start, (struct timezone *)0);
#ifdef COMPLEX
result_c = zasum_c(m, x, inc_x);
#else
result_c = asum_c(m, x, inc_x);
#endif
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg_c += time1;

test &= assert_dbl_near(result, result_c, SINGLE_EPS);
}

timeg /= loops;
timeg_c /= loops;

#ifdef COMPLEX
fprintf(stderr, "%10.2f MFlops %10.6f sec %10.6f sec %s\n", 4. * (double)m / timeg * 1.e-6, timeg, timeg_c, test ? "PASS" : "FAILD");
#else
fprintf(stderr, "%10.2f MFlops %10.6f sec %10.6f sec %s\n", 2. * (double)m / timeg * 1.e-6, timeg, timeg_c, test ? "PASS" : "FAILD");
#endif

}

return 0;
}

// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

+ 303
- 0
ztest/axpy.c View File

@@ -0,0 +1,303 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"

#define SINGLE_EPS 1e-04
#define DOUBLE_EPS 1e-13

int assert_dbl_near(double exp, double real, double tol) {
double diff = exp - real;
double absdiff = diff;
/* avoid using fabs and linking with a math lib */
if(diff < 0) {
absdiff *= -1;
}
if (absdiff > tol) {
return 0;
}
return 1;
}

#ifdef COMPLEX
int zaxpy_c(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i=0;
BLASLONG ix,iy;
BLASLONG inc_x2;
BLASLONG inc_y2;

if ( n < 0 ) return(0);
if ( da_r == 0.0 && da_i == 0.0 ) return(0);

ix = 0;
iy = 0;

inc_x2 = 2 * inc_x;
inc_y2 = 2 * inc_y;

while(i < n)
{
#if !defined(CONJ)
y[iy] += ( da_r * x[ix] - da_i * x[ix+1] ) ;
y[iy+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ;
#else
y[iy] += ( da_r * x[ix] + da_i * x[ix+1] ) ;
y[iy+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ;
#endif
ix += inc_x2 ;
iy += inc_y2 ;
i++ ;

}
return(0);

}
#else
int axpy_c(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i=0;
BLASLONG ix,iy;

if ( n < 0 ) return(0);
if ( da == 0.0 ) return(0);

ix = 0;
iy = 0;

while(i < n)
{

y[iy] += da * x[ix] ;
ix += inc_x ;
iy += inc_y ;
i++ ;

}
return(0);

}
#endif

#undef AXPY
#ifdef COMPLEX
#ifdef DOUBLE
#define AXPY BLASFUNC(zaxpy)
#else
#define AXPY BLASFUNC(caxpy)
#endif
#else
#ifdef DOUBLE
#define AXPY BLASFUNC(daxpy)
#else
#define AXPY BLASFUNC(saxpy)
#endif
#endif

#if defined(__WIN32__) || defined(__WIN64__)

#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif

int gettimeofday(struct timeval *tv, void *tz){

FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;

if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);

tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;

/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}

return 0;
}

#endif

#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0

static void *huge_malloc(BLASLONG size){
int shmid;
void *address;

#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif

if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}

address = shmat(shmid, NULL, SHM_RND);

if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}

shmctl(shmid, IPC_RMID, 0);

return address;
}

#define malloc huge_malloc

#endif

int main(int argc, char *argv[]){

FLOAT *x, *y, *y_c;;
FLOAT alpha[2] = { 2.0, 2.0 };
blasint m, i;
blasint inc_x=1,inc_y=1;
int loops = 1;
int l;
char *p;

int from = 1;
int to = 200;
int step = 1;

struct timeval start, stop;
double time1,timeg,timeg_c;

argc--;argv++;

blasint iy;
int test = 1;

if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}

if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p);

fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops);

if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}

if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}

if (( y_c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}

#ifdef linux
srandom(getpid());
#endif

fprintf(stderr, " SIZE Flops Time CTime Test\n");

for(m = from; m <= to; m += step)
{

timeg=0;
timeg_c=0;

fprintf(stderr, " %6d :", (int)m);


for (l=0; l<loops; l++)
{

for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}

for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
y_c[i] = y[i];
}
gettimeofday( &start, (struct timezone *)0);
AXPY (&m, alpha, x, &inc_x, y, &inc_y );
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;

gettimeofday( &start, (struct timezone *)0);
#ifdef COMPLEX
zaxpy_c(m, 0, 0, alpha[0], alpha[1], x, inc_x, y_c, inc_y, NULL, 0);
#else
axpy_c(m, 0, 0, *alpha, x, inc_x, y_c, inc_y, NULL, 0);
#endif
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg_c += time1;

iy = 0;
#ifdef COMPLEX
for (i = 0; i < m * 2; i++)
#else
for (i = 0; i < m; i++)
#endif
{
test &= assert_dbl_near(y[iy], y_c[iy], SINGLE_EPS);
iy += inc_y;
}
}

timeg /= loops;
timeg_c /= loops;

#ifdef COMPLEX
fprintf(stderr, "%10.2f MFlops %10.6f sec %10.6f sec %s\n", 2. * (double)m / timeg * 1.e-6, timeg, timeg_c, test ? "PASS" : "FAILD");
#else
fprintf(stderr, "%10.2f MFlops %10.6f sec %10.6f sec %s\n", 2. * (double)m / timeg * 1.e-6, timeg, timeg_c, test ? "PASS" : "FAILD");
#endif

}

return 0;
}

// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

+ 291
- 0
ztest/copy.c View File

@@ -0,0 +1,291 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"

#define SINGLE_EPS 1e-04
#define DOUBLE_EPS 1e-13

int assert_dbl_near(double exp, double real, double tol) {
double diff = exp - real;
double absdiff = diff;
/* avoid using fabs and linking with a math lib */
if(diff < 0) {
absdiff *= -1;
}
if (absdiff > tol) {
return 0;
}
return 1;
}

#ifdef COMPLEX
int zcopy_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
BLASLONG inc_x2;
BLASLONG inc_y2;

if ( n < 0 ) return(0);

inc_x2 = 2 * inc_x;
inc_y2 = 2 * inc_y;

while(i < n)
{

y[iy] = x[ix] ;
y[iy+1] = x[ix+1] ;
ix += inc_x2;
iy += inc_y2;
i++ ;

}
return(0);

}
#else
int copy_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;

if ( n < 0 ) return(0);

while(i < n)
{

y[iy] = x[ix] ;
ix += inc_x ;
iy += inc_y ;
i++ ;

}
return(0);

}
#endif

#undef COPY
#ifdef COMPLEX
#ifdef DOUBLE
#define COPY BLASFUNC(zcopy)
#else
#define COPY BLASFUNC(ccopy)
#endif
#else
#ifdef DOUBLE
#define COPY BLASFUNC(dcopy)
#else
#define COPY BLASFUNC(scopy)
#endif
#endif

#if defined(__WIN32__) || defined(__WIN64__)

#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif

int gettimeofday(struct timeval *tv, void *tz){

FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;

if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);

tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;

/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}

return 0;
}

#endif

#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0

static void *huge_malloc(BLASLONG size){
int shmid;
void *address;

#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif

if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}

address = shmat(shmid, NULL, SHM_RND);

if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}

shmctl(shmid, IPC_RMID, 0);

return address;
}

#define malloc huge_malloc

#endif

int main(int argc, char *argv[]){

FLOAT *x, *y, *y_c;
blasint m, i;
blasint inc_x=1,inc_y=1;
int loops = 1;
int l;
char *p;

int from = 1;
int to = 200;
int step = 1;

struct timeval start, stop;
double time1,timeg,timeg_c;

blasint iy;
int test = 1;

argc--;argv++;

if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}

if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p);

fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops);

if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}

if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}

if (( y_c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}

#ifdef linux
srandom(getpid());
#endif

fprintf(stderr, " SIZE Flops Time CTime Test\n");

for(m = from; m <= to; m += step)
{

timeg=0;
timeg_c=0;

fprintf(stderr, " %6d :", (int)m);


for (l=0; l<loops; l++)
{

for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}

for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
y_c[i] = y[i];
}
gettimeofday( &start, (struct timezone *)0);
COPY (&m, x, &inc_x, y, &inc_y );
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;

gettimeofday( &start, (struct timezone *)0);
#ifdef COMPLEX
zcopy_c(m, x, inc_x, y_c, inc_y);
#else
copy_c(m, x, inc_x, y_c, inc_y);
#endif
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg_c += time1;

iy = 0;
#ifdef COMPLEX
for (i = 0; i < m * 2; i++)
#else
for (i = 0; i < m; i++)
#endif
{
test &= assert_dbl_near(y[iy], y_c[iy], SINGLE_EPS);
iy += inc_y;
}

}

timeg /= loops;
timeg_c /= loops;

#ifdef COMPLEX
fprintf(stderr, "%10.2f MFlops %10.6f sec %10.6f sec %s\n", 6. * (double)m / timeg * 1.e-6, timeg, timeg_c, test ? "PASS" : "FAILD");
#else
fprintf(stderr, "%10.2f MFlops %10.6f sec %10.6f sec %s\n", 1. * (double)m / timeg * 1.e-6, timeg, timeg_c, test ? "PASS" : "FAILD");
#endif

}

return 0;
}

// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

+ 296
- 0
ztest/dot.c View File

@@ -0,0 +1,296 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"

#define SINGLE_EPS 1e-04
#define DOUBLE_EPS 1e-13

int assert_dbl_near(double exp, double real, double tol) {
double diff = exp - real;
double absdiff = diff;
/* avoid using fabs and linking with a math lib */
if(diff < 0) {
absdiff *= -1;
}
if (absdiff > tol) {
return 0;
}
return 1;
}

#ifdef COMPLEX
OPENBLAS_COMPLEX_FLOAT zdot_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
FLOAT dot[2];
OPENBLAS_COMPLEX_FLOAT result;
BLASLONG inc_x2;
BLASLONG inc_y2;

dot[0]=0.0;
dot[1]=0.0;

CREAL(result) = 0.0 ;
CIMAG(result) = 0.0 ;

if ( n < 1 ) return(result);

inc_x2 = 2 * inc_x ;
inc_y2 = 2 * inc_y ;

while(i < n)
{
#if !defined(CONJ)
dot[0] += ( x[ix] * y[iy] - x[ix+1] * y[iy+1] ) ;
dot[1] += ( x[ix+1] * y[iy] + x[ix] * y[iy+1] ) ;
#else
dot[0] += ( x[ix] * y[iy] + x[ix+1] * y[iy+1] ) ;
dot[1] -= ( x[ix+1] * y[iy] - x[ix] * y[iy+1] ) ;
#endif
ix += inc_x2 ;
iy += inc_y2 ;
i++ ;

}
CREAL(result) = dot[0];
CIMAG(result) = dot[1];
return(result);

}
#else
FLOAT dot_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
FLOAT dot = 0.0 ;

if ( n < 0 ) return(dot);

while(i < n)
{

dot += y[iy] * x[ix] ;
ix += inc_x ;
iy += inc_y ;
i++ ;

}
return(dot);
}
#endif

#undef DOT
#ifdef COMPLEX
#ifdef DOUBLE
#define DOT BLASFUNC(zdotu)
#else
#define DOT BLASFUNC(cdotu)
#endif
#else
#ifdef DOUBLE
#define DOT BLASFUNC(ddot)
#else
#define DOT BLASFUNC(sdot)
#endif
#endif

#if defined(__WIN32__) || defined(__WIN64__)

#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif

int gettimeofday(struct timeval *tv, void *tz){

FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;

if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);

tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;

/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}

return 0;
}

#endif

#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0

static void *huge_malloc(BLASLONG size){
int shmid;
void *address;

#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif

if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}

address = shmat(shmid, NULL, SHM_RND);

if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}

shmctl(shmid, IPC_RMID, 0);

return address;
}

#define malloc huge_malloc

#endif

int main(int argc, char *argv[]){

FLOAT *x, *y;
#ifdef COMPLEX
OPENBLAS_COMPLEX_FLOAT result, result_c;
#else
FLOAT result, result_c;
#endif
blasint m, i;
blasint inc_x=1,inc_y=1;
int loops = 1;
int l;
char *p;

int from = 1;
int to = 200;
int step = 1;

struct timeval start, stop;
double time1,timeg,timeg_c;

int test = 1;

argc--;argv++;

if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}

if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p);

fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops);

if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}

if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}

#ifdef linux
srandom(getpid());
#endif

fprintf(stderr, " SIZE Flops Time CTime Test\n");

for(m = from; m <= to; m += step)
{

timeg=0;
timeg_c=0;

fprintf(stderr, " %6d :", (int)m);


for (l=0; l<loops; l++)
{

for(i = 0; i < m * COMPSIZE * abs(inc_x); i++) {
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}

for(i = 0; i < m * COMPSIZE * abs(inc_y); i++) {
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}

gettimeofday( &start, (struct timezone *)0);
result = DOT(&m, x, &inc_x, y, &inc_y);
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;

gettimeofday( &start, (struct timezone *)0);
#ifdef COMPLEX
result_c = zdot_c(m, x, inc_x, y, inc_y);
#else
result_c = dot_c(m, x, inc_x, y, inc_y);
#endif
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg_c += time1;

#ifdef COMPLEX
test &= assert_dbl_near(CREAL(result), CREAL(result_c), SINGLE_EPS);
test &= assert_dbl_near(CIMAG(result), CIMAG(result_c), SINGLE_EPS);
#else
test &= assert_dbl_near(result, result_c, SINGLE_EPS);
#endif
}

timeg /= loops;
timeg_c /= loops;

fprintf(stderr, "%10.2f MFlops %10.6f sec %10.6f sec %s\n", 2. * (double)m / timeg * 1.e-6, timeg, timeg_c, test ? "PASS" : "FAILD");

}

return 0;
}

// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

+ 229
- 0
ztest/dsdot.c View File

@@ -0,0 +1,229 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"

#define SINGLE_EPS 1e-04
#define DOUBLE_EPS 1e-13

int assert_dbl_near(double exp, double real, double tol) {
double diff = exp - real;
double absdiff = diff;
/* avoid using fabs and linking with a math lib */
if(diff < 0) {
absdiff *= -1;
}
if (absdiff > tol) {
return 0;
}
return 1;
}

double dsdot_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
double dot = 0.0 ;

if ( n < 0 ) return(dot);

while(i < n)
{

dot += y[iy] * x[ix] ;
ix += inc_x ;
iy += inc_y ;
i++ ;

}
return(dot);
}

#undef DSDOT
#define DSDOT BLASFUNC(dsdot)

#if defined(__WIN32__) || defined(__WIN64__)

#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif

int gettimeofday(struct timeval *tv, void *tz){

FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;

if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);

tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;

/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}

return 0;
}

#endif

#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0

static void *huge_malloc(BLASLONG size){
int shmid;
void *address;

#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif

if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}

address = shmat(shmid, NULL, SHM_RND);

if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}

shmctl(shmid, IPC_RMID, 0);

return address;
}

#define malloc huge_malloc

#endif

int main(int argc, char *argv[]){

FLOAT *x, *y;
double result, result_c;
blasint m, i;
blasint inc_x=1,inc_y=1;
int loops = 1;
int l;
char *p;

int from = 1;
int to = 200;
int step = 1;

struct timeval start, stop;
double time1,timeg,timeg_c;

int test = 1;

argc--;argv++;

if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}

if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p);

fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops);

if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}

if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}

#ifdef linux
srandom(getpid());
#endif

fprintf(stderr, " SIZE Flops Time CTime Test\n");

for(m = from; m <= to; m += step)
{

timeg=0;
timeg_c=0;

fprintf(stderr, " %6d :", (int)m);


for (l=0; l<loops; l++)
{

for(i = 0; i < m * COMPSIZE * abs(inc_x); i++) {
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}

for(i = 0; i < m * COMPSIZE * abs(inc_y); i++) {
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}

gettimeofday( &start, (struct timezone *)0);
result = DSDOT(&m, x, &inc_x, y, &inc_y);
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;

gettimeofday( &start, (struct timezone *)0);
result_c = dsdot_c(m, x, inc_x, y, inc_y);
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg_c += time1;

test &= assert_dbl_near(result, result_c, SINGLE_EPS);
}

timeg /= loops;
timeg_c /= loops;

fprintf(stderr, "%10.2f MFlops %10.6f sec %10.6f sec %s\n", 2. * (double)m / timeg * 1.e-6, timeg, timeg_c, test ? "PASS" : "FAILD");

}

return 0;
}

// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

+ 618
- 0
ztest/gemv.c View File

@@ -0,0 +1,618 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"

#define SINGLE_EPS 1e-04
#define DOUBLE_EPS 1e-13

int assert_dbl_near(double exp, double real, double tol) {
double diff = exp - real;
double absdiff = diff;
/* avoid using fabs and linking with a math lib */
if(diff < 0) {
absdiff *= -1;
}
if (absdiff > tol) {
return 0;
}
return 1;
}

#ifdef COMPLEX
int zgemv_n_c(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
BLASLONG i;
BLASLONG ix,iy;
BLASLONG j;
FLOAT *a_ptr;
FLOAT temp_r,temp_i;
BLASLONG inc_x2,inc_y2;
BLASLONG lda2;
BLASLONG i2;

lda2 = 2*lda;

ix = 0;
a_ptr = a;

if ( inc_x == 1 && inc_y == 1 )
{

for (j=0; j<n; j++)
{

#if !defined(XCONJ)
temp_r = alpha_r * x[ix] - alpha_i * x[ix+1];
temp_i = alpha_r * x[ix+1] + alpha_i * x[ix];
#else
temp_r = alpha_r * x[ix] + alpha_i * x[ix+1];
temp_i = alpha_r * x[ix+1] - alpha_i * x[ix];
#endif
iy = 0;
i2=0;

for (i=0; i<m; i++)
{
#if !defined(CONJ)

#if !defined(XCONJ)
printf("\nParO: %f %f %f %f\n", a_ptr[i2], a_ptr[i2+1], temp_r, temp_i);
y[iy] += temp_r * a_ptr[i2] - temp_i * a_ptr[i2+1];
y[iy+1] += temp_r * a_ptr[i2+1] + temp_i * a_ptr[i2];
#else
y[iy] += temp_r * a_ptr[i2] + temp_i * a_ptr[i2+1];
y[iy+1] += temp_r * a_ptr[i2+1] - temp_i * a_ptr[i2];
#endif

#else

#if !defined(XCONJ)
y[iy] += temp_r * a_ptr[i2] + temp_i * a_ptr[i2+1];
y[iy+1] -= temp_r * a_ptr[i2+1] - temp_i * a_ptr[i2];
#else
y[iy] += temp_r * a_ptr[i2] - temp_i * a_ptr[i2+1];
y[iy+1] -= temp_r * a_ptr[i2+1] + temp_i * a_ptr[i2];
#endif

#endif
i2 += 2;
iy += 2;
}
a_ptr += lda2;
ix += 2;
}

return(0);

}


inc_x2 = 2 * inc_x;
inc_y2 = 2 * inc_y;

for (j=0; j<n; j++)
{

#if !defined(XCONJ)
temp_r = alpha_r * x[ix] - alpha_i * x[ix+1];
temp_i = alpha_r * x[ix+1] + alpha_i * x[ix];
#else
temp_r = alpha_r * x[ix] + alpha_i * x[ix+1];
temp_i = alpha_r * x[ix+1] - alpha_i * x[ix];
#endif
iy = 0;
i2=0;

for (i=0; i<m; i++)
{
#if !defined(CONJ)

#if !defined(XCONJ)
y[iy] += temp_r * a_ptr[i2] - temp_i * a_ptr[i2+1];
y[iy+1] += temp_r * a_ptr[i2+1] + temp_i * a_ptr[i2];
#else
y[iy] += temp_r * a_ptr[i2] + temp_i * a_ptr[i2+1];
y[iy+1] += temp_r * a_ptr[i2+1] - temp_i * a_ptr[i2];
#endif

#else

#if !defined(XCONJ)
y[iy] += temp_r * a_ptr[i2] + temp_i * a_ptr[i2+1];
y[iy+1] -= temp_r * a_ptr[i2+1] - temp_i * a_ptr[i2];
#else
y[iy] += temp_r * a_ptr[i2] - temp_i * a_ptr[i2+1];
y[iy+1] -= temp_r * a_ptr[i2+1] + temp_i * a_ptr[i2];
#endif

#endif
i2 += 2;
iy += inc_y2;
}
a_ptr += lda2;
ix += inc_x2;
}


return(0);
}
int zgemv_t_c(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
BLASLONG i;
BLASLONG ix,iy;
BLASLONG j;
FLOAT *a_ptr;
FLOAT temp_r,temp_i;
BLASLONG inc_x2,inc_y2;
BLASLONG lda2;
BLASLONG i2;

lda2 = 2*lda;

iy = 0;
a_ptr = a;

if ( inc_x == 1 && inc_y == 1 )
{

for (j=0; j<n; j++)
{
temp_r = 0.0;
temp_i = 0.0;
ix = 0;
i2=0;

for (i=0; i<m; i++)
{

#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r += a_ptr[i2] * x[ix] - a_ptr[i2+1] * x[ix+1];
temp_i += a_ptr[i2] * x[ix+1] + a_ptr[i2+1] * x[ix];
#else
temp_r += a_ptr[i2] * x[ix] + a_ptr[i2+1] * x[ix+1];
temp_i += a_ptr[i2] * x[ix+1] - a_ptr[i2+1] * x[ix];
#endif

i2 += 2;
ix += 2;
}

#if !defined(XCONJ)
y[iy] += alpha_r * temp_r - alpha_i * temp_i;
y[iy+1] += alpha_r * temp_i + alpha_i * temp_r;
#else
y[iy] += alpha_r * temp_r + alpha_i * temp_i;
y[iy+1] -= alpha_r * temp_i - alpha_i * temp_r;
#endif

a_ptr += lda2;
iy += 2;
}

return(0);

}


inc_x2 = 2 * inc_x;
inc_y2 = 2 * inc_y;

for (j=0; j<n; j++)
{
temp_r = 0.0;
temp_i = 0.0;
ix = 0;
i2=0;

for (i=0; i<m; i++)
{

#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r += a_ptr[i2] * x[ix] - a_ptr[i2+1] * x[ix+1];
temp_i += a_ptr[i2] * x[ix+1] + a_ptr[i2+1] * x[ix];
#else
temp_r += a_ptr[i2] * x[ix] + a_ptr[i2+1] * x[ix+1];
temp_i += a_ptr[i2] * x[ix+1] - a_ptr[i2+1] * x[ix];
#endif

i2 += 2;
ix += inc_x2;
}

#if !defined(XCONJ)
y[iy] += alpha_r * temp_r - alpha_i * temp_i;
y[iy+1] += alpha_r * temp_i + alpha_i * temp_r;
#else
y[iy] += alpha_r * temp_r + alpha_i * temp_i;
y[iy+1] -= alpha_r * temp_i - alpha_i * temp_r;
#endif

a_ptr += lda2;
iy += inc_y2;
}

return(0);

}
#else
int gemv_n_c(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
BLASLONG i;
BLASLONG ix,iy;
BLASLONG j;
FLOAT *a_ptr;
FLOAT temp;

ix = 0;
a_ptr = a;

for (j=0; j<n; j++)
{
temp = alpha * x[ix];
iy = 0;
for (i=0; i<m; i++)
{
y[iy] += temp * a_ptr[i];
iy += inc_y;
}
a_ptr += lda;
ix += inc_x;
}

return(0);
}
int gemv_t_c(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
BLASLONG i;
BLASLONG ix,iy;
BLASLONG j;
FLOAT *a_ptr;
FLOAT temp;

iy = 0;
a_ptr = a;

for (j=0; j<n; j++)
{
temp = 0.0;
ix = 0;
for (i=0; i<m; i++)
{
temp += a_ptr[i] * x[ix];
ix += inc_x;
}
y[iy] += alpha * temp;
iy += inc_y;
a_ptr += lda;
}
return(0);
}
#endif

#undef GEMV
#ifndef COMPLEX
#ifdef DOUBLE
#define GEMV BLASFUNC(dgemv)
#else
#define GEMV BLASFUNC(sgemv)
#endif
#else
#ifdef DOUBLE
#define GEMV BLASFUNC(zgemv)
#else
#define GEMV BLASFUNC(cgemv)
#endif
#endif

#if defined(__WIN32__) || defined(__WIN64__)

#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif

int gettimeofday(struct timeval *tv, void *tz){

FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;

if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);

tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;

/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}

return 0;
}

#endif

#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0

static void *huge_malloc(BLASLONG size){
int shmid;
void *address;

#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif

if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}

address = shmat(shmid, NULL, SHM_RND);

if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}

shmctl(shmid, IPC_RMID, 0);

return address;
}

#define malloc huge_malloc

#endif

int main(int argc, char *argv[]){

FLOAT *a, *x, *y, *y_c;
FLOAT alpha[] = {1.0, 1.0};
FLOAT beta [] = {1.0, 1.0};
char trans='N';
blasint m, i, j;
blasint inc_x=1,inc_y=1;
blasint n=0;
int has_param_n = 0;
int has_param_m = 0;
int loops = 1;
int l;
char *p;

int from = 1;
int to = 200;
int step = 1;

struct timeval start, stop;
double time1,timeg,timeg_c;

blasint iy;
int test = 1;

argc--;argv++;

if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}


int tomax = to;

if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p);
if ((p = getenv("OPENBLAS_TRANS"))) trans=*p;
if ((p = getenv("OPENBLAS_PARAM_N"))) {
n = atoi(p);
if ((n>0)) has_param_n = 1;
if ( n > tomax ) tomax = n;
}
if ( has_param_n == 0 )
if ((p = getenv("OPENBLAS_PARAM_M"))) {
m = atoi(p);
if ((m>0)) has_param_m = 1;
if ( m > tomax ) tomax = m;
}



fprintf(stderr, "From : %3d To : %3d Step = %3d Trans = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,trans,inc_x,inc_y,loops);

if (( a = (FLOAT *)malloc(sizeof(FLOAT) * tomax * tomax * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}

if (( x = (FLOAT *)malloc(sizeof(FLOAT) * tomax * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}

if (( y = (FLOAT *)malloc(sizeof(FLOAT) * tomax * abs(inc_y) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}

if (( y_c = (FLOAT *)malloc(sizeof(FLOAT) * tomax * abs(inc_y) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}

#ifdef linux
srandom(getpid());
#endif

fprintf(stderr, " SIZE Flops Time CTime Test\n");

if (has_param_m == 0)
{

for(m = from; m <= to; m += step)
{
timeg=0;
timeg_c=0;
if ( has_param_n == 0 ) n = m;
fprintf(stderr, " %6dx%d :", (int)m,(int)n);
for(j = 0; j < m; j++){
for(i = 0; i < n * COMPSIZE; i++){
a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
}

for (l=0; l<loops; l++)
{

for(i = 0; i < n * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}

for(i = 0; i < n * COMPSIZE * abs(inc_y); i++){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
y_c[i]= y[i];
}
gettimeofday( &start, (struct timezone *)0);
GEMV (&trans, &m, &n, alpha, a, &m, x, &inc_x, beta, y, &inc_y );
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;

gettimeofday( &start, (struct timezone *)0);
#ifdef COMPLEX
if (trans == 'N')
zgemv_n_c(m, n, 0, alpha[0], alpha[1], a, m, x, inc_x, y_c, inc_y);
else
zgemv_t_c(m, n, 0, alpha[0], alpha[1], a, m, x, inc_x, y_c, inc_y);
#else
if (trans == 'N')
gemv_n_c(m, n, 0, *alpha, a, m, x, inc_x, y_c, inc_y);
else
gemv_t_c(m, n, 0, *alpha, a, m, x, inc_x, y_c, inc_y);
#endif
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg_c += time1;

iy = 0;
#ifdef COMPLEX
for (i = 0; i < m * 2; i++)
#else
for (i = 0; i < m; i++)
#endif
{
test &= assert_dbl_near(y[iy], y_c[iy], SINGLE_EPS);
iy += inc_y;
}

}

timeg /= loops;
timeg_c /= loops;

fprintf(stderr, "%10.2f MFlops %10.6f sec %10.6f sec %s\n", 2. * (double)m / timeg * 1.e-6, timeg, timeg_c, test ? "PASS" : "FAILD");

}
}
else
{

for(n = from; n <= to; n += step)
{
timeg=0;
timeg_c=0;
fprintf(stderr, " %6dx%d :", (int)m,(int)n);
for(j = 0; j < m; j++){
for(i = 0; i < n * COMPSIZE; i++){
a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
}

for (l=0; l<loops; l++)
{

for(i = 0; i < n * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}

for(i = 0; i < n * COMPSIZE * abs(inc_y); i++){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
y_c[i]= y[i];
}
gettimeofday( &start, (struct timezone *)0);
GEMV (&trans, &m, &n, alpha, a, &m, x, &inc_x, beta, y, &inc_y );
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;

gettimeofday( &start, (struct timezone *)0);
#ifdef COMPLEX
if (trans == 'N')
zgemv_n_c(m, n, 0, alpha[0], alpha[1], a, m, x, inc_x, y_c, inc_y);
else
zgemv_t_c(m, n, 0, alpha[0], alpha[1], a, m, x, inc_x, y_c, inc_y);
#else
if (trans == 'N')
gemv_n_c(m, n, 0, *alpha, a, m, x, inc_x, y_c, inc_y);
else
gemv_t_c(m, n, 0, *alpha, a, m, x, inc_x, y_c, inc_y);
#endif
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg_c += time1;

iy = 0;
#ifdef COMPLEX
for (i = 0; i < m * 2; i++)
#else
for (i = 0; i < m; i++)
#endif
{
test &= assert_dbl_near(y[iy], y_c[iy], SINGLE_EPS);
iy += inc_y;
}

}

timeg /= loops;
timeg_c /= loops;

fprintf(stderr, "%10.2f MFlops %10.6f sec %10.6f sec %s\n", 2. * (double)m / timeg * 1.e-6, timeg, timeg_c, test ? "PASS" : "FAILD");

}
}

return 0;
}

// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

+ 284
- 0
ztest/iamax.c View File

@@ -0,0 +1,284 @@
/***************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"

#define SINGLE_EPS 1e-04
#define DOUBLE_EPS 1e-13

int assert_dbl_near(double exp, double real, double tol) {
double diff = exp - real;
double absdiff = diff;
/* avoid using fabs and linking with a math lib */
if(diff < 0) {
absdiff *= -1;
}
if (absdiff > tol) {
return 0;
}
return 1;
}

#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
#ifdef COMPLEX
#define CABS1(x,i) ABS(x[i])+ABS(x[i+1])
BLASLONG izamax_c(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
BLASLONG ix=0;
FLOAT maxf;
BLASLONG max=0;
BLASLONG inc_x2;

if (n <= 0 || inc_x <= 0) return(max);

inc_x2 = 2 * inc_x;

maxf = CABS1(x,0);
ix += inc_x2;
i++;

while(i < n)
{
if( CABS1(x,ix) > maxf )
{
max = i;
maxf = CABS1(x,ix);
}
ix += inc_x2;
i++;
}
return(max+1);
}
#else
BLASLONG iamax_c(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
BLASLONG ix=0;
FLOAT maxf=0.0;
BLASLONG max=0;

if (n <= 0 || inc_x <= 0) return(max);

maxf=ABS(x[0]);
ix += inc_x;
i++;

while(i < n)
{
if( ABS(x[ix]) > maxf )
{
max = i;
maxf = ABS(x[ix]);
}
ix += inc_x;
i++;
}
return(max+1);
}
#endif

#undef IAMAX
#ifdef COMPLEX
#ifdef DOUBLE
#define IAMAX BLASFUNC(izamax)
#else
#define IAMAX BLASFUNC(icamax)
#endif
#else
#ifdef DOUBLE
#define IAMAX BLASFUNC(idamax)
#else
#define IAMAX BLASFUNC(isamax)
#endif
#endif

#if defined(__WIN32__) || defined(__WIN64__)

#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif

int gettimeofday(struct timeval *tv, void *tz){

FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;

if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);

tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;

/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}

return 0;
}

#endif

#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0

static void *huge_malloc(BLASLONG size){
int shmid;
void *address;

#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif

if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}

address = shmat(shmid, NULL, SHM_RND);

if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}

shmctl(shmid, IPC_RMID, 0);

return address;
}

#define malloc huge_malloc

#endif

int main(int argc, char *argv[]){

FLOAT *x;
BLASLONG result, result_c;
blasint m, i;
blasint inc_x=1;
int loops = 1;
int l;
char *p;

int from = 1;
int to = 200;
int step = 1;

struct timeval start, stop;
double time1,timeg,timeg_c;

int test = 1;

argc--;argv++;

if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}

if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);

fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);

if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}

#ifdef linux
srandom(getpid());
#endif

fprintf(stderr, " SIZE Flops Time CTime Test\n");

for(m = from; m <= to; m += step)
{

timeg=0;
timeg_c=0;

fprintf(stderr, " %6d :", (int)m);


for (l=0; l<loops; l++)
{

for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}

gettimeofday( &start, (struct timezone *)0);
result = IAMAX (&m, x, &inc_x);
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;

gettimeofday( &start, (struct timezone *)0);
#ifdef COMPLEX
result_c = izamax_c(m, x, inc_x);
#else
result_c = iamax_c(m, x, inc_x);
#endif
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg_c += time1;

test &= (result == result_c);

}

timeg /= loops;
timeg_c /= loops;
#ifdef COMPLEX
fprintf(stderr, "%10.2f MFlops %10.6f sec %10.6f sec %s\n", 6. * (double)m / timeg * 1.e-6, timeg, timeg_c, test ? "PASS" : "FAILD");
#else
fprintf(stderr, "%10.2f MFlops %10.6f sec %10.6f sec %s\n", 1. * (double)m / timeg * 1.e-6, timeg, timeg_c, test ? "PASS" : "FAILD");
#endif

}

return 0;
}

// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

+ 284
- 0
ztest/iamin.c View File

@@ -0,0 +1,284 @@
/***************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"

#define SINGLE_EPS 1e-04
#define DOUBLE_EPS 1e-13

int assert_dbl_near(double exp, double real, double tol) {
double diff = exp - real;
double absdiff = diff;
/* avoid using fabs and linking with a math lib */
if(diff < 0) {
absdiff *= -1;
}
if (absdiff > tol) {
return 0;
}
return 1;
}

#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
#ifdef COMPLEX
#define CABS1(x,i) ABS(x[i])+ABS(x[i+1])
BLASLONG izamin_c(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
BLASLONG ix=0;
FLOAT minf;
BLASLONG min=0;
BLASLONG inc_x2;

if (n <= 0 || inc_x <= 0) return(min);

inc_x2 = 2 * inc_x;

minf = CABS1(x,0);
ix += inc_x2;
i++;

while(i < n)
{
if( CABS1(x,ix) < minf )
{
min = i;
minf = CABS1(x,ix);
}
ix += inc_x2;
i++;
}
return(min+1);
}
#else
BLASLONG iamin_c(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
BLASLONG ix=0;
FLOAT minf=0.0;
BLASLONG min=0;

if (n <= 0 || inc_x <= 0) return(min);

minf=ABS(x[0]);
ix += inc_x;
i++;

while(i < n)
{
if( ABS(x[ix]) < minf )
{
min = i;
minf = ABS(x[ix]);
}
ix += inc_x;
i++;
}
return(min+1);
}
#endif

#undef IAMIN
#ifdef COMPLEX
#ifdef DOUBLE
#define IAMIN BLASFUNC(izamin)
#else
#define IAMIN BLASFUNC(icamin)
#endif
#else
#ifdef DOUBLE
#define IAMIN BLASFUNC(idamin)
#else
#define IAMIN BLASFUNC(isamin)
#endif
#endif

#if defined(__WIN32__) || defined(__WIN64__)

#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif

int gettimeofday(struct timeval *tv, void *tz){

FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;

if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);

tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;

/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}

return 0;
}

#endif

#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0

static void *huge_malloc(BLASLONG size){
int shmid;
void *address;

#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif

if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}

address = shmat(shmid, NULL, SHM_RND);

if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}

shmctl(shmid, IPC_RMID, 0);

return address;
}

#define malloc huge_malloc

#endif

int main(int argc, char *argv[]){

FLOAT *x;
BLASLONG result, result_c;
blasint m, i;
blasint inc_x=1;
int loops = 1;
int l;
char *p;

int from = 1;
int to = 200;
int step = 1;

struct timeval start, stop;
double time1,timeg,timeg_c;

int test = 1;

argc--;argv++;

if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}

if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);

fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);

if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}

#ifdef linux
srandom(getpid());
#endif

fprintf(stderr, " SIZE Flops Time CTime Test\n");

for(m = from; m <= to; m += step)
{

timeg=0;
timeg_c=0;

fprintf(stderr, " %6d :", (int)m);


for (l=0; l<loops; l++)
{

for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}

gettimeofday( &start, (struct timezone *)0);
result = IAMIN (&m, x, &inc_x);
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;

gettimeofday( &start, (struct timezone *)0);
#ifdef COMPLEX
result_c = izamin_c(m, x, inc_x);
#else
result_c = iamin_c(m, x, inc_x);
#endif
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg_c += time1;

test &= (result == result_c);

}

timeg /= loops;
timeg_c /= loops;
#ifdef COMPLEX
fprintf(stderr, "%10.2f MFlops %10.6f sec %10.6f sec %s\n", 6. * (double)m / timeg * 1.e-6, timeg, timeg_c, test ? "PASS" : "FAILD");
#else
fprintf(stderr, "%10.2f MFlops %10.6f sec %10.6f sec %s\n", 1. * (double)m / timeg * 1.e-6, timeg, timeg_c, test ? "PASS" : "FAILD");
#endif

}

return 0;
}

// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

+ 231
- 0
ztest/imax.c View File

@@ -0,0 +1,231 @@
/***************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"

#define SINGLE_EPS 1e-04
#define DOUBLE_EPS 1e-13

int assert_dbl_near(double exp, double real, double tol) {
double diff = exp - real;
double absdiff = diff;
/* avoid using fabs and linking with a math lib */
if(diff < 0) {
absdiff *= -1;
}
if (absdiff > tol) {
return 0;
}
return 1;
}

BLASLONG imax_c(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
BLASLONG ix=0;
FLOAT maxf=0.0;
BLASLONG max=0;

if (n <= 0 || inc_x <= 0) return(max);

maxf=x[0];
ix += inc_x;
i++;

while(i < n)
{
if( x[ix] > maxf )
{
max = i;
maxf = x[ix];
}
ix += inc_x;
i++;
}
return(max+1);
}

#undef IMAX
#ifdef DOUBLE
#define IMAX BLASFUNC(idmax)
#else
#define IMAX BLASFUNC(ismax)
#endif

#if defined(__WIN32__) || defined(__WIN64__)

#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif

int gettimeofday(struct timeval *tv, void *tz){

FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;

if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);

tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;

/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}

return 0;
}

#endif

#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0

static void *huge_malloc(BLASLONG size){
int shmid;
void *address;

#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif

if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}

address = shmat(shmid, NULL, SHM_RND);

if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}

shmctl(shmid, IPC_RMID, 0);

return address;
}

#define malloc huge_malloc

#endif

int main(int argc, char *argv[]){

FLOAT *x;
BLASLONG result, result_c;
blasint m, i;
blasint inc_x=1;
int loops = 1;
int l;
char *p;

int from = 1;
int to = 200;
int step = 1;

struct timeval start, stop;
double time1,timeg,timeg_c;

int test = 1;

argc--;argv++;

if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}

if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);

fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);

if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}

#ifdef linux
srandom(getpid());
#endif

fprintf(stderr, " SIZE Flops Time CTime Test\n");

for(m = from; m <= to; m += step)
{

timeg=0;
timeg_c=0;

fprintf(stderr, " %6d :", (int)m);


for (l=0; l<loops; l++)
{

for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}

gettimeofday( &start, (struct timezone *)0);
result = IMAX (&m, x, &inc_x);
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;

gettimeofday( &start, (struct timezone *)0);
result_c = imax_c(m, x, inc_x);
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg_c += time1;

test &= (result == result_c);

}

timeg /= loops;
timeg_c /= loops;
fprintf(stderr, "%10.2f MFlops %10.6f sec %10.6f sec %s\n", 1. * (double)m / timeg * 1.e-6, timeg, timeg_c, test ? "PASS" : "FAILD");

}

return 0;
}

// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

+ 231
- 0
ztest/imin.c View File

@@ -0,0 +1,231 @@
/***************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"

#define SINGLE_EPS 1e-04
#define DOUBLE_EPS 1e-13

int assert_dbl_near(double exp, double real, double tol) {
double diff = exp - real;
double absdiff = diff;
/* avoid using fabs and linking with a math lib */
if(diff < 0) {
absdiff *= -1;
}
if (absdiff > tol) {
return 0;
}
return 1;
}

BLASLONG imin_c(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
BLASLONG ix=0;
FLOAT minf=0.0;
BLASLONG min=0;

if (n <= 0 || inc_x <= 0) return(min);

minf=x[0];
ix += inc_x;
i++;

while(i < n)
{
if( x[ix] < minf )
{
min = i;
minf = x[ix];
}
ix += inc_x;
i++;
}
return(min+1);
}

#undef IMIN
#ifdef DOUBLE
#define IMIN BLASFUNC(idmin)
#else
#define IMIN BLASFUNC(ismin)
#endif

#if defined(__WIN32__) || defined(__WIN64__)

#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif

int gettimeofday(struct timeval *tv, void *tz){

FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;

if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);

tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;

/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}

return 0;
}

#endif

#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0

static void *huge_malloc(BLASLONG size){
int shmid;
void *address;

#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif

if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}

address = shmat(shmid, NULL, SHM_RND);

if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}

shmctl(shmid, IPC_RMID, 0);

return address;
}

#define malloc huge_malloc

#endif

int main(int argc, char *argv[]){

FLOAT *x;
BLASLONG result, result_c;
blasint m, i;
blasint inc_x=1;
int loops = 1;
int l;
char *p;

int from = 1;
int to = 200;
int step = 1;

struct timeval start, stop;
double time1,timeg,timeg_c;

int test = 1;

argc--;argv++;

if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}

if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);

fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);

if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}

#ifdef linux
srandom(getpid());
#endif

fprintf(stderr, " SIZE Flops Time CTime Test\n");

for(m = from; m <= to; m += step)
{

timeg=0;
timeg_c=0;

fprintf(stderr, " %6d :", (int)m);


for (l=0; l<loops; l++)
{

for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}

gettimeofday( &start, (struct timezone *)0);
result = IMIN (&m, x, &inc_x);
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;

gettimeofday( &start, (struct timezone *)0);
result_c = imin_c(m, x, inc_x);
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg_c += time1;

test &= (result == result_c);

}

timeg /= loops;
timeg_c /= loops;
fprintf(stderr, "%10.2f MFlops %10.6f sec %10.6f sec %s\n", 1. * (double)m / timeg * 1.e-6, timeg, timeg_c, test ? "PASS" : "FAILD");

}

return 0;
}

// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

+ 229
- 0
ztest/max.c View File

@@ -0,0 +1,229 @@
/***************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"

#define SINGLE_EPS 1e-04
#define DOUBLE_EPS 1e-13

int assert_dbl_near(double exp, double real, double tol) {
double diff = exp - real;
double absdiff = diff;
/* avoid using fabs and linking with a math lib */
if(diff < 0) {
absdiff *= -1;
}
if (absdiff > tol) {
return 0;
}
return 1;
}

FLOAT max_c(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
BLASLONG ix=0;
FLOAT maxf=0.0;

if (n <= 0 || inc_x <= 0) return(maxf);

maxf=x[0];
ix += inc_x;
i++;

while(i < n)
{
if( x[ix] > maxf )
{
maxf = x[ix];
}
ix += inc_x;
i++;
}
return(maxf);
}

#undef MAX_
#ifdef DOUBLE
#define MAX_ BLASFUNC(dmax)
#else
#define MAX_ BLASFUNC(smax)
#endif

#if defined(__WIN32__) || defined(__WIN64__)

#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif

int gettimeofday(struct timeval *tv, void *tz){

FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;

if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);

tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;

/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}

return 0;
}

#endif

#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0

static void *huge_malloc(BLASLONG size){
int shmid;
void *address;

#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif

if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}

address = shmat(shmid, NULL, SHM_RND);

if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}

shmctl(shmid, IPC_RMID, 0);

return address;
}

#define malloc huge_malloc

#endif

int main(int argc, char *argv[]){

FLOAT *x;
FLOAT result, result_c;
blasint m, i;
blasint inc_x=1;
int loops = 1;
int l;
char *p;

int from = 1;
int to = 200;
int step = 1;

struct timeval start, stop;
double time1,timeg,timeg_c;

int test = 1;

argc--;argv++;

if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}

if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);

fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);

if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}

#ifdef linux
srandom(getpid());
#endif

fprintf(stderr, " SIZE Flops Time CTime Test\n");

for(m = from; m <= to; m += step)
{

timeg=0;
timeg_c=0;

fprintf(stderr, " %6d :", (int)m);


for (l=0; l<loops; l++)
{

for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}

gettimeofday( &start, (struct timezone *)0);
result = MAX_ (&m, x, &inc_x);
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;

gettimeofday( &start, (struct timezone *)0);
result_c = max_c(m, x, inc_x);
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg_c += time1;

test &= assert_dbl_near(result, result_c, SINGLE_EPS);

}

timeg /= loops;
timeg_c /= loops;
fprintf(stderr, "%10.2f MFlops %10.6f sec %10.6f sec %s\n", 1. * (double)m / timeg * 1.e-6, timeg, timeg_c, test ? "PASS" : "FAILD");

}

return 0;
}

// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

+ 229
- 0
ztest/min.c View File

@@ -0,0 +1,229 @@
/***************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"

#define SINGLE_EPS 1e-04
#define DOUBLE_EPS 1e-13

int assert_dbl_near(double exp, double real, double tol) {
double diff = exp - real;
double absdiff = diff;
/* avoid using fabs and linking with a math lib */
if(diff < 0) {
absdiff *= -1;
}
if (absdiff > tol) {
return 0;
}
return 1;
}

FLOAT min_c(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
BLASLONG ix=0;
FLOAT minf=0.0;

if (n <= 0 || inc_x <= 0) return(minf);

minf=x[0];
ix += inc_x;
i++;

while(i < n)
{
if( x[ix] < minf )
{
minf = x[ix];
}
ix += inc_x;
i++;
}
return(minf);
}

#undef MIN_
#ifdef DOUBLE
#define MIN_ BLASFUNC(dmin)
#else
#define MIN_ BLASFUNC(smin)
#endif

#if defined(__WIN32__) || defined(__WIN64__)

#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif

int gettimeofday(struct timeval *tv, void *tz){

FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;

if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);

tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;

/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}

return 0;
}

#endif

#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0

static void *huge_malloc(BLASLONG size){
int shmid;
void *address;

#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif

if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}

address = shmat(shmid, NULL, SHM_RND);

if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}

shmctl(shmid, IPC_RMID, 0);

return address;
}

#define malloc huge_malloc

#endif

int main(int argc, char *argv[]){

FLOAT *x;
FLOAT result, result_c;
blasint m, i;
blasint inc_x=1;
int loops = 1;
int l;
char *p;

int from = 1;
int to = 200;
int step = 1;

struct timeval start, stop;
double time1,timeg,timeg_c;

int test = 1;

argc--;argv++;

if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}

if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);

fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);

if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}

#ifdef linux
srandom(getpid());
#endif

fprintf(stderr, " SIZE Flops Time CTime Test\n");

for(m = from; m <= to; m += step)
{

timeg=0;
timeg_c=0;

fprintf(stderr, " %6d :", (int)m);


for (l=0; l<loops; l++)
{

for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}

gettimeofday( &start, (struct timezone *)0);
result = MIN_ (&m, x, &inc_x);
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;

gettimeofday( &start, (struct timezone *)0);
result_c = min_c(m, x, inc_x);
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg_c += time1;

test &= assert_dbl_near(result, result_c, SINGLE_EPS);

}

timeg /= loops;
timeg_c /= loops;
fprintf(stderr, "%10.2f MFlops %10.6f sec %10.6f sec %s\n", 1. * (double)m / timeg * 1.e-6, timeg, timeg_c, test ? "PASS" : "FAILD");

}

return 0;
}

// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

+ 303
- 0
ztest/rot.c View File

@@ -0,0 +1,303 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"

#define SINGLE_EPS 1e-04
#define DOUBLE_EPS 1e-13

int assert_dbl_near(double exp, double real, double tol) {
double diff = exp - real;
double absdiff = diff;
/* avoid using fabs and linking with a math lib */
if(diff < 0) {
absdiff *= -1;
}
if (absdiff > tol) {
return 0;
}
return 1;
}

#ifdef COMPLEX
int zrot_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
FLOAT temp[2];
BLASLONG inc_x2;
BLASLONG inc_y2;

if ( n <= 0 ) return(0);

inc_x2 = 2 * inc_x ;
inc_y2 = 2 * inc_y ;

while(i < n)
{
temp[0] = c*x[ix] + s*y[iy] ;
temp[1] = c*x[ix+1] + s*y[iy+1] ;
y[iy] = c*y[iy] - s*x[ix] ;
y[iy+1] = c*y[iy+1] - s*x[ix+1] ;
x[ix] = temp[0] ;
x[ix+1] = temp[1] ;

ix += inc_x2 ;
iy += inc_y2 ;
i++ ;

}
return(0);
}
#else
int rot_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
FLOAT temp;

if ( n <= 0 ) return(0);

while(i < n)
{
temp = c*x[ix] + s*y[iy] ;
y[iy] = c*y[iy] - s*x[ix] ;
x[ix] = temp ;

ix += inc_x ;
iy += inc_y ;
i++ ;

}
return(0);
}
#endif

#undef ROT
#ifdef COMPLEX
#ifdef DOUBLE
#define ROT BLASFUNC(zdrot)
#else
#define ROT BLASFUNC(csrot)
#endif
#else
#ifdef DOUBLE
#define ROT BLASFUNC(drot)
#else
#define ROT BLASFUNC(srot)
#endif
#endif

#if defined(__WIN32__) || defined(__WIN64__)

#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif

int gettimeofday(struct timeval *tv, void *tz){

FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;

if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);

tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;

/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}

return 0;
}

#endif

#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0

static void *huge_malloc(BLASLONG size){
int shmid;
void *address;

#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif

if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}

address = shmat(shmid, NULL, SHM_RND);

if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}

shmctl(shmid, IPC_RMID, 0);

return address;
}

#define malloc huge_malloc

#endif

int main(int argc, char *argv[]){

FLOAT *x, *y, *x_c, *y_c;
// FLOAT result;
blasint m, i;
blasint inc_x=1,inc_y=1;
FLOAT c[1] = { 2.0 };
FLOAT s[1] = { 2.0 };
int loops = 1;
int l;
char *p;

int from = 1;
int to = 200;
int step = 1;

struct timeval start, stop;
double time1,timeg,timeg_c;

blasint ix,iy;
int test = 1;

argc--;argv++;

if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}

if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p);

fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops);

if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}

if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}

if (( x_c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}

if (( y_c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}

#ifdef linux
srandom(getpid());
#endif

fprintf(stderr, " SIZE Flops Time CTime Test\n");

for(m = from; m <= to; m += step)
{

timeg=0;
timeg_c=0;

fprintf(stderr, " %6d :", (int)m);


for (l=0; l<loops; l++)
{

for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
x_c[i] = x[i];
}

for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
y_c[i] = y[i];
}
gettimeofday( &start, (struct timezone *)0);
ROT (&m, x, &inc_x, y, &inc_y, c, s);
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;

gettimeofday( &start, (struct timezone *)0);
#ifdef COMPLEX
zrot_c(m, x_c, inc_x, y_c, inc_y, *c, *s);
#else
rot_c(m, x_c, inc_x, y_c, inc_y, *c, *s);
#endif
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg_c += time1;

ix = 0;
iy = 0;
#ifdef COMPLEX
for (i = 0; i < m * 2; i++)
#else
for (i = 0; i < m; i++)
#endif
{
test &= assert_dbl_near(x[ix], x_c[ix], SINGLE_EPS);
test &= assert_dbl_near(y[iy], y_c[iy], SINGLE_EPS);
ix += inc_x;
iy += inc_y;
}
}

timeg /= loops;
timeg_c /= loops;

fprintf(stderr, "%10.2f MFlops %10.6f sec %10.6f sec %s\n", 2. * (double)m / timeg * 1.e-6, timeg, timeg_c, test ? "PASS" : "FAILD");

}

return 0;
}

// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

+ 308
- 0
ztest/scal.c View File

@@ -0,0 +1,308 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"

#define SINGLE_EPS 1e-04
#define DOUBLE_EPS 1e-13

int assert_dbl_near(double exp, double real, double tol) {
double diff = exp - real;
double absdiff = diff;
/* avoid using fabs and linking with a math lib */
if(diff < 0) {
absdiff *= -1;
}
if (absdiff > tol) {
return 0;
}
return 1;
}

#ifdef COMPLEX
int zscal_c(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i=0;
BLASLONG inc_x2;
BLASLONG ip = 0;
FLOAT temp;

if ( (n <= 0) || (inc_x <= 0))
return(0);

inc_x2 = 2 * inc_x;
for ( i=0; i<n; i++ )
{
if ( da_r == 0.0 )
{
if ( da_i == 0.0 )
{
temp = 0.0;
x[ip+1] = 0.0 ;
}
else
{
temp = - da_i * x[ip+1] ;
x[ip+1] = da_i * x[ip] ;
}
}
else
{
if ( da_i == 0.0 )
{
temp = da_r * x[ip] ;
x[ip+1] = da_r * x[ip+1];
}
else
{
temp = da_r * x[ip] - da_i * x[ip+1] ;
x[ip+1] = da_r * x[ip+1] + da_i * x[ip] ;
}
}
x[ip] = temp;

ip += inc_x2;
}

return(0);
}
#else
int scal_c(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i=0,j=0;

if ( (n <= 0) || (inc_x <= 0))
return(0);

while(j < n)
{

if ( da == 0.0 )
x[i]=0.0;
else
x[i] = da * x[i] ;

i += inc_x ;
j++;

}

return 0;
}
#endif

#undef SCAL
#ifdef COMPLEX
#ifdef DOUBLE
#define SCAL BLASFUNC(zscal)
#else
#define SCAL BLASFUNC(cscal)
#endif
#else
#ifdef DOUBLE
#define SCAL BLASFUNC(dscal)
#else
#define SCAL BLASFUNC(sscal)
#endif
#endif

#if defined(__WIN32__) || defined(__WIN64__)

#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif

int gettimeofday(struct timeval *tv, void *tz){

FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;

if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);

tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;

/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}

return 0;
}

#endif

#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0

static void *huge_malloc(BLASLONG size){
int shmid;
void *address;

#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif

if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}

address = shmat(shmid, NULL, SHM_RND);

if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}

shmctl(shmid, IPC_RMID, 0);

return address;
}

#define malloc huge_malloc

#endif

int main(int argc, char *argv[]){

FLOAT *x, *x_c;
FLOAT alpha[2] = { 2.0, 2.0 };
blasint m, i;
blasint inc_x=1;
int loops = 1;
int l;
char *p;

int from = 1;
int to = 200;
int step = 1;

struct timeval start, stop;
double time1,timeg,timeg_c;

blasint ix;
int test = 1;

argc--;argv++;

if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}

if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);

fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);

if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}

if (( x_c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}

#ifdef linux
srandom(getpid());
#endif

fprintf(stderr, " SIZE Flops Time CTime Test\n");

for(m = from; m <= to; m += step)
{

timeg=0;
timeg_c=0;

fprintf(stderr, " %6d :", (int)m);


for (l=0; l<loops; l++)
{

for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
x_c[i] = x[i];
}

gettimeofday( &start, (struct timezone *)0);
SCAL (&m, alpha, x, &inc_x);
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;

gettimeofday( &start, (struct timezone *)0);
#ifdef COMPLEX
zscal_c(m, 0, 0, alpha[0],alpha[1], x_c, inc_x, NULL, 0, NULL, 0);
#else
scal_c(m, 0, 0, *alpha, x_c, inc_x, NULL, 0, NULL, 0);
#endif
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg_c += time1;

ix = 0;
#ifdef COMPLEX
for (i = 0; i < m * 2; i++)
#else
for (i = 0; i < m; i++)
#endif
{
test &= assert_dbl_near(x[ix], x_c[ix], SINGLE_EPS);
ix += inc_x;
}
}

timeg /= loops;
timeg_c /= loops;

#ifdef COMPLEX
fprintf(stderr, "%10.2f MFlops %10.6f sec %10.6f sec %s\n", 6. * (double)m / timeg * 1.e-6, timeg, timeg_c, test ? "PASS" : "FAILD");
#else
fprintf(stderr, "%10.2f MFlops %10.6f sec %10.6f sec %s\n", 1. * (double)m / timeg * 1.e-6, timeg, timeg_c, test ? "PASS" : "FAILD");
#endif

}

return 0;
}

// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

+ 306
- 0
ztest/swap.c View File

@@ -0,0 +1,306 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above swapright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above swapright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE SWAPRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"

#define SINGLE_EPS 1e-04
#define DOUBLE_EPS 1e-13

int assert_dbl_near(double exp, double real, double tol) {
double diff = exp - real;
double absdiff = diff;
/* avoid using fabs and linking with a math lib */
if(diff < 0) {
absdiff *= -1;
}
if (absdiff > tol) {
return 0;
}
return 1;
}

#ifdef COMPLEX
int zswap_c(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
FLOAT temp[2];
BLASLONG inc_x2;
BLASLONG inc_y2;

if ( n < 0 ) return(0);

inc_x2 = 2 * inc_x;
inc_y2 = 2 * inc_y;

while(i < n)
{

temp[0] = x[ix] ;
temp[1] = x[ix+1] ;
x[ix] = y[iy] ;
x[ix+1] = y[iy+1] ;
y[iy] = temp[0] ;
y[iy+1] = temp[1] ;

ix += inc_x2 ;
iy += inc_y2 ;
i++ ;

}
return(0);
}
#else
int swap_c(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
FLOAT temp;

if ( n < 0 ) return(0);

while(i < n)
{

temp = x[ix] ;
x[ix] = y[iy] ;
y[iy] = temp ;

ix += inc_x ;
iy += inc_y ;
i++ ;

}
return(0);
}
#endif

#undef SWAP
#ifdef COMPLEX
#ifdef DOUBLE
#define SWAP BLASFUNC(zswap)
#else
#define SWAP BLASFUNC(cswap)
#endif
#else
#ifdef DOUBLE
#define SWAP BLASFUNC(dswap)
#else
#define SWAP BLASFUNC(sswap)
#endif
#endif

#if defined(__WIN32__) || defined(__WIN64__)

#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif

int gettimeofday(struct timeval *tv, void *tz){

FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;

if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);

tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;

/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}

return 0;
}

#endif

#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0

static void *huge_malloc(BLASLONG size){
int shmid;
void *address;

#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif

if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}

address = shmat(shmid, NULL, SHM_RND);

if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}

shmctl(shmid, IPC_RMID, 0);

return address;
}

#define malloc huge_malloc

#endif

int main(int argc, char *argv[]){

FLOAT *x, *y, *x_c, *y_c;
blasint m, i;
blasint inc_x=1,inc_y=1;
int loops = 1;
int l;
char *p;

int from = 1;
int to = 200;
int step = 1;

struct timeval start, stop;
double time1,timeg,timeg_c;

blasint ix,iy;
int test = 1;

argc--;argv++;

if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}

if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p);

fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops);

if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}

if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}

if (( x_c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}

if (( y_c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}

#ifdef linux
srandom(getpid());
#endif

fprintf(stderr, " SIZE Flops Time CTime Test\n");

for(m = from; m <= to; m += step)
{

timeg=0;
timeg_c=0;

fprintf(stderr, " %6d :", (int)m);


for (l=0; l<loops; l++)
{

for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
x_c[i] = x[i];
}

for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
y_c[i] = y[i];
}
gettimeofday( &start, (struct timezone *)0);
SWAP (&m, x, &inc_x, y, &inc_y );
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;

gettimeofday( &start, (struct timezone *)0);
#ifdef COMPLEX
zswap_c(m, 0, 0, 0, 0, x_c, inc_x, y_c, inc_y, NULL, 0);
#else
swap_c(m, 0, 0, 0, x_c, inc_x, y_c, inc_y, NULL, 0);
#endif
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg_c += time1;

ix = 0;
iy = 0;
#ifdef COMPLEX
for (i = 0; i < m * 2; i++)
#else
for (i = 0; i < m; i++)
#endif
{
test &= assert_dbl_near(x[ix], x_c[ix], SINGLE_EPS);
test &= assert_dbl_near(y[ix], y_c[ix], SINGLE_EPS);
ix += inc_x;
iy += inc_y;
}
}

timeg /= loops;
timeg_c /= loops;

#ifdef COMPLEX
fprintf(stderr, "%10.2f MFlops %10.6f sec %10.6f sec %s\n", 6. * (double)m / timeg * 1.e-6, timeg, timeg_c, test ? "PASS" : "FAILD");
#else
fprintf(stderr, "%10.2f MFlops %10.6f sec %10.6f sec %s\n", 1. * (double)m / timeg * 1.e-6, timeg, timeg_c, test ? "PASS" : "FAILD");
#endif

}

return 0;
}

// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

Loading…
Cancel
Save