Browse Source

Merge pull request #2072 from martin-frbg/sum

Add (C)BLAS extension ?sum
tags/v0.3.6^2
Martin Kroeker GitHub 6 years ago
parent
commit
ccfb7ead15
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
52 changed files with 5640 additions and 27 deletions
  1. +5
    -0
      cblas.h
  2. +7
    -1
      cmake/kernel.cmake
  3. +2
    -0
      common_c.h
  4. +2
    -0
      common_d.h
  5. +7
    -0
      common_interface.h
  6. +7
    -0
      common_level1.h
  7. +6
    -0
      common_macro.h
  8. +6
    -0
      common_param.h
  9. +2
    -0
      common_q.h
  10. +2
    -0
      common_s.h
  11. +2
    -0
      common_x.h
  12. +2
    -0
      common_z.h
  13. +3
    -0
      interface/CMakeLists.txt
  14. +43
    -13
      interface/Makefile
  15. +97
    -0
      interface/sum.c
  16. +1
    -0
      kernel/CMakeLists.txt
  17. +54
    -7
      kernel/Makefile.L1
  18. +206
    -0
      kernel/alpha/sum.S
  19. +208
    -0
      kernel/alpha/zsum.S
  20. +5
    -0
      kernel/arm/KERNEL.ARMV5
  21. +3
    -0
      kernel/arm/KERNEL.ARMV6
  22. +51
    -0
      kernel/arm/sum.c
  23. +425
    -0
      kernel/arm/sum_vfp.S
  24. +57
    -0
      kernel/arm/zsum.c
  25. +164
    -0
      kernel/arm64/csum.S
  26. +186
    -0
      kernel/arm64/sum.S
  27. +158
    -0
      kernel/arm64/zsum.S
  28. +4
    -0
      kernel/ia64/KERNEL
  29. +358
    -0
      kernel/ia64/sum.S
  30. +5
    -0
      kernel/mips/KERNEL.P5600
  31. +47
    -0
      kernel/mips/sum.c
  32. +52
    -0
      kernel/mips/zsum.c
  33. +332
    -0
      kernel/mips64/sum.S
  34. +204
    -0
      kernel/mips64/zsum.S
  35. +446
    -0
      kernel/power/sum.S
  36. +452
    -0
      kernel/power/zsum.S
  37. +6
    -6
      kernel/setparam-ref.c
  38. +325
    -0
      kernel/sparc/sum.S
  39. +327
    -0
      kernel/sparc/zsum.S
  40. +5
    -0
      kernel/x86/KERNEL.generic
  41. +207
    -0
      kernel/x86/sum.S
  42. +208
    -0
      kernel/x86/zsum.S
  43. +5
    -0
      kernel/x86_64/KERNEL.generic
  44. +179
    -0
      kernel/x86_64/sum.S
  45. +180
    -0
      kernel/x86_64/zsum.S
  46. +5
    -0
      kernel/zarch/KERNEL.Z13
  47. +5
    -0
      kernel/zarch/KERNEL.Z14
  48. +5
    -0
      kernel/zarch/KERNEL.ZARCH_GENERIC
  49. +137
    -0
      kernel/zarch/csum.c
  50. +148
    -0
      kernel/zarch/dsum.c
  51. +151
    -0
      kernel/zarch/ssum.c
  52. +136
    -0
      kernel/zarch/zsum.c

+ 5
- 0
cblas.h View File

@@ -73,6 +73,11 @@ double cblas_dasum (OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS
float cblas_scasum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
double cblas_dzasum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);

float cblas_ssum (OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx);
double cblas_dsum (OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
float cblas_scsum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
double cblas_dzsum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);

float cblas_snrm2 (OPENBLAS_CONST blasint N, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX);
double cblas_dnrm2 (OPENBLAS_CONST blasint N, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX);
float cblas_scnrm2(OPENBLAS_CONST blasint N, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX);


+ 7
- 1
cmake/kernel.cmake View File

@@ -107,6 +107,12 @@ macro(SetDefaultL1)
set(DAXPBYKERNEL ../arm/axpby.c)
set(CAXPBYKERNEL ../arm/zaxpby.c)
set(ZAXPBYKERNEL ../arm/zaxpby.c)
set(SSUMKERNEL sum.S)
set(DSUMKERNEL sum.S)
set(CSUMKERNEL zsum.S)
set(ZSUMKERNEL zsum.S)
set(QSUMKERNEL sum.S)
set(XSUMKERNEL zsum.S)
endmacro ()

macro(SetDefaultL2)
@@ -162,4 +168,4 @@ macro(SetDefaultL3)
set(DGEADD_KERNEL ../generic/geadd.c)
set(CGEADD_KERNEL ../generic/zgeadd.c)
set(ZGEADD_KERNEL ../generic/zgeadd.c)
endmacro ()
endmacro ()

+ 2
- 0
common_c.h View File

@@ -19,6 +19,7 @@
#define CDOTC_K cdotc_k
#define CNRM2_K cnrm2_k
#define CSCAL_K cscal_k
#define CSUM_K csum_k
#define CSWAP_K cswap_k
#define CROT_K csrot_k

@@ -249,6 +250,7 @@
#define CDOTC_K gotoblas -> cdotc_k
#define CNRM2_K gotoblas -> cnrm2_k
#define CSCAL_K gotoblas -> cscal_k
#define CSUM_K gotoblas -> csum_k
#define CSWAP_K gotoblas -> cswap_k
#define CROT_K gotoblas -> csrot_k



+ 2
- 0
common_d.h View File

@@ -19,6 +19,7 @@
#define DDOTC_K ddot_k
#define DNRM2_K dnrm2_k
#define DSCAL_K dscal_k
#define DSUM_K dsum_k
#define DSWAP_K dswap_k
#define DROT_K drot_k

@@ -174,6 +175,7 @@
#define DDOTC_K gotoblas -> ddot_k
#define DNRM2_K gotoblas -> dnrm2_k
#define DSCAL_K gotoblas -> dscal_k
#define DSUM_K gotoblas -> dsum_k
#define DSWAP_K gotoblas -> dswap_k
#define DROT_K gotoblas -> drot_k



+ 7
- 0
common_interface.h View File

@@ -122,6 +122,13 @@ xdouble BLASFUNC(qasum) (blasint *, xdouble *, blasint *);
double BLASFUNC(dzasum)(blasint *, double *, blasint *);
xdouble BLASFUNC(qxasum)(blasint *, xdouble *, blasint *);

FLOATRET BLASFUNC(ssum) (blasint *, float *, blasint *);
FLOATRET BLASFUNC(scsum)(blasint *, float *, blasint *);
double BLASFUNC(dsum) (blasint *, double *, blasint *);
xdouble BLASFUNC(qsum) (blasint *, xdouble *, blasint *);
double BLASFUNC(dzsum)(blasint *, double *, blasint *);
xdouble BLASFUNC(qxsum)(blasint *, xdouble *, blasint *);

blasint BLASFUNC(isamax)(blasint *, float *, blasint *);
blasint BLASFUNC(idamax)(blasint *, double *, blasint *);
blasint BLASFUNC(iqamax)(blasint *, xdouble *, blasint *);


+ 7
- 0
common_level1.h View File

@@ -100,6 +100,13 @@ float casum_k (BLASLONG, float *, BLASLONG);
double zasum_k (BLASLONG, double *, BLASLONG);
xdouble xasum_k (BLASLONG, xdouble *, BLASLONG);

float ssum_k (BLASLONG, float *, BLASLONG);
double dsum_k (BLASLONG, double *, BLASLONG);
xdouble qsum_k (BLASLONG, xdouble *, BLASLONG);
float csum_k (BLASLONG, float *, BLASLONG);
double zsum_k (BLASLONG, double *, BLASLONG);
xdouble xsum_k (BLASLONG, xdouble *, BLASLONG);

float samax_k (BLASLONG, float *, BLASLONG);
double damax_k (BLASLONG, double *, BLASLONG);
xdouble qamax_k (BLASLONG, xdouble *, BLASLONG);


+ 6
- 0
common_macro.h View File

@@ -66,6 +66,7 @@
#define DOTC_K QDOTC_K
#define NRM2_K QNRM2_K
#define SCAL_K QSCAL_K
#define SUM_K QSUM_K
#define SWAP_K QSWAP_K
#define ROT_K QROT_K

@@ -356,6 +357,7 @@
#define DOTC_K DDOTC_K
#define NRM2_K DNRM2_K
#define SCAL_K DSCAL_K
#define SUM_K DSUM_K
#define SWAP_K DSWAP_K
#define ROT_K DROT_K

@@ -658,6 +660,7 @@
#define DOTC_K SDOTC_K
#define NRM2_K SNRM2_K
#define SCAL_K SSCAL_K
#define SUM_K SSUM_K
#define SWAP_K SSWAP_K
#define ROT_K SROT_K

@@ -962,6 +965,7 @@
#define DOTC_K XDOTC_K
#define NRM2_K XNRM2_K
#define SCAL_K XSCAL_K
#define SUM_K XSUM_K
#define SWAP_K XSWAP_K
#define ROT_K XROT_K

@@ -1363,6 +1367,7 @@
#define DOTC_K ZDOTC_K
#define NRM2_K ZNRM2_K
#define SCAL_K ZSCAL_K
#define SUM_K ZSUM_K
#define SWAP_K ZSWAP_K
#define ROT_K ZROT_K

@@ -1785,6 +1790,7 @@
#define DOTC_K CDOTC_K
#define NRM2_K CNRM2_K
#define SCAL_K CSCAL_K
#define SUM_K CSUM_K
#define SWAP_K CSWAP_K
#define ROT_K CROT_K



+ 6
- 0
common_param.h View File

@@ -63,6 +63,7 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG);

float (*snrm2_k) (BLASLONG, float *, BLASLONG);
float (*sasum_k) (BLASLONG, float *, BLASLONG);
float (*ssum_k) (BLASLONG, float *, BLASLONG);
int (*scopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
float (*sdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
double (*dsdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
@@ -154,6 +155,7 @@ BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG);

double (*dnrm2_k) (BLASLONG, double *, BLASLONG);
double (*dasum_k) (BLASLONG, double *, BLASLONG);
double (*dsum_k) (BLASLONG, double *, BLASLONG);
int (*dcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG);
double (*ddot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG);
int (*drot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG, double, double);
@@ -245,6 +247,7 @@ BLASLONG (*iqmin_k) (BLASLONG, xdouble *, BLASLONG);

xdouble (*qnrm2_k) (BLASLONG, xdouble *, BLASLONG);
xdouble (*qasum_k) (BLASLONG, xdouble *, BLASLONG);
xdouble (*qsum_k) (BLASLONG, xdouble *, BLASLONG);
int (*qcopy_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
xdouble (*qdot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
int (*qrot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble);
@@ -332,6 +335,7 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG);

float (*cnrm2_k) (BLASLONG, float *, BLASLONG);
float (*casum_k) (BLASLONG, float *, BLASLONG);
float (*csum_k) (BLASLONG, float *, BLASLONG);
int (*ccopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
openblas_complex_float (*cdotu_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
openblas_complex_float (*cdotc_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
@@ -495,6 +499,7 @@ BLASLONG (*izamin_k)(BLASLONG, double *, BLASLONG);

double (*znrm2_k) (BLASLONG, double *, BLASLONG);
double (*zasum_k) (BLASLONG, double *, BLASLONG);
double (*zsum_k) (BLASLONG, double *, BLASLONG);
int (*zcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG);
openblas_complex_double (*zdotu_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG);
openblas_complex_double (*zdotc_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG);
@@ -660,6 +665,7 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG);

xdouble (*xnrm2_k) (BLASLONG, xdouble *, BLASLONG);
xdouble (*xasum_k) (BLASLONG, xdouble *, BLASLONG);
xdouble (*xsum_k) (BLASLONG, xdouble *, BLASLONG);
int (*xcopy_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
openblas_complex_xdouble (*xdotu_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
openblas_complex_xdouble (*xdotc_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);


+ 2
- 0
common_q.h View File

@@ -19,6 +19,7 @@
#define QDOTC_K qdot_k
#define QNRM2_K qnrm2_k
#define QSCAL_K qscal_k
#define QSUM_K qsum_k
#define QSWAP_K qswap_k
#define QROT_K qrot_k

@@ -161,6 +162,7 @@
#define QDOTC_K gotoblas -> qdot_k
#define QNRM2_K gotoblas -> qnrm2_k
#define QSCAL_K gotoblas -> qscal_k
#define QSUM_K gotoblas -> qsum_k
#define QSWAP_K gotoblas -> qswap_k
#define QROT_K gotoblas -> qrot_k



+ 2
- 0
common_s.h View File

@@ -12,6 +12,7 @@
#define ISMAX_K ismax_k
#define ISMIN_K ismin_k
#define SASUM_K sasum_k
#define SSUM_K ssum_k
#define SAXPYU_K saxpy_k
#define SAXPYC_K saxpy_k
#define SCOPY_K scopy_k
@@ -170,6 +171,7 @@
#define ISMAX_K gotoblas -> ismax_k
#define ISMIN_K gotoblas -> ismin_k
#define SASUM_K gotoblas -> sasum_k
#define SSUM_K gotoblas -> ssum_k
#define SAXPYU_K gotoblas -> saxpy_k
#define SAXPYC_K gotoblas -> saxpy_k
#define SCOPY_K gotoblas -> scopy_k


+ 2
- 0
common_x.h View File

@@ -19,6 +19,7 @@
#define XDOTC_K xdotc_k
#define XNRM2_K xnrm2_k
#define XSCAL_K xscal_k
#define XSUM_K xsum_k
#define XSWAP_K xswap_k
#define XROT_K xqrot_k

@@ -227,6 +228,7 @@
#define XDOTC_K gotoblas -> xdotc_k
#define XNRM2_K gotoblas -> xnrm2_k
#define XSCAL_K gotoblas -> xscal_k
#define XSUM_K gotoblas -> xsum_k
#define XSWAP_K gotoblas -> xswap_k
#define XROT_K gotoblas -> xqrot_k



+ 2
- 0
common_z.h View File

@@ -19,6 +19,7 @@
#define ZDOTC_K zdotc_k
#define ZNRM2_K znrm2_k
#define ZSCAL_K zscal_k
#define ZSUM_K zsum_k
#define ZSWAP_K zswap_k
#define ZROT_K zdrot_k

@@ -249,6 +250,7 @@
#define ZDOTC_K gotoblas -> zdotc_k
#define ZNRM2_K gotoblas -> znrm2_k
#define ZSCAL_K gotoblas -> zscal_k
#define ZSUM_K gotoblas -> zsum_k
#define ZSWAP_K gotoblas -> zswap_k
#define ZROT_K gotoblas -> zdrot_k



+ 3
- 0
interface/CMakeLists.txt View File

@@ -12,6 +12,7 @@ set(BLAS1_REAL_ONLY_SOURCES
rotm.c rotmg.c # N.B. these do not have complex counterparts
rot.c
asum.c
sum.c
)

# these will have 'z' prepended for the complex version
@@ -124,6 +125,7 @@ foreach (float_type ${FLOAT_TYPES})
GenerateNamedObjects("max.c" "USE_ABS;USE_MIN" "scamin" ${CBLAS_FLAG} "" "" true "COMPLEX")
GenerateNamedObjects("max.c" "USE_ABS" "scamax" ${CBLAS_FLAG} "" "" true "COMPLEX")
GenerateNamedObjects("asum.c" "" "scasum" ${CBLAS_FLAG} "" "" true "COMPLEX")
GenerateNamedObjects("sum.c" "" "scsum" ${CBLAS_FLAG} "" "" true "COMPLEX")
endif ()
if (${float_type} STREQUAL "ZCOMPLEX")
GenerateNamedObjects("zscal.c" "SSCAL" "dscal" ${CBLAS_FLAG} "" "" false "ZCOMPLEX")
@@ -132,6 +134,7 @@ foreach (float_type ${FLOAT_TYPES})
GenerateNamedObjects("max.c" "USE_ABS;USE_MIN" "dzamin" ${CBLAS_FLAG} "" "" true "ZCOMPLEX")
GenerateNamedObjects("max.c" "USE_ABS" "dzamax" ${CBLAS_FLAG} "" "" true "ZCOMPLEX")
GenerateNamedObjects("asum.c" "" "dzasum" ${CBLAS_FLAG} "" "" true "ZCOMPLEX")
GenerateNamedObjects("sum.c" "" "dzsum" ${CBLAS_FLAG} "" "" true "ZCOMPLEX")
endif ()
endforeach ()



+ 43
- 13
interface/Makefile View File

@@ -25,7 +25,7 @@ SBLAS1OBJS = \
saxpy.$(SUFFIX) sswap.$(SUFFIX) \
scopy.$(SUFFIX) sscal.$(SUFFIX) \
sdot.$(SUFFIX) sdsdot.$(SUFFIX) dsdot.$(SUFFIX) \
sasum.$(SUFFIX) snrm2.$(SUFFIX) \
sasum.$(SUFFIX) ssum.$(SUFFIX) snrm2.$(SUFFIX) \
smax.$(SUFFIX) samax.$(SUFFIX) ismax.$(SUFFIX) isamax.$(SUFFIX) \
smin.$(SUFFIX) samin.$(SUFFIX) ismin.$(SUFFIX) isamin.$(SUFFIX) \
srot.$(SUFFIX) srotg.$(SUFFIX) srotm.$(SUFFIX) srotmg.$(SUFFIX) \
@@ -51,7 +51,7 @@ DBLAS1OBJS = \
daxpy.$(SUFFIX) dswap.$(SUFFIX) \
dcopy.$(SUFFIX) dscal.$(SUFFIX) \
ddot.$(SUFFIX) \
dasum.$(SUFFIX) dnrm2.$(SUFFIX) \
dasum.$(SUFFIX) dsum.$(SUFFIX) dnrm2.$(SUFFIX) \
dmax.$(SUFFIX) damax.$(SUFFIX) idmax.$(SUFFIX) idamax.$(SUFFIX) \
dmin.$(SUFFIX) damin.$(SUFFIX) idmin.$(SUFFIX) idamin.$(SUFFIX) \
drot.$(SUFFIX) drotg.$(SUFFIX) drotm.$(SUFFIX) drotmg.$(SUFFIX) \
@@ -76,7 +76,7 @@ CBLAS1OBJS = \
caxpy.$(SUFFIX) caxpyc.$(SUFFIX) cswap.$(SUFFIX) \
ccopy.$(SUFFIX) cscal.$(SUFFIX) csscal.$(SUFFIX) \
cdotc.$(SUFFIX) cdotu.$(SUFFIX) \
scasum.$(SUFFIX) scnrm2.$(SUFFIX) \
scasum.$(SUFFIX) scsum.$(SUFFIX) scnrm2.$(SUFFIX) \
scamax.$(SUFFIX) icamax.$(SUFFIX) \
scamin.$(SUFFIX) icamin.$(SUFFIX) \
csrot.$(SUFFIX) crotg.$(SUFFIX) \
@@ -105,7 +105,7 @@ ZBLAS1OBJS = \
zaxpy.$(SUFFIX) zaxpyc.$(SUFFIX) zswap.$(SUFFIX) \
zcopy.$(SUFFIX) zscal.$(SUFFIX) zdscal.$(SUFFIX) \
zdotc.$(SUFFIX) zdotu.$(SUFFIX) \
dzasum.$(SUFFIX) dznrm2.$(SUFFIX) \
dzasum.$(SUFFIX) dzsum.$(SUFFIX) dznrm2.$(SUFFIX) \
dzamax.$(SUFFIX) izamax.$(SUFFIX) \
dzamin.$(SUFFIX) izamin.$(SUFFIX) \
zdrot.$(SUFFIX) zrotg.$(SUFFIX) \
@@ -146,7 +146,7 @@ QBLAS1OBJS = \
qaxpy.$(SUFFIX) qswap.$(SUFFIX) \
qcopy.$(SUFFIX) qscal.$(SUFFIX) \
qdot.$(SUFFIX) \
qasum.$(SUFFIX) qnrm2.$(SUFFIX) \
qasum.$(SUFFIX) qsum.$(SUFFIX) qnrm2.$(SUFFIX) \
qmax.$(SUFFIX) qamax.$(SUFFIX) iqmax.$(SUFFIX) iqamax.$(SUFFIX) \
qmin.$(SUFFIX) qamin.$(SUFFIX) iqmin.$(SUFFIX) iqamin.$(SUFFIX) \
qrot.$(SUFFIX) qrotg.$(SUFFIX) qrotm.$(SUFFIX) qrotmg.$(SUFFIX) \
@@ -168,7 +168,7 @@ XBLAS1OBJS = \
xaxpy.$(SUFFIX) xaxpyc.$(SUFFIX) xswap.$(SUFFIX) \
xcopy.$(SUFFIX) xscal.$(SUFFIX) xqscal.$(SUFFIX) \
xdotc.$(SUFFIX) xdotu.$(SUFFIX) \
qxasum.$(SUFFIX) qxnrm2.$(SUFFIX) \
qxasum.$(SUFFIX) qxsum.$(SUFFIX) qxnrm2.$(SUFFIX) \
qxamax.$(SUFFIX) ixamax.$(SUFFIX) \
qxamin.$(SUFFIX) ixamin.$(SUFFIX) \
xqrot.$(SUFFIX) xrotg.$(SUFFIX) \
@@ -203,7 +203,7 @@ ifdef QUAD_PRECISION
QBLAS1OBJS = \
qaxpy.$(SUFFIX) qswap.$(SUFFIX) \
qcopy.$(SUFFIX) qscal.$(SUFFIX) \
qasum.$(SUFFIX) qnrm2.$(SUFFIX) \
qasum.$(SUFFIX) qsum.$(SUFFIX) qnrm2.$(SUFFIX) \
qmax.$(SUFFIX) qamax.$(SUFFIX) iqmax.$(SUFFIX) iqamax.$(SUFFIX) \
qmin.$(SUFFIX) qamin.$(SUFFIX) iqmin.$(SUFFIX) iqamin.$(SUFFIX) \
qrot.$(SUFFIX) qrotg.$(SUFFIX) qrotm.$(SUFFIX) qrotmg.$(SUFFIX) \
@@ -224,7 +224,7 @@ QBLAS3OBJS = \
XBLAS1OBJS = \
xaxpy.$(SUFFIX) xaxpyc.$(SUFFIX) xswap.$(SUFFIX) \
xcopy.$(SUFFIX) xscal.$(SUFFIX) xqscal.$(SUFFIX) \
qxasum.$(SUFFIX) qxnrm2.$(SUFFIX) \
qxasum.$(SUFFIX) qxsum.$(SUFFIX) qxnrm2.$(SUFFIX) \
qxamax.$(SUFFIX) ixamax.$(SUFFIX) \
qxamin.$(SUFFIX) ixamin.$(SUFFIX) \
xqrot.$(SUFFIX) xrotg.$(SUFFIX) \
@@ -264,7 +264,7 @@ CSBLAS1OBJS = \
cblas_scopy.$(SUFFIX) cblas_sdot.$(SUFFIX) cblas_sdsdot.$(SUFFIX) cblas_dsdot.$(SUFFIX) \
cblas_srot.$(SUFFIX) cblas_srotg.$(SUFFIX) cblas_srotm.$(SUFFIX) cblas_srotmg.$(SUFFIX) \
cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) cblas_saxpby.$(SUFFIX) \
cblas_ismin.$(SUFFIX) cblas_ismax.$(SUFFIX)
cblas_ismin.$(SUFFIX) cblas_ismax.$(SUFFIX) cblas_ssum.$(SUFFIX)

CSBLAS2OBJS = \
cblas_sgemv.$(SUFFIX) cblas_sger.$(SUFFIX) cblas_ssymv.$(SUFFIX) cblas_strmv.$(SUFFIX) \
@@ -282,7 +282,7 @@ CDBLAS1OBJS = \
cblas_dcopy.$(SUFFIX) cblas_ddot.$(SUFFIX) \
cblas_drot.$(SUFFIX) cblas_drotg.$(SUFFIX) cblas_drotm.$(SUFFIX) cblas_drotmg.$(SUFFIX) \
cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) cblas_daxpby.$(SUFFIX) \
cblas_idmin.$(SUFFIX) cblas_idmax.$(SUFFIX)
cblas_idmin.$(SUFFIX) cblas_idmax.$(SUFFIX) cblas_dsum.$(SUFFIX)

CDBLAS2OBJS = \
cblas_dgemv.$(SUFFIX) cblas_dger.$(SUFFIX) cblas_dsymv.$(SUFFIX) cblas_dtrmv.$(SUFFIX) \
@@ -303,7 +303,7 @@ CCBLAS1OBJS = \
cblas_cscal.$(SUFFIX) cblas_csscal.$(SUFFIX) \
cblas_cswap.$(SUFFIX) cblas_scnrm2.$(SUFFIX) \
cblas_caxpby.$(SUFFIX) \
cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX)
cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX) cblas_scsum.$(SUFFIX)

CCBLAS2OBJS = \
cblas_cgemv.$(SUFFIX) cblas_cgerc.$(SUFFIX) cblas_cgeru.$(SUFFIX) \
@@ -330,7 +330,7 @@ CZBLAS1OBJS = \
cblas_zscal.$(SUFFIX) cblas_zdscal.$(SUFFIX) \
cblas_zswap.$(SUFFIX) cblas_dznrm2.$(SUFFIX) \
cblas_zaxpby.$(SUFFIX) \
cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX)
cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX) cblas_dzsum.$(SUFFIX)


CZBLAS2OBJS = \
@@ -565,6 +565,24 @@ dzasum.$(SUFFIX) dzasum.$(PSUFFIX) : asum.c
qxasum.$(SUFFIX) qxasum.$(PSUFFIX) : asum.c
$(CC) $(CFLAGS) -c $< -o $(@F)

ssum.$(SUFFIX) ssum.$(PSUFFIX) : sum.c
$(CC) $(CFLAGS) -c $< -o $(@F)

dsum.$(SUFFIX) dsum.$(PSUFFIX) : sum.c
$(CC) $(CFLAGS) -c $< -o $(@F)

qsum.$(SUFFIX) qsum.$(PSUFFIX) : sum.c
$(CC) $(CFLAGS) -c $< -o $(@F)

scsum.$(SUFFIX) scsum.$(PSUFFIX) : sum.c
$(CC) $(CFLAGS) -c $< -o $(@F)

dzsum.$(SUFFIX) dzsum.$(PSUFFIX) : sum.c
$(CC) $(CFLAGS) -c $< -o $(@F)

qxsum.$(SUFFIX) qxsum.$(PSUFFIX) : sum.c
$(CC) $(CFLAGS) -c $< -o $(@F)

snrm2.$(SUFFIX) snrm2.$(PSUFFIX) : nrm2.c
$(CC) $(CFLAGS) -c $< -o $(@F)

@@ -1412,6 +1430,18 @@ cblas_scasum.$(SUFFIX) cblas_scasum.$(PSUFFIX) : asum.c
cblas_dzasum.$(SUFFIX) cblas_dzasum.$(PSUFFIX) : asum.c
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)

cblas_ssum.$(SUFFIX) cblas_ssum.$(PSUFFIX) : sum.c
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)

cblas_dsum.$(SUFFIX) cblas_dsum.$(PSUFFIX) : sum.c
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)

cblas_scsum.$(SUFFIX) cblas_scsum.$(PSUFFIX) : sum.c
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)

cblas_dzsum.$(SUFFIX) cblas_dzsum.$(PSUFFIX) : sum.c
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)

cblas_sdsdot.$(SUFFIX) cblas_sdsdot.$(PSUFFIX) : sdsdot.c
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)

@@ -1419,7 +1449,7 @@ cblas_dsdot.$(SUFFIX) cblas_dsdot.$(PSUFFIX) : dsdot.c
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)

cblas_sdot.$(SUFFIX) cblas_sdot.$(PSUFFIX) : dot.c
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)

cblas_ddot.$(SUFFIX) cblas_ddot.$(PSUFFIX) : dot.c
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)


+ 97
- 0
interface/sum.c View File

@@ -0,0 +1,97 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/

#include <stdio.h>
#include "common.h"
#ifdef FUNCTION_PROFILE
#include "functable.h"
#endif

#ifndef CBLAS

FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){

BLASLONG n = *N;
BLASLONG incx = *INCX;
FLOATRET ret;

PRINT_DEBUG_NAME;

if (n <= 0) return 0;

IDEBUG_START;

FUNCTION_PROFILE_START();

ret = (FLOATRET)SUM_K(n, x, incx);

FUNCTION_PROFILE_END(COMPSIZE, n, n);

IDEBUG_END;

return ret;
}

#else
#ifdef COMPLEX
FLOAT CNAME(blasint n, void *vx, blasint incx){
FLOAT *x = (FLOAT*) vx;
#else
FLOAT CNAME(blasint n, FLOAT *x, blasint incx){
#endif

FLOAT ret;

PRINT_DEBUG_CNAME;

if (n <= 0) return 0;

IDEBUG_START;

FUNCTION_PROFILE_START();

ret = SUM_K(n, x, incx);

FUNCTION_PROFILE_END(COMPSIZE, n, n);

IDEBUG_END;

return ret;
}

#endif

+ 1
- 0
kernel/CMakeLists.txt View File

@@ -65,6 +65,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
GenerateNamedObjects("${KERNELDIR}/${${float_char}SCALKERNEL}" "" "scal_k" false "" "" false ${float_type})
GenerateNamedObjects("${KERNELDIR}/${${float_char}SWAPKERNEL}" "" "swap_k" false "" "" false ${float_type})
GenerateNamedObjects("${KERNELDIR}/${${float_char}AXPBYKERNEL}" "" "axpby_k" false "" "" false ${float_type})
GenerateNamedObjects("${KERNELDIR}/${${float_char}SUMKERNEL}" "" "sum_k" false "" "" false ${float_type})

if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX")
GenerateNamedObjects("${KERNELDIR}/${${float_char}AXPYKERNEL}" "CONJ" "axpyc_k" false "" "" false ${float_type})


+ 54
- 7
kernel/Makefile.L1 View File

@@ -340,6 +340,32 @@ ifndef XSCALKERNEL
XSCALKERNEL = zscal.S
endif

### SUM ###

ifndef SSUMKERNEL
SSUMKERNEL = sum.S
endif

ifndef DSUMKERNEL
DSUMKERNEL = sum.S
endif

ifndef CSUMKERNEL
CSUMKERNEL = zsum.S
endif

ifndef ZSUMKERNEL
ZSUMKERNEL = zsum.S
endif

ifndef QSUMKERNEL
QSUMKERNEL = sum.S
endif

ifndef XSUMKERNEL
XSUMKERNEL = zsum.S
endif

### SWAP ###

ifndef SSWAPKERNEL
@@ -453,7 +479,7 @@ endif
SBLASOBJS += \
samax_k$(TSUFFIX).$(SUFFIX) samin_k$(TSUFFIX).$(SUFFIX) smax_k$(TSUFFIX).$(SUFFIX) smin_k$(TSUFFIX).$(SUFFIX) \
isamax_k$(TSUFFIX).$(SUFFIX) isamin_k$(TSUFFIX).$(SUFFIX) ismax_k$(TSUFFIX).$(SUFFIX) ismin_k$(TSUFFIX).$(SUFFIX) \
sasum_k$(TSUFFIX).$(SUFFIX) saxpy_k$(TSUFFIX).$(SUFFIX) scopy_k$(TSUFFIX).$(SUFFIX) \
sasum_k$(TSUFFIX).$(SUFFIX) ssum_k$(TSUFFIX).$(SUFFIX) saxpy_k$(TSUFFIX).$(SUFFIX) scopy_k$(TSUFFIX).$(SUFFIX) \
sdot_k$(TSUFFIX).$(SUFFIX) sdsdot_k$(TSUFFIX).$(SUFFIX) dsdot_k$(TSUFFIX).$(SUFFIX) \
snrm2_k$(TSUFFIX).$(SUFFIX) srot_k$(TSUFFIX).$(SUFFIX) sscal_k$(TSUFFIX).$(SUFFIX) sswap_k$(TSUFFIX).$(SUFFIX) \
saxpby_k$(TSUFFIX).$(SUFFIX)
@@ -463,31 +489,32 @@ DBLASOBJS += \
idamax_k$(TSUFFIX).$(SUFFIX) idamin_k$(TSUFFIX).$(SUFFIX) idmax_k$(TSUFFIX).$(SUFFIX) idmin_k$(TSUFFIX).$(SUFFIX) \
dasum_k$(TSUFFIX).$(SUFFIX) daxpy_k$(TSUFFIX).$(SUFFIX) dcopy_k$(TSUFFIX).$(SUFFIX) ddot_k$(TSUFFIX).$(SUFFIX) \
dnrm2_k$(TSUFFIX).$(SUFFIX) drot_k$(TSUFFIX).$(SUFFIX) dscal_k$(TSUFFIX).$(SUFFIX) dswap_k$(TSUFFIX).$(SUFFIX) \
daxpby_k$(TSUFFIX).$(SUFFIX)
daxpby_k$(TSUFFIX).$(SUFFIX) dsum_k$(TSUFFIX).$(SUFFIX)

QBLASOBJS += \
qamax_k$(TSUFFIX).$(SUFFIX) qamin_k$(TSUFFIX).$(SUFFIX) qmax_k$(TSUFFIX).$(SUFFIX) qmin_k$(TSUFFIX).$(SUFFIX) \
iqamax_k$(TSUFFIX).$(SUFFIX) iqamin_k$(TSUFFIX).$(SUFFIX) iqmax_k$(TSUFFIX).$(SUFFIX) iqmin_k$(TSUFFIX).$(SUFFIX) \
qasum_k$(TSUFFIX).$(SUFFIX) qaxpy_k$(TSUFFIX).$(SUFFIX) qcopy_k$(TSUFFIX).$(SUFFIX) qdot_k$(TSUFFIX).$(SUFFIX) \
qnrm2_k$(TSUFFIX).$(SUFFIX) qrot_k$(TSUFFIX).$(SUFFIX) qscal_k$(TSUFFIX).$(SUFFIX) qswap_k$(TSUFFIX).$(SUFFIX)
qnrm2_k$(TSUFFIX).$(SUFFIX) qrot_k$(TSUFFIX).$(SUFFIX) qscal_k$(TSUFFIX).$(SUFFIX) qswap_k$(TSUFFIX).$(SUFFIX) \
qsum_k$(TSUFFIX).$(SUFFIX)

CBLASOBJS += \
camax_k$(TSUFFIX).$(SUFFIX) camin_k$(TSUFFIX).$(SUFFIX) icamax_k$(TSUFFIX).$(SUFFIX) icamin_k$(TSUFFIX).$(SUFFIX) \
casum_k$(TSUFFIX).$(SUFFIX) caxpy_k$(TSUFFIX).$(SUFFIX) caxpyc_k$(TSUFFIX).$(SUFFIX) ccopy_k$(TSUFFIX).$(SUFFIX) \
cdotc_k$(TSUFFIX).$(SUFFIX) cdotu_k$(TSUFFIX).$(SUFFIX) cnrm2_k$(TSUFFIX).$(SUFFIX) csrot_k$(TSUFFIX).$(SUFFIX) \
cscal_k$(TSUFFIX).$(SUFFIX) cswap_k$(TSUFFIX).$(SUFFIX) caxpby_k$(TSUFFIX).$(SUFFIX)
cscal_k$(TSUFFIX).$(SUFFIX) cswap_k$(TSUFFIX).$(SUFFIX) caxpby_k$(TSUFFIX).$(SUFFIX) csum_k$(TSUFFIX).$(SUFFIX)

ZBLASOBJS += \
zamax_k$(TSUFFIX).$(SUFFIX) zamin_k$(TSUFFIX).$(SUFFIX) izamax_k$(TSUFFIX).$(SUFFIX) izamin_k$(TSUFFIX).$(SUFFIX) \
zasum_k$(TSUFFIX).$(SUFFIX) zaxpy_k$(TSUFFIX).$(SUFFIX) zaxpyc_k$(TSUFFIX).$(SUFFIX) zcopy_k$(TSUFFIX).$(SUFFIX) \
zdotc_k$(TSUFFIX).$(SUFFIX) zdotu_k$(TSUFFIX).$(SUFFIX) znrm2_k$(TSUFFIX).$(SUFFIX) zdrot_k$(TSUFFIX).$(SUFFIX) \
zscal_k$(TSUFFIX).$(SUFFIX) zswap_k$(TSUFFIX).$(SUFFIX) zaxpby_k$(TSUFFIX).$(SUFFIX)
zscal_k$(TSUFFIX).$(SUFFIX) zswap_k$(TSUFFIX).$(SUFFIX) zaxpby_k$(TSUFFIX).$(SUFFIX) zsum_k$(TSUFFIX).$(SUFFIX)

XBLASOBJS += \
xamax_k$(TSUFFIX).$(SUFFIX) xamin_k$(TSUFFIX).$(SUFFIX) ixamax_k$(TSUFFIX).$(SUFFIX) ixamin_k$(TSUFFIX).$(SUFFIX) \
xasum_k$(TSUFFIX).$(SUFFIX) xaxpy_k$(TSUFFIX).$(SUFFIX) xaxpyc_k$(TSUFFIX).$(SUFFIX) xcopy_k$(TSUFFIX).$(SUFFIX) \
xdotc_k$(TSUFFIX).$(SUFFIX) xdotu_k$(TSUFFIX).$(SUFFIX) xnrm2_k$(TSUFFIX).$(SUFFIX) xqrot_k$(TSUFFIX).$(SUFFIX) \
xscal_k$(TSUFFIX).$(SUFFIX) xswap_k$(TSUFFIX).$(SUFFIX)
xscal_k$(TSUFFIX).$(SUFFIX) xswap_k$(TSUFFIX).$(SUFFIX) xsum_k$(TSUFFIX).$(SUFFIX)

### AMAX ###

@@ -617,7 +644,7 @@ $(KDIR)idmin_k$(TSUFFIX).$(SUFFIX) $(KDIR)idmin_k$(TPSUFFIX).$(PSUFFIX) : $(KE
$(KDIR)iqmin_k$(TSUFFIX).$(SUFFIX) $(KDIR)iqmin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IQMINKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UUSE_ABS -DUSE_MIN $< -o $@

### ASUM ###
$(KDIR)sasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)sasum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SASUMKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@

@@ -636,6 +663,26 @@ $(KDIR)zasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)zasum_k$(TPSUFFIX).$(PSUFFIX) : $(KE
$(KDIR)xasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)xasum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XASUMKERNEL)
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $@

### SUM ###
$(KDIR)ssum_k$(TSUFFIX).$(SUFFIX) $(KDIR)ssum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SSUMKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@

$(KDIR)dsum_k$(TSUFFIX).$(SUFFIX) $(KDIR)dsum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DSUMKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $@

$(KDIR)qsum_k$(TSUFFIX).$(SUFFIX) $(KDIR)qsum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QSUMKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $@

$(KDIR)csum_k$(TSUFFIX).$(SUFFIX) $(KDIR)csum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CSUMKERNEL)
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE $< -o $@

$(KDIR)zsum_k$(TSUFFIX).$(SUFFIX) $(KDIR)zsum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZSUMKERNEL)
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE $< -o $@

$(KDIR)xsum_k$(TSUFFIX).$(SUFFIX) $(KDIR)xsum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XSUMKERNEL)
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $@

### AXPY ###
$(KDIR)saxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)saxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SAXPYKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@



+ 206
- 0
kernel/alpha/sum.S View File

@@ -0,0 +1,206 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/

#define ASSEMBLER
#include "common.h"
#include "version.h"

#define PREFETCHSIZE 88

#define N $16
#define X $17
#define INCX $18
#define I $19

#define s0 $f0
#define s1 $f1
#define s2 $f10
#define s3 $f11

#define a0 $f12
#define a1 $f13
#define a2 $f14
#define a3 $f15
#define a4 $f16
#define a5 $f17
#define a6 $f18
#define a7 $f19

#define t0 $f20
#define t1 $f21
#define t2 $f22
#define t3 $f23

PROLOGUE
PROFCODE

fclr s0
unop
fclr t0
ble N, $L999

sra N, 3, I
fclr s1
fclr s2
ble I, $L15

LD a0, 0 * SIZE(X)
fclr t1
SXADDQ INCX, X, X
fclr t2

LD a1, 0 * SIZE(X)
fclr t3
SXADDQ INCX, X, X
fclr s3

LD a2, 0 * SIZE(X)
SXADDQ INCX, X, X
LD a3, 0 * SIZE(X)
SXADDQ INCX, X, X

LD a4, 0 * SIZE(X)
SXADDQ INCX, X, X
LD a5, 0 * SIZE(X)
SXADDQ INCX, X, X

lda I, -1(I)
ble I, $L13
.align 4

$L12:
ADD s0, t0, s0
ldl $31, PREFETCHSIZE * 2 * SIZE(X)
fmov a0, t0
lda I, -1(I)

ADD s1, t1, s1
LD a6, 0 * SIZE(X)
fmov a1, t1
SXADDQ INCX, X, X

ADD s2, t2, s2
LD a7, 0 * SIZE(X)
fmov a2, t2
SXADDQ INCX, X, X

ADD s3, t3, s3
LD a0, 0 * SIZE(X)
fmov a3, t3
SXADDQ INCX, X, X

ADD s0, t0, s0
LD a1, 0 * SIZE(X)
fmov a4, t0
SXADDQ INCX, X, X

ADD s1, t1, s1
LD a2, 0 * SIZE(X)
fmov a5, t1
SXADDQ INCX, X, X

ADD s2, t2, s2
LD a3, 0 * SIZE(X)
fmov a6, t2
SXADDQ INCX, X, X

ADD s3, t3, s3
LD a4, 0 * SIZE(X)
fmov a7, t3
SXADDQ INCX, X, X

LD a5, 0 * SIZE(X)
unop
SXADDQ INCX, X, X
bne I, $L12
.align 4

$L13:
ADD s0, t0, s0
LD a6, 0 * SIZE(X)
fmov a0, t0
SXADDQ INCX, X, X

ADD s1, t1, s1
LD a7, 0 * SIZE(X)
fmov a1, t1
SXADDQ INCX, X, X

ADD s2, t2, s2
fmov a2, t2
ADD s3, t3, s3
fmov a3, t3

ADD s0, t0, s0
fmov a4, t0
ADD s1, t1, s1
fmov a5, t1
ADD s2, t2, s2
fmov a6, t2
ADD s3, t3, s3
fmov a7, t3

ADD s1, t1, s1
ADD s2, t2, s2
ADD s3, t3, s3

ADD s0, s1, s0
ADD s2, s3, s2
.align 4

$L15:
and N, 7, I
ADD s0, s2, s0
unop
ble I, $L999
.align 4

$L17:
ADD s0, t0, s0
LD a0, 0 * SIZE(X)
SXADDQ INCX, X, X
fmov a0, t0

lda I, -1(I)
bne I, $L17
.align 4

$L999:
ADD s0, t0, s0
ret
EPILOGUE

+ 208
- 0
kernel/alpha/zsum.S View File

@@ -0,0 +1,208 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/

#define ASSEMBLER
#include "common.h"
#include "version.h"

#define PREFETCHSIZE 88

#define N $16
#define X $17
#define INCX $18
#define I $19

#define s0 $f0
#define s1 $f1
#define s2 $f10
#define s3 $f11

#define a0 $f12
#define a1 $f13
#define a2 $f14
#define a3 $f15
#define a4 $f16
#define a5 $f17
#define a6 $f18
#define a7 $f19

#define t0 $f20
#define t1 $f21
#define t2 $f22
#define t3 $f23

PROLOGUE
PROFCODE

fclr s0
unop
fclr t0
addq INCX, INCX, INCX

fclr s1
unop
fclr t1
ble N, $L999

fclr s2
sra N, 2, I
fclr s3
ble I, $L15

LD a0, 0 * SIZE(X)
fclr t2
LD a1, 1 * SIZE(X)
SXADDQ INCX, X, X

LD a2, 0 * SIZE(X)
fclr t3
LD a3, 1 * SIZE(X)
SXADDQ INCX, X, X

LD a4, 0 * SIZE(X)
LD a5, 1 * SIZE(X)
SXADDQ INCX, X, X
lda I, -1(I)

ble I, $L13
.align 4

$L12:
ADD s0, t0, s0
ldl $31, PREFETCHSIZE * SIZE(X)
fmov a0, t0
lda I, -1(I)

ADD s1, t1, s1
LD a6, 0 * SIZE(X)
fmov a1, t1
unop

ADD s2, t2, s2
LD a7, 1 * SIZE(X)
fmov a2, t2
SXADDQ INCX, X, X

ADD s3, t3, s3
LD a0, 0 * SIZE(X)
fmov a3, t3
unop

ADD s0, t0, s0
LD a1, 1 * SIZE(X)
fmov a4, t0
SXADDQ INCX, X, X

ADD s1, t1, s1
LD a2, 0 * SIZE(X)
fmov a5, t1
unop

ADD s2, t2, s2
LD a3, 1 * SIZE(X)
fmov a6, t2
SXADDQ INCX, X, X

ADD s3, t3, s3
LD a4, 0 * SIZE(X)
fmov a7, t3
unop

LD a5, 1 * SIZE(X)
unop
SXADDQ INCX, X, X
bne I, $L12
.align 4

$L13:
ADD s0, t0, s0
LD a6, 0 * SIZE(X)
fmov a0, t0

ADD s1, t1, s1
LD a7, 1 * SIZE(X)
fmov a1, t1
SXADDQ INCX, X, X

ADD s2, t2, s2
fmov a2, t2
ADD s3, t3, s3
fmov a3, t3

ADD s0, t0, s0
fmov a4, t0
ADD s1, t1, s1
fmov a5, t1
ADD s2, t2, s2
fmov a6, t2
ADD s3, t3, s3
fmov a7, t3

ADD s2, t2, s2
ADD s3, t3, s3

.align 4

$L15:
ADD s0, s2, s0
and N, 3, I
ADD s1, s3, s1
ble I, $L999
.align 4

$L17:
ADD s0, t0, s0
LD a0, 0 * SIZE(X)
fmov a0, t0
lda I, -1(I)

ADD s1, t1, s1
LD a1, 1 * SIZE(X)
fmov a1, t1
SXADDQ INCX, X, X

bne I, $L17
.align 4

$L999:
ADD s0, t0, s0
ADD s1, t1, s1

ADD s0, s1, s0
ret
EPILOGUE

+ 5
- 0
kernel/arm/KERNEL.ARMV5 View File

@@ -35,6 +35,11 @@ DASUMKERNEL = ../arm/asum.c
CASUMKERNEL = ../arm/zasum.c
ZASUMKERNEL = ../arm/zasum.c

SSUMKERNEL = ../arm/sum.c
DSUMKERNEL = ../arm/sum.c
CSUMKERNEL = ../arm/zsum.c
ZSUMKERNEL = ../arm/zsum.c

SAXPYKERNEL = ../arm/axpy.c
DAXPYKERNEL = ../arm/axpy.c
CAXPYKERNEL = ../arm/zaxpy.c


+ 3
- 0
kernel/arm/KERNEL.ARMV6 View File

@@ -37,6 +37,9 @@ DASUMKERNEL = asum_vfp.S
CASUMKERNEL = asum_vfp.S
ZASUMKERNEL = asum_vfp.S

SSUMKERNEL = sum_vfp.S
DSUMKERNEL = sum_vfp.S

SAXPYKERNEL = axpy_vfp.S
DAXPYKERNEL = axpy_vfp.S
CAXPYKERNEL = axpy_vfp.S


+ 51
- 0
kernel/arm/sum.c View File

@@ -0,0 +1,51 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

/**************************************************************************************
* trivial copy of asum.c with the ABS() removed *
**************************************************************************************/


#include "common.h"
#include <math.h>

FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
FLOAT sumf = 0.0;
if (n <= 0 || inc_x <= 0) return(sumf);

n *= inc_x;
while(i < n)
{
sumf += x[i];
i += inc_x;
}
return(sumf);
}



+ 425
- 0
kernel/arm/sum_vfp.S View File

@@ -0,0 +1,425 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

/**************************************************************************************
* trivial copy of asum_vfp.S with the in-place vabs.f64 calls removed *
**************************************************************************************/

#define ASSEMBLER
#include "common.h"

#define STACKSIZE 256

#define N r0
#define X r1
#define INC_X r2


#define I r12

#define X_PRE 512

/**************************************************************************************
* Macro definitions
**************************************************************************************/

#if !defined(COMPLEX)

#if defined(DOUBLE)

.macro KERNEL_F4

pld [ X, #X_PRE ]
vldmia.f64 X!, { d4 - d5 }
vadd.f64 d0 , d0, d4
vldmia.f64 X!, { d6 - d7 }
vadd.f64 d1 , d1, d5
vadd.f64 d0 , d0, d6
vadd.f64 d1 , d1, d7

.endm

.macro KERNEL_F1

vldmia.f64 X!, { d4 }
vadd.f64 d0 , d0, d4

.endm


.macro KERNEL_S4

vldmia.f64 X, { d4 }
vadd.f64 d0 , d0, d4
add X, X, INC_X

vldmia.f64 X, { d4 }
vadd.f64 d0 , d0, d4
add X, X, INC_X

vldmia.f64 X, { d4 }
vadd.f64 d0 , d0, d4
add X, X, INC_X

vldmia.f64 X, { d4 }
vadd.f64 d0 , d0, d4
add X, X, INC_X

.endm


.macro KERNEL_S1

vldmia.f64 X, { d4 }
vadd.f64 d0 , d0, d4
add X, X, INC_X

.endm

#else

.macro KERNEL_F4

vldmia.f32 X!, { s4 - s5 }
vadd.f32 s0 , s0, s4
vldmia.f32 X!, { s6 - s7 }
vadd.f32 s1 , s1, s5
vadd.f32 s0 , s0, s6
vadd.f32 s1 , s1, s7

.endm

.macro KERNEL_F1

vldmia.f32 X!, { s4 }
vadd.f32 s0 , s0, s4

.endm


.macro KERNEL_S4

vldmia.f32 X, { s4 }
vadd.f32 s0 , s0, s4
add X, X, INC_X

vldmia.f32 X, { s4 }
vadd.f32 s0 , s0, s4
add X, X, INC_X

vldmia.f32 X, { s4 }
vadd.f32 s0 , s0, s4
add X, X, INC_X

vldmia.f32 X, { s4 }
vadd.f32 s0 , s0, s4
add X, X, INC_X

.endm


.macro KERNEL_S1

vldmia.f32 X, { s4 }
vadd.f32 s0 , s0, s4
add X, X, INC_X

.endm


#endif

#else

#if defined(DOUBLE)

.macro KERNEL_F4

pld [ X, #X_PRE ]
vldmia.f64 X!, { d4 - d5 }
vadd.f64 d0 , d0, d4
vldmia.f64 X!, { d6 - d7 }
vadd.f64 d1 , d1, d5
vadd.f64 d0 , d0, d6
vadd.f64 d1 , d1, d7

pld [ X, #X_PRE ]
vldmia.f64 X!, { d4 - d5 }
vadd.f64 d0 , d0, d4
vldmia.f64 X!, { d6 - d7 }
vadd.f64 d1 , d1, d5
vadd.f64 d0 , d0, d6
vadd.f64 d1 , d1, d7


.endm

.macro KERNEL_F1

vldmia.f64 X!, { d4 }
vadd.f64 d0 , d0, d4

vldmia.f64 X!, { d4 }
vadd.f64 d0 , d0, d4


.endm


.macro KERNEL_S4

vldmia.f64 X, { d4 -d5 }
vadd.f64 d0 , d0, d4
vadd.f64 d0 , d0, d5
add X, X, INC_X

vldmia.f64 X, { d4 -d5 }
vadd.f64 d0 , d0, d4
vadd.f64 d0 , d0, d5
add X, X, INC_X

vldmia.f64 X, { d4 -d5 }
vadd.f64 d0 , d0, d4
vadd.f64 d0 , d0, d5
add X, X, INC_X

vldmia.f64 X, { d4 -d5 }
vadd.f64 d0 , d0, d4
vadd.f64 d0 , d0, d5
add X, X, INC_X

.endm


.macro KERNEL_S1

vldmia.f64 X, { d4 -d5 }
vadd.f64 d0 , d0, d4
vadd.f64 d0 , d0, d5
add X, X, INC_X

.endm

#else

.macro KERNEL_F4

pld [ X, #X_PRE ]
vldmia.f32 X!, { s4 - s5 }
vadd.f32 s0 , s0, s4
vldmia.f32 X!, { s6 - s7 }
vadd.f32 s1 , s1, s5
vadd.f32 s0 , s0, s6
vadd.f32 s1 , s1, s7

vldmia.f32 X!, { s4 - s5 }
vadd.f32 s0 , s0, s4
vldmia.f32 X!, { s6 - s7 }
vadd.f32 s1 , s1, s5
vadd.f32 s0 , s0, s6
vadd.f32 s1 , s1, s7


.endm

.macro KERNEL_F1

vldmia.f32 X!, { s4 }
vadd.f32 s0 , s0, s4

vldmia.f32 X!, { s4 }
vadd.f32 s0 , s0, s4

.endm


.macro KERNEL_S4

vldmia.f32 X, { s4 -s5 }
vadd.f32 s0 , s0, s4
vadd.f32 s0 , s0, s5
add X, X, INC_X

vldmia.f32 X, { s4 -s5 }
vadd.f32 s0 , s0, s4
vadd.f32 s0 , s0, s5
add X, X, INC_X

vldmia.f32 X, { s4 -s5 }
vadd.f32 s0 , s0, s4
vadd.f32 s0 , s0, s5
add X, X, INC_X

vldmia.f32 X, { s4 -s5 }
vadd.f32 s0 , s0, s4
vadd.f32 s0 , s0, s5
add X, X, INC_X

.endm


.macro KERNEL_S1

vldmia.f32 X, { s4 -s5 }
vadd.f32 s0 , s0, s4
vadd.f32 s0 , s0, s5
add X, X, INC_X

.endm

#endif

#endif

/**************************************************************************************
* End of macro definitions
**************************************************************************************/

PROLOGUE

.align 5

movs r12, #0 // clear floating point register
vmov s0, r12
vmov s1, r12
#if defined(DOUBLE)
vcvt.f64.f32 d0, s0
vcvt.f64.f32 d1, s1
#endif

cmp N, #0
ble asum_kernel_L999

cmp INC_X, #0
beq asum_kernel_L999

cmp INC_X, #1
bne asum_kernel_S_BEGIN


asum_kernel_F_BEGIN:

asrs I, N, #2 // I = N / 4
ble asum_kernel_F1

.align 5

asum_kernel_F4:

#if !defined(DOUBLE) && !defined(COMPLEX)
pld [ X, #X_PRE ]
#endif
KERNEL_F4

subs I, I, #1
ble asum_kernel_F1

KERNEL_F4

subs I, I, #1
bne asum_kernel_F4

asum_kernel_F1:

ands I, N, #3
ble asum_kernel_L999

asum_kernel_F10:

KERNEL_F1

subs I, I, #1
bne asum_kernel_F10

b asum_kernel_L999

asum_kernel_S_BEGIN:

#if defined(COMPLEX)

#if defined(DOUBLE)
lsl INC_X, INC_X, #4 // INC_X * SIZE * 2
#else
lsl INC_X, INC_X, #3 // INC_X * SIZE * 2
#endif

#else

#if defined(DOUBLE)
lsl INC_X, INC_X, #3 // INC_X * SIZE
#else
lsl INC_X, INC_X, #2 // INC_X * SIZE
#endif

#endif

asrs I, N, #2 // I = N / 4
ble asum_kernel_S1

.align 5

asum_kernel_S4:

KERNEL_S4

subs I, I, #1
bne asum_kernel_S4

asum_kernel_S1:

ands I, N, #3
ble asum_kernel_L999

asum_kernel_S10:

KERNEL_S1

subs I, I, #1
bne asum_kernel_S10


asum_kernel_L999:


#if defined(DOUBLE)
vadd.f64 d0 , d0, d1 // set return value
#else
vadd.f32 s0 , s0, s1 // set return value
#endif

#if !defined(__ARM_PCS_VFP)
#if !defined(DOUBLE)
vmov r0, s0
#else
vmov r0, r1, d0
#endif
#endif

bx lr

EPILOGUE


+ 57
- 0
kernel/arm/zsum.c View File

@@ -0,0 +1,57 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

/**************************************************************************************
* trivial copy of zasum.c with the ABS() removed *
**************************************************************************************/


#include "common.h"
#include <math.h>

#define CSUM1(x,i) x[i]+x[i+1]

FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
FLOAT sumf = 0.0;
BLASLONG inc_x2;

if (n <= 0 || inc_x <= 0) return(sumf);

inc_x2 = 2 * inc_x;

n *= inc_x2;
while(i < n)
{
sumf += CSUM1(x,i);
i += inc_x2;
}
return(sumf);
}



+ 164
- 0
kernel/arm64/csum.S View File

@@ -0,0 +1,164 @@
/*******************************************************************************
Copyright (c) 2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/

#define ASSEMBLER
#include "common.h"

#define N x0 /* vector length */
#define X x1 /* X vector address */
#define INC_X x2 /* X stride */
#define I x5 /* loop variable */

/*******************************************************************************
* Macro definitions
*******************************************************************************/

#define REG0 wzr
#define SUMF s0
#define TMPF s1
#define TMPVF {v1.s}[0]
#define SZ 4

/******************************************************************************/

.macro KERNEL_F1
ld1 {v1.2s}, [X], #8
ext v2.8b, v1.8b, v1.8b, #4
fadd TMPF, TMPF, s2
fadd SUMF, SUMF, TMPF
.endm

.macro KERNEL_F8
ld1 {v1.4s, v2.4s, v3.4s, v4.4s}, [X]
add X, X, #64

PRFM PLDL1KEEP, [X, #1024]

fadd v1.4s, v1.4s, v2.4s
fadd v3.4s, v3.4s, v4.4s
fadd v0.4s, v0.4s, v1.4s
fadd v0.4s, v0.4s, v3.4s
.endm

.macro KERNEL_F8_FINALIZE
ext v1.16b, v0.16b, v0.16b, #8
fadd v0.2s, v0.2s, v1.2s
faddp SUMF, v0.2s
.endm

.macro INIT_S
lsl INC_X, INC_X, #3
.endm

.macro KERNEL_S1
ld1 {v1.2s}, [X], INC_X
ext v2.8b, v1.8b, v1.8b, #4
fadd TMPF, TMPF, s2
fadd SUMF, SUMF, TMPF

.endm

/*******************************************************************************
* End of macro definitions
*******************************************************************************/

PROLOGUE

fmov SUMF, REG0
fmov s1, SUMF

cmp N, xzr
ble .Lcsum_kernel_L999
cmp INC_X, xzr
ble .Lcsum_kernel_L999

cmp INC_X, #1
bne .Lcsum_kernel_S_BEGIN

.Lcsum_kernel_F_BEGIN:

asr I, N, #3
cmp I, xzr
beq .Lcsum_kernel_F1

.Lcsum_kernel_F8:

KERNEL_F8

subs I, I, #1
bne .Lcsum_kernel_F8

KERNEL_F8_FINALIZE

.Lcsum_kernel_F1:

ands I, N, #7
ble .Lcsum_kernel_L999

.Lcsum_kernel_F10:

KERNEL_F1

subs I, I, #1
bne .Lcsum_kernel_F10

.Lcsum_kernel_L999:
ret

.Lcsum_kernel_S_BEGIN:

INIT_S

asr I, N, #2
cmp I, xzr
ble .Lcsum_kernel_S1

.Lcsum_kernel_S4:

KERNEL_S1
KERNEL_S1
KERNEL_S1
KERNEL_S1

subs I, I, #1
bne .Lcsum_kernel_S4

.Lcsum_kernel_S1:

ands I, N, #3
ble .Lcsum_kernel_L999

.Lcsum_kernel_S10:

KERNEL_S1

subs I, I, #1
bne .Lcsum_kernel_S10

ret

EPILOGUE

+ 186
- 0
kernel/arm64/sum.S View File

@@ -0,0 +1,186 @@
/*******************************************************************************
Copyright (c) 2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/

#define ASSEMBLER
#include "common.h"

#define N x0 /* vector length */
#define X x1 /* X vector address */
#define INC_X x2 /* X stride */
#define I x5 /* loop variable */

/*******************************************************************************
* Macro definitions
*******************************************************************************/

#if !defined(DOUBLE)
#define REG0 wzr
#define SUMF s0
#define TMPF s1
#define TMPVF {v1.s}[0]
#define SZ 4
#else
#define REG0 xzr
#define SUMF d0
#define TMPF d1
#define TMPVF {v1.d}[0]
#define SZ 8
#endif

/******************************************************************************/

.macro KERNEL_F1
ldr TMPF, [X], #SZ
fadd SUMF, SUMF, TMPF
.endm

.macro KERNEL_F8
#if !defined(DOUBLE)
ld1 {v1.4s, v2.4s}, [X], #32 // Load [X3, X2, X1, X0]
fadd v1.4s, v1.4s, v2.4s // [X3+X1, X2+X0]
fadd v0.4s, v0.4s, v1.4s // [X3+X1, X2+X0]
PRFM PLDL1KEEP, [X, #1024]
#else // DOUBLE
ld1 {v2.2d, v3.2d, v4.2d, v5.2d}, [X]
add X, X, #64

PRFM PLDL1KEEP, [X, #1024]

fadd v2.2d, v2.2d, v3.2d
fadd v4.2d, v4.2d, v5.2d
fadd v0.2d, v0.2d, v2.2d
fadd v0.2d, v0.2d, v4.2d
#endif
.endm

.macro KERNEL_F8_FINALIZE
#if !defined(DOUBLE)
ext v1.16b, v0.16b, v0.16b, #8
fadd v0.2s, v0.2s, v1.2s
faddp SUMF, v0.2s
#else
faddp SUMF, v0.2d
#endif
.endm

.macro INIT_S
#if !defined(DOUBLE)
lsl INC_X, INC_X, #2
#else
lsl INC_X, INC_X, #3
#endif
.endm

.macro KERNEL_S1
ld1 TMPVF, [X], INC_X
fadd SUMF, SUMF, TMPF
.endm

/*******************************************************************************
* End of macro definitions
*******************************************************************************/

PROLOGUE

fmov SUMF, REG0
#if !defined(DOUBLE)
fmov s1, SUMF
#else
fmov d1, SUMF
#endif

cmp N, xzr
ble .Lsum_kernel_L999
cmp INC_X, xzr
ble .Lsum_kernel_L999

cmp INC_X, #1
bne .Lsum_kernel_S_BEGIN

.Lsum_kernel_F_BEGIN:

asr I, N, #3
cmp I, xzr
beq .Lsum_kernel_F1

.Lsum_kernel_F8:

KERNEL_F8

subs I, I, #1
bne .Lsum_kernel_F8

KERNEL_F8_FINALIZE

.Lsum_kernel_F1:

ands I, N, #7
ble .Lsum_kernel_L999

.Lsum_kernel_F10:

KERNEL_F1

subs I, I, #1
bne .Lsum_kernel_F10

.Lsum_kernel_L999:
ret

.Lsum_kernel_S_BEGIN:

INIT_S

asr I, N, #2
cmp I, xzr
ble .Lsum_kernel_S1

.Lsum_kernel_S4:

KERNEL_S1
KERNEL_S1
KERNEL_S1
KERNEL_S1

subs I, I, #1
bne .Lsum_kernel_S4

.Lsum_kernel_S1:

ands I, N, #3
ble .Lsum_kernel_L999

.Lsum_kernel_S10:

KERNEL_S1

subs I, I, #1
bne .Lsum_kernel_S10

ret

EPILOGUE

+ 158
- 0
kernel/arm64/zsum.S View File

@@ -0,0 +1,158 @@
/*******************************************************************************
Copyright (c) 2015, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/

#define ASSEMBLER
#include "common.h"

#define N x0 /* vector length */
#define X x1 /* X vector address */
#define INC_X x2 /* X stride */
#define I x5 /* loop variable */

/*******************************************************************************
* Macro definitions
*******************************************************************************/

#define REG0 xzr
#define SUMF d0
#define TMPF d1
#define TMPVF {v1.d}[0]
#define SZ 8

/******************************************************************************/

.macro KERNEL_F1
ld1 {v1.2d}, [X], #16
faddp TMPF, v1.2d
fadd SUMF, SUMF, TMPF
.endm

.macro KERNEL_F4
ld1 {v1.2d, v2.2d, v3.2d, v4.2d}, [X], #64

fadd v1.2d, v1.2d, v2.2d
fadd v3.2d, v3.2d, v4.2d

fadd v0.2d, v0.2d, v1.2d
fadd v0.2d, v0.2d, v3.2d

PRFM PLDL1KEEP, [X, #1024]
.endm

.macro KERNEL_F4_FINALIZE
faddp SUMF, v0.2d
.endm

.macro INIT_S
lsl INC_X, INC_X, #4
.endm

.macro KERNEL_S1
ld1 {v1.2d}, [X], INC_X
faddp TMPF, v1.2d
fadd SUMF, SUMF, TMPF
.endm

/*******************************************************************************
* End of macro definitions
*******************************************************************************/

PROLOGUE

fmov SUMF, REG0

cmp N, xzr
ble .Lzsum_kernel_L999
cmp INC_X, xzr
ble .Lzsum_kernel_L999

cmp INC_X, #1
bne .Lzsum_kernel_S_BEGIN

.Lzsum_kernel_F_BEGIN:

asr I, N, #2
cmp I, xzr
beq .Lzsum_kernel_F1

.Lzsum_kernel_F4:

KERNEL_F4

subs I, I, #1
bne .Lzsum_kernel_F4

KERNEL_F4_FINALIZE

.Lzsum_kernel_F1:

ands I, N, #3
ble .Lzsum_kernel_L999

.Lzsum_kernel_F10:

KERNEL_F1

subs I, I, #1
bne .Lzsum_kernel_F10

.Lzsum_kernel_L999:
ret

.Lzsum_kernel_S_BEGIN:

INIT_S

asr I, N, #2
cmp I, xzr
ble .Lzsum_kernel_S1

.Lzsum_kernel_S4:

KERNEL_S1
KERNEL_S1
KERNEL_S1
KERNEL_S1

subs I, I, #1
bne .Lzsum_kernel_S4

.Lzsum_kernel_S1:

ands I, N, #3
ble .Lzsum_kernel_L999

.Lzsum_kernel_S10:

KERNEL_S1

subs I, I, #1
bne .Lzsum_kernel_S10

ret

EPILOGUE

+ 4
- 0
kernel/ia64/KERNEL View File

@@ -60,6 +60,10 @@ CASUMKERNEL = asum.S
ZASUMKERNEL = asum.S
XASUMKERNEL = asum.S

CSUMKERNEL = sum.S
ZSUMKERNEL = sum.S
XSUMKERNEL = sum.S

CNRM2KERNEL = nrm2.S
ZNRM2KERNEL = nrm2.S
XNRM2KERNEL = nrm2.S


+ 358
- 0
kernel/ia64/sum.S View File

@@ -0,0 +1,358 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2019, The OpenBLAS project */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/

#define ASSEMBLER
#include "common.h"

#ifdef XDOUBLE
#define PREFETCH_SIZE ( 8 * 16 + 4)
#elif defined(DOUBLE)
#define PREFETCH_SIZE (16 * 16 + 8)
#else
#define PREFETCH_SIZE (32 * 16 + 16)
#endif

#ifndef COMPLEX
#define COMPADD 0
#define STRIDE INCX
#else
#define COMPADD 1
#define STRIDE SIZE
#endif

#define PRE1 r2

#define I r17
#define J r18
#define INCX16 r21

#define PR r30
#define ARLC r31

#define N r32
#define X r33
#define INCX r34


PROLOGUE
.prologue
PROFCODE
{ .mfi
adds PRE1 = PREFETCH_SIZE * SIZE, X
mov f8 = f0
.save ar.lc, ARLC
mov ARLC = ar.lc
}
;;
.body
#ifdef F_INTERFACE
{ .mmi
LDINT N = [N]
LDINT INCX = [INCX]
nop.i 0
}
;;
#ifndef USE64BITINT
{ .mii
nop.m 0
sxt4 N = N
sxt4 INCX = INCX
}
;;
#endif
#endif
{ .mmi
cmp.lt p0, p6 = r0, INCX
cmp.lt p0, p7 = r0, N
shr I = N, (4 - COMPADD)
}
{ .mbb
and J = ((1 << (4 - COMPADD)) - 1), N
(p6) br.ret.sptk.many b0
(p7) br.ret.sptk.many b0
}
;;
{ .mfi
adds I = -1, I
mov f10 = f0
mov PR = pr
}
{ .mfi
cmp.eq p9, p0 = r0, J
mov f9 = f0
tbit.z p0, p12 = N, 3 - COMPADD
}
;;
{ .mmi
cmp.eq p16, p0 = r0, r0
cmp.ne p17, p0 = r0, r0
mov ar.ec= 3
}
{ .mfi
cmp.ne p18, p0 = r0, r0
mov f11 = f0
shl INCX = INCX, BASE_SHIFT + COMPADD
}
;;
{ .mmi
#ifdef XDOUBLE
shladd INCX16 = INCX, (3 - COMPADD), r0
#else
shladd INCX16 = INCX, (4 - COMPADD), r0
#endif
cmp.ne p19, p0 = r0, r0
mov ar.lc = I
}
{ .mmb
cmp.gt p8 ,p0 = r0, I
#ifdef COMPLEX
adds INCX = - SIZE, INCX
#else
nop.m 0
#endif
(p8) br.cond.dpnt .L55
}
;;
.align 32

.L52:
{ .mmf
(p16) lfetch.nt1 [PRE1], INCX16
(p16) LDFD f32 = [X], STRIDE
}
{ .mfb
(p19) FADD f8 = f8, f71
}
;;
{ .mmf
(p16) LDFD f35 = [X], INCX
}
{ .mfb
(p19) FADD f9 = f9, f74
}
;;
{ .mmf
(p16) LDFD f38 = [X], STRIDE
}
{ .mfb
(p19) FADD f10 = f10, f77
}
;;
{ .mmf
(p16) LDFD f41 = [X], INCX
}
{ .mfb
(p19) FADD f11 = f11, f80
}
;;
{ .mmf
(p16) LDFD f44 = [X], STRIDE
}
{ .mfb
(p18) FADD f8 = f8, f34
}
;;
{ .mmf
(p16) LDFD f47 = [X], INCX
}
{ .mfb
(p18) FADD f9 = f9, f37
}
;;
{ .mmf
(p16) LDFD f50 = [X], STRIDE
}
{ .mfb
(p18) FADD f10 = f10, f40
}
;;
{ .mmf
(p16) LDFD f53 = [X], INCX
}
{ .mfb
(p18) FADD f11 = f11, f43
}
;;
{ .mmf
#ifdef XDOUBLE
(p16) lfetch.nt1 [PRE1], INCX16
#endif
(p16) LDFD f56 = [X], STRIDE
}
{ .mfb
(p18) FADD f8 = f8, f46
}
;;
{ .mmf
(p16) LDFD f59 = [X], INCX
}
{ .mfb
(p18) FADD f9 = f9, f49
}
;;
{ .mmf
(p16) LDFD f62 = [X], STRIDE
}
{ .mfb
(p18) FADD f10 = f10, f52
}
;;
{ .mmf
(p16) LDFD f65 = [X], INCX
}
{ .mfb
(p18) FADD f11 = f11, f55
}
;;
{ .mmf
(p16) LDFD f68 = [X], STRIDE
}
{ .mfb
(p18) FADD f8 = f8, f58
}
;;
{ .mmf
(p16) LDFD f71 = [X], INCX
}
{ .mfb
(p18) FADD f9 = f9, f61
}
;;
{ .mmf
(p16) LDFD f74 = [X], STRIDE
}
{ .mfb
(p18) FADD f10 = f10, f64
}
;;
{ .mmf
(p16) LDFD f77 = [X], INCX
}
{ .mfb
(p18) FADD f11 = f11, f67
br.ctop.sptk.few .L52
}
;;
FADD f8 = f8, f71
FADD f9 = f9, f74
FADD f10 = f10, f77
FADD f11 = f11, f80
.align 32
;;
.L55:
(p12) LDFD f32 = [X], STRIDE
(p9) br.cond.dptk .L998
;;
(p12) LDFD f33 = [X], INCX
;;
(p12) LDFD f34 = [X], STRIDE
;;
(p12) LDFD f35 = [X], INCX
tbit.z p0, p13 = N, (2 - COMPADD)
;;
(p12) LDFD f36 = [X], STRIDE
tbit.z p0, p14 = N, (1 - COMPADD)
;;
(p12) LDFD f37 = [X], INCX
#ifndef COMPLEX
tbit.z p0, p15 = N, 0
#endif
;;
(p12) LDFD f38 = [X], STRIDE
;;
(p12) LDFD f39 = [X], INCX
;;
(p13) LDFD f40 = [X], STRIDE
;;
(p13) LDFD f41 = [X], INCX
;;
(p13) LDFD f42 = [X], STRIDE
(p12) FADD f8 = f8, f32
;;
(p13) LDFD f43 = [X], INCX
(p12) FADD f9 = f9, f33
;;
(p14) LDFD f44 = [X], STRIDE
(p12) FADD f10 = f10, f34
;;
(p14) LDFD f45 = [X], INCX
(p12) FADD f11 = f11, f35
;;
#ifndef COMPLEX
(p15) LDFD f46 = [X]
#endif
(p12) FADD f8 = f8, f36
;;
(p12) FADD f9 = f9, f37
(p12) FADD f10 = f10, f38
(p12) FADD f11 = f11, f39
;;
(p13) FADD f8 = f8, f40
(p13) FADD f9 = f9, f41
#ifndef COMPLEX
#endif
(p13) FADD f10 = f10, f42
;;
(p13) FADD f11 = f11, f43
(p14) FADD f8 = f8, f44
(p14) FADD f9 = f9, f45
#ifndef COMPLEX
(p15) FADD f10 = f10, f46
#endif
;;
.align 32

.L998:
{ .mfi
FADD f8 = f8, f9
mov ar.lc = ARLC
}
{ .mmf
FADD f10 = f10, f11
}
;;
{ .mii
mov pr = PR, -65474
}
;;
{ .mfb
FADD f8 = f8, f10
br.ret.sptk.many b0
}
EPILOGUE

+ 5
- 0
kernel/mips/KERNEL.P5600 View File

@@ -30,6 +30,11 @@ IDMAXKERNEL = ../mips/imax.c
ISMINKERNEL = ../mips/imin.c
IDMINKERNEL = ../mips/imin.c

SSUMKERNEL = ../mips/sum.c
DSUMKERNEL = ../mips/sum.c
CSUMKERNEL = ../mips/zsum.c
ZSUMKERNEL = ../mips/zsum.c

ifdef HAVE_MSA
SASUMKERNEL = ../mips/sasum_msa.c
DASUMKERNEL = ../mips/dasum_msa.c


+ 47
- 0
kernel/mips/sum.c View File

@@ -0,0 +1,47 @@
/***************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#include "common.h"
#include <math.h>


FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
FLOAT sumf = 0.0;
if (n <= 0 || inc_x <= 0) return(sumf);

n *= inc_x;
while(i < n)
{
sumf += x[i];
i += inc_x;
}
return(sumf);
}



+ 52
- 0
kernel/mips/zsum.c View File

@@ -0,0 +1,52 @@
/***************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#include "common.h"
#include <math.h>

#define CSUM1(x,i) x[i]+x[i+1]

FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
FLOAT sumf = 0.0;
BLASLONG inc_x2;

if (n <= 0 || inc_x <= 0) return(sumf);

inc_x2 = 2 * inc_x;

n *= inc_x2;
while(i < n)
{
sumf += CSUM1(x,i);
i += inc_x2;
}
return(sumf);
}



+ 332
- 0
kernel/mips64/sum.S View File

@@ -0,0 +1,332 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/

#define ASSEMBLER
#include "common.h"

#define N $4
#define X $5
#define INCX $6

#define I $2
#define TEMP $3

#define a1 $f2
#define a2 $f3
#define a3 $f4
#define a4 $f5
#define a5 $f6
#define a6 $f7
#define a7 $f8
#define a8 $f9

#define t1 $f10
#define t2 $f11
#define t3 $f12
#define t4 $f13

#define s1 $f0
#define s2 $f1

PROLOGUE

#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
#endif

MTC $0, s1

MTC $0, s2
dsll INCX, INCX, BASE_SHIFT

blez N, .L999
li TEMP, SIZE

bne INCX, TEMP, .L20
dsra I, N, 3

blez I, .L15
NOP

LD a1, 0 * SIZE(X)
LD a2, 1 * SIZE(X)
LD a3, 2 * SIZE(X)
LD a4, 3 * SIZE(X)

LD a5, 4 * SIZE(X)
MOV t1, a1
LD a6, 5 * SIZE(X)
MOV t2, a2
LD a7, 6 * SIZE(X)
MOV t3, a3

MOV t4, a4
daddiu I, I, -1

blez I, .L13
LD a8, 7 * SIZE(X)
.align 3

.L12:
ADD s1, s1, t1
LD a1, 8 * SIZE(X)

MOV t1, a5
daddiu I, I, -1

ADD s2, s2, t2
LD a2, 9 * SIZE(X)

MOV t2, a6
NOP

ADD s1, s1, t3
LD a3, 10 * SIZE(X)

MOV t3, a7
NOP

ADD s2, s2, t4
LD a4, 11 * SIZE(X)

MOV t4, a8
daddiu X, X, 8 * SIZE

ADD s1, s1, t1
LD a5, 4 * SIZE(X)

MOV t1, a1
NOP

ADD s2, s2, t2
LD a6, 5 * SIZE(X)

MOV t2, a2
NOP

ADD s1, s1, t3
LD a7, 6 * SIZE(X)

MOV t3, a3
NOP

ADD s2, s2, t4
LD a8, 7 * SIZE(X)

bgtz I, .L12
MOV t4, a4
.align 3

.L13:
ADD s1, s1, t1
daddiu X, X, 8 * SIZE

MOV t1, a5
NOP

ADD s2, s2, t2
MOV t2, a6

ADD s1, s1, t3
MOV t3, a7

ADD s2, s2, t4
MOV t4, a8

ADD s1, s1, t1
ADD s2, s2, t2
ADD s1, s1, t3
ADD s2, s2, t4
.align 3

.L15:
andi I, N, 7

blez I, .L999
NOP
.align 3

.L16:
LD a1, 0 * SIZE(X)
daddiu I, I, -1

MOV t1, a1

ADD s1, s1, t1

bgtz I, .L16
daddiu X, X, SIZE

j .L999
NOP
.align 3

.L20:
blez I, .L25
NOP

LD a1, 0 * SIZE(X)
daddu X, X, INCX

LD a2, 0 * SIZE(X)
daddu X, X, INCX

LD a3, 0 * SIZE(X)
daddu X, X, INCX

LD a4, 0 * SIZE(X)
daddu X, X, INCX

LD a5, 0 * SIZE(X)
daddu X, X, INCX

LD a6, 0 * SIZE(X)
daddu X, X, INCX

MOV t1, a1
LD a7, 0 * SIZE(X)

MOV t2, a2
daddu X, X, INCX

MOV t3, a3
LD a8, 0 * SIZE(X)

MOV t4, a4
daddiu I, I, -1

blez I, .L24
daddu X, X, INCX
.align 3

.L23:
ADD s1, s1, t1
LD a1, 0 * SIZE(X)

MOV t1, a5
daddu X, X, INCX

ADD s2, s2, t2
LD a2, 0 * SIZE(X)

MOV t2, a6
daddu X, X, INCX

ADD s1, s1, t3
LD a3, 0 * SIZE(X)

MOV t3, a7
daddu X, X, INCX

ADD s2, s2, t4
LD a4, 0 * SIZE(X)

MOV t4, a8
daddu X, X, INCX

ADD s1, s1, t1
LD a5, 0 * SIZE(X)

MOV t1, a1
daddu X, X, INCX

ADD s2, s2, t2
LD a6, 0 * SIZE(X)

MOV t2, a2
daddu X, X, INCX

ADD s1, s1, t3
LD a7, 0 * SIZE(X)

MOV t3, a3
daddu X, X, INCX

ADD s2, s2, t4
LD a8, 0 * SIZE(X)

MOV t4, a4
daddiu I, I, -1

bgtz I, .L23
daddu X, X, INCX
.align 3

.L24:
ADD s1, s1, t1
MOV t1, a5

ADD s2, s2, t2
MOV t2, a6

ADD s1, s1, t3
MOV t3, a7

ADD s2, s2, t4
MOV t4, a8

ADD s1, s1, t1
ADD s2, s2, t2
ADD s1, s1, t3
ADD s2, s2, t4
.align 3

.L25:
andi I, N, 7

blez I, .L999
NOP
.align 3

.L26:
LD a1, 0 * SIZE(X)
daddiu I, I, -1

MOV t1, a1
daddu X, X, INCX

bgtz I, .L26
ADD s1, s1, t1
.align 3

.L999:
j $31
ADD s1, s1, s2

EPILOGUE

+ 204
- 0
kernel/mips64/zsum.S View File

@@ -0,0 +1,204 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/

#define ASSEMBLER
#include "common.h"

#define N $4
#define X $5
#define INCX $6

#define I $2
#define TEMP $3

#define a1 $f2
#define a2 $f3
#define a3 $f4
#define a4 $f5
#define a5 $f6
#define a6 $f7
#define a7 $f8
#define a8 $f9

#define t1 $f10
#define t2 $f11
#define t3 $f12
#define t4 $f13

#define s1 $f0
#define s2 $f1

PROLOGUE

#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
#endif

MTC $0, s1

MTC $0, s2
dsll INCX, INCX, ZBASE_SHIFT

blez N, .L999
dsra I, N, 2

blez I, .L25
NOP

LD a1, 0 * SIZE(X)
LD a2, 1 * SIZE(X)
daddu X, X, INCX

LD a3, 0 * SIZE(X)
LD a4, 1 * SIZE(X)
daddu X, X, INCX

LD a5, 0 * SIZE(X)
LD a6, 1 * SIZE(X)
daddu X, X, INCX

MOV t1, a1
MOV t2, a2

LD a7, 0 * SIZE(X)
LD a8, 1 * SIZE(X)

MOV t3, a3
MOV t4, a4
daddiu I, I, -1

blez I, .L24
daddu X, X, INCX
.align 3

.L23:
ADD s1, s1, t1
LD a1, 0 * SIZE(X)

MOV t1, a5
daddiu I, I, -1

ADD s2, s2, t2
LD a2, 1 * SIZE(X)

MOV t2, a6
daddu X, X, INCX

ADD s1, s1, t3
LD a3, 0 * SIZE(X)

MOV t3, a7
NOP

ADD s2, s2, t4
LD a4, 1 * SIZE(X)

MOV t4, a8
daddu X, X, INCX

ADD s1, s1, t1
LD a5, 0 * SIZE(X)

MOV t1, a1
NOP

ADD s2, s2, t2
LD a6, 1 * SIZE(X)

MOV t2, a2
daddu X, X, INCX

ADD s1, s1, t3
LD a7, 0 * SIZE(X)

MOV t3, a3
LD a8, 1 * SIZE(X)

ADD s2, s2, t4
daddu X, X, INCX

bgtz I, .L23
MOV t4, a4
.align 3

.L24:
ADD s1, s1, t1
MOV t1, a5

ADD s2, s2, t2
MOV t2, a6

ADD s1, s1, t3
MOV t3, a7

ADD s2, s2, t4
MOV t4, a8

ADD s1, s1, t1
ADD s2, s2, t2
ADD s1, s1, t3
ADD s2, s2, t4
.align 3

.L25:
andi I, N, 3

blez I, .L999
NOP
.align 3

.L26:
LD a1, 0 * SIZE(X)
LD a2, 1 * SIZE(X)

MOV t1, a1
daddiu I, I, -1
MOV t2, a2
daddu X, X, INCX

ADD s1, s1, t1
bgtz I, .L26
ADD s2, s2, t2
.align 3

.L999:
j $31
ADD s1, s1, s2

EPILOGUE

+ 446
- 0
kernel/power/sum.S View File

@@ -0,0 +1,446 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/

#define ASSEMBLER
#include "common.h"

#define N r3
#define X r4
#define INCX r5

#define PREA r8

#define FZERO f0

#define STACKSIZE 160

PROLOGUE
PROFCODE

addi SP, SP, -STACKSIZE
li r0, 0

stfd f14, 0(SP)
stfd f15, 8(SP)
stfd f16, 16(SP)
stfd f17, 24(SP)

stfd f18, 32(SP)
stfd f19, 40(SP)
stfd f20, 48(SP)
stfd f21, 56(SP)

stfd f22, 64(SP)
stfd f23, 72(SP)
stfd f24, 80(SP)
stfd f25, 88(SP)

stfd f26, 96(SP)
stfd f27, 104(SP)
stfd f28, 112(SP)
stfd f29, 120(SP)

stfd f30, 128(SP)
stfd f31, 136(SP)

stw r0, 144(SP)
lfs FZERO,144(SP)

#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
#endif

slwi INCX, INCX, BASE_SHIFT

fmr f1, FZERO
fmr f2, FZERO
fmr f3, FZERO
fmr f4, FZERO
fmr f5, FZERO
fmr f6, FZERO
fmr f7, FZERO

li PREA, L1_PREFETCHSIZE

cmpwi cr0, N, 0
ble- LL(999)

cmpwi cr0, INCX, 0
ble- LL(999)

cmpwi cr0, INCX, SIZE
bne- cr0, LL(100)

srawi. r0, N, 4
mtspr CTR, r0
beq- cr0, LL(50)
.align 4

LFD f8, 0 * SIZE(X)
LFD f9, 1 * SIZE(X)
LFD f10, 2 * SIZE(X)
LFD f11, 3 * SIZE(X)
LFD f12, 4 * SIZE(X)
LFD f13, 5 * SIZE(X)
LFD f14, 6 * SIZE(X)
LFD f15, 7 * SIZE(X)

LFD f24, 8 * SIZE(X)
LFD f25, 9 * SIZE(X)
LFD f26, 10 * SIZE(X)
LFD f27, 11 * SIZE(X)
LFD f28, 12 * SIZE(X)
LFD f29, 13 * SIZE(X)
LFD f30, 14 * SIZE(X)
LFD f31, 15 * SIZE(X)

fmr f16, f8
fmr f17, f9
fmr f18, f10
fmr f19, f11

fmr f20, f12
fmr f21, f13
fmr f22, f14
fmr f23, f15
bdz LL(20)
.align 4

LL(10):
FADD f0, f0, f16
fmr f16, f24
FADD f1, f1, f17
fmr f17, f25

FADD f2, f2, f18
fmr f18, f26
FADD f3, f3, f19
fmr f19, f27

LFD f8, 16 * SIZE(X)
LFD f9, 17 * SIZE(X)
LFD f10, 18 * SIZE(X)
LFD f11, 19 * SIZE(X)

FADD f4, f4, f20
fmr f20, f28
FADD f5, f5, f21
fmr f21, f29

FADD f6, f6, f22
fmr f22, f30
FADD f7, f7, f23
fmr f23, f31

LFD f12, 20 * SIZE(X)
LFD f13, 21 * SIZE(X)
LFD f14, 22 * SIZE(X)
LFD f15, 23 * SIZE(X)

FADD f0, f0, f16
fmr f16, f8
FADD f1, f1, f17
fmr f17, f9

FADD f2, f2, f18
fmr f18, f10
FADD f3, f3, f19
fmr f19, f11

LFD f24, 24 * SIZE(X)
LFD f25, 25 * SIZE(X)
LFD f26, 26 * SIZE(X)
LFD f27, 27 * SIZE(X)

FADD f4, f4, f20
fmr f20, f12
FADD f5, f5, f21
fmr f21, f13

FADD f6, f6, f22
fmr f22, f14
FADD f7, f7, f23
fmr f23, f15

LFD f28, 28 * SIZE(X)
LFD f29, 29 * SIZE(X)
LFD f30, 30 * SIZE(X)
LFD f31, 31 * SIZE(X)

#ifndef POWER6
L1_PREFETCH X, PREA
#endif
addi X, X, 16 * SIZE
#ifdef POWER6
L1_PREFETCH X, PREA
#endif

bdnz LL(10)
.align 4

LL(20):
FADD f0, f0, f16
fmr f16, f24
FADD f1, f1, f17
fmr f17, f25

FADD f2, f2, f18
fmr f18, f26
FADD f3, f3, f19
fmr f19, f27

FADD f4, f4, f20
fmr f20, f28
FADD f5, f5, f21
fmr f21, f29

FADD f6, f6, f22
fmr f22, f30
FADD f7, f7, f23
fmr f23, f31

FADD f0, f0, f16
FADD f1, f1, f17
FADD f2, f2, f18
FADD f3, f3, f19

FADD f4, f4, f20
FADD f5, f5, f21
FADD f6, f6, f22
FADD f7, f7, f23
addi X, X, 16 * SIZE
.align 4

LL(50):
andi. r0, N, 15
mtspr CTR, r0
beq LL(999)
.align 4

LL(60):
LFD f8, 0 * SIZE(X)
addi X, X, 1 * SIZE

FADD f0, f0, f8

bdnz LL(60)
b LL(999)
.align 4

LL(100):
sub X, X, INCX

srawi. r0, N, 4
mtspr CTR, r0
beq- LL(150)

LFDUX f8, X, INCX
LFDUX f9, X, INCX
LFDUX f10, X, INCX
LFDUX f11, X, INCX
LFDUX f12, X, INCX
LFDUX f13, X, INCX
LFDUX f14, X, INCX
LFDUX f15, X, INCX

LFDUX f24, X, INCX
LFDUX f25, X, INCX
LFDUX f26, X, INCX
LFDUX f27, X, INCX
LFDUX f28, X, INCX
LFDUX f29, X, INCX
LFDUX f30, X, INCX
LFDUX f31, X, INCX

fmr f16, f8
fmr f17, f9
fmr f18, f10
fmr f19, f11

fmr f20, f12
fmr f21, f13
fmr f22, f14
fmr f23, f15
bdz LL(120)
.align 4

LL(110):
FADD f0, f0, f16
fmr f16, f24
FADD f1, f1, f17
fmr f17, f25

FADD f2, f2, f18
fmr f18, f26
FADD f3, f3, f19
fmr f19, f27

LFDUX f8, X, INCX
LFDUX f9, X, INCX
LFDUX f10, X, INCX
LFDUX f11, X, INCX

FADD f4, f4, f20
fmr f20, f28
FADD f5, f5, f21
fmr f21, f29

FADD f6, f6, f22
fmr f22, f30
FADD f7, f7, f23
fmr f23, f31

LFDUX f12, X, INCX
LFDUX f13, X, INCX
LFDUX f14, X, INCX
LFDUX f15, X, INCX

FADD f0, f0, f16
fmr f16, f8
FADD f1, f1, f17
fmr f17, f9

FADD f2, f2, f18
fmr f18, f10
FADD f3, f3, f19
fmr f19, f11

LFDUX f24, X, INCX
LFDUX f25, X, INCX
LFDUX f26, X, INCX
LFDUX f27, X, INCX

FADD f4, f4, f20
fmr f20, f12
FADD f5, f5, f21
fmr f21, f13

FADD f6, f6, f22
fmr f22, f14
FADD f7, f7, f23
fmr f23, f15

LFDUX f28, X, INCX
LFDUX f29, X, INCX
LFDUX f30, X, INCX
LFDUX f31, X, INCX
bdnz LL(110)
.align 4

LL(120):
FADD f0, f0, f16
fmr f16, f24
FADD f1, f1, f17
fmr f17, f25

FADD f2, f2, f18
fmr f18, f26
FADD f3, f3, f19
fmr f19, f27

FADD f4, f4, f20
fmr f20, f28
FADD f5, f5, f21
fmr f21, f29

FADD f6, f6, f22
fmr f22, f30
FADD f7, f7, f23
fmr f23, f31

FADD f0, f0, f16
FADD f1, f1, f17
FADD f2, f2, f18
FADD f3, f3, f19

FADD f4, f4, f20
FADD f5, f5, f21
FADD f6, f6, f22
FADD f7, f7, f23
.align 4

LL(150):
andi. r0, N, 15
mtspr CTR, r0
beq LL(999)
.align 4

LL(160):
LFDUX f8, X, INCX
FADD f0, f0, f8
bdnz LL(160)
.align 4

LL(999):
FADD f0, f0, f1
FADD f2, f2, f3
FADD f4, f4, f5
FADD f6, f6, f7

FADD f0, f0, f2
FADD f4, f4, f6
FADD f1, f0, f4

lfd f14, 0(SP)
lfd f15, 8(SP)
lfd f16, 16(SP)
lfd f17, 24(SP)

lfd f18, 32(SP)
lfd f19, 40(SP)
lfd f20, 48(SP)
lfd f21, 56(SP)

lfd f22, 64(SP)
lfd f23, 72(SP)
lfd f24, 80(SP)
lfd f25, 88(SP)

lfd f26, 96(SP)
lfd f27, 104(SP)
lfd f28, 112(SP)
lfd f29, 120(SP)

lfd f30, 128(SP)
lfd f31, 136(SP)

addi SP, SP, STACKSIZE
blr

EPILOGUE

+ 452
- 0
kernel/power/zsum.S View File

@@ -0,0 +1,452 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/

#define ASSEMBLER
#include "common.h"

#define N r3
#define X r4
#define INCX r5

#define INCXM1 r9
#define PREA r8

#define FZERO f0

#define STACKSIZE 160

PROLOGUE
PROFCODE

addi SP, SP, -STACKSIZE
li r0, 0

stfd f14, 0(SP)
stfd f15, 8(SP)
stfd f16, 16(SP)
stfd f17, 24(SP)

stfd f18, 32(SP)
stfd f19, 40(SP)
stfd f20, 48(SP)
stfd f21, 56(SP)

stfd f22, 64(SP)
stfd f23, 72(SP)
stfd f24, 80(SP)
stfd f25, 88(SP)

stfd f26, 96(SP)
stfd f27, 104(SP)
stfd f28, 112(SP)
stfd f29, 120(SP)

stfd f30, 128(SP)
stfd f31, 136(SP)

stw r0, 144(SP)
lfs FZERO,144(SP)

#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
#endif

slwi INCX, INCX, ZBASE_SHIFT
subi INCXM1, INCX, SIZE

fmr f1, FZERO
fmr f2, FZERO
fmr f3, FZERO
fmr f4, FZERO
fmr f5, FZERO
fmr f6, FZERO
fmr f7, FZERO

li PREA, L1_PREFETCHSIZE

cmpwi cr0, N, 0
ble- LL(999)

cmpwi cr0, INCX, 0
ble- LL(999)

cmpwi cr0, INCX, 2 * SIZE
bne- cr0, LL(100)

srawi. r0, N, 3
mtspr CTR, r0
beq- cr0, LL(50)
.align 4

LFD f8, 0 * SIZE(X)
LFD f9, 1 * SIZE(X)
LFD f10, 2 * SIZE(X)
LFD f11, 3 * SIZE(X)
LFD f12, 4 * SIZE(X)
LFD f13, 5 * SIZE(X)
LFD f14, 6 * SIZE(X)
LFD f15, 7 * SIZE(X)

LFD f24, 8 * SIZE(X)
LFD f25, 9 * SIZE(X)
LFD f26, 10 * SIZE(X)
LFD f27, 11 * SIZE(X)
LFD f28, 12 * SIZE(X)
LFD f29, 13 * SIZE(X)
LFD f30, 14 * SIZE(X)
LFD f31, 15 * SIZE(X)

fmr f16, f8
fmr f17, f9
fmr f18, f10
fmr f19, f11

fmr f20, f12
fmr f21, f13
fmr f22, f14
fmr f23, f15
bdz LL(20)
.align 4

LL(10):
FADD f0, f0, f16
fmr f16, f24
FADD f1, f1, f17
fmr f17, f25

FADD f2, f2, f18
fmr f18, f26
FADD f3, f3, f19
fmr f19, f27

LFD f8, 16 * SIZE(X)
LFD f9, 17 * SIZE(X)
LFD f10, 18 * SIZE(X)
LFD f11, 19 * SIZE(X)

FADD f4, f4, f20
fmr f20, f28
FADD f5, f5, f21
fmr f21, f29

FADD f6, f6, f22
fmr f22, f30
FADD f7, f7, f23
fmr f23, f31

LFD f12, 20 * SIZE(X)
LFD f13, 21 * SIZE(X)
LFD f14, 22 * SIZE(X)
LFD f15, 23 * SIZE(X)

FADD f0, f0, f16
fmr f16, f8
FADD f1, f1, f17
fmr f17, f9

FADD f2, f2, f18
fmr f18, f10
FADD f3, f3, f19
fmr f19, f11

LFD f24, 24 * SIZE(X)
LFD f25, 25 * SIZE(X)
LFD f26, 26 * SIZE(X)
LFD f27, 27 * SIZE(X)

FADD f4, f4, f20
fmr f20, f12
FADD f5, f5, f21
fmr f21, f13

FADD f6, f6, f22
fmr f22, f14
FADD f7, f7, f23
fmr f23, f15

LFD f28, 28 * SIZE(X)
LFD f29, 29 * SIZE(X)
LFD f30, 30 * SIZE(X)
LFD f31, 31 * SIZE(X)

#ifndef POWER6
L1_PREFETCH X, PREA
#endif
addi X, X, 16 * SIZE
#ifdef POWER6
L1_PREFETCH X, PREA
#endif

bdnz LL(10)
.align 4

LL(20):
FADD f0, f0, f16
fmr f16, f24
FADD f1, f1, f17
fmr f17, f25

FADD f2, f2, f18
fmr f18, f26
FADD f3, f3, f19
fmr f19, f27

FADD f4, f4, f20
fmr f20, f28
FADD f5, f5, f21
fmr f21, f29

FADD f6, f6, f22
fmr f22, f30
FADD f7, f7, f23
fmr f23, f31

FADD f0, f0, f16
FADD f1, f1, f17
FADD f2, f2, f18
FADD f3, f3, f19

FADD f4, f4, f20
FADD f5, f5, f21
FADD f6, f6, f22
FADD f7, f7, f23
addi X, X, 16 * SIZE
.align 4

LL(50):
andi. r0, N, 7
mtspr CTR, r0
beq LL(999)
.align 4

LL(60):
LFD f8, 0 * SIZE(X)
LFD f9, 1 * SIZE(X)
addi X, X, 2 * SIZE

FADD f0, f0, f8
FADD f1, f1, f9

bdnz LL(60)
b LL(999)
.align 4

LL(100):
sub X, X, INCXM1

srawi. r0, N, 3
mtspr CTR, r0
beq- LL(150)

LFDX f8, X, INCXM1
LFDUX f9, X, INCX
LFDX f10, X, INCXM1
LFDUX f11, X, INCX
LFDX f12, X, INCXM1
LFDUX f13, X, INCX
LFDX f14, X, INCXM1
LFDUX f15, X, INCX

LFDX f24, X, INCXM1
LFDUX f25, X, INCX
LFDX f26, X, INCXM1
LFDUX f27, X, INCX
LFDX f28, X, INCXM1
LFDUX f29, X, INCX
LFDX f30, X, INCXM1
LFDUX f31, X, INCX

fmr f16, f8
fmr f17, f9
fmr f18, f10
fmr f19, f11

fmr f20, f12
fmr f21, f13
fmr f22, f14
fmr f23, f15
bdz LL(120)
.align 4

LL(110):
FADD f0, f0, f16
fmr f16, f24
FADD f1, f1, f17
fmr f17, f25

FADD f2, f2, f18
fmr f18, f26
FADD f3, f3, f19
fmr f19, f27

LFDX f8, X, INCXM1
LFDUX f9, X, INCX
LFDX f10, X, INCXM1
LFDUX f11, X, INCX

FADD f4, f4, f20
fmr f20, f28
FADD f5, f5, f21
fmr f21, f29

FADD f6, f6, f22
fmr f22, f30
FADD f7, f7, f23
fmr f23, f31

LFDX f12, X, INCXM1
LFDUX f13, X, INCX
LFDX f14, X, INCXM1
LFDUX f15, X, INCX

FADD f0, f0, f16
fmr f16, f8
FADD f1, f1, f17
fmr f17, f9

FADD f2, f2, f18
fmr f18, f10
FADD f3, f3, f19
fmr f19, f11

LFDX f24, X, INCXM1
LFDUX f25, X, INCX
LFDX f26, X, INCXM1
LFDUX f27, X, INCX

FADD f4, f4, f20
fmr f20, f12
FADD f5, f5, f21
fmr f21, f13

FADD f6, f6, f22
fmr f22, f14
FADD f7, f7, f23
fmr f23, f15

LFDX f28, X, INCXM1
LFDUX f29, X, INCX
LFDX f30, X, INCXM1
LFDUX f31, X, INCX
bdnz LL(110)
.align 4

LL(120):
FADD f0, f0, f16
fmr f16, f24
FADD f1, f1, f17
fmr f17, f25

FADD f2, f2, f18
fmr f18, f26
FADD f3, f3, f19
fmr f19, f27

FADD f4, f4, f20
fmr f20, f28
FADD f5, f5, f21
fmr f21, f29

FADD f6, f6, f22
fmr f22, f30
FADD f7, f7, f23
fmr f23, f31

FADD f0, f0, f16
FADD f1, f1, f17
FADD f2, f2, f18
FADD f3, f3, f19

FADD f4, f4, f20
FADD f5, f5, f21
FADD f6, f6, f22
FADD f7, f7, f23
.align 4

LL(150):
andi. r0, N, 7
mtspr CTR, r0
beq LL(999)
.align 4

LL(160):
LFDX f8, X, INCXM1
LFDUX f9, X, INCX
FADD f0, f0, f8
FADD f1, f1, f9
bdnz LL(160)
.align 4

LL(999):
FADD f0, f0, f1
FADD f2, f2, f3
FADD f4, f4, f5
FADD f6, f6, f7

FADD f0, f0, f2
FADD f4, f4, f6
FADD f1, f0, f4

lfd f14, 0(SP)
lfd f15, 8(SP)
lfd f16, 16(SP)
lfd f17, 24(SP)

lfd f18, 32(SP)
lfd f19, 40(SP)
lfd f20, 48(SP)
lfd f21, 56(SP)

lfd f22, 64(SP)
lfd f23, 72(SP)
lfd f24, 80(SP)
lfd f25, 88(SP)

lfd f26, 96(SP)
lfd f27, 104(SP)
lfd f28, 112(SP)
lfd f29, 120(SP)

lfd f30, 128(SP)
lfd f31, 136(SP)

addi SP, SP, STACKSIZE
blr

EPILOGUE

+ 6
- 6
kernel/setparam-ref.c View File

@@ -70,7 +70,7 @@ gotoblas_t TABLE_NAME = {

samax_kTS, samin_kTS, smax_kTS, smin_kTS,
isamax_kTS, isamin_kTS, ismax_kTS, ismin_kTS,
snrm2_kTS, sasum_kTS, scopy_kTS, sdot_kTS,
snrm2_kTS, sasum_kTS, ssum_kTS, scopy_kTS, sdot_kTS,
dsdot_kTS,
srot_kTS, saxpy_kTS, sscal_kTS, sswap_kTS,
sgemv_nTS, sgemv_tTS, sger_kTS,
@@ -126,7 +126,7 @@ gotoblas_t TABLE_NAME = {

damax_kTS, damin_kTS, dmax_kTS, dmin_kTS,
idamax_kTS, idamin_kTS, idmax_kTS, idmin_kTS,
dnrm2_kTS, dasum_kTS, dcopy_kTS, ddot_kTS,
dnrm2_kTS, dasum_kTS, dsum_kTS, dcopy_kTS, ddot_kTS,
drot_kTS, daxpy_kTS, dscal_kTS, dswap_kTS,
dgemv_nTS, dgemv_tTS, dger_kTS,
dsymv_LTS, dsymv_UTS,
@@ -178,7 +178,7 @@ gotoblas_t TABLE_NAME = {

qamax_kTS, qamin_kTS, qmax_kTS, qmin_kTS,
iqamax_kTS, iqamin_kTS, iqmax_kTS, iqmin_kTS,
qnrm2_kTS, qasum_kTS, qcopy_kTS, qdot_kTS,
qnrm2_kTS, qasum_kTS, qsum_kTS, qcopy_kTS, qdot_kTS,
qrot_kTS, qaxpy_kTS, qscal_kTS, qswap_kTS,
qgemv_nTS, qgemv_tTS, qger_kTS,
qsymv_LTS, qsymv_UTS,
@@ -234,7 +234,7 @@ gotoblas_t TABLE_NAME = {
#endif

camax_kTS, camin_kTS, icamax_kTS, icamin_kTS,
cnrm2_kTS, casum_kTS, ccopy_kTS,
cnrm2_kTS, casum_kTS, csum_kTS, ccopy_kTS,
cdotu_kTS, cdotc_kTS, csrot_kTS,
caxpy_kTS, caxpyc_kTS, cscal_kTS, cswap_kTS,

@@ -369,7 +369,7 @@ gotoblas_t TABLE_NAME = {
#endif

zamax_kTS, zamin_kTS, izamax_kTS, izamin_kTS,
znrm2_kTS, zasum_kTS, zcopy_kTS,
znrm2_kTS, zasum_kTS, zsum_kTS, zcopy_kTS,
zdotu_kTS, zdotc_kTS, zdrot_kTS,
zaxpy_kTS, zaxpyc_kTS, zscal_kTS, zswap_kTS,

@@ -500,7 +500,7 @@ gotoblas_t TABLE_NAME = {
XGEMM_DEFAULT_UNROLL_M, XGEMM_DEFAULT_UNROLL_N, MAX(XGEMM_DEFAULT_UNROLL_M, XGEMM_DEFAULT_UNROLL_N),

xamax_kTS, xamin_kTS, ixamax_kTS, ixamin_kTS,
xnrm2_kTS, xasum_kTS, xcopy_kTS,
xnrm2_kTS, xasum_kTS, xsum_kTS, xcopy_kTS,
xdotu_kTS, xdotc_kTS, xqrot_kTS,
xaxpy_kTS, xaxpyc_kTS, xscal_kTS, xswap_kTS,



+ 325
- 0
kernel/sparc/sum.S View File

@@ -0,0 +1,325 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/

#define ASSEMBLER
#include "common.h"

#define N %i0
#define X %i1
#define INCX %i2
#define I %i3

#ifdef DOUBLE
#define c1 %f0
#define c2 %f2
#define t1 %f8
#define t2 %f10
#define t3 %f12
#define t4 %f14

#define a1 %f16
#define a2 %f18
#define a3 %f20
#define a4 %f22
#define a5 %f24
#define a6 %f26
#define a7 %f28
#define a8 %f30
#else
#define c1 %f0
#define c2 %f1
#define t1 %f4
#define t2 %f5
#define t3 %f6
#define t4 %f7

#define a1 %f8
#define a2 %f9
#define a3 %f10
#define a4 %f11
#define a5 %f12
#define a6 %f13
#define a7 %f14
#define a8 %f15
#endif

PROLOGUE
SAVESP

FCLR(0)

sll INCX, BASE_SHIFT, INCX

FMOV c1, c2
FMOV c1, t1
FMOV c1, t2
FMOV c1, t3
FMOV c1, t4

cmp INCX, 0
ble .LL19
cmp INCX, SIZE
bne .LL50

sra N, 3, I
cmp I, 0
ble,pn %icc, .LL15
nop

LDF [X + 0 * SIZE], a1
add I, -1, I
LDF [X + 1 * SIZE], a2
cmp I, 0
LDF [X + 2 * SIZE], a3
LDF [X + 3 * SIZE], a4
LDF [X + 4 * SIZE], a5
LDF [X + 5 * SIZE], a6
LDF [X + 6 * SIZE], a7
LDF [X + 7 * SIZE], a8

ble,pt %icc, .LL12
add X, 8 * SIZE, X

#define PREFETCHSIZE 128

.LL11:
FADD c1, t1, c1
prefetch [X + PREFETCHSIZE * SIZE], 0
FMOV a1, t1
LDF [X + 0 * SIZE], a1

FADD c2, t2, c2
add I, -1, I
FMOV a2, t2
LDF [X + 1 * SIZE], a2

FADD c1, t3, c1
cmp I, 0
FMOV a3, t3
LDF [X + 2 * SIZE], a3

FADD c2, t4, c2
nop
FMOV a4, t4
LDF [X + 3 * SIZE], a4

FADD c1, t1, c1
nop
FMOV a5, t1
LDF [X + 4 * SIZE], a5

FADD c2, t2, c2
nop
FMOV a6, t2
LDF [X + 5 * SIZE], a6

FADD c1, t3, c1
FMOV a7, t3
LDF [X + 6 * SIZE], a7
add X, 8 * SIZE, X

FADD c2, t4, c2
FMOV a8, t4
bg,pt %icc, .LL11
LDF [X - 1 * SIZE], a8

.LL12:
FADD c1, t1, c1
FMOV a1, t1
FADD c2, t2, c2
FMOV a2, t2

FADD c1, t3, c1
FMOV a3, t3
FADD c2, t4, c2
FMOV a4, t4

FADD c1, t1, c1
FMOV a5, t1
FADD c2, t2, c2
FMOV a6, t2

FADD c1, t3, c1
FMOV a7, t3
FADD c2, t4, c2
FMOV a8, t4

.LL15:
and N, 7, I
cmp I, 0
ble,a,pn %icc, .LL19
nop

.LL16:
LDF [X + 0 * SIZE], a1
add I, -1, I
cmp I, 0
FADD c1, t1, c1
FMOV a1, t1
bg,pt %icc, .LL16
add X, 1 * SIZE, X

.LL19:
FADD c1, t1, c1
FADD c2, t2, c2
FADD c1, t3, c1
FADD c2, t4, c2

FADD c1, c2, c1
return %i7 + 8
clr %g0

.LL50:
sra N, 3, I
cmp I, 0
ble,pn %icc, .LL55
nop

LDF [X + 0 * SIZE], a1
add X, INCX, X
LDF [X + 0 * SIZE], a2
add X, INCX, X
LDF [X + 0 * SIZE], a3
add X, INCX, X
LDF [X + 0 * SIZE], a4
add X, INCX, X
LDF [X + 0 * SIZE], a5
add X, INCX, X
LDF [X + 0 * SIZE], a6
add X, INCX, X
add I, -1, I
LDF [X + 0 * SIZE], a7
cmp I, 0
add X, INCX, X
LDF [X + 0 * SIZE], a8

ble,pt %icc, .LL52
add X, INCX, X

.LL51:
FADD c1, t1, c1
add I, -1, I
FMOV a1, t1
LDF [X + 0 * SIZE], a1
add X, INCX, X

FADD c2, t2, c2
cmp I, 0
FMOV a2, t2
LDF [X + 0 * SIZE], a2
add X, INCX, X

FADD c1, t3, c1
FMOV a3, t3
LDF [X + 0 * SIZE], a3
add X, INCX, X

FADD c2, t4, c2
FMOV a4, t4
LDF [X + 0 * SIZE], a4
add X, INCX, X

FADD c1, t1, c1
FMOV a5, t1
LDF [X + 0 * SIZE], a5
add X, INCX, X

FADD c2, t2, c2
FMOV a6, t2
LDF [X + 0 * SIZE], a6
add X, INCX, X

FADD c1, t3, c1
FMOV a7, t3
LDF [X + 0 * SIZE], a7
add X, INCX, X

FADD c2, t4, c2
FMOV a8, t4
LDF [X + 0 * SIZE], a8

bg,pt %icc, .LL51
add X, INCX, X

.LL52:
FADD c1, t1, c1
FMOV a1, t1
FADD c2, t2, c2
FMOV a2, t2

FADD c1, t3, c1
FMOV a3, t3
FADD c2, t4, c2
FMOV a4, t4

FADD c1, t1, c1
FMOV a5, t1
FADD c2, t2, c2
FMOV a6, t2

FADD c1, t3, c1
FMOV a7, t3
FADD c2, t4, c2
FMOV a8, t4

.LL55:
and N, 7, I
cmp I, 0
ble,a,pn %icc, .LL59
nop

.LL56:
LDF [X + 0 * SIZE], a1
FADD c1, t1, c1
add I, -1, I
FMOV a1, t1
cmp I, 0
bg,pt %icc, .LL56
add X, INCX, X

.LL59:
FADD c1, t1, c1
FADD c2, t2, c2
FADD c1, t3, c1
FADD c2, t4, c2

FADD c1, c2, c1
return %i7 + 8
clr %o0

EPILOGUE

+ 327
- 0
kernel/sparc/zsum.S View File

@@ -0,0 +1,327 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/

#define ASSEMBLER
#include "common.h"

#define N %i0
#define X %i1
#define INCX %i2
#define I %i3

#ifdef DOUBLE
#define c1 %f0
#define c2 %f2
#define t1 %f8
#define t2 %f10
#define t3 %f12
#define t4 %f14

#define a1 %f16
#define a2 %f18
#define a3 %f20
#define a4 %f22
#define a5 %f24
#define a6 %f26
#define a7 %f28
#define a8 %f30
#else
#define c1 %f0
#define c2 %f1
#define t1 %f4
#define t2 %f5
#define t3 %f6
#define t4 %f7

#define a1 %f8
#define a2 %f9
#define a3 %f10
#define a4 %f11
#define a5 %f12
#define a6 %f13
#define a7 %f14
#define a8 %f15
#endif

PROLOGUE
SAVESP

FCLR(0)

sll INCX, ZBASE_SHIFT, INCX

FMOV c1, c2
FMOV c1, t1
FMOV c1, t2
FMOV c1, t3
FMOV c1, t4

cmp INCX, 0
ble .LL19
nop

cmp INCX, 2 * SIZE
bne .LL50
nop

sra N, 2, I
cmp I, 0
ble,pn %icc, .LL15
nop

LDF [X + 0 * SIZE], a1
add I, -1, I
LDF [X + 1 * SIZE], a2
cmp I, 0
LDF [X + 2 * SIZE], a3
LDF [X + 3 * SIZE], a4
LDF [X + 4 * SIZE], a5
LDF [X + 5 * SIZE], a6
LDF [X + 6 * SIZE], a7
LDF [X + 7 * SIZE], a8

ble,pt %icc, .LL12
add X, 8 * SIZE, X

#define PREFETCHSIZE 32

.LL11:
FADD c1, t1, c1
prefetch [X + PREFETCHSIZE * SIZE], 0
FMOV a1, t1
LDF [X + 0 * SIZE], a1

FADD c2, t2, c2
add I, -1, I
FMOV a2, t2
LDF [X + 1 * SIZE], a2

FADD c1, t3, c1
cmp I, 0
FMOV a3, t3
LDF [X + 2 * SIZE], a3

FADD c2, t4, c2
nop
FMOV a4, t4
LDF [X + 3 * SIZE], a4

FADD c1, t1, c1
nop
FMOV a5, t1
LDF [X + 4 * SIZE], a5

FADD c2, t2, c2
nop
FMOV a6, t2
LDF [X + 5 * SIZE], a6

FADD c1, t3, c1
FMOV a7, t3
LDF [X + 6 * SIZE], a7
add X, 8 * SIZE, X

FADD c2, t4, c2
FMOV a8, t4
bg,pt %icc, .LL11
LDF [X - 1 * SIZE], a8

.LL12:
FADD c1, t1, c1
FMOV a1, t1
FADD c2, t2, c2
FMOV a2, t2

FADD c1, t3, c1
FMOV a3, t3
FADD c2, t4, c2
FMOV a4, t4

FADD c1, t1, c1
FMOV a5, t1
FADD c2, t2, c2
FMOV a6, t2

FADD c1, t3, c1
FMOV a7, t3
FADD c2, t4, c2
FMOV a8, t4

.LL15:
and N, 3, I
cmp I, 0
ble,a,pn %icc, .LL19
nop

.LL16:
LDF [X + 0 * SIZE], a1
LDF [X + 1 * SIZE], a2
add I, -1, I
cmp I, 0
FADD c1, t1, c1
FADD c2, t2, c2
FMOV a1, t1
FMOV a2, t2
bg,pt %icc, .LL16
add X, 2 * SIZE, X

.LL19:
FADD c1, t1, c1
FADD c2, t2, c2
FADD c1, t3, c1
FADD c2, t4, c2

FADD c1, c2, c1
return %i7 + 8
clr %g0

.LL50:
sra N, 2, I
cmp I, 0
ble,pn %icc, .LL55
nop

LDF [X + 0 * SIZE], a1
LDF [X + 1 * SIZE], a2
add X, INCX, X
LDF [X + 0 * SIZE], a3
LDF [X + 1 * SIZE], a4
add X, INCX, X
LDF [X + 0 * SIZE], a5
LDF [X + 1 * SIZE], a6
add X, INCX, X
add I, -1, I
LDF [X + 0 * SIZE], a7
cmp I, 0
LDF [X + 1 * SIZE], a8

ble,pt %icc, .LL52
add X, INCX, X

.LL51:
FADD c1, t1, c1
add I, -1, I
FMOV a1, t1
LDF [X + 0 * SIZE], a1

FADD c2, t2, c2
cmp I, 0
FMOV a2, t2
LDF [X + 1 * SIZE], a2
add X, INCX, X

FADD c1, t3, c1
FMOV a3, t3
LDF [X + 0 * SIZE], a3

FADD c2, t4, c2
FMOV a4, t4
LDF [X + 1 * SIZE], a4
add X, INCX, X

FADD c1, t1, c1
FMOV a5, t1
LDF [X + 0 * SIZE], a5

FADD c2, t2, c2
FMOV a6, t2
LDF [X + 1 * SIZE], a6
add X, INCX, X

FADD c1, t3, c1
FMOV a7, t3
LDF [X + 0 * SIZE], a7

FADD c2, t4, c2
FMOV a8, t4
LDF [X + 1 * SIZE], a8

bg,pt %icc, .LL51
add X, INCX, X

.LL52:
FADD c1, t1, c1
FMOV a1, t1
FADD c2, t2, c2
FMOV a2, t2

FADD c1, t3, c1
FMOV a3, t3
FADD c2, t4, c2
FMOV a4, t4

FADD c1, t1, c1
FMOV a5, t1
FADD c2, t2, c2
FMOV a6, t2

FADD c1, t3, c1
FMOV a7, t3
FADD c2, t4, c2
FMOV a8, t4

.LL55:
and N, 3, I
cmp I, 0
ble,a,pn %icc, .LL59
nop

.LL56:
LDF [X + 0 * SIZE], a1
LDF [X + 1 * SIZE], a2
FADD c1, t1, c1
FADD c2, t2, c2
add I, -1, I
FMOV a1, t1
FMOV a2, t2
cmp I, 0
bg,pt %icc, .LL56
add X, INCX, X

.LL59:
FADD c1, t1, c1
FADD c2, t2, c2
FADD c1, t3, c1
FADD c2, t4, c2

FADD c1, c2, c1

return %i7 + 8
clr %o0

EPILOGUE

+ 5
- 0
kernel/x86/KERNEL.generic View File

@@ -94,6 +94,11 @@ DASUMKERNEL = ../arm/asum.c
CASUMKERNEL = ../arm/zasum.c
ZASUMKERNEL = ../arm/zasum.c

SSUMKERNEL = ../arm/sum.c
DSUMKERNEL = ../arm/sum.c
CSUMKERNEL = ../arm/zsum.c
ZSUMKERNEL = ../arm/zsum.c

SAXPYKERNEL = ../arm/axpy.c
DAXPYKERNEL = ../arm/axpy.c
CAXPYKERNEL = ../arm/zaxpy.c


+ 207
- 0
kernel/x86/sum.S View File

@@ -0,0 +1,207 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/

#define ASSEMBLER
#include "common.h"

#define STACK 8
#define ARGS 0

#define STACK_M 4 + STACK + ARGS(%esp)
#define STACK_X 8 + STACK + ARGS(%esp)
#define STACK_INCX 12 + STACK + ARGS(%esp)

#define M %edx
#define X %ecx
#define INCX %esi

#define I %eax

#include "l1param.h"

PROLOGUE

pushl %esi
pushl %ebx

PROFCODE

#if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95)
EMMS
#endif

movl STACK_M, M
movl STACK_X, X
movl STACK_INCX, INCX

#ifdef F_INTERFACE
movl (M), M
movl (INCX), INCX
#endif

fldz
testl M, M
jle .L999
testl INCX, INCX
jle .L999

sall $BASE_SHIFT, INCX
fldz
fldz
fldz
cmpl $SIZE, INCX
jne .L40

movl M, I
sarl $3, I
jle .L20
ALIGN_4

.L10:
#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
#endif

FLD 0 * SIZE(X)
FLD 1 * SIZE(X)
FLD 2 * SIZE(X)
FLD 3 * SIZE(X)

faddp %st, %st(7)
faddp %st, %st(5)
faddp %st, %st(3)
faddp %st, %st(1)

FLD 4 * SIZE(X)
FLD 5 * SIZE(X)
FLD 6 * SIZE(X)
FLD 7 * SIZE(X)

addl $8 * SIZE, X

faddp %st, %st(7)
faddp %st, %st(5)
faddp %st, %st(3)
faddp %st, %st(1)

decl I
jg .L10
ALIGN_4

.L20:
movl M, I
andl $7, I
jle .L998
ALIGN_4


.L21:
FLD (X)
faddp %st,%st(1)
addl $1 * SIZE, X
decl I
jg .L21
jmp .L998
ALIGN_4

.L40:
movl M, I
sarl $3, I
jle .L60
ALIGN_4

.L50:
FLD (X)
addl INCX, X
FLD (X)
addl INCX, X
FLD (X)
addl INCX, X
FLD (X)
addl INCX, X

faddp %st, %st(7)
faddp %st, %st(5)
faddp %st, %st(3)
faddp %st, %st(1)

FLD (X)
addl INCX, X
FLD (X)
addl INCX, X
FLD (X)
addl INCX, X
FLD (X)
addl INCX, X

faddp %st, %st(7)
faddp %st, %st(5)
faddp %st, %st(3)
faddp %st, %st(1)

decl I
jg .L50
ALIGN_4

.L60:
movl M, I
andl $7, I
jle .L998
ALIGN_4


.L61:
FLD (X)
addl INCX, X
faddp %st,%st(1)
decl I
jg .L61
ALIGN_4

.L998:
faddp %st,%st(2)
faddp %st,%st(1)
faddp %st,%st(1)
ALIGN_4

.L999:
popl %ebx
popl %esi
ret

EPILOGUE

+ 208
- 0
kernel/x86/zsum.S View File

@@ -0,0 +1,208 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/

#define ASSEMBLER
#include "common.h"

#define STACK 8
#define ARGS 0

#define STACK_M 4 + STACK + ARGS(%esp)
#define STACK_X 8 + STACK + ARGS(%esp)
#define STACK_INCX 12 + STACK + ARGS(%esp)

#define M %edx
#define X %ecx
#define INCX %esi

#define I %eax

#include "l1param.h"

PROLOGUE

pushl %esi
pushl %ebx

PROFCODE

#if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95)
EMMS
#endif

movl STACK_M, M
movl STACK_X, X
movl STACK_INCX, INCX

#ifdef F_INTERFACE
movl (M), M
movl (INCX), INCX
#endif

fldz
testl M, M
jle .L999
testl INCX, INCX
jle .L999

sall $ZBASE_SHIFT, INCX

fldz
fldz
fldz
cmpl $SIZE * 2, INCX
jne .L40

movl M, I
sarl $2, I
jle .L20
ALIGN_4

.L10:
#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
#endif

FLD 0 * SIZE(X)
FLD 1 * SIZE(X)
FLD 2 * SIZE(X)
FLD 3 * SIZE(X)

faddp %st, %st(7)
faddp %st, %st(5)
faddp %st, %st(3)
faddp %st, %st(1)

FLD 4 * SIZE(X)
FLD 5 * SIZE(X)
FLD 6 * SIZE(X)
FLD 7 * SIZE(X)

addl $8 * SIZE, X

faddp %st, %st(7)
faddp %st, %st(5)
faddp %st, %st(3)
faddp %st, %st(1)

decl I
jg .L10
ALIGN_4

.L20:
movl M, I
andl $3, I
jle .L998
ALIGN_4


.L21:
FLD 0 * SIZE(X)
FLD 1 * SIZE(X)
faddp %st,%st(3)
faddp %st,%st(1)
addl $2 * SIZE, X
decl I
jg .L21
jmp .L998
ALIGN_4

.L40:
movl M, I
sarl $2, I
jle .L60
ALIGN_4

.L50:
FLD 0 * SIZE(X)
FLD 1 * SIZE(X)
addl INCX, X
FLD 0 * SIZE(X)
FLD 1 * SIZE(X)
addl INCX, X

faddp %st, %st(7)
faddp %st, %st(5)
faddp %st, %st(3)
faddp %st, %st(1)

FLD 0 * SIZE(X)
FLD 1 * SIZE(X)
addl INCX, X
FLD 0 * SIZE(X)
FLD 1 * SIZE(X)
addl INCX, X

faddp %st, %st(7)
faddp %st, %st(5)
faddp %st, %st(3)
faddp %st, %st(1)

decl I
jg .L50
ALIGN_4

.L60:
movl M, I
andl $3, I
jle .L998
ALIGN_4


.L61:
FLD 0 * SIZE(X)
FLD 1 * SIZE(X)
addl INCX, X
faddp %st,%st(3)
faddp %st,%st(1)
decl I
jg .L61
ALIGN_4

.L998:
faddp %st,%st(2)
faddp %st,%st(1)
faddp %st,%st(1)
ALIGN_4

.L999:
popl %ebx
popl %esi
ret

EPILOGUE

+ 5
- 0
kernel/x86_64/KERNEL.generic View File

@@ -94,6 +94,11 @@ DASUMKERNEL = ../arm/asum.c
CASUMKERNEL = ../arm/zasum.c
ZASUMKERNEL = ../arm/zasum.c

SSUMKERNEL = ../arm/sum.c
DSUMKERNEL = ../arm/sum.c
CSUMKERNEL = ../arm/zsum.c
ZSUMKERNEL = ../arm/zsum.c

SAXPYKERNEL = ../arm/axpy.c
DAXPYKERNEL = ../arm/axpy.c
CAXPYKERNEL = ../arm/zaxpy.c


+ 179
- 0
kernel/x86_64/sum.S View File

@@ -0,0 +1,179 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/

#define ASSEMBLER
#include "common.h"

#define M ARG1
#define X ARG2
#define INCX ARG3

#define I %rax

#include "l1param.h"

PROLOGUE
PROFCODE

fldz
testq M, M
jle .L999
testq INCX, INCX
jle .L999

salq $BASE_SHIFT, INCX

fldz
fldz
fldz
cmpq $SIZE, INCX
jne .L40

movq M, I
sarq $3, I
jle .L20
ALIGN_4

.L10:
#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
#endif

FLD 0 * SIZE(X)
FLD 1 * SIZE(X)
FLD 2 * SIZE(X)
FLD 3 * SIZE(X)

faddp %st, %st(7)
faddp %st, %st(5)
faddp %st, %st(3)
faddp %st, %st(1)

FLD 4 * SIZE(X)
FLD 5 * SIZE(X)
FLD 6 * SIZE(X)
FLD 7 * SIZE(X)

addq $8 * SIZE, X

faddp %st, %st(7)
faddp %st, %st(5)
faddp %st, %st(3)
faddp %st, %st(1)

decq I
jg .L10
ALIGN_4

.L20:
andq $7, M
jle .L998
ALIGN_4

.L21:
FLD (X)
faddp %st,%st(1)
addq $1 * SIZE, X
decq M
jg .L21
jmp .L998
ALIGN_4

.L40:
movq M, I
sarq $3, I
jle .L60
ALIGN_4

.L50:
FLD (X)
addq INCX, X
FLD (X)
addq INCX, X
FLD (X)
addq INCX, X
FLD (X)
addq INCX, X

faddp %st, %st(7)
faddp %st, %st(5)
faddp %st, %st(3)
faddp %st, %st(1)

FLD (X)
addq INCX, X
FLD (X)
addq INCX, X
FLD (X)
addq INCX, X
FLD (X)
addq INCX, X

faddp %st, %st(7)
faddp %st, %st(5)
faddp %st, %st(3)
faddp %st, %st(1)

decq I
jg .L50
ALIGN_4

.L60:
andq $7, M
jle .L998
ALIGN_4


.L61:
FLD (X)
addq INCX, X
faddp %st,%st(1)
decq M
jg .L61
ALIGN_4

.L998:
faddp %st,%st(2)
faddp %st,%st(1)
faddp %st,%st(1)
ALIGN_4

.L999:
ret

EPILOGUE

+ 180
- 0
kernel/x86_64/zsum.S View File

@@ -0,0 +1,180 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/

#define ASSEMBLER
#include "common.h"

#define M ARG1
#define X ARG2
#define INCX ARG3

#define I %rax

#include "l1param.h"

PROLOGUE
PROFCODE

fldz
testq M, M
jle .L999
testq INCX, INCX
jle .L999

salq $ZBASE_SHIFT, INCX

fldz
fldz
fldz
cmpq $SIZE * 2, INCX
jne .L40

movq M, I
sarq $2, I
jle .L20
ALIGN_4

.L10:
#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
#endif

FLD 0 * SIZE(X)
FLD 1 * SIZE(X)
FLD 2 * SIZE(X)
FLD 3 * SIZE(X)

faddp %st, %st(7)
faddp %st, %st(5)
faddp %st, %st(3)
faddp %st, %st(1)

FLD 4 * SIZE(X)
FLD 5 * SIZE(X)
FLD 6 * SIZE(X)
FLD 7 * SIZE(X)

addq $8 * SIZE, X

faddp %st, %st(7)
faddp %st, %st(5)
faddp %st, %st(3)
faddp %st, %st(1)

decq I
jg .L10
ALIGN_4

.L20:
andq $3, M
jle .L998
ALIGN_4


.L21:
FLD 0 * SIZE(X)
FLD 1 * SIZE(X)
faddp %st,%st(3)
faddp %st,%st(1)
addq $2 * SIZE, X
decq M
jg .L21
jmp .L998
ALIGN_4

.L40:
movq M, I
sarq $2, I
jle .L60
ALIGN_4

.L50:
FLD 0 * SIZE(X)
FLD 1 * SIZE(X)
addq INCX, X
FLD 0 * SIZE(X)
FLD 1 * SIZE(X)
addq INCX, X

faddp %st, %st(7)
faddp %st, %st(5)
faddp %st, %st(3)
faddp %st, %st(1)

FLD 0 * SIZE(X)
FLD 1 * SIZE(X)
addq INCX, X
FLD 0 * SIZE(X)
FLD 1 * SIZE(X)
addq INCX, X

faddp %st, %st(7)
faddp %st, %st(5)
faddp %st, %st(3)
faddp %st, %st(1)

decq I
jg .L50
ALIGN_4

.L60:
andq $3, M
jle .L998
ALIGN_4


.L61:
FLD 0 * SIZE(X)
FLD 1 * SIZE(X)
addq INCX, X
faddp %st,%st(3)
faddp %st,%st(1)
decq M
jg .L61
ALIGN_4

.L998:
faddp %st,%st(2)
faddp %st,%st(1)
faddp %st,%st(1)
ALIGN_4

.L999:
ret

EPILOGUE

+ 5
- 0
kernel/zarch/KERNEL.Z13 View File

@@ -35,6 +35,11 @@ DASUMKERNEL = dasum.c
CASUMKERNEL = ../arm/zasum.c
ZASUMKERNEL = zasum.c

SSUMKERNEL = ../arm/asum.c
DSUMKERNEL = dasum.c
CSUMKERNEL = ../arm/zasum.c
ZSUMKERNEL = zasum.c

SAXPYKERNEL = ../arm/axpy.c
DAXPYKERNEL = daxpy.c
CAXPYKERNEL = ../arm/zaxpy.c


+ 5
- 0
kernel/zarch/KERNEL.Z14 View File

@@ -35,6 +35,11 @@ DASUMKERNEL = dasum.c
CASUMKERNEL = casum.c
ZASUMKERNEL = zasum.c

SSUMKERNEL = ssum.c
DSUMKERNEL = dsum.c
CSUMKERNEL = csum.c
ZSUMKERNEL = zsum.c

SAXPYKERNEL = saxpy.c
DAXPYKERNEL = daxpy.c
CAXPYKERNEL = caxpy.c


+ 5
- 0
kernel/zarch/KERNEL.ZARCH_GENERIC View File

@@ -35,6 +35,11 @@ DASUMKERNEL = ../arm/asum.c
CASUMKERNEL = ../arm/zasum.c
ZASUMKERNEL = ../arm/zasum.c

SSUMKERNEL = ../arm/sum.c
DSUMKERNEL = ../arm/sum.c
CSUMKERNEL = ../arm/zsum.c
ZSUMKERNEL = ../arm/zsum.c

SAXPYKERNEL = ../arm/axpy.c
DAXPYKERNEL = ../arm/axpy.c
CAXPYKERNEL = ../arm/zaxpy.c


+ 137
- 0
kernel/zarch/csum.c View File

@@ -0,0 +1,137 @@
/***************************************************************************
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#include "common.h"
#include <math.h>

static FLOAT csum_kernel_32(BLASLONG n, FLOAT *x) {
FLOAT sum;

__asm__("vzero %%v24\n\t"
"vzero %%v25\n\t"
"vzero %%v26\n\t"
"vzero %%v27\n\t"
"vzero %%v28\n\t"
"vzero %%v29\n\t"
"vzero %%v30\n\t"
"vzero %%v31\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16, 0(%%r1,%[x])\n\t"
"vl %%v17, 16(%%r1,%[x])\n\t"
"vl %%v18, 32(%%r1,%[x])\n\t"
"vl %%v19, 48(%%r1,%[x])\n\t"
"vl %%v20, 64(%%r1,%[x])\n\t"
"vl %%v21, 80(%%r1,%[x])\n\t"
"vl %%v22, 96(%%r1,%[x])\n\t"
"vl %%v23, 112(%%r1,%[x])\n\t"
"vfasb %%v24,%%v24,%%v16\n\t"
"vfasb %%v25,%%v25,%%v17\n\t"
"vfasb %%v26,%%v26,%%v18\n\t"
"vfasb %%v27,%%v27,%%v19\n\t"
"vfasb %%v28,%%v28,%%v20\n\t"
"vfasb %%v29,%%v29,%%v21\n\t"
"vfasb %%v30,%%v30,%%v22\n\t"
"vfasb %%v31,%%v31,%%v23\n\t"
"vl %%v16, 128(%%r1,%[x])\n\t"
"vl %%v17, 144(%%r1,%[x])\n\t"
"vl %%v18, 160(%%r1,%[x])\n\t"
"vl %%v19, 176(%%r1,%[x])\n\t"
"vl %%v20, 192(%%r1,%[x])\n\t"
"vl %%v21, 208(%%r1,%[x])\n\t"
"vl %%v22, 224(%%r1,%[x])\n\t"
"vl %%v23, 240(%%r1,%[x])\n\t"
"vfasb %%v24,%%v24,%%v16\n\t"
"vfasb %%v25,%%v25,%%v17\n\t"
"vfasb %%v26,%%v26,%%v18\n\t"
"vfasb %%v27,%%v27,%%v19\n\t"
"vfasb %%v28,%%v28,%%v20\n\t"
"vfasb %%v29,%%v29,%%v21\n\t"
"vfasb %%v30,%%v30,%%v22\n\t"
"vfasb %%v31,%%v31,%%v23\n\t"
"agfi %%r1,256\n\t"
"brctg %[n],0b\n\t"
"vfasb %%v24,%%v24,%%v25\n\t"
"vfasb %%v24,%%v24,%%v26\n\t"
"vfasb %%v24,%%v24,%%v27\n\t"
"vfasb %%v24,%%v24,%%v28\n\t"
"vfasb %%v24,%%v24,%%v29\n\t"
"vfasb %%v24,%%v24,%%v30\n\t"
"vfasb %%v24,%%v24,%%v31\n\t"
"veslg %%v25,%%v24,32\n\t"
"vfasb %%v24,%%v24,%%v25\n\t"
"vrepf %%v25,%%v24,2\n\t"
"vfasb %%v24,%%v24,%%v25\n\t"
"vstef %%v24,%[asum],0"
: [sum] "=Q"(sum),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
: "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
"v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");

return sum;
}

FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG ip = 0;
FLOAT sumf = 0.0;
BLASLONG n1;
BLASLONG inc_x2;

if (n <= 0 || inc_x <= 0)
return (sumf);

if (inc_x == 1) {

n1 = n & -32;
if (n1 > 0) {

sumf = csum_kernel_32(n1, x);
i = n1;
ip = 2 * n1;
}

while (i < n) {
sumf += x[ip] + x[ip + 1];
i++;
ip += 2;
}

} else {
inc_x2 = 2 * inc_x;

while (i < n) {
sumf += x[ip] + x[ip + 1];
ip += inc_x2;
i++;
}

}
return (sumf);
}

+ 148
- 0
kernel/zarch/dsum.c View File

@@ -0,0 +1,148 @@
/***************************************************************************
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#include "common.h"
#include <math.h>

static FLOAT dsum_kernel_32(BLASLONG n, FLOAT *x) {
FLOAT sum;

__asm__("vzero %%v24\n\t"
"vzero %%v25\n\t"
"vzero %%v26\n\t"
"vzero %%v27\n\t"
"vzero %%v28\n\t"
"vzero %%v29\n\t"
"vzero %%v30\n\t"
"vzero %%v31\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16, 0(%%r1,%[x])\n\t"
"vl %%v17, 16(%%r1,%[x])\n\t"
"vl %%v18, 32(%%r1,%[x])\n\t"
"vl %%v19, 48(%%r1,%[x])\n\t"
"vl %%v20, 64(%%r1,%[x])\n\t"
"vl %%v21, 80(%%r1,%[x])\n\t"
"vl %%v22, 96(%%r1,%[x])\n\t"
"vl %%v23, 112(%%r1,%[x])\n\t"
"vfadb %%v24,%%v24,%%v16\n\t"
"vfadb %%v25,%%v25,%%v17\n\t"
"vfadb %%v26,%%v26,%%v18\n\t"
"vfadb %%v27,%%v27,%%v19\n\t"
"vfadb %%v28,%%v28,%%v20\n\t"
"vfadb %%v29,%%v29,%%v21\n\t"
"vfadb %%v30,%%v30,%%v22\n\t"
"vfadb %%v31,%%v31,%%v23\n\t"
"vl %%v16, 128(%%r1,%[x])\n\t"
"vl %%v17, 144(%%r1,%[x])\n\t"
"vl %%v18, 160(%%r1,%[x])\n\t"
"vl %%v19, 176(%%r1,%[x])\n\t"
"vl %%v20, 192(%%r1,%[x])\n\t"
"vl %%v21, 208(%%r1,%[x])\n\t"
"vl %%v22, 224(%%r1,%[x])\n\t"
"vl %%v23, 240(%%r1,%[x])\n\t"
"vfadb %%v24,%%v24,%%v16\n\t"
"vfadb %%v25,%%v25,%%v17\n\t"
"vfadb %%v26,%%v26,%%v18\n\t"
"vfadb %%v27,%%v27,%%v19\n\t"
"vfadb %%v28,%%v28,%%v20\n\t"
"vfadb %%v29,%%v29,%%v21\n\t"
"vfadb %%v30,%%v30,%%v22\n\t"
"vfadb %%v31,%%v31,%%v23\n\t"
"agfi %%r1,256\n\t"
"brctg %[n],0b\n\t"
"vfadb %%v24,%%v24,%%v25\n\t"
"vfadb %%v24,%%v24,%%v26\n\t"
"vfadb %%v24,%%v24,%%v27\n\t"
"vfadb %%v24,%%v24,%%v28\n\t"
"vfadb %%v24,%%v24,%%v29\n\t"
"vfadb %%v24,%%v24,%%v30\n\t"
"vfadb %%v24,%%v24,%%v31\n\t"
"vrepg %%v25,%%v24,1\n\t"
"vfadb %%v24,%%v24,%%v25\n\t"
"vsteg %%v24,%[asum],0"
: [sum] "=Q"(sum),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
: "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
"v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");

return sum;
}

FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT sumf = 0.0;
BLASLONG n1;

if (n <= 0 || inc_x <= 0)
return sumf;

if (inc_x == 1) {

n1 = n & -32;

if (n1 > 0) {

sumf = dsum_kernel_32(n1, x);
i = n1;
}

while (i < n) {
sumf += x[i];
i++;
}

} else {
BLASLONG n1 = n & -4;
register FLOAT sum1, sum2;
sum1 = 0.0;
sum2 = 0.0;
while (j < n1) {

sum1 += x[i];
sum2 += x[i + inc_x];
sum1 += x[i + 2 * inc_x];
sum2 += x[i + 3 * inc_x];

i += inc_x * 4;
j += 4;

}
sumf = sum1 + sum2;
while (j < n) {

sumf += x[i];
i += inc_x;
j++;
}

}
return sumf;
}

+ 151
- 0
kernel/zarch/ssum.c View File

@@ -0,0 +1,151 @@
/***************************************************************************
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#include "common.h"
#include <math.h>


static FLOAT ssum_kernel_64(BLASLONG n, FLOAT *x) {
FLOAT sum;

__asm__("vzero %%v24\n\t"
"vzero %%v25\n\t"
"vzero %%v26\n\t"
"vzero %%v27\n\t"
"vzero %%v28\n\t"
"vzero %%v29\n\t"
"vzero %%v30\n\t"
"vzero %%v31\n\t"
"srlg %[n],%[n],6\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16, 0(%%r1,%[x])\n\t"
"vl %%v17, 16(%%r1,%[x])\n\t"
"vl %%v18, 32(%%r1,%[x])\n\t"
"vl %%v19, 48(%%r1,%[x])\n\t"
"vl %%v20, 64(%%r1,%[x])\n\t"
"vl %%v21, 80(%%r1,%[x])\n\t"
"vl %%v22, 96(%%r1,%[x])\n\t"
"vl %%v23, 112(%%r1,%[x])\n\t"
"vfasb %%v24,%%v24,%%v16\n\t"
"vfasb %%v25,%%v25,%%v17\n\t"
"vfasb %%v26,%%v26,%%v18\n\t"
"vfasb %%v27,%%v27,%%v19\n\t"
"vfasb %%v28,%%v28,%%v20\n\t"
"vfasb %%v29,%%v29,%%v21\n\t"
"vfasb %%v30,%%v30,%%v22\n\t"
"vfasb %%v31,%%v31,%%v23\n\t"
"vl %%v16, 128(%%r1,%[x])\n\t"
"vl %%v17, 144(%%r1,%[x])\n\t"
"vl %%v18, 160(%%r1,%[x])\n\t"
"vl %%v19, 176(%%r1,%[x])\n\t"
"vl %%v20, 192(%%r1,%[x])\n\t"
"vl %%v21, 208(%%r1,%[x])\n\t"
"vl %%v22, 224(%%r1,%[x])\n\t"
"vl %%v23, 240(%%r1,%[x])\n\t"
"vfasb %%v24,%%v24,%%v16\n\t"
"vfasb %%v25,%%v25,%%v17\n\t"
"vfasb %%v26,%%v26,%%v18\n\t"
"vfasb %%v27,%%v27,%%v19\n\t"
"vfasb %%v28,%%v28,%%v20\n\t"
"vfasb %%v29,%%v29,%%v21\n\t"
"vfasb %%v30,%%v30,%%v22\n\t"
"vfasb %%v31,%%v31,%%v23\n\t"
"agfi %%r1,256\n\t"
"brctg %[n],0b\n\t"
"vfasb %%v24,%%v24,%%v25\n\t"
"vfasb %%v24,%%v24,%%v26\n\t"
"vfasb %%v24,%%v24,%%v27\n\t"
"vfasb %%v24,%%v24,%%v28\n\t"
"vfasb %%v24,%%v24,%%v29\n\t"
"vfasb %%v24,%%v24,%%v30\n\t"
"vfasb %%v24,%%v24,%%v31\n\t"
"veslg %%v25,%%v24,32\n\t"
"vfasb %%v24,%%v24,%%v25\n\t"
"vrepf %%v25,%%v24,2\n\t"
"vfasb %%v24,%%v24,%%v25\n\t"
"vstef %%v24,%[asum],0"
: [sum] "=Q"(sum),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
: "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
"v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");

return sum;
}

FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT sumf = 0.0;
BLASLONG n1;

if (n <= 0 || inc_x <= 0)
return sumf;

if (inc_x == 1) {

n1 = n & -64;

if (n1 > 0) {

sumf = ssum_kernel_64(n1, x);
i = n1;
}

while (i < n) {
sumf += x[i];
i++;
}

} else {
BLASLONG n1 = n & -4;
register FLOAT sum1, sum2;
sum1 = 0.0;
sum2 = 0.0;
while (j < n1) {

sum1 += x[i];
sum2 += x[i + inc_x];
sum1 += x[i + 2 * inc_x];
sum2 += x[i + 3 * inc_x];

i += inc_x * 4;
j += 4;

}
sumf = sum1 + sum2;
while (j < n) {

sumf += x[i];
i += inc_x;
j++;
}

}
return sumf;
}

+ 136
- 0
kernel/zarch/zsum.c View File

@@ -0,0 +1,136 @@
/***************************************************************************
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#include "common.h"
#include <math.h>


static FLOAT zsum_kernel_16(BLASLONG n, FLOAT *x) {
FLOAT sum;

__asm__("vzero %%v24\n\t"
"vzero %%v25\n\t"
"vzero %%v26\n\t"
"vzero %%v27\n\t"
"vzero %%v28\n\t"
"vzero %%v29\n\t"
"vzero %%v30\n\t"
"vzero %%v31\n\t"
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16, 0(%%r1,%[x])\n\t"
"vl %%v17, 16(%%r1,%[x])\n\t"
"vl %%v18, 32(%%r1,%[x])\n\t"
"vl %%v19, 48(%%r1,%[x])\n\t"
"vl %%v20, 64(%%r1,%[x])\n\t"
"vl %%v21, 80(%%r1,%[x])\n\t"
"vl %%v22, 96(%%r1,%[x])\n\t"
"vl %%v23, 112(%%r1,%[x])\n\t"
"vfadb %%v24,%%v24,%%v16\n\t"
"vfadb %%v25,%%v25,%%v17\n\t"
"vfadb %%v26,%%v26,%%v18\n\t"
"vfadb %%v27,%%v27,%%v19\n\t"
"vfadb %%v28,%%v28,%%v20\n\t"
"vfadb %%v29,%%v29,%%v21\n\t"
"vfadb %%v30,%%v30,%%v22\n\t"
"vfadb %%v31,%%v31,%%v23\n\t"
"vl %%v16, 128(%%r1,%[x])\n\t"
"vl %%v17, 144(%%r1,%[x])\n\t"
"vl %%v18, 160(%%r1,%[x])\n\t"
"vl %%v19, 176(%%r1,%[x])\n\t"
"vl %%v20, 192(%%r1,%[x])\n\t"
"vl %%v21, 208(%%r1,%[x])\n\t"
"vl %%v22, 224(%%r1,%[x])\n\t"
"vl %%v23, 240(%%r1,%[x])\n\t"
"vfadb %%v24,%%v24,%%v16\n\t"
"vfadb %%v25,%%v25,%%v17\n\t"
"vfadb %%v26,%%v26,%%v18\n\t"
"vfadb %%v27,%%v27,%%v19\n\t"
"vfadb %%v28,%%v28,%%v20\n\t"
"vfadb %%v29,%%v29,%%v21\n\t"
"vfadb %%v30,%%v30,%%v22\n\t"
"vfadb %%v31,%%v31,%%v23\n\t"
"agfi %%r1,256\n\t"
"brctg %[n],0b\n\t"
"vfadb %%v24,%%v24,%%v25\n\t"
"vfadb %%v24,%%v24,%%v26\n\t"
"vfadb %%v24,%%v24,%%v27\n\t"
"vfadb %%v24,%%v24,%%v28\n\t"
"vfadb %%v24,%%v24,%%v29\n\t"
"vfadb %%v24,%%v24,%%v30\n\t"
"vfadb %%v24,%%v24,%%v31\n\t"
"vrepg %%v25,%%v24,1\n\t"
"vfadb %%v24,%%v24,%%v25\n\t"
"vsteg %%v24,%[asum],0"
: [sum] "=Q"(sum),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
: "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
"v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");

return sum;
}

FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG ip = 0;
FLOAT sumf = 0.0;
BLASLONG n1;
BLASLONG inc_x2;

if (n <= 0 || inc_x <= 0)
return (sumf);

if (inc_x == 1) {

n1 = n & -16;
if (n1 > 0) {

sumf = zsum_kernel_16(n1, x);
i = n1;
ip = 2 * n1;
}

while (i < n) {
sumf += x[ip] + x[ip + 1];
i++;
ip += 2;
}

} else {
inc_x2 = 2 * inc_x;

while (i < n) {
sumf += x[ip] + x[ip + 1];
ip += inc_x2;
i++;
}

}
return (sumf);
}

Loading…
Cancel
Save