Add (C)BLAS extension ?sumtags/v0.3.6^2
| @@ -73,6 +73,11 @@ double cblas_dasum (OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS | |||
| float cblas_scasum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | |||
| double cblas_dzasum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | |||
| float cblas_ssum (OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx); | |||
| double cblas_dsum (OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx); | |||
| float cblas_scsum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | |||
| double cblas_dzsum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | |||
| float cblas_snrm2 (OPENBLAS_CONST blasint N, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX); | |||
| double cblas_dnrm2 (OPENBLAS_CONST blasint N, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX); | |||
| float cblas_scnrm2(OPENBLAS_CONST blasint N, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX); | |||
| @@ -107,6 +107,12 @@ macro(SetDefaultL1) | |||
| set(DAXPBYKERNEL ../arm/axpby.c) | |||
| set(CAXPBYKERNEL ../arm/zaxpby.c) | |||
| set(ZAXPBYKERNEL ../arm/zaxpby.c) | |||
| set(SSUMKERNEL sum.S) | |||
| set(DSUMKERNEL sum.S) | |||
| set(CSUMKERNEL zsum.S) | |||
| set(ZSUMKERNEL zsum.S) | |||
| set(QSUMKERNEL sum.S) | |||
| set(XSUMKERNEL zsum.S) | |||
| endmacro () | |||
| macro(SetDefaultL2) | |||
| @@ -162,4 +168,4 @@ macro(SetDefaultL3) | |||
| set(DGEADD_KERNEL ../generic/geadd.c) | |||
| set(CGEADD_KERNEL ../generic/zgeadd.c) | |||
| set(ZGEADD_KERNEL ../generic/zgeadd.c) | |||
| endmacro () | |||
| endmacro () | |||
| @@ -19,6 +19,7 @@ | |||
| #define CDOTC_K cdotc_k | |||
| #define CNRM2_K cnrm2_k | |||
| #define CSCAL_K cscal_k | |||
| #define CSUM_K csum_k | |||
| #define CSWAP_K cswap_k | |||
| #define CROT_K csrot_k | |||
| @@ -249,6 +250,7 @@ | |||
| #define CDOTC_K gotoblas -> cdotc_k | |||
| #define CNRM2_K gotoblas -> cnrm2_k | |||
| #define CSCAL_K gotoblas -> cscal_k | |||
| #define CSUM_K gotoblas -> csum_k | |||
| #define CSWAP_K gotoblas -> cswap_k | |||
| #define CROT_K gotoblas -> csrot_k | |||
| @@ -19,6 +19,7 @@ | |||
| #define DDOTC_K ddot_k | |||
| #define DNRM2_K dnrm2_k | |||
| #define DSCAL_K dscal_k | |||
| #define DSUM_K dsum_k | |||
| #define DSWAP_K dswap_k | |||
| #define DROT_K drot_k | |||
| @@ -174,6 +175,7 @@ | |||
| #define DDOTC_K gotoblas -> ddot_k | |||
| #define DNRM2_K gotoblas -> dnrm2_k | |||
| #define DSCAL_K gotoblas -> dscal_k | |||
| #define DSUM_K gotoblas -> dsum_k | |||
| #define DSWAP_K gotoblas -> dswap_k | |||
| #define DROT_K gotoblas -> drot_k | |||
| @@ -122,6 +122,13 @@ xdouble BLASFUNC(qasum) (blasint *, xdouble *, blasint *); | |||
| double BLASFUNC(dzasum)(blasint *, double *, blasint *); | |||
| xdouble BLASFUNC(qxasum)(blasint *, xdouble *, blasint *); | |||
| FLOATRET BLASFUNC(ssum) (blasint *, float *, blasint *); | |||
| FLOATRET BLASFUNC(scsum)(blasint *, float *, blasint *); | |||
| double BLASFUNC(dsum) (blasint *, double *, blasint *); | |||
| xdouble BLASFUNC(qsum) (blasint *, xdouble *, blasint *); | |||
| double BLASFUNC(dzsum)(blasint *, double *, blasint *); | |||
| xdouble BLASFUNC(qxsum)(blasint *, xdouble *, blasint *); | |||
| blasint BLASFUNC(isamax)(blasint *, float *, blasint *); | |||
| blasint BLASFUNC(idamax)(blasint *, double *, blasint *); | |||
| blasint BLASFUNC(iqamax)(blasint *, xdouble *, blasint *); | |||
| @@ -100,6 +100,13 @@ float casum_k (BLASLONG, float *, BLASLONG); | |||
| double zasum_k (BLASLONG, double *, BLASLONG); | |||
| xdouble xasum_k (BLASLONG, xdouble *, BLASLONG); | |||
| float ssum_k (BLASLONG, float *, BLASLONG); | |||
| double dsum_k (BLASLONG, double *, BLASLONG); | |||
| xdouble qsum_k (BLASLONG, xdouble *, BLASLONG); | |||
| float csum_k (BLASLONG, float *, BLASLONG); | |||
| double zsum_k (BLASLONG, double *, BLASLONG); | |||
| xdouble xsum_k (BLASLONG, xdouble *, BLASLONG); | |||
| float samax_k (BLASLONG, float *, BLASLONG); | |||
| double damax_k (BLASLONG, double *, BLASLONG); | |||
| xdouble qamax_k (BLASLONG, xdouble *, BLASLONG); | |||
| @@ -66,6 +66,7 @@ | |||
| #define DOTC_K QDOTC_K | |||
| #define NRM2_K QNRM2_K | |||
| #define SCAL_K QSCAL_K | |||
| #define SUM_K QSUM_K | |||
| #define SWAP_K QSWAP_K | |||
| #define ROT_K QROT_K | |||
| @@ -356,6 +357,7 @@ | |||
| #define DOTC_K DDOTC_K | |||
| #define NRM2_K DNRM2_K | |||
| #define SCAL_K DSCAL_K | |||
| #define SUM_K DSUM_K | |||
| #define SWAP_K DSWAP_K | |||
| #define ROT_K DROT_K | |||
| @@ -658,6 +660,7 @@ | |||
| #define DOTC_K SDOTC_K | |||
| #define NRM2_K SNRM2_K | |||
| #define SCAL_K SSCAL_K | |||
| #define SUM_K SSUM_K | |||
| #define SWAP_K SSWAP_K | |||
| #define ROT_K SROT_K | |||
| @@ -962,6 +965,7 @@ | |||
| #define DOTC_K XDOTC_K | |||
| #define NRM2_K XNRM2_K | |||
| #define SCAL_K XSCAL_K | |||
| #define SUM_K XSUM_K | |||
| #define SWAP_K XSWAP_K | |||
| #define ROT_K XROT_K | |||
| @@ -1363,6 +1367,7 @@ | |||
| #define DOTC_K ZDOTC_K | |||
| #define NRM2_K ZNRM2_K | |||
| #define SCAL_K ZSCAL_K | |||
| #define SUM_K ZSUM_K | |||
| #define SWAP_K ZSWAP_K | |||
| #define ROT_K ZROT_K | |||
| @@ -1785,6 +1790,7 @@ | |||
| #define DOTC_K CDOTC_K | |||
| #define NRM2_K CNRM2_K | |||
| #define SCAL_K CSCAL_K | |||
| #define SUM_K CSUM_K | |||
| #define SWAP_K CSWAP_K | |||
| #define ROT_K CROT_K | |||
| @@ -63,6 +63,7 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); | |||
| float (*snrm2_k) (BLASLONG, float *, BLASLONG); | |||
| float (*sasum_k) (BLASLONG, float *, BLASLONG); | |||
| float (*ssum_k) (BLASLONG, float *, BLASLONG); | |||
| int (*scopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
| float (*sdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
| double (*dsdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
| @@ -154,6 +155,7 @@ BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG); | |||
| double (*dnrm2_k) (BLASLONG, double *, BLASLONG); | |||
| double (*dasum_k) (BLASLONG, double *, BLASLONG); | |||
| double (*dsum_k) (BLASLONG, double *, BLASLONG); | |||
| int (*dcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); | |||
| double (*ddot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); | |||
| int (*drot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG, double, double); | |||
| @@ -245,6 +247,7 @@ BLASLONG (*iqmin_k) (BLASLONG, xdouble *, BLASLONG); | |||
| xdouble (*qnrm2_k) (BLASLONG, xdouble *, BLASLONG); | |||
| xdouble (*qasum_k) (BLASLONG, xdouble *, BLASLONG); | |||
| xdouble (*qsum_k) (BLASLONG, xdouble *, BLASLONG); | |||
| int (*qcopy_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); | |||
| xdouble (*qdot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); | |||
| int (*qrot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble); | |||
| @@ -332,6 +335,7 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG); | |||
| float (*cnrm2_k) (BLASLONG, float *, BLASLONG); | |||
| float (*casum_k) (BLASLONG, float *, BLASLONG); | |||
| float (*csum_k) (BLASLONG, float *, BLASLONG); | |||
| int (*ccopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
| openblas_complex_float (*cdotu_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
| openblas_complex_float (*cdotc_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
| @@ -495,6 +499,7 @@ BLASLONG (*izamin_k)(BLASLONG, double *, BLASLONG); | |||
| double (*znrm2_k) (BLASLONG, double *, BLASLONG); | |||
| double (*zasum_k) (BLASLONG, double *, BLASLONG); | |||
| double (*zsum_k) (BLASLONG, double *, BLASLONG); | |||
| int (*zcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); | |||
| openblas_complex_double (*zdotu_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); | |||
| openblas_complex_double (*zdotc_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); | |||
| @@ -660,6 +665,7 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); | |||
| xdouble (*xnrm2_k) (BLASLONG, xdouble *, BLASLONG); | |||
| xdouble (*xasum_k) (BLASLONG, xdouble *, BLASLONG); | |||
| xdouble (*xsum_k) (BLASLONG, xdouble *, BLASLONG); | |||
| int (*xcopy_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); | |||
| openblas_complex_xdouble (*xdotu_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); | |||
| openblas_complex_xdouble (*xdotc_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); | |||
| @@ -19,6 +19,7 @@ | |||
| #define QDOTC_K qdot_k | |||
| #define QNRM2_K qnrm2_k | |||
| #define QSCAL_K qscal_k | |||
| #define QSUM_K qsum_k | |||
| #define QSWAP_K qswap_k | |||
| #define QROT_K qrot_k | |||
| @@ -161,6 +162,7 @@ | |||
| #define QDOTC_K gotoblas -> qdot_k | |||
| #define QNRM2_K gotoblas -> qnrm2_k | |||
| #define QSCAL_K gotoblas -> qscal_k | |||
| #define QSUM_K gotoblas -> qsum_k | |||
| #define QSWAP_K gotoblas -> qswap_k | |||
| #define QROT_K gotoblas -> qrot_k | |||
| @@ -12,6 +12,7 @@ | |||
| #define ISMAX_K ismax_k | |||
| #define ISMIN_K ismin_k | |||
| #define SASUM_K sasum_k | |||
| #define SSUM_K ssum_k | |||
| #define SAXPYU_K saxpy_k | |||
| #define SAXPYC_K saxpy_k | |||
| #define SCOPY_K scopy_k | |||
| @@ -170,6 +171,7 @@ | |||
| #define ISMAX_K gotoblas -> ismax_k | |||
| #define ISMIN_K gotoblas -> ismin_k | |||
| #define SASUM_K gotoblas -> sasum_k | |||
| #define SSUM_K gotoblas -> ssum_k | |||
| #define SAXPYU_K gotoblas -> saxpy_k | |||
| #define SAXPYC_K gotoblas -> saxpy_k | |||
| #define SCOPY_K gotoblas -> scopy_k | |||
| @@ -19,6 +19,7 @@ | |||
| #define XDOTC_K xdotc_k | |||
| #define XNRM2_K xnrm2_k | |||
| #define XSCAL_K xscal_k | |||
| #define XSUM_K xsum_k | |||
| #define XSWAP_K xswap_k | |||
| #define XROT_K xqrot_k | |||
| @@ -227,6 +228,7 @@ | |||
| #define XDOTC_K gotoblas -> xdotc_k | |||
| #define XNRM2_K gotoblas -> xnrm2_k | |||
| #define XSCAL_K gotoblas -> xscal_k | |||
| #define XSUM_K gotoblas -> xsum_k | |||
| #define XSWAP_K gotoblas -> xswap_k | |||
| #define XROT_K gotoblas -> xqrot_k | |||
| @@ -19,6 +19,7 @@ | |||
| #define ZDOTC_K zdotc_k | |||
| #define ZNRM2_K znrm2_k | |||
| #define ZSCAL_K zscal_k | |||
| #define ZSUM_K zsum_k | |||
| #define ZSWAP_K zswap_k | |||
| #define ZROT_K zdrot_k | |||
| @@ -249,6 +250,7 @@ | |||
| #define ZDOTC_K gotoblas -> zdotc_k | |||
| #define ZNRM2_K gotoblas -> znrm2_k | |||
| #define ZSCAL_K gotoblas -> zscal_k | |||
| #define ZSUM_K gotoblas -> zsum_k | |||
| #define ZSWAP_K gotoblas -> zswap_k | |||
| #define ZROT_K gotoblas -> zdrot_k | |||
| @@ -12,6 +12,7 @@ set(BLAS1_REAL_ONLY_SOURCES | |||
| rotm.c rotmg.c # N.B. these do not have complex counterparts | |||
| rot.c | |||
| asum.c | |||
| sum.c | |||
| ) | |||
| # these will have 'z' prepended for the complex version | |||
| @@ -124,6 +125,7 @@ foreach (float_type ${FLOAT_TYPES}) | |||
| GenerateNamedObjects("max.c" "USE_ABS;USE_MIN" "scamin" ${CBLAS_FLAG} "" "" true "COMPLEX") | |||
| GenerateNamedObjects("max.c" "USE_ABS" "scamax" ${CBLAS_FLAG} "" "" true "COMPLEX") | |||
| GenerateNamedObjects("asum.c" "" "scasum" ${CBLAS_FLAG} "" "" true "COMPLEX") | |||
| GenerateNamedObjects("sum.c" "" "scsum" ${CBLAS_FLAG} "" "" true "COMPLEX") | |||
| endif () | |||
| if (${float_type} STREQUAL "ZCOMPLEX") | |||
| GenerateNamedObjects("zscal.c" "SSCAL" "dscal" ${CBLAS_FLAG} "" "" false "ZCOMPLEX") | |||
| @@ -132,6 +134,7 @@ foreach (float_type ${FLOAT_TYPES}) | |||
| GenerateNamedObjects("max.c" "USE_ABS;USE_MIN" "dzamin" ${CBLAS_FLAG} "" "" true "ZCOMPLEX") | |||
| GenerateNamedObjects("max.c" "USE_ABS" "dzamax" ${CBLAS_FLAG} "" "" true "ZCOMPLEX") | |||
| GenerateNamedObjects("asum.c" "" "dzasum" ${CBLAS_FLAG} "" "" true "ZCOMPLEX") | |||
| GenerateNamedObjects("sum.c" "" "dzsum" ${CBLAS_FLAG} "" "" true "ZCOMPLEX") | |||
| endif () | |||
| endforeach () | |||
| @@ -25,7 +25,7 @@ SBLAS1OBJS = \ | |||
| saxpy.$(SUFFIX) sswap.$(SUFFIX) \ | |||
| scopy.$(SUFFIX) sscal.$(SUFFIX) \ | |||
| sdot.$(SUFFIX) sdsdot.$(SUFFIX) dsdot.$(SUFFIX) \ | |||
| sasum.$(SUFFIX) snrm2.$(SUFFIX) \ | |||
| sasum.$(SUFFIX) ssum.$(SUFFIX) snrm2.$(SUFFIX) \ | |||
| smax.$(SUFFIX) samax.$(SUFFIX) ismax.$(SUFFIX) isamax.$(SUFFIX) \ | |||
| smin.$(SUFFIX) samin.$(SUFFIX) ismin.$(SUFFIX) isamin.$(SUFFIX) \ | |||
| srot.$(SUFFIX) srotg.$(SUFFIX) srotm.$(SUFFIX) srotmg.$(SUFFIX) \ | |||
| @@ -51,7 +51,7 @@ DBLAS1OBJS = \ | |||
| daxpy.$(SUFFIX) dswap.$(SUFFIX) \ | |||
| dcopy.$(SUFFIX) dscal.$(SUFFIX) \ | |||
| ddot.$(SUFFIX) \ | |||
| dasum.$(SUFFIX) dnrm2.$(SUFFIX) \ | |||
| dasum.$(SUFFIX) dsum.$(SUFFIX) dnrm2.$(SUFFIX) \ | |||
| dmax.$(SUFFIX) damax.$(SUFFIX) idmax.$(SUFFIX) idamax.$(SUFFIX) \ | |||
| dmin.$(SUFFIX) damin.$(SUFFIX) idmin.$(SUFFIX) idamin.$(SUFFIX) \ | |||
| drot.$(SUFFIX) drotg.$(SUFFIX) drotm.$(SUFFIX) drotmg.$(SUFFIX) \ | |||
| @@ -76,7 +76,7 @@ CBLAS1OBJS = \ | |||
| caxpy.$(SUFFIX) caxpyc.$(SUFFIX) cswap.$(SUFFIX) \ | |||
| ccopy.$(SUFFIX) cscal.$(SUFFIX) csscal.$(SUFFIX) \ | |||
| cdotc.$(SUFFIX) cdotu.$(SUFFIX) \ | |||
| scasum.$(SUFFIX) scnrm2.$(SUFFIX) \ | |||
| scasum.$(SUFFIX) scsum.$(SUFFIX) scnrm2.$(SUFFIX) \ | |||
| scamax.$(SUFFIX) icamax.$(SUFFIX) \ | |||
| scamin.$(SUFFIX) icamin.$(SUFFIX) \ | |||
| csrot.$(SUFFIX) crotg.$(SUFFIX) \ | |||
| @@ -105,7 +105,7 @@ ZBLAS1OBJS = \ | |||
| zaxpy.$(SUFFIX) zaxpyc.$(SUFFIX) zswap.$(SUFFIX) \ | |||
| zcopy.$(SUFFIX) zscal.$(SUFFIX) zdscal.$(SUFFIX) \ | |||
| zdotc.$(SUFFIX) zdotu.$(SUFFIX) \ | |||
| dzasum.$(SUFFIX) dznrm2.$(SUFFIX) \ | |||
| dzasum.$(SUFFIX) dzsum.$(SUFFIX) dznrm2.$(SUFFIX) \ | |||
| dzamax.$(SUFFIX) izamax.$(SUFFIX) \ | |||
| dzamin.$(SUFFIX) izamin.$(SUFFIX) \ | |||
| zdrot.$(SUFFIX) zrotg.$(SUFFIX) \ | |||
| @@ -146,7 +146,7 @@ QBLAS1OBJS = \ | |||
| qaxpy.$(SUFFIX) qswap.$(SUFFIX) \ | |||
| qcopy.$(SUFFIX) qscal.$(SUFFIX) \ | |||
| qdot.$(SUFFIX) \ | |||
| qasum.$(SUFFIX) qnrm2.$(SUFFIX) \ | |||
| qasum.$(SUFFIX) qsum.$(SUFFIX) qnrm2.$(SUFFIX) \ | |||
| qmax.$(SUFFIX) qamax.$(SUFFIX) iqmax.$(SUFFIX) iqamax.$(SUFFIX) \ | |||
| qmin.$(SUFFIX) qamin.$(SUFFIX) iqmin.$(SUFFIX) iqamin.$(SUFFIX) \ | |||
| qrot.$(SUFFIX) qrotg.$(SUFFIX) qrotm.$(SUFFIX) qrotmg.$(SUFFIX) \ | |||
| @@ -168,7 +168,7 @@ XBLAS1OBJS = \ | |||
| xaxpy.$(SUFFIX) xaxpyc.$(SUFFIX) xswap.$(SUFFIX) \ | |||
| xcopy.$(SUFFIX) xscal.$(SUFFIX) xqscal.$(SUFFIX) \ | |||
| xdotc.$(SUFFIX) xdotu.$(SUFFIX) \ | |||
| qxasum.$(SUFFIX) qxnrm2.$(SUFFIX) \ | |||
| qxasum.$(SUFFIX) qxsum.$(SUFFIX) qxnrm2.$(SUFFIX) \ | |||
| qxamax.$(SUFFIX) ixamax.$(SUFFIX) \ | |||
| qxamin.$(SUFFIX) ixamin.$(SUFFIX) \ | |||
| xqrot.$(SUFFIX) xrotg.$(SUFFIX) \ | |||
| @@ -203,7 +203,7 @@ ifdef QUAD_PRECISION | |||
| QBLAS1OBJS = \ | |||
| qaxpy.$(SUFFIX) qswap.$(SUFFIX) \ | |||
| qcopy.$(SUFFIX) qscal.$(SUFFIX) \ | |||
| qasum.$(SUFFIX) qnrm2.$(SUFFIX) \ | |||
| qasum.$(SUFFIX) qsum.$(SUFFIX) qnrm2.$(SUFFIX) \ | |||
| qmax.$(SUFFIX) qamax.$(SUFFIX) iqmax.$(SUFFIX) iqamax.$(SUFFIX) \ | |||
| qmin.$(SUFFIX) qamin.$(SUFFIX) iqmin.$(SUFFIX) iqamin.$(SUFFIX) \ | |||
| qrot.$(SUFFIX) qrotg.$(SUFFIX) qrotm.$(SUFFIX) qrotmg.$(SUFFIX) \ | |||
| @@ -224,7 +224,7 @@ QBLAS3OBJS = \ | |||
| XBLAS1OBJS = \ | |||
| xaxpy.$(SUFFIX) xaxpyc.$(SUFFIX) xswap.$(SUFFIX) \ | |||
| xcopy.$(SUFFIX) xscal.$(SUFFIX) xqscal.$(SUFFIX) \ | |||
| qxasum.$(SUFFIX) qxnrm2.$(SUFFIX) \ | |||
| qxasum.$(SUFFIX) qxsum.$(SUFFIX) qxnrm2.$(SUFFIX) \ | |||
| qxamax.$(SUFFIX) ixamax.$(SUFFIX) \ | |||
| qxamin.$(SUFFIX) ixamin.$(SUFFIX) \ | |||
| xqrot.$(SUFFIX) xrotg.$(SUFFIX) \ | |||
| @@ -264,7 +264,7 @@ CSBLAS1OBJS = \ | |||
| cblas_scopy.$(SUFFIX) cblas_sdot.$(SUFFIX) cblas_sdsdot.$(SUFFIX) cblas_dsdot.$(SUFFIX) \ | |||
| cblas_srot.$(SUFFIX) cblas_srotg.$(SUFFIX) cblas_srotm.$(SUFFIX) cblas_srotmg.$(SUFFIX) \ | |||
| cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) cblas_saxpby.$(SUFFIX) \ | |||
| cblas_ismin.$(SUFFIX) cblas_ismax.$(SUFFIX) | |||
| cblas_ismin.$(SUFFIX) cblas_ismax.$(SUFFIX) cblas_ssum.$(SUFFIX) | |||
| CSBLAS2OBJS = \ | |||
| cblas_sgemv.$(SUFFIX) cblas_sger.$(SUFFIX) cblas_ssymv.$(SUFFIX) cblas_strmv.$(SUFFIX) \ | |||
| @@ -282,7 +282,7 @@ CDBLAS1OBJS = \ | |||
| cblas_dcopy.$(SUFFIX) cblas_ddot.$(SUFFIX) \ | |||
| cblas_drot.$(SUFFIX) cblas_drotg.$(SUFFIX) cblas_drotm.$(SUFFIX) cblas_drotmg.$(SUFFIX) \ | |||
| cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) cblas_daxpby.$(SUFFIX) \ | |||
| cblas_idmin.$(SUFFIX) cblas_idmax.$(SUFFIX) | |||
| cblas_idmin.$(SUFFIX) cblas_idmax.$(SUFFIX) cblas_dsum.$(SUFFIX) | |||
| CDBLAS2OBJS = \ | |||
| cblas_dgemv.$(SUFFIX) cblas_dger.$(SUFFIX) cblas_dsymv.$(SUFFIX) cblas_dtrmv.$(SUFFIX) \ | |||
| @@ -303,7 +303,7 @@ CCBLAS1OBJS = \ | |||
| cblas_cscal.$(SUFFIX) cblas_csscal.$(SUFFIX) \ | |||
| cblas_cswap.$(SUFFIX) cblas_scnrm2.$(SUFFIX) \ | |||
| cblas_caxpby.$(SUFFIX) \ | |||
| cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX) | |||
| cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX) cblas_scsum.$(SUFFIX) | |||
| CCBLAS2OBJS = \ | |||
| cblas_cgemv.$(SUFFIX) cblas_cgerc.$(SUFFIX) cblas_cgeru.$(SUFFIX) \ | |||
| @@ -330,7 +330,7 @@ CZBLAS1OBJS = \ | |||
| cblas_zscal.$(SUFFIX) cblas_zdscal.$(SUFFIX) \ | |||
| cblas_zswap.$(SUFFIX) cblas_dznrm2.$(SUFFIX) \ | |||
| cblas_zaxpby.$(SUFFIX) \ | |||
| cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX) | |||
| cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX) cblas_dzsum.$(SUFFIX) | |||
| CZBLAS2OBJS = \ | |||
| @@ -565,6 +565,24 @@ dzasum.$(SUFFIX) dzasum.$(PSUFFIX) : asum.c | |||
| qxasum.$(SUFFIX) qxasum.$(PSUFFIX) : asum.c | |||
| $(CC) $(CFLAGS) -c $< -o $(@F) | |||
| ssum.$(SUFFIX) ssum.$(PSUFFIX) : sum.c | |||
| $(CC) $(CFLAGS) -c $< -o $(@F) | |||
| dsum.$(SUFFIX) dsum.$(PSUFFIX) : sum.c | |||
| $(CC) $(CFLAGS) -c $< -o $(@F) | |||
| qsum.$(SUFFIX) qsum.$(PSUFFIX) : sum.c | |||
| $(CC) $(CFLAGS) -c $< -o $(@F) | |||
| scsum.$(SUFFIX) scsum.$(PSUFFIX) : sum.c | |||
| $(CC) $(CFLAGS) -c $< -o $(@F) | |||
| dzsum.$(SUFFIX) dzsum.$(PSUFFIX) : sum.c | |||
| $(CC) $(CFLAGS) -c $< -o $(@F) | |||
| qxsum.$(SUFFIX) qxsum.$(PSUFFIX) : sum.c | |||
| $(CC) $(CFLAGS) -c $< -o $(@F) | |||
| snrm2.$(SUFFIX) snrm2.$(PSUFFIX) : nrm2.c | |||
| $(CC) $(CFLAGS) -c $< -o $(@F) | |||
| @@ -1412,6 +1430,18 @@ cblas_scasum.$(SUFFIX) cblas_scasum.$(PSUFFIX) : asum.c | |||
| cblas_dzasum.$(SUFFIX) cblas_dzasum.$(PSUFFIX) : asum.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | |||
| cblas_ssum.$(SUFFIX) cblas_ssum.$(PSUFFIX) : sum.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | |||
| cblas_dsum.$(SUFFIX) cblas_dsum.$(PSUFFIX) : sum.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | |||
| cblas_scsum.$(SUFFIX) cblas_scsum.$(PSUFFIX) : sum.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | |||
| cblas_dzsum.$(SUFFIX) cblas_dzsum.$(PSUFFIX) : sum.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | |||
| cblas_sdsdot.$(SUFFIX) cblas_sdsdot.$(PSUFFIX) : sdsdot.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | |||
| @@ -1419,7 +1449,7 @@ cblas_dsdot.$(SUFFIX) cblas_dsdot.$(PSUFFIX) : dsdot.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | |||
| cblas_sdot.$(SUFFIX) cblas_sdot.$(PSUFFIX) : dot.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | |||
| $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | |||
| cblas_ddot.$(SUFFIX) cblas_ddot.$(PSUFFIX) : dot.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | |||
| @@ -0,0 +1,97 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| #ifdef FUNCTION_PROFILE | |||
| #include "functable.h" | |||
| #endif | |||
| #ifndef CBLAS | |||
| FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){ | |||
| BLASLONG n = *N; | |||
| BLASLONG incx = *INCX; | |||
| FLOATRET ret; | |||
| PRINT_DEBUG_NAME; | |||
| if (n <= 0) return 0; | |||
| IDEBUG_START; | |||
| FUNCTION_PROFILE_START(); | |||
| ret = (FLOATRET)SUM_K(n, x, incx); | |||
| FUNCTION_PROFILE_END(COMPSIZE, n, n); | |||
| IDEBUG_END; | |||
| return ret; | |||
| } | |||
| #else | |||
| #ifdef COMPLEX | |||
| FLOAT CNAME(blasint n, void *vx, blasint incx){ | |||
| FLOAT *x = (FLOAT*) vx; | |||
| #else | |||
| FLOAT CNAME(blasint n, FLOAT *x, blasint incx){ | |||
| #endif | |||
| FLOAT ret; | |||
| PRINT_DEBUG_CNAME; | |||
| if (n <= 0) return 0; | |||
| IDEBUG_START; | |||
| FUNCTION_PROFILE_START(); | |||
| ret = SUM_K(n, x, incx); | |||
| FUNCTION_PROFILE_END(COMPSIZE, n, n); | |||
| IDEBUG_END; | |||
| return ret; | |||
| } | |||
| #endif | |||
| @@ -65,6 +65,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) | |||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}SCALKERNEL}" "" "scal_k" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}SWAPKERNEL}" "" "swap_k" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}AXPBYKERNEL}" "" "axpby_k" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}SUMKERNEL}" "" "sum_k" false "" "" false ${float_type}) | |||
| if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") | |||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}AXPYKERNEL}" "CONJ" "axpyc_k" false "" "" false ${float_type}) | |||
| @@ -340,6 +340,32 @@ ifndef XSCALKERNEL | |||
| XSCALKERNEL = zscal.S | |||
| endif | |||
| ### SUM ### | |||
| ifndef SSUMKERNEL | |||
| SSUMKERNEL = sum.S | |||
| endif | |||
| ifndef DSUMKERNEL | |||
| DSUMKERNEL = sum.S | |||
| endif | |||
| ifndef CSUMKERNEL | |||
| CSUMKERNEL = zsum.S | |||
| endif | |||
| ifndef ZSUMKERNEL | |||
| ZSUMKERNEL = zsum.S | |||
| endif | |||
| ifndef QSUMKERNEL | |||
| QSUMKERNEL = sum.S | |||
| endif | |||
| ifndef XSUMKERNEL | |||
| XSUMKERNEL = zsum.S | |||
| endif | |||
| ### SWAP ### | |||
| ifndef SSWAPKERNEL | |||
| @@ -453,7 +479,7 @@ endif | |||
| SBLASOBJS += \ | |||
| samax_k$(TSUFFIX).$(SUFFIX) samin_k$(TSUFFIX).$(SUFFIX) smax_k$(TSUFFIX).$(SUFFIX) smin_k$(TSUFFIX).$(SUFFIX) \ | |||
| isamax_k$(TSUFFIX).$(SUFFIX) isamin_k$(TSUFFIX).$(SUFFIX) ismax_k$(TSUFFIX).$(SUFFIX) ismin_k$(TSUFFIX).$(SUFFIX) \ | |||
| sasum_k$(TSUFFIX).$(SUFFIX) saxpy_k$(TSUFFIX).$(SUFFIX) scopy_k$(TSUFFIX).$(SUFFIX) \ | |||
| sasum_k$(TSUFFIX).$(SUFFIX) ssum_k$(TSUFFIX).$(SUFFIX) saxpy_k$(TSUFFIX).$(SUFFIX) scopy_k$(TSUFFIX).$(SUFFIX) \ | |||
| sdot_k$(TSUFFIX).$(SUFFIX) sdsdot_k$(TSUFFIX).$(SUFFIX) dsdot_k$(TSUFFIX).$(SUFFIX) \ | |||
| snrm2_k$(TSUFFIX).$(SUFFIX) srot_k$(TSUFFIX).$(SUFFIX) sscal_k$(TSUFFIX).$(SUFFIX) sswap_k$(TSUFFIX).$(SUFFIX) \ | |||
| saxpby_k$(TSUFFIX).$(SUFFIX) | |||
| @@ -463,31 +489,32 @@ DBLASOBJS += \ | |||
| idamax_k$(TSUFFIX).$(SUFFIX) idamin_k$(TSUFFIX).$(SUFFIX) idmax_k$(TSUFFIX).$(SUFFIX) idmin_k$(TSUFFIX).$(SUFFIX) \ | |||
| dasum_k$(TSUFFIX).$(SUFFIX) daxpy_k$(TSUFFIX).$(SUFFIX) dcopy_k$(TSUFFIX).$(SUFFIX) ddot_k$(TSUFFIX).$(SUFFIX) \ | |||
| dnrm2_k$(TSUFFIX).$(SUFFIX) drot_k$(TSUFFIX).$(SUFFIX) dscal_k$(TSUFFIX).$(SUFFIX) dswap_k$(TSUFFIX).$(SUFFIX) \ | |||
| daxpby_k$(TSUFFIX).$(SUFFIX) | |||
| daxpby_k$(TSUFFIX).$(SUFFIX) dsum_k$(TSUFFIX).$(SUFFIX) | |||
| QBLASOBJS += \ | |||
| qamax_k$(TSUFFIX).$(SUFFIX) qamin_k$(TSUFFIX).$(SUFFIX) qmax_k$(TSUFFIX).$(SUFFIX) qmin_k$(TSUFFIX).$(SUFFIX) \ | |||
| iqamax_k$(TSUFFIX).$(SUFFIX) iqamin_k$(TSUFFIX).$(SUFFIX) iqmax_k$(TSUFFIX).$(SUFFIX) iqmin_k$(TSUFFIX).$(SUFFIX) \ | |||
| qasum_k$(TSUFFIX).$(SUFFIX) qaxpy_k$(TSUFFIX).$(SUFFIX) qcopy_k$(TSUFFIX).$(SUFFIX) qdot_k$(TSUFFIX).$(SUFFIX) \ | |||
| qnrm2_k$(TSUFFIX).$(SUFFIX) qrot_k$(TSUFFIX).$(SUFFIX) qscal_k$(TSUFFIX).$(SUFFIX) qswap_k$(TSUFFIX).$(SUFFIX) | |||
| qnrm2_k$(TSUFFIX).$(SUFFIX) qrot_k$(TSUFFIX).$(SUFFIX) qscal_k$(TSUFFIX).$(SUFFIX) qswap_k$(TSUFFIX).$(SUFFIX) \ | |||
| qsum_k$(TSUFFIX).$(SUFFIX) | |||
| CBLASOBJS += \ | |||
| camax_k$(TSUFFIX).$(SUFFIX) camin_k$(TSUFFIX).$(SUFFIX) icamax_k$(TSUFFIX).$(SUFFIX) icamin_k$(TSUFFIX).$(SUFFIX) \ | |||
| casum_k$(TSUFFIX).$(SUFFIX) caxpy_k$(TSUFFIX).$(SUFFIX) caxpyc_k$(TSUFFIX).$(SUFFIX) ccopy_k$(TSUFFIX).$(SUFFIX) \ | |||
| cdotc_k$(TSUFFIX).$(SUFFIX) cdotu_k$(TSUFFIX).$(SUFFIX) cnrm2_k$(TSUFFIX).$(SUFFIX) csrot_k$(TSUFFIX).$(SUFFIX) \ | |||
| cscal_k$(TSUFFIX).$(SUFFIX) cswap_k$(TSUFFIX).$(SUFFIX) caxpby_k$(TSUFFIX).$(SUFFIX) | |||
| cscal_k$(TSUFFIX).$(SUFFIX) cswap_k$(TSUFFIX).$(SUFFIX) caxpby_k$(TSUFFIX).$(SUFFIX) csum_k$(TSUFFIX).$(SUFFIX) | |||
| ZBLASOBJS += \ | |||
| zamax_k$(TSUFFIX).$(SUFFIX) zamin_k$(TSUFFIX).$(SUFFIX) izamax_k$(TSUFFIX).$(SUFFIX) izamin_k$(TSUFFIX).$(SUFFIX) \ | |||
| zasum_k$(TSUFFIX).$(SUFFIX) zaxpy_k$(TSUFFIX).$(SUFFIX) zaxpyc_k$(TSUFFIX).$(SUFFIX) zcopy_k$(TSUFFIX).$(SUFFIX) \ | |||
| zdotc_k$(TSUFFIX).$(SUFFIX) zdotu_k$(TSUFFIX).$(SUFFIX) znrm2_k$(TSUFFIX).$(SUFFIX) zdrot_k$(TSUFFIX).$(SUFFIX) \ | |||
| zscal_k$(TSUFFIX).$(SUFFIX) zswap_k$(TSUFFIX).$(SUFFIX) zaxpby_k$(TSUFFIX).$(SUFFIX) | |||
| zscal_k$(TSUFFIX).$(SUFFIX) zswap_k$(TSUFFIX).$(SUFFIX) zaxpby_k$(TSUFFIX).$(SUFFIX) zsum_k$(TSUFFIX).$(SUFFIX) | |||
| XBLASOBJS += \ | |||
| xamax_k$(TSUFFIX).$(SUFFIX) xamin_k$(TSUFFIX).$(SUFFIX) ixamax_k$(TSUFFIX).$(SUFFIX) ixamin_k$(TSUFFIX).$(SUFFIX) \ | |||
| xasum_k$(TSUFFIX).$(SUFFIX) xaxpy_k$(TSUFFIX).$(SUFFIX) xaxpyc_k$(TSUFFIX).$(SUFFIX) xcopy_k$(TSUFFIX).$(SUFFIX) \ | |||
| xdotc_k$(TSUFFIX).$(SUFFIX) xdotu_k$(TSUFFIX).$(SUFFIX) xnrm2_k$(TSUFFIX).$(SUFFIX) xqrot_k$(TSUFFIX).$(SUFFIX) \ | |||
| xscal_k$(TSUFFIX).$(SUFFIX) xswap_k$(TSUFFIX).$(SUFFIX) | |||
| xscal_k$(TSUFFIX).$(SUFFIX) xswap_k$(TSUFFIX).$(SUFFIX) xsum_k$(TSUFFIX).$(SUFFIX) | |||
| ### AMAX ### | |||
| @@ -617,7 +644,7 @@ $(KDIR)idmin_k$(TSUFFIX).$(SUFFIX) $(KDIR)idmin_k$(TPSUFFIX).$(PSUFFIX) : $(KE | |||
| $(KDIR)iqmin_k$(TSUFFIX).$(SUFFIX) $(KDIR)iqmin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IQMINKERNEL) | |||
| $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UUSE_ABS -DUSE_MIN $< -o $@ | |||
| ### ASUM ### | |||
| $(KDIR)sasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)sasum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SASUMKERNEL) | |||
| $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@ | |||
| @@ -636,6 +663,26 @@ $(KDIR)zasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)zasum_k$(TPSUFFIX).$(PSUFFIX) : $(KE | |||
| $(KDIR)xasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)xasum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XASUMKERNEL) | |||
| $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $@ | |||
| ### SUM ### | |||
| $(KDIR)ssum_k$(TSUFFIX).$(SUFFIX) $(KDIR)ssum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SSUMKERNEL) | |||
| $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@ | |||
| $(KDIR)dsum_k$(TSUFFIX).$(SUFFIX) $(KDIR)dsum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DSUMKERNEL) | |||
| $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $@ | |||
| $(KDIR)qsum_k$(TSUFFIX).$(SUFFIX) $(KDIR)qsum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QSUMKERNEL) | |||
| $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $@ | |||
| $(KDIR)csum_k$(TSUFFIX).$(SUFFIX) $(KDIR)csum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CSUMKERNEL) | |||
| $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE $< -o $@ | |||
| $(KDIR)zsum_k$(TSUFFIX).$(SUFFIX) $(KDIR)zsum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZSUMKERNEL) | |||
| $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE $< -o $@ | |||
| $(KDIR)xsum_k$(TSUFFIX).$(SUFFIX) $(KDIR)xsum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XSUMKERNEL) | |||
| $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $@ | |||
| ### AXPY ### | |||
| $(KDIR)saxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)saxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SAXPYKERNEL) | |||
| $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@ | |||
| @@ -0,0 +1,206 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #include "version.h" | |||
| #define PREFETCHSIZE 88 | |||
| #define N $16 | |||
| #define X $17 | |||
| #define INCX $18 | |||
| #define I $19 | |||
| #define s0 $f0 | |||
| #define s1 $f1 | |||
| #define s2 $f10 | |||
| #define s3 $f11 | |||
| #define a0 $f12 | |||
| #define a1 $f13 | |||
| #define a2 $f14 | |||
| #define a3 $f15 | |||
| #define a4 $f16 | |||
| #define a5 $f17 | |||
| #define a6 $f18 | |||
| #define a7 $f19 | |||
| #define t0 $f20 | |||
| #define t1 $f21 | |||
| #define t2 $f22 | |||
| #define t3 $f23 | |||
| PROLOGUE | |||
| PROFCODE | |||
| fclr s0 | |||
| unop | |||
| fclr t0 | |||
| ble N, $L999 | |||
| sra N, 3, I | |||
| fclr s1 | |||
| fclr s2 | |||
| ble I, $L15 | |||
| LD a0, 0 * SIZE(X) | |||
| fclr t1 | |||
| SXADDQ INCX, X, X | |||
| fclr t2 | |||
| LD a1, 0 * SIZE(X) | |||
| fclr t3 | |||
| SXADDQ INCX, X, X | |||
| fclr s3 | |||
| LD a2, 0 * SIZE(X) | |||
| SXADDQ INCX, X, X | |||
| LD a3, 0 * SIZE(X) | |||
| SXADDQ INCX, X, X | |||
| LD a4, 0 * SIZE(X) | |||
| SXADDQ INCX, X, X | |||
| LD a5, 0 * SIZE(X) | |||
| SXADDQ INCX, X, X | |||
| lda I, -1(I) | |||
| ble I, $L13 | |||
| .align 4 | |||
| $L12: | |||
| ADD s0, t0, s0 | |||
| ldl $31, PREFETCHSIZE * 2 * SIZE(X) | |||
| fmov a0, t0 | |||
| lda I, -1(I) | |||
| ADD s1, t1, s1 | |||
| LD a6, 0 * SIZE(X) | |||
| fmov a1, t1 | |||
| SXADDQ INCX, X, X | |||
| ADD s2, t2, s2 | |||
| LD a7, 0 * SIZE(X) | |||
| fmov a2, t2 | |||
| SXADDQ INCX, X, X | |||
| ADD s3, t3, s3 | |||
| LD a0, 0 * SIZE(X) | |||
| fmov a3, t3 | |||
| SXADDQ INCX, X, X | |||
| ADD s0, t0, s0 | |||
| LD a1, 0 * SIZE(X) | |||
| fmov a4, t0 | |||
| SXADDQ INCX, X, X | |||
| ADD s1, t1, s1 | |||
| LD a2, 0 * SIZE(X) | |||
| fmov a5, t1 | |||
| SXADDQ INCX, X, X | |||
| ADD s2, t2, s2 | |||
| LD a3, 0 * SIZE(X) | |||
| fmov a6, t2 | |||
| SXADDQ INCX, X, X | |||
| ADD s3, t3, s3 | |||
| LD a4, 0 * SIZE(X) | |||
| fmov a7, t3 | |||
| SXADDQ INCX, X, X | |||
| LD a5, 0 * SIZE(X) | |||
| unop | |||
| SXADDQ INCX, X, X | |||
| bne I, $L12 | |||
| .align 4 | |||
| $L13: | |||
| ADD s0, t0, s0 | |||
| LD a6, 0 * SIZE(X) | |||
| fmov a0, t0 | |||
| SXADDQ INCX, X, X | |||
| ADD s1, t1, s1 | |||
| LD a7, 0 * SIZE(X) | |||
| fmov a1, t1 | |||
| SXADDQ INCX, X, X | |||
| ADD s2, t2, s2 | |||
| fmov a2, t2 | |||
| ADD s3, t3, s3 | |||
| fmov a3, t3 | |||
| ADD s0, t0, s0 | |||
| fmov a4, t0 | |||
| ADD s1, t1, s1 | |||
| fmov a5, t1 | |||
| ADD s2, t2, s2 | |||
| fmov a6, t2 | |||
| ADD s3, t3, s3 | |||
| fmov a7, t3 | |||
| ADD s1, t1, s1 | |||
| ADD s2, t2, s2 | |||
| ADD s3, t3, s3 | |||
| ADD s0, s1, s0 | |||
| ADD s2, s3, s2 | |||
| .align 4 | |||
| $L15: | |||
| and N, 7, I | |||
| ADD s0, s2, s0 | |||
| unop | |||
| ble I, $L999 | |||
| .align 4 | |||
| $L17: | |||
| ADD s0, t0, s0 | |||
| LD a0, 0 * SIZE(X) | |||
| SXADDQ INCX, X, X | |||
| fmov a0, t0 | |||
| lda I, -1(I) | |||
| bne I, $L17 | |||
| .align 4 | |||
| $L999: | |||
| ADD s0, t0, s0 | |||
| ret | |||
| EPILOGUE | |||
| @@ -0,0 +1,208 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #include "version.h" | |||
| #define PREFETCHSIZE 88 | |||
| #define N $16 | |||
| #define X $17 | |||
| #define INCX $18 | |||
| #define I $19 | |||
| #define s0 $f0 | |||
| #define s1 $f1 | |||
| #define s2 $f10 | |||
| #define s3 $f11 | |||
| #define a0 $f12 | |||
| #define a1 $f13 | |||
| #define a2 $f14 | |||
| #define a3 $f15 | |||
| #define a4 $f16 | |||
| #define a5 $f17 | |||
| #define a6 $f18 | |||
| #define a7 $f19 | |||
| #define t0 $f20 | |||
| #define t1 $f21 | |||
| #define t2 $f22 | |||
| #define t3 $f23 | |||
| PROLOGUE | |||
| PROFCODE | |||
| fclr s0 | |||
| unop | |||
| fclr t0 | |||
| addq INCX, INCX, INCX | |||
| fclr s1 | |||
| unop | |||
| fclr t1 | |||
| ble N, $L999 | |||
| fclr s2 | |||
| sra N, 2, I | |||
| fclr s3 | |||
| ble I, $L15 | |||
| LD a0, 0 * SIZE(X) | |||
| fclr t2 | |||
| LD a1, 1 * SIZE(X) | |||
| SXADDQ INCX, X, X | |||
| LD a2, 0 * SIZE(X) | |||
| fclr t3 | |||
| LD a3, 1 * SIZE(X) | |||
| SXADDQ INCX, X, X | |||
| LD a4, 0 * SIZE(X) | |||
| LD a5, 1 * SIZE(X) | |||
| SXADDQ INCX, X, X | |||
| lda I, -1(I) | |||
| ble I, $L13 | |||
| .align 4 | |||
| $L12: | |||
| ADD s0, t0, s0 | |||
| ldl $31, PREFETCHSIZE * SIZE(X) | |||
| fmov a0, t0 | |||
| lda I, -1(I) | |||
| ADD s1, t1, s1 | |||
| LD a6, 0 * SIZE(X) | |||
| fmov a1, t1 | |||
| unop | |||
| ADD s2, t2, s2 | |||
| LD a7, 1 * SIZE(X) | |||
| fmov a2, t2 | |||
| SXADDQ INCX, X, X | |||
| ADD s3, t3, s3 | |||
| LD a0, 0 * SIZE(X) | |||
| fmov a3, t3 | |||
| unop | |||
| ADD s0, t0, s0 | |||
| LD a1, 1 * SIZE(X) | |||
| fmov a4, t0 | |||
| SXADDQ INCX, X, X | |||
| ADD s1, t1, s1 | |||
| LD a2, 0 * SIZE(X) | |||
| fmov a5, t1 | |||
| unop | |||
| ADD s2, t2, s2 | |||
| LD a3, 1 * SIZE(X) | |||
| fmov a6, t2 | |||
| SXADDQ INCX, X, X | |||
| ADD s3, t3, s3 | |||
| LD a4, 0 * SIZE(X) | |||
| fmov a7, t3 | |||
| unop | |||
| LD a5, 1 * SIZE(X) | |||
| unop | |||
| SXADDQ INCX, X, X | |||
| bne I, $L12 | |||
| .align 4 | |||
| $L13: | |||
| ADD s0, t0, s0 | |||
| LD a6, 0 * SIZE(X) | |||
| fmov a0, t0 | |||
| ADD s1, t1, s1 | |||
| LD a7, 1 * SIZE(X) | |||
| fmov a1, t1 | |||
| SXADDQ INCX, X, X | |||
| ADD s2, t2, s2 | |||
| fmov a2, t2 | |||
| ADD s3, t3, s3 | |||
| fmov a3, t3 | |||
| ADD s0, t0, s0 | |||
| fmov a4, t0 | |||
| ADD s1, t1, s1 | |||
| fmov a5, t1 | |||
| ADD s2, t2, s2 | |||
| fmov a6, t2 | |||
| ADD s3, t3, s3 | |||
| fmov a7, t3 | |||
| ADD s2, t2, s2 | |||
| ADD s3, t3, s3 | |||
| .align 4 | |||
| $L15: | |||
| ADD s0, s2, s0 | |||
| and N, 3, I | |||
| ADD s1, s3, s1 | |||
| ble I, $L999 | |||
| .align 4 | |||
| $L17: | |||
| ADD s0, t0, s0 | |||
| LD a0, 0 * SIZE(X) | |||
| fmov a0, t0 | |||
| lda I, -1(I) | |||
| ADD s1, t1, s1 | |||
| LD a1, 1 * SIZE(X) | |||
| fmov a1, t1 | |||
| SXADDQ INCX, X, X | |||
| bne I, $L17 | |||
| .align 4 | |||
| $L999: | |||
| ADD s0, t0, s0 | |||
| ADD s1, t1, s1 | |||
| ADD s0, s1, s0 | |||
| ret | |||
| EPILOGUE | |||
| @@ -35,6 +35,11 @@ DASUMKERNEL = ../arm/asum.c | |||
| CASUMKERNEL = ../arm/zasum.c | |||
| ZASUMKERNEL = ../arm/zasum.c | |||
| SSUMKERNEL = ../arm/sum.c | |||
| DSUMKERNEL = ../arm/sum.c | |||
| CSUMKERNEL = ../arm/zsum.c | |||
| ZSUMKERNEL = ../arm/zsum.c | |||
| SAXPYKERNEL = ../arm/axpy.c | |||
| DAXPYKERNEL = ../arm/axpy.c | |||
| CAXPYKERNEL = ../arm/zaxpy.c | |||
| @@ -37,6 +37,9 @@ DASUMKERNEL = asum_vfp.S | |||
| CASUMKERNEL = asum_vfp.S | |||
| ZASUMKERNEL = asum_vfp.S | |||
| SSUMKERNEL = sum_vfp.S | |||
| DSUMKERNEL = sum_vfp.S | |||
| SAXPYKERNEL = axpy_vfp.S | |||
| DAXPYKERNEL = axpy_vfp.S | |||
| CAXPYKERNEL = axpy_vfp.S | |||
| @@ -0,0 +1,51 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * trivial copy of asum.c with the ABS() removed * | |||
| **************************************************************************************/ | |||
| #include "common.h" | |||
| #include <math.h> | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| BLASLONG i=0; | |||
| FLOAT sumf = 0.0; | |||
| if (n <= 0 || inc_x <= 0) return(sumf); | |||
| n *= inc_x; | |||
| while(i < n) | |||
| { | |||
| sumf += x[i]; | |||
| i += inc_x; | |||
| } | |||
| return(sumf); | |||
| } | |||
| @@ -0,0 +1,425 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * trivial copy of asum_vfp.S with the in-place vabs.f64 calls removed * | |||
| **************************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #define STACKSIZE 256 | |||
| #define N r0 | |||
| #define X r1 | |||
| #define INC_X r2 | |||
| #define I r12 | |||
| #define X_PRE 512 | |||
| /************************************************************************************** | |||
| * Macro definitions | |||
| **************************************************************************************/ | |||
| #if !defined(COMPLEX) | |||
| #if defined(DOUBLE) | |||
| .macro KERNEL_F4 | |||
| pld [ X, #X_PRE ] | |||
| vldmia.f64 X!, { d4 - d5 } | |||
| vadd.f64 d0 , d0, d4 | |||
| vldmia.f64 X!, { d6 - d7 } | |||
| vadd.f64 d1 , d1, d5 | |||
| vadd.f64 d0 , d0, d6 | |||
| vadd.f64 d1 , d1, d7 | |||
| .endm | |||
| .macro KERNEL_F1 | |||
| vldmia.f64 X!, { d4 } | |||
| vadd.f64 d0 , d0, d4 | |||
| .endm | |||
| .macro KERNEL_S4 | |||
| vldmia.f64 X, { d4 } | |||
| vadd.f64 d0 , d0, d4 | |||
| add X, X, INC_X | |||
| vldmia.f64 X, { d4 } | |||
| vadd.f64 d0 , d0, d4 | |||
| add X, X, INC_X | |||
| vldmia.f64 X, { d4 } | |||
| vadd.f64 d0 , d0, d4 | |||
| add X, X, INC_X | |||
| vldmia.f64 X, { d4 } | |||
| vadd.f64 d0 , d0, d4 | |||
| add X, X, INC_X | |||
| .endm | |||
| .macro KERNEL_S1 | |||
| vldmia.f64 X, { d4 } | |||
| vadd.f64 d0 , d0, d4 | |||
| add X, X, INC_X | |||
| .endm | |||
| #else | |||
| .macro KERNEL_F4 | |||
| vldmia.f32 X!, { s4 - s5 } | |||
| vadd.f32 s0 , s0, s4 | |||
| vldmia.f32 X!, { s6 - s7 } | |||
| vadd.f32 s1 , s1, s5 | |||
| vadd.f32 s0 , s0, s6 | |||
| vadd.f32 s1 , s1, s7 | |||
| .endm | |||
| .macro KERNEL_F1 | |||
| vldmia.f32 X!, { s4 } | |||
| vadd.f32 s0 , s0, s4 | |||
| .endm | |||
| .macro KERNEL_S4 | |||
| vldmia.f32 X, { s4 } | |||
| vadd.f32 s0 , s0, s4 | |||
| add X, X, INC_X | |||
| vldmia.f32 X, { s4 } | |||
| vadd.f32 s0 , s0, s4 | |||
| add X, X, INC_X | |||
| vldmia.f32 X, { s4 } | |||
| vadd.f32 s0 , s0, s4 | |||
| add X, X, INC_X | |||
| vldmia.f32 X, { s4 } | |||
| vadd.f32 s0 , s0, s4 | |||
| add X, X, INC_X | |||
| .endm | |||
| .macro KERNEL_S1 | |||
| vldmia.f32 X, { s4 } | |||
| vadd.f32 s0 , s0, s4 | |||
| add X, X, INC_X | |||
| .endm | |||
| #endif | |||
| #else | |||
| #if defined(DOUBLE) | |||
| .macro KERNEL_F4 | |||
| pld [ X, #X_PRE ] | |||
| vldmia.f64 X!, { d4 - d5 } | |||
| vadd.f64 d0 , d0, d4 | |||
| vldmia.f64 X!, { d6 - d7 } | |||
| vadd.f64 d1 , d1, d5 | |||
| vadd.f64 d0 , d0, d6 | |||
| vadd.f64 d1 , d1, d7 | |||
| pld [ X, #X_PRE ] | |||
| vldmia.f64 X!, { d4 - d5 } | |||
| vadd.f64 d0 , d0, d4 | |||
| vldmia.f64 X!, { d6 - d7 } | |||
| vadd.f64 d1 , d1, d5 | |||
| vadd.f64 d0 , d0, d6 | |||
| vadd.f64 d1 , d1, d7 | |||
| .endm | |||
| .macro KERNEL_F1 | |||
| vldmia.f64 X!, { d4 } | |||
| vadd.f64 d0 , d0, d4 | |||
| vldmia.f64 X!, { d4 } | |||
| vadd.f64 d0 , d0, d4 | |||
| .endm | |||
| .macro KERNEL_S4 | |||
| vldmia.f64 X, { d4 -d5 } | |||
| vadd.f64 d0 , d0, d4 | |||
| vadd.f64 d0 , d0, d5 | |||
| add X, X, INC_X | |||
| vldmia.f64 X, { d4 -d5 } | |||
| vadd.f64 d0 , d0, d4 | |||
| vadd.f64 d0 , d0, d5 | |||
| add X, X, INC_X | |||
| vldmia.f64 X, { d4 -d5 } | |||
| vadd.f64 d0 , d0, d4 | |||
| vadd.f64 d0 , d0, d5 | |||
| add X, X, INC_X | |||
| vldmia.f64 X, { d4 -d5 } | |||
| vadd.f64 d0 , d0, d4 | |||
| vadd.f64 d0 , d0, d5 | |||
| add X, X, INC_X | |||
| .endm | |||
| .macro KERNEL_S1 | |||
| vldmia.f64 X, { d4 -d5 } | |||
| vadd.f64 d0 , d0, d4 | |||
| vadd.f64 d0 , d0, d5 | |||
| add X, X, INC_X | |||
| .endm | |||
| #else | |||
| .macro KERNEL_F4 | |||
| pld [ X, #X_PRE ] | |||
| vldmia.f32 X!, { s4 - s5 } | |||
| vadd.f32 s0 , s0, s4 | |||
| vldmia.f32 X!, { s6 - s7 } | |||
| vadd.f32 s1 , s1, s5 | |||
| vadd.f32 s0 , s0, s6 | |||
| vadd.f32 s1 , s1, s7 | |||
| vldmia.f32 X!, { s4 - s5 } | |||
| vadd.f32 s0 , s0, s4 | |||
| vldmia.f32 X!, { s6 - s7 } | |||
| vadd.f32 s1 , s1, s5 | |||
| vadd.f32 s0 , s0, s6 | |||
| vadd.f32 s1 , s1, s7 | |||
| .endm | |||
| .macro KERNEL_F1 | |||
| vldmia.f32 X!, { s4 } | |||
| vadd.f32 s0 , s0, s4 | |||
| vldmia.f32 X!, { s4 } | |||
| vadd.f32 s0 , s0, s4 | |||
| .endm | |||
| .macro KERNEL_S4 | |||
| vldmia.f32 X, { s4 -s5 } | |||
| vadd.f32 s0 , s0, s4 | |||
| vadd.f32 s0 , s0, s5 | |||
| add X, X, INC_X | |||
| vldmia.f32 X, { s4 -s5 } | |||
| vadd.f32 s0 , s0, s4 | |||
| vadd.f32 s0 , s0, s5 | |||
| add X, X, INC_X | |||
| vldmia.f32 X, { s4 -s5 } | |||
| vadd.f32 s0 , s0, s4 | |||
| vadd.f32 s0 , s0, s5 | |||
| add X, X, INC_X | |||
| vldmia.f32 X, { s4 -s5 } | |||
| vadd.f32 s0 , s0, s4 | |||
| vadd.f32 s0 , s0, s5 | |||
| add X, X, INC_X | |||
| .endm | |||
| .macro KERNEL_S1 | |||
| vldmia.f32 X, { s4 -s5 } | |||
| vadd.f32 s0 , s0, s4 | |||
| vadd.f32 s0 , s0, s5 | |||
| add X, X, INC_X | |||
| .endm | |||
| #endif | |||
| #endif | |||
| /************************************************************************************** | |||
| * End of macro definitions | |||
| **************************************************************************************/ | |||
| PROLOGUE | |||
| .align 5 | |||
| movs r12, #0 // clear floating point register | |||
| vmov s0, r12 | |||
| vmov s1, r12 | |||
| #if defined(DOUBLE) | |||
| vcvt.f64.f32 d0, s0 | |||
| vcvt.f64.f32 d1, s1 | |||
| #endif | |||
| cmp N, #0 | |||
| ble asum_kernel_L999 | |||
| cmp INC_X, #0 | |||
| beq asum_kernel_L999 | |||
| cmp INC_X, #1 | |||
| bne asum_kernel_S_BEGIN | |||
| asum_kernel_F_BEGIN: | |||
| asrs I, N, #2 // I = N / 4 | |||
| ble asum_kernel_F1 | |||
| .align 5 | |||
| asum_kernel_F4: | |||
| #if !defined(DOUBLE) && !defined(COMPLEX) | |||
| pld [ X, #X_PRE ] | |||
| #endif | |||
| KERNEL_F4 | |||
| subs I, I, #1 | |||
| ble asum_kernel_F1 | |||
| KERNEL_F4 | |||
| subs I, I, #1 | |||
| bne asum_kernel_F4 | |||
| asum_kernel_F1: | |||
| ands I, N, #3 | |||
| ble asum_kernel_L999 | |||
| asum_kernel_F10: | |||
| KERNEL_F1 | |||
| subs I, I, #1 | |||
| bne asum_kernel_F10 | |||
| b asum_kernel_L999 | |||
| asum_kernel_S_BEGIN: | |||
| #if defined(COMPLEX) | |||
| #if defined(DOUBLE) | |||
| lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 | |||
| #else | |||
| lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 | |||
| #endif | |||
| #else | |||
| #if defined(DOUBLE) | |||
| lsl INC_X, INC_X, #3 // INC_X * SIZE | |||
| #else | |||
| lsl INC_X, INC_X, #2 // INC_X * SIZE | |||
| #endif | |||
| #endif | |||
| asrs I, N, #2 // I = N / 4 | |||
| ble asum_kernel_S1 | |||
| .align 5 | |||
| asum_kernel_S4: | |||
| KERNEL_S4 | |||
| subs I, I, #1 | |||
| bne asum_kernel_S4 | |||
| asum_kernel_S1: | |||
| ands I, N, #3 | |||
| ble asum_kernel_L999 | |||
| asum_kernel_S10: | |||
| KERNEL_S1 | |||
| subs I, I, #1 | |||
| bne asum_kernel_S10 | |||
| asum_kernel_L999: | |||
| #if defined(DOUBLE) | |||
| vadd.f64 d0 , d0, d1 // set return value | |||
| #else | |||
| vadd.f32 s0 , s0, s1 // set return value | |||
| #endif | |||
| #if !defined(__ARM_PCS_VFP) | |||
| #if !defined(DOUBLE) | |||
| vmov r0, s0 | |||
| #else | |||
| vmov r0, r1, d0 | |||
| #endif | |||
| #endif | |||
| bx lr | |||
| EPILOGUE | |||
| @@ -0,0 +1,57 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * trivial copy of zasum.c with the ABS() removed * | |||
| **************************************************************************************/ | |||
| #include "common.h" | |||
| #include <math.h> | |||
| #define CSUM1(x,i) x[i]+x[i+1] | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| BLASLONG i=0; | |||
| FLOAT sumf = 0.0; | |||
| BLASLONG inc_x2; | |||
| if (n <= 0 || inc_x <= 0) return(sumf); | |||
| inc_x2 = 2 * inc_x; | |||
| n *= inc_x2; | |||
| while(i < n) | |||
| { | |||
| sumf += CSUM1(x,i); | |||
| i += inc_x2; | |||
| } | |||
| return(sumf); | |||
| } | |||
| @@ -0,0 +1,164 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2019, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #define N x0 /* vector length */ | |||
| #define X x1 /* X vector address */ | |||
| #define INC_X x2 /* X stride */ | |||
| #define I x5 /* loop variable */ | |||
| /******************************************************************************* | |||
| * Macro definitions | |||
| *******************************************************************************/ | |||
| #define REG0 wzr | |||
| #define SUMF s0 | |||
| #define TMPF s1 | |||
| #define TMPVF {v1.s}[0] | |||
| #define SZ 4 | |||
| /******************************************************************************/ | |||
| .macro KERNEL_F1 | |||
| ld1 {v1.2s}, [X], #8 | |||
| ext v2.8b, v1.8b, v1.8b, #4 | |||
| fadd TMPF, TMPF, s2 | |||
| fadd SUMF, SUMF, TMPF | |||
| .endm | |||
| .macro KERNEL_F8 | |||
| ld1 {v1.4s, v2.4s, v3.4s, v4.4s}, [X] | |||
| add X, X, #64 | |||
| PRFM PLDL1KEEP, [X, #1024] | |||
| fadd v1.4s, v1.4s, v2.4s | |||
| fadd v3.4s, v3.4s, v4.4s | |||
| fadd v0.4s, v0.4s, v1.4s | |||
| fadd v0.4s, v0.4s, v3.4s | |||
| .endm | |||
| .macro KERNEL_F8_FINALIZE | |||
| ext v1.16b, v0.16b, v0.16b, #8 | |||
| fadd v0.2s, v0.2s, v1.2s | |||
| faddp SUMF, v0.2s | |||
| .endm | |||
| .macro INIT_S | |||
| lsl INC_X, INC_X, #3 | |||
| .endm | |||
| .macro KERNEL_S1 | |||
| ld1 {v1.2s}, [X], INC_X | |||
| ext v2.8b, v1.8b, v1.8b, #4 | |||
| fadd TMPF, TMPF, s2 | |||
| fadd SUMF, SUMF, TMPF | |||
| .endm | |||
| /******************************************************************************* | |||
| * End of macro definitions | |||
| *******************************************************************************/ | |||
| PROLOGUE | |||
| fmov SUMF, REG0 | |||
| fmov s1, SUMF | |||
| cmp N, xzr | |||
| ble .Lcsum_kernel_L999 | |||
| cmp INC_X, xzr | |||
| ble .Lcsum_kernel_L999 | |||
| cmp INC_X, #1 | |||
| bne .Lcsum_kernel_S_BEGIN | |||
| .Lcsum_kernel_F_BEGIN: | |||
| asr I, N, #3 | |||
| cmp I, xzr | |||
| beq .Lcsum_kernel_F1 | |||
| .Lcsum_kernel_F8: | |||
| KERNEL_F8 | |||
| subs I, I, #1 | |||
| bne .Lcsum_kernel_F8 | |||
| KERNEL_F8_FINALIZE | |||
| .Lcsum_kernel_F1: | |||
| ands I, N, #7 | |||
| ble .Lcsum_kernel_L999 | |||
| .Lcsum_kernel_F10: | |||
| KERNEL_F1 | |||
| subs I, I, #1 | |||
| bne .Lcsum_kernel_F10 | |||
| .Lcsum_kernel_L999: | |||
| ret | |||
| .Lcsum_kernel_S_BEGIN: | |||
| INIT_S | |||
| asr I, N, #2 | |||
| cmp I, xzr | |||
| ble .Lcsum_kernel_S1 | |||
| .Lcsum_kernel_S4: | |||
| KERNEL_S1 | |||
| KERNEL_S1 | |||
| KERNEL_S1 | |||
| KERNEL_S1 | |||
| subs I, I, #1 | |||
| bne .Lcsum_kernel_S4 | |||
| .Lcsum_kernel_S1: | |||
| ands I, N, #3 | |||
| ble .Lcsum_kernel_L999 | |||
| .Lcsum_kernel_S10: | |||
| KERNEL_S1 | |||
| subs I, I, #1 | |||
| bne .Lcsum_kernel_S10 | |||
| ret | |||
| EPILOGUE | |||
| @@ -0,0 +1,186 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2019, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #define N x0 /* vector length */ | |||
| #define X x1 /* X vector address */ | |||
| #define INC_X x2 /* X stride */ | |||
| #define I x5 /* loop variable */ | |||
| /******************************************************************************* | |||
| * Macro definitions | |||
| *******************************************************************************/ | |||
| #if !defined(DOUBLE) | |||
| #define REG0 wzr | |||
| #define SUMF s0 | |||
| #define TMPF s1 | |||
| #define TMPVF {v1.s}[0] | |||
| #define SZ 4 | |||
| #else | |||
| #define REG0 xzr | |||
| #define SUMF d0 | |||
| #define TMPF d1 | |||
| #define TMPVF {v1.d}[0] | |||
| #define SZ 8 | |||
| #endif | |||
| /******************************************************************************/ | |||
| .macro KERNEL_F1 | |||
| ldr TMPF, [X], #SZ | |||
| fadd SUMF, SUMF, TMPF | |||
| .endm | |||
| .macro KERNEL_F8 | |||
| #if !defined(DOUBLE) | |||
| ld1 {v1.4s, v2.4s}, [X], #32 // Load [X3, X2, X1, X0] | |||
| fadd v1.4s, v1.4s, v2.4s // [X3+X1, X2+X0] | |||
| fadd v0.4s, v0.4s, v1.4s // [X3+X1, X2+X0] | |||
| PRFM PLDL1KEEP, [X, #1024] | |||
| #else // DOUBLE | |||
| ld1 {v2.2d, v3.2d, v4.2d, v5.2d}, [X] | |||
| add X, X, #64 | |||
| PRFM PLDL1KEEP, [X, #1024] | |||
| fadd v2.2d, v2.2d, v3.2d | |||
| fadd v4.2d, v4.2d, v5.2d | |||
| fadd v0.2d, v0.2d, v2.2d | |||
| fadd v0.2d, v0.2d, v4.2d | |||
| #endif | |||
| .endm | |||
| .macro KERNEL_F8_FINALIZE | |||
| #if !defined(DOUBLE) | |||
| ext v1.16b, v0.16b, v0.16b, #8 | |||
| fadd v0.2s, v0.2s, v1.2s | |||
| faddp SUMF, v0.2s | |||
| #else | |||
| faddp SUMF, v0.2d | |||
| #endif | |||
| .endm | |||
| .macro INIT_S | |||
| #if !defined(DOUBLE) | |||
| lsl INC_X, INC_X, #2 | |||
| #else | |||
| lsl INC_X, INC_X, #3 | |||
| #endif | |||
| .endm | |||
| .macro KERNEL_S1 | |||
| ld1 TMPVF, [X], INC_X | |||
| fadd SUMF, SUMF, TMPF | |||
| .endm | |||
| /******************************************************************************* | |||
| * End of macro definitions | |||
| *******************************************************************************/ | |||
| PROLOGUE | |||
| fmov SUMF, REG0 | |||
| #if !defined(DOUBLE) | |||
| fmov s1, SUMF | |||
| #else | |||
| fmov d1, SUMF | |||
| #endif | |||
| cmp N, xzr | |||
| ble .Lsum_kernel_L999 | |||
| cmp INC_X, xzr | |||
| ble .Lsum_kernel_L999 | |||
| cmp INC_X, #1 | |||
| bne .Lsum_kernel_S_BEGIN | |||
| .Lsum_kernel_F_BEGIN: | |||
| asr I, N, #3 | |||
| cmp I, xzr | |||
| beq .Lsum_kernel_F1 | |||
| .Lsum_kernel_F8: | |||
| KERNEL_F8 | |||
| subs I, I, #1 | |||
| bne .Lsum_kernel_F8 | |||
| KERNEL_F8_FINALIZE | |||
| .Lsum_kernel_F1: | |||
| ands I, N, #7 | |||
| ble .Lsum_kernel_L999 | |||
| .Lsum_kernel_F10: | |||
| KERNEL_F1 | |||
| subs I, I, #1 | |||
| bne .Lsum_kernel_F10 | |||
| .Lsum_kernel_L999: | |||
| ret | |||
| .Lsum_kernel_S_BEGIN: | |||
| INIT_S | |||
| asr I, N, #2 | |||
| cmp I, xzr | |||
| ble .Lsum_kernel_S1 | |||
| .Lsum_kernel_S4: | |||
| KERNEL_S1 | |||
| KERNEL_S1 | |||
| KERNEL_S1 | |||
| KERNEL_S1 | |||
| subs I, I, #1 | |||
| bne .Lsum_kernel_S4 | |||
| .Lsum_kernel_S1: | |||
| ands I, N, #3 | |||
| ble .Lsum_kernel_L999 | |||
| .Lsum_kernel_S10: | |||
| KERNEL_S1 | |||
| subs I, I, #1 | |||
| bne .Lsum_kernel_S10 | |||
| ret | |||
| EPILOGUE | |||
| @@ -0,0 +1,158 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2015, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #define N x0 /* vector length */ | |||
| #define X x1 /* X vector address */ | |||
| #define INC_X x2 /* X stride */ | |||
| #define I x5 /* loop variable */ | |||
| /******************************************************************************* | |||
| * Macro definitions | |||
| *******************************************************************************/ | |||
| #define REG0 xzr | |||
| #define SUMF d0 | |||
| #define TMPF d1 | |||
| #define TMPVF {v1.d}[0] | |||
| #define SZ 8 | |||
| /******************************************************************************/ | |||
| .macro KERNEL_F1 | |||
| ld1 {v1.2d}, [X], #16 | |||
| faddp TMPF, v1.2d | |||
| fadd SUMF, SUMF, TMPF | |||
| .endm | |||
| .macro KERNEL_F4 | |||
| ld1 {v1.2d, v2.2d, v3.2d, v4.2d}, [X], #64 | |||
| fadd v1.2d, v1.2d, v2.2d | |||
| fadd v3.2d, v3.2d, v4.2d | |||
| fadd v0.2d, v0.2d, v1.2d | |||
| fadd v0.2d, v0.2d, v3.2d | |||
| PRFM PLDL1KEEP, [X, #1024] | |||
| .endm | |||
| .macro KERNEL_F4_FINALIZE | |||
| faddp SUMF, v0.2d | |||
| .endm | |||
| .macro INIT_S | |||
| lsl INC_X, INC_X, #4 | |||
| .endm | |||
| .macro KERNEL_S1 | |||
| ld1 {v1.2d}, [X], INC_X | |||
| faddp TMPF, v1.2d | |||
| fadd SUMF, SUMF, TMPF | |||
| .endm | |||
| /******************************************************************************* | |||
| * End of macro definitions | |||
| *******************************************************************************/ | |||
| PROLOGUE | |||
| fmov SUMF, REG0 | |||
| cmp N, xzr | |||
| ble .Lzsum_kernel_L999 | |||
| cmp INC_X, xzr | |||
| ble .Lzsum_kernel_L999 | |||
| cmp INC_X, #1 | |||
| bne .Lzsum_kernel_S_BEGIN | |||
| .Lzsum_kernel_F_BEGIN: | |||
| asr I, N, #2 | |||
| cmp I, xzr | |||
| beq .Lzsum_kernel_F1 | |||
| .Lzsum_kernel_F4: | |||
| KERNEL_F4 | |||
| subs I, I, #1 | |||
| bne .Lzsum_kernel_F4 | |||
| KERNEL_F4_FINALIZE | |||
| .Lzsum_kernel_F1: | |||
| ands I, N, #3 | |||
| ble .Lzsum_kernel_L999 | |||
| .Lzsum_kernel_F10: | |||
| KERNEL_F1 | |||
| subs I, I, #1 | |||
| bne .Lzsum_kernel_F10 | |||
| .Lzsum_kernel_L999: | |||
| ret | |||
| .Lzsum_kernel_S_BEGIN: | |||
| INIT_S | |||
| asr I, N, #2 | |||
| cmp I, xzr | |||
| ble .Lzsum_kernel_S1 | |||
| .Lzsum_kernel_S4: | |||
| KERNEL_S1 | |||
| KERNEL_S1 | |||
| KERNEL_S1 | |||
| KERNEL_S1 | |||
| subs I, I, #1 | |||
| bne .Lzsum_kernel_S4 | |||
| .Lzsum_kernel_S1: | |||
| ands I, N, #3 | |||
| ble .Lzsum_kernel_L999 | |||
| .Lzsum_kernel_S10: | |||
| KERNEL_S1 | |||
| subs I, I, #1 | |||
| bne .Lzsum_kernel_S10 | |||
| ret | |||
| EPILOGUE | |||
| @@ -60,6 +60,10 @@ CASUMKERNEL = asum.S | |||
| ZASUMKERNEL = asum.S | |||
| XASUMKERNEL = asum.S | |||
| CSUMKERNEL = sum.S | |||
| ZSUMKERNEL = sum.S | |||
| XSUMKERNEL = sum.S | |||
| CNRM2KERNEL = nrm2.S | |||
| ZNRM2KERNEL = nrm2.S | |||
| XNRM2KERNEL = nrm2.S | |||
| @@ -0,0 +1,358 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* Copyright 2019, The OpenBLAS project */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #ifdef XDOUBLE | |||
| #define PREFETCH_SIZE ( 8 * 16 + 4) | |||
| #elif defined(DOUBLE) | |||
| #define PREFETCH_SIZE (16 * 16 + 8) | |||
| #else | |||
| #define PREFETCH_SIZE (32 * 16 + 16) | |||
| #endif | |||
| #ifndef COMPLEX | |||
| #define COMPADD 0 | |||
| #define STRIDE INCX | |||
| #else | |||
| #define COMPADD 1 | |||
| #define STRIDE SIZE | |||
| #endif | |||
| #define PRE1 r2 | |||
| #define I r17 | |||
| #define J r18 | |||
| #define INCX16 r21 | |||
| #define PR r30 | |||
| #define ARLC r31 | |||
| #define N r32 | |||
| #define X r33 | |||
| #define INCX r34 | |||
| PROLOGUE | |||
| .prologue | |||
| PROFCODE | |||
| { .mfi | |||
| adds PRE1 = PREFETCH_SIZE * SIZE, X | |||
| mov f8 = f0 | |||
| .save ar.lc, ARLC | |||
| mov ARLC = ar.lc | |||
| } | |||
| ;; | |||
| .body | |||
| #ifdef F_INTERFACE | |||
| { .mmi | |||
| LDINT N = [N] | |||
| LDINT INCX = [INCX] | |||
| nop.i 0 | |||
| } | |||
| ;; | |||
| #ifndef USE64BITINT | |||
| { .mii | |||
| nop.m 0 | |||
| sxt4 N = N | |||
| sxt4 INCX = INCX | |||
| } | |||
| ;; | |||
| #endif | |||
| #endif | |||
| { .mmi | |||
| cmp.lt p0, p6 = r0, INCX | |||
| cmp.lt p0, p7 = r0, N | |||
| shr I = N, (4 - COMPADD) | |||
| } | |||
| { .mbb | |||
| and J = ((1 << (4 - COMPADD)) - 1), N | |||
| (p6) br.ret.sptk.many b0 | |||
| (p7) br.ret.sptk.many b0 | |||
| } | |||
| ;; | |||
| { .mfi | |||
| adds I = -1, I | |||
| mov f10 = f0 | |||
| mov PR = pr | |||
| } | |||
| { .mfi | |||
| cmp.eq p9, p0 = r0, J | |||
| mov f9 = f0 | |||
| tbit.z p0, p12 = N, 3 - COMPADD | |||
| } | |||
| ;; | |||
| { .mmi | |||
| cmp.eq p16, p0 = r0, r0 | |||
| cmp.ne p17, p0 = r0, r0 | |||
| mov ar.ec= 3 | |||
| } | |||
| { .mfi | |||
| cmp.ne p18, p0 = r0, r0 | |||
| mov f11 = f0 | |||
| shl INCX = INCX, BASE_SHIFT + COMPADD | |||
| } | |||
| ;; | |||
| { .mmi | |||
| #ifdef XDOUBLE | |||
| shladd INCX16 = INCX, (3 - COMPADD), r0 | |||
| #else | |||
| shladd INCX16 = INCX, (4 - COMPADD), r0 | |||
| #endif | |||
| cmp.ne p19, p0 = r0, r0 | |||
| mov ar.lc = I | |||
| } | |||
| { .mmb | |||
| cmp.gt p8 ,p0 = r0, I | |||
| #ifdef COMPLEX | |||
| adds INCX = - SIZE, INCX | |||
| #else | |||
| nop.m 0 | |||
| #endif | |||
| (p8) br.cond.dpnt .L55 | |||
| } | |||
| ;; | |||
| .align 32 | |||
| .L52: | |||
| { .mmf | |||
| (p16) lfetch.nt1 [PRE1], INCX16 | |||
| (p16) LDFD f32 = [X], STRIDE | |||
| } | |||
| { .mfb | |||
| (p19) FADD f8 = f8, f71 | |||
| } | |||
| ;; | |||
| { .mmf | |||
| (p16) LDFD f35 = [X], INCX | |||
| } | |||
| { .mfb | |||
| (p19) FADD f9 = f9, f74 | |||
| } | |||
| ;; | |||
| { .mmf | |||
| (p16) LDFD f38 = [X], STRIDE | |||
| } | |||
| { .mfb | |||
| (p19) FADD f10 = f10, f77 | |||
| } | |||
| ;; | |||
| { .mmf | |||
| (p16) LDFD f41 = [X], INCX | |||
| } | |||
| { .mfb | |||
| (p19) FADD f11 = f11, f80 | |||
| } | |||
| ;; | |||
| { .mmf | |||
| (p16) LDFD f44 = [X], STRIDE | |||
| } | |||
| { .mfb | |||
| (p18) FADD f8 = f8, f34 | |||
| } | |||
| ;; | |||
| { .mmf | |||
| (p16) LDFD f47 = [X], INCX | |||
| } | |||
| { .mfb | |||
| (p18) FADD f9 = f9, f37 | |||
| } | |||
| ;; | |||
| { .mmf | |||
| (p16) LDFD f50 = [X], STRIDE | |||
| } | |||
| { .mfb | |||
| (p18) FADD f10 = f10, f40 | |||
| } | |||
| ;; | |||
| { .mmf | |||
| (p16) LDFD f53 = [X], INCX | |||
| } | |||
| { .mfb | |||
| (p18) FADD f11 = f11, f43 | |||
| } | |||
| ;; | |||
| { .mmf | |||
| #ifdef XDOUBLE | |||
| (p16) lfetch.nt1 [PRE1], INCX16 | |||
| #endif | |||
| (p16) LDFD f56 = [X], STRIDE | |||
| } | |||
| { .mfb | |||
| (p18) FADD f8 = f8, f46 | |||
| } | |||
| ;; | |||
| { .mmf | |||
| (p16) LDFD f59 = [X], INCX | |||
| } | |||
| { .mfb | |||
| (p18) FADD f9 = f9, f49 | |||
| } | |||
| ;; | |||
| { .mmf | |||
| (p16) LDFD f62 = [X], STRIDE | |||
| } | |||
| { .mfb | |||
| (p18) FADD f10 = f10, f52 | |||
| } | |||
| ;; | |||
| { .mmf | |||
| (p16) LDFD f65 = [X], INCX | |||
| } | |||
| { .mfb | |||
| (p18) FADD f11 = f11, f55 | |||
| } | |||
| ;; | |||
| { .mmf | |||
| (p16) LDFD f68 = [X], STRIDE | |||
| } | |||
| { .mfb | |||
| (p18) FADD f8 = f8, f58 | |||
| } | |||
| ;; | |||
| { .mmf | |||
| (p16) LDFD f71 = [X], INCX | |||
| } | |||
| { .mfb | |||
| (p18) FADD f9 = f9, f61 | |||
| } | |||
| ;; | |||
| { .mmf | |||
| (p16) LDFD f74 = [X], STRIDE | |||
| } | |||
| { .mfb | |||
| (p18) FADD f10 = f10, f64 | |||
| } | |||
| ;; | |||
| { .mmf | |||
| (p16) LDFD f77 = [X], INCX | |||
| } | |||
| { .mfb | |||
| (p18) FADD f11 = f11, f67 | |||
| br.ctop.sptk.few .L52 | |||
| } | |||
| ;; | |||
| FADD f8 = f8, f71 | |||
| FADD f9 = f9, f74 | |||
| FADD f10 = f10, f77 | |||
| FADD f11 = f11, f80 | |||
| .align 32 | |||
| ;; | |||
| .L55: | |||
| (p12) LDFD f32 = [X], STRIDE | |||
| (p9) br.cond.dptk .L998 | |||
| ;; | |||
| (p12) LDFD f33 = [X], INCX | |||
| ;; | |||
| (p12) LDFD f34 = [X], STRIDE | |||
| ;; | |||
| (p12) LDFD f35 = [X], INCX | |||
| tbit.z p0, p13 = N, (2 - COMPADD) | |||
| ;; | |||
| (p12) LDFD f36 = [X], STRIDE | |||
| tbit.z p0, p14 = N, (1 - COMPADD) | |||
| ;; | |||
| (p12) LDFD f37 = [X], INCX | |||
| #ifndef COMPLEX | |||
| tbit.z p0, p15 = N, 0 | |||
| #endif | |||
| ;; | |||
| (p12) LDFD f38 = [X], STRIDE | |||
| ;; | |||
| (p12) LDFD f39 = [X], INCX | |||
| ;; | |||
| (p13) LDFD f40 = [X], STRIDE | |||
| ;; | |||
| (p13) LDFD f41 = [X], INCX | |||
| ;; | |||
| (p13) LDFD f42 = [X], STRIDE | |||
| (p12) FADD f8 = f8, f32 | |||
| ;; | |||
| (p13) LDFD f43 = [X], INCX | |||
| (p12) FADD f9 = f9, f33 | |||
| ;; | |||
| (p14) LDFD f44 = [X], STRIDE | |||
| (p12) FADD f10 = f10, f34 | |||
| ;; | |||
| (p14) LDFD f45 = [X], INCX | |||
| (p12) FADD f11 = f11, f35 | |||
| ;; | |||
| #ifndef COMPLEX | |||
| (p15) LDFD f46 = [X] | |||
| #endif | |||
| (p12) FADD f8 = f8, f36 | |||
| ;; | |||
| (p12) FADD f9 = f9, f37 | |||
| (p12) FADD f10 = f10, f38 | |||
| (p12) FADD f11 = f11, f39 | |||
| ;; | |||
| (p13) FADD f8 = f8, f40 | |||
| (p13) FADD f9 = f9, f41 | |||
| #ifndef COMPLEX | |||
| #endif | |||
| (p13) FADD f10 = f10, f42 | |||
| ;; | |||
| (p13) FADD f11 = f11, f43 | |||
| (p14) FADD f8 = f8, f44 | |||
| (p14) FADD f9 = f9, f45 | |||
| #ifndef COMPLEX | |||
| (p15) FADD f10 = f10, f46 | |||
| #endif | |||
| ;; | |||
| .align 32 | |||
| .L998: | |||
| { .mfi | |||
| FADD f8 = f8, f9 | |||
| mov ar.lc = ARLC | |||
| } | |||
| { .mmf | |||
| FADD f10 = f10, f11 | |||
| } | |||
| ;; | |||
| { .mii | |||
| mov pr = PR, -65474 | |||
| } | |||
| ;; | |||
| { .mfb | |||
| FADD f8 = f8, f10 | |||
| br.ret.sptk.many b0 | |||
| } | |||
| EPILOGUE | |||
| @@ -30,6 +30,11 @@ IDMAXKERNEL = ../mips/imax.c | |||
| ISMINKERNEL = ../mips/imin.c | |||
| IDMINKERNEL = ../mips/imin.c | |||
| SSUMKERNEL = ../mips/sum.c | |||
| DSUMKERNEL = ../mips/sum.c | |||
| CSUMKERNEL = ../mips/zsum.c | |||
| ZSUMKERNEL = ../mips/zsum.c | |||
| ifdef HAVE_MSA | |||
| SASUMKERNEL = ../mips/sasum_msa.c | |||
| DASUMKERNEL = ../mips/dasum_msa.c | |||
| @@ -0,0 +1,47 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <math.h> | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| BLASLONG i=0; | |||
| FLOAT sumf = 0.0; | |||
| if (n <= 0 || inc_x <= 0) return(sumf); | |||
| n *= inc_x; | |||
| while(i < n) | |||
| { | |||
| sumf += x[i]; | |||
| i += inc_x; | |||
| } | |||
| return(sumf); | |||
| } | |||
| @@ -0,0 +1,52 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <math.h> | |||
| #define CSUM1(x,i) x[i]+x[i+1] | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| BLASLONG i=0; | |||
| FLOAT sumf = 0.0; | |||
| BLASLONG inc_x2; | |||
| if (n <= 0 || inc_x <= 0) return(sumf); | |||
| inc_x2 = 2 * inc_x; | |||
| n *= inc_x2; | |||
| while(i < n) | |||
| { | |||
| sumf += CSUM1(x,i); | |||
| i += inc_x2; | |||
| } | |||
| return(sumf); | |||
| } | |||
| @@ -0,0 +1,332 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #define N $4 | |||
| #define X $5 | |||
| #define INCX $6 | |||
| #define I $2 | |||
| #define TEMP $3 | |||
| #define a1 $f2 | |||
| #define a2 $f3 | |||
| #define a3 $f4 | |||
| #define a4 $f5 | |||
| #define a5 $f6 | |||
| #define a6 $f7 | |||
| #define a7 $f8 | |||
| #define a8 $f9 | |||
| #define t1 $f10 | |||
| #define t2 $f11 | |||
| #define t3 $f12 | |||
| #define t4 $f13 | |||
| #define s1 $f0 | |||
| #define s2 $f1 | |||
| PROLOGUE | |||
| #ifdef F_INTERFACE | |||
| LDINT N, 0(N) | |||
| LDINT INCX, 0(INCX) | |||
| #endif | |||
| MTC $0, s1 | |||
| MTC $0, s2 | |||
| dsll INCX, INCX, BASE_SHIFT | |||
| blez N, .L999 | |||
| li TEMP, SIZE | |||
| bne INCX, TEMP, .L20 | |||
| dsra I, N, 3 | |||
| blez I, .L15 | |||
| NOP | |||
| LD a1, 0 * SIZE(X) | |||
| LD a2, 1 * SIZE(X) | |||
| LD a3, 2 * SIZE(X) | |||
| LD a4, 3 * SIZE(X) | |||
| LD a5, 4 * SIZE(X) | |||
| MOV t1, a1 | |||
| LD a6, 5 * SIZE(X) | |||
| MOV t2, a2 | |||
| LD a7, 6 * SIZE(X) | |||
| MOV t3, a3 | |||
| MOV t4, a4 | |||
| daddiu I, I, -1 | |||
| blez I, .L13 | |||
| LD a8, 7 * SIZE(X) | |||
| .align 3 | |||
| .L12: | |||
| ADD s1, s1, t1 | |||
| LD a1, 8 * SIZE(X) | |||
| MOV t1, a5 | |||
| daddiu I, I, -1 | |||
| ADD s2, s2, t2 | |||
| LD a2, 9 * SIZE(X) | |||
| MOV t2, a6 | |||
| NOP | |||
| ADD s1, s1, t3 | |||
| LD a3, 10 * SIZE(X) | |||
| MOV t3, a7 | |||
| NOP | |||
| ADD s2, s2, t4 | |||
| LD a4, 11 * SIZE(X) | |||
| MOV t4, a8 | |||
| daddiu X, X, 8 * SIZE | |||
| ADD s1, s1, t1 | |||
| LD a5, 4 * SIZE(X) | |||
| MOV t1, a1 | |||
| NOP | |||
| ADD s2, s2, t2 | |||
| LD a6, 5 * SIZE(X) | |||
| MOV t2, a2 | |||
| NOP | |||
| ADD s1, s1, t3 | |||
| LD a7, 6 * SIZE(X) | |||
| MOV t3, a3 | |||
| NOP | |||
| ADD s2, s2, t4 | |||
| LD a8, 7 * SIZE(X) | |||
| bgtz I, .L12 | |||
| MOV t4, a4 | |||
| .align 3 | |||
| .L13: | |||
| ADD s1, s1, t1 | |||
| daddiu X, X, 8 * SIZE | |||
| MOV t1, a5 | |||
| NOP | |||
| ADD s2, s2, t2 | |||
| MOV t2, a6 | |||
| ADD s1, s1, t3 | |||
| MOV t3, a7 | |||
| ADD s2, s2, t4 | |||
| MOV t4, a8 | |||
| ADD s1, s1, t1 | |||
| ADD s2, s2, t2 | |||
| ADD s1, s1, t3 | |||
| ADD s2, s2, t4 | |||
| .align 3 | |||
| .L15: | |||
| andi I, N, 7 | |||
| blez I, .L999 | |||
| NOP | |||
| .align 3 | |||
| .L16: | |||
| LD a1, 0 * SIZE(X) | |||
| daddiu I, I, -1 | |||
| MOV t1, a1 | |||
| ADD s1, s1, t1 | |||
| bgtz I, .L16 | |||
| daddiu X, X, SIZE | |||
| j .L999 | |||
| NOP | |||
| .align 3 | |||
| .L20: | |||
| blez I, .L25 | |||
| NOP | |||
| LD a1, 0 * SIZE(X) | |||
| daddu X, X, INCX | |||
| LD a2, 0 * SIZE(X) | |||
| daddu X, X, INCX | |||
| LD a3, 0 * SIZE(X) | |||
| daddu X, X, INCX | |||
| LD a4, 0 * SIZE(X) | |||
| daddu X, X, INCX | |||
| LD a5, 0 * SIZE(X) | |||
| daddu X, X, INCX | |||
| LD a6, 0 * SIZE(X) | |||
| daddu X, X, INCX | |||
| MOV t1, a1 | |||
| LD a7, 0 * SIZE(X) | |||
| MOV t2, a2 | |||
| daddu X, X, INCX | |||
| MOV t3, a3 | |||
| LD a8, 0 * SIZE(X) | |||
| MOV t4, a4 | |||
| daddiu I, I, -1 | |||
| blez I, .L24 | |||
| daddu X, X, INCX | |||
| .align 3 | |||
| .L23: | |||
| ADD s1, s1, t1 | |||
| LD a1, 0 * SIZE(X) | |||
| MOV t1, a5 | |||
| daddu X, X, INCX | |||
| ADD s2, s2, t2 | |||
| LD a2, 0 * SIZE(X) | |||
| MOV t2, a6 | |||
| daddu X, X, INCX | |||
| ADD s1, s1, t3 | |||
| LD a3, 0 * SIZE(X) | |||
| MOV t3, a7 | |||
| daddu X, X, INCX | |||
| ADD s2, s2, t4 | |||
| LD a4, 0 * SIZE(X) | |||
| MOV t4, a8 | |||
| daddu X, X, INCX | |||
| ADD s1, s1, t1 | |||
| LD a5, 0 * SIZE(X) | |||
| MOV t1, a1 | |||
| daddu X, X, INCX | |||
| ADD s2, s2, t2 | |||
| LD a6, 0 * SIZE(X) | |||
| MOV t2, a2 | |||
| daddu X, X, INCX | |||
| ADD s1, s1, t3 | |||
| LD a7, 0 * SIZE(X) | |||
| MOV t3, a3 | |||
| daddu X, X, INCX | |||
| ADD s2, s2, t4 | |||
| LD a8, 0 * SIZE(X) | |||
| MOV t4, a4 | |||
| daddiu I, I, -1 | |||
| bgtz I, .L23 | |||
| daddu X, X, INCX | |||
| .align 3 | |||
| .L24: | |||
| ADD s1, s1, t1 | |||
| MOV t1, a5 | |||
| ADD s2, s2, t2 | |||
| MOV t2, a6 | |||
| ADD s1, s1, t3 | |||
| MOV t3, a7 | |||
| ADD s2, s2, t4 | |||
| MOV t4, a8 | |||
| ADD s1, s1, t1 | |||
| ADD s2, s2, t2 | |||
| ADD s1, s1, t3 | |||
| ADD s2, s2, t4 | |||
| .align 3 | |||
| .L25: | |||
| andi I, N, 7 | |||
| blez I, .L999 | |||
| NOP | |||
| .align 3 | |||
| .L26: | |||
| LD a1, 0 * SIZE(X) | |||
| daddiu I, I, -1 | |||
| MOV t1, a1 | |||
| daddu X, X, INCX | |||
| bgtz I, .L26 | |||
| ADD s1, s1, t1 | |||
| .align 3 | |||
| .L999: | |||
| j $31 | |||
| ADD s1, s1, s2 | |||
| EPILOGUE | |||
| @@ -0,0 +1,204 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #define N $4 | |||
| #define X $5 | |||
| #define INCX $6 | |||
| #define I $2 | |||
| #define TEMP $3 | |||
| #define a1 $f2 | |||
| #define a2 $f3 | |||
| #define a3 $f4 | |||
| #define a4 $f5 | |||
| #define a5 $f6 | |||
| #define a6 $f7 | |||
| #define a7 $f8 | |||
| #define a8 $f9 | |||
| #define t1 $f10 | |||
| #define t2 $f11 | |||
| #define t3 $f12 | |||
| #define t4 $f13 | |||
| #define s1 $f0 | |||
| #define s2 $f1 | |||
| PROLOGUE | |||
| #ifdef F_INTERFACE | |||
| LDINT N, 0(N) | |||
| LDINT INCX, 0(INCX) | |||
| #endif | |||
| MTC $0, s1 | |||
| MTC $0, s2 | |||
| dsll INCX, INCX, ZBASE_SHIFT | |||
| blez N, .L999 | |||
| dsra I, N, 2 | |||
| blez I, .L25 | |||
| NOP | |||
| LD a1, 0 * SIZE(X) | |||
| LD a2, 1 * SIZE(X) | |||
| daddu X, X, INCX | |||
| LD a3, 0 * SIZE(X) | |||
| LD a4, 1 * SIZE(X) | |||
| daddu X, X, INCX | |||
| LD a5, 0 * SIZE(X) | |||
| LD a6, 1 * SIZE(X) | |||
| daddu X, X, INCX | |||
| MOV t1, a1 | |||
| MOV t2, a2 | |||
| LD a7, 0 * SIZE(X) | |||
| LD a8, 1 * SIZE(X) | |||
| MOV t3, a3 | |||
| MOV t4, a4 | |||
| daddiu I, I, -1 | |||
| blez I, .L24 | |||
| daddu X, X, INCX | |||
| .align 3 | |||
| .L23: | |||
| ADD s1, s1, t1 | |||
| LD a1, 0 * SIZE(X) | |||
| MOV t1, a5 | |||
| daddiu I, I, -1 | |||
| ADD s2, s2, t2 | |||
| LD a2, 1 * SIZE(X) | |||
| MOV t2, a6 | |||
| daddu X, X, INCX | |||
| ADD s1, s1, t3 | |||
| LD a3, 0 * SIZE(X) | |||
| MOV t3, a7 | |||
| NOP | |||
| ADD s2, s2, t4 | |||
| LD a4, 1 * SIZE(X) | |||
| MOV t4, a8 | |||
| daddu X, X, INCX | |||
| ADD s1, s1, t1 | |||
| LD a5, 0 * SIZE(X) | |||
| MOV t1, a1 | |||
| NOP | |||
| ADD s2, s2, t2 | |||
| LD a6, 1 * SIZE(X) | |||
| MOV t2, a2 | |||
| daddu X, X, INCX | |||
| ADD s1, s1, t3 | |||
| LD a7, 0 * SIZE(X) | |||
| MOV t3, a3 | |||
| LD a8, 1 * SIZE(X) | |||
| ADD s2, s2, t4 | |||
| daddu X, X, INCX | |||
| bgtz I, .L23 | |||
| MOV t4, a4 | |||
| .align 3 | |||
| .L24: | |||
| ADD s1, s1, t1 | |||
| MOV t1, a5 | |||
| ADD s2, s2, t2 | |||
| MOV t2, a6 | |||
| ADD s1, s1, t3 | |||
| MOV t3, a7 | |||
| ADD s2, s2, t4 | |||
| MOV t4, a8 | |||
| ADD s1, s1, t1 | |||
| ADD s2, s2, t2 | |||
| ADD s1, s1, t3 | |||
| ADD s2, s2, t4 | |||
| .align 3 | |||
| .L25: | |||
| andi I, N, 3 | |||
| blez I, .L999 | |||
| NOP | |||
| .align 3 | |||
| .L26: | |||
| LD a1, 0 * SIZE(X) | |||
| LD a2, 1 * SIZE(X) | |||
| MOV t1, a1 | |||
| daddiu I, I, -1 | |||
| MOV t2, a2 | |||
| daddu X, X, INCX | |||
| ADD s1, s1, t1 | |||
| bgtz I, .L26 | |||
| ADD s2, s2, t2 | |||
| .align 3 | |||
| .L999: | |||
| j $31 | |||
| ADD s1, s1, s2 | |||
| EPILOGUE | |||
| @@ -0,0 +1,446 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #define N r3 | |||
| #define X r4 | |||
| #define INCX r5 | |||
| #define PREA r8 | |||
| #define FZERO f0 | |||
| #define STACKSIZE 160 | |||
| PROLOGUE | |||
| PROFCODE | |||
| addi SP, SP, -STACKSIZE | |||
| li r0, 0 | |||
| stfd f14, 0(SP) | |||
| stfd f15, 8(SP) | |||
| stfd f16, 16(SP) | |||
| stfd f17, 24(SP) | |||
| stfd f18, 32(SP) | |||
| stfd f19, 40(SP) | |||
| stfd f20, 48(SP) | |||
| stfd f21, 56(SP) | |||
| stfd f22, 64(SP) | |||
| stfd f23, 72(SP) | |||
| stfd f24, 80(SP) | |||
| stfd f25, 88(SP) | |||
| stfd f26, 96(SP) | |||
| stfd f27, 104(SP) | |||
| stfd f28, 112(SP) | |||
| stfd f29, 120(SP) | |||
| stfd f30, 128(SP) | |||
| stfd f31, 136(SP) | |||
| stw r0, 144(SP) | |||
| lfs FZERO,144(SP) | |||
| #ifdef F_INTERFACE | |||
| LDINT N, 0(N) | |||
| LDINT INCX, 0(INCX) | |||
| #endif | |||
| slwi INCX, INCX, BASE_SHIFT | |||
| fmr f1, FZERO | |||
| fmr f2, FZERO | |||
| fmr f3, FZERO | |||
| fmr f4, FZERO | |||
| fmr f5, FZERO | |||
| fmr f6, FZERO | |||
| fmr f7, FZERO | |||
| li PREA, L1_PREFETCHSIZE | |||
| cmpwi cr0, N, 0 | |||
| ble- LL(999) | |||
| cmpwi cr0, INCX, 0 | |||
| ble- LL(999) | |||
| cmpwi cr0, INCX, SIZE | |||
| bne- cr0, LL(100) | |||
| srawi. r0, N, 4 | |||
| mtspr CTR, r0 | |||
| beq- cr0, LL(50) | |||
| .align 4 | |||
| LFD f8, 0 * SIZE(X) | |||
| LFD f9, 1 * SIZE(X) | |||
| LFD f10, 2 * SIZE(X) | |||
| LFD f11, 3 * SIZE(X) | |||
| LFD f12, 4 * SIZE(X) | |||
| LFD f13, 5 * SIZE(X) | |||
| LFD f14, 6 * SIZE(X) | |||
| LFD f15, 7 * SIZE(X) | |||
| LFD f24, 8 * SIZE(X) | |||
| LFD f25, 9 * SIZE(X) | |||
| LFD f26, 10 * SIZE(X) | |||
| LFD f27, 11 * SIZE(X) | |||
| LFD f28, 12 * SIZE(X) | |||
| LFD f29, 13 * SIZE(X) | |||
| LFD f30, 14 * SIZE(X) | |||
| LFD f31, 15 * SIZE(X) | |||
| fmr f16, f8 | |||
| fmr f17, f9 | |||
| fmr f18, f10 | |||
| fmr f19, f11 | |||
| fmr f20, f12 | |||
| fmr f21, f13 | |||
| fmr f22, f14 | |||
| fmr f23, f15 | |||
| bdz LL(20) | |||
| .align 4 | |||
| LL(10): | |||
| FADD f0, f0, f16 | |||
| fmr f16, f24 | |||
| FADD f1, f1, f17 | |||
| fmr f17, f25 | |||
| FADD f2, f2, f18 | |||
| fmr f18, f26 | |||
| FADD f3, f3, f19 | |||
| fmr f19, f27 | |||
| LFD f8, 16 * SIZE(X) | |||
| LFD f9, 17 * SIZE(X) | |||
| LFD f10, 18 * SIZE(X) | |||
| LFD f11, 19 * SIZE(X) | |||
| FADD f4, f4, f20 | |||
| fmr f20, f28 | |||
| FADD f5, f5, f21 | |||
| fmr f21, f29 | |||
| FADD f6, f6, f22 | |||
| fmr f22, f30 | |||
| FADD f7, f7, f23 | |||
| fmr f23, f31 | |||
| LFD f12, 20 * SIZE(X) | |||
| LFD f13, 21 * SIZE(X) | |||
| LFD f14, 22 * SIZE(X) | |||
| LFD f15, 23 * SIZE(X) | |||
| FADD f0, f0, f16 | |||
| fmr f16, f8 | |||
| FADD f1, f1, f17 | |||
| fmr f17, f9 | |||
| FADD f2, f2, f18 | |||
| fmr f18, f10 | |||
| FADD f3, f3, f19 | |||
| fmr f19, f11 | |||
| LFD f24, 24 * SIZE(X) | |||
| LFD f25, 25 * SIZE(X) | |||
| LFD f26, 26 * SIZE(X) | |||
| LFD f27, 27 * SIZE(X) | |||
| FADD f4, f4, f20 | |||
| fmr f20, f12 | |||
| FADD f5, f5, f21 | |||
| fmr f21, f13 | |||
| FADD f6, f6, f22 | |||
| fmr f22, f14 | |||
| FADD f7, f7, f23 | |||
| fmr f23, f15 | |||
| LFD f28, 28 * SIZE(X) | |||
| LFD f29, 29 * SIZE(X) | |||
| LFD f30, 30 * SIZE(X) | |||
| LFD f31, 31 * SIZE(X) | |||
| #ifndef POWER6 | |||
| L1_PREFETCH X, PREA | |||
| #endif | |||
| addi X, X, 16 * SIZE | |||
| #ifdef POWER6 | |||
| L1_PREFETCH X, PREA | |||
| #endif | |||
| bdnz LL(10) | |||
| .align 4 | |||
| LL(20): | |||
| FADD f0, f0, f16 | |||
| fmr f16, f24 | |||
| FADD f1, f1, f17 | |||
| fmr f17, f25 | |||
| FADD f2, f2, f18 | |||
| fmr f18, f26 | |||
| FADD f3, f3, f19 | |||
| fmr f19, f27 | |||
| FADD f4, f4, f20 | |||
| fmr f20, f28 | |||
| FADD f5, f5, f21 | |||
| fmr f21, f29 | |||
| FADD f6, f6, f22 | |||
| fmr f22, f30 | |||
| FADD f7, f7, f23 | |||
| fmr f23, f31 | |||
| FADD f0, f0, f16 | |||
| FADD f1, f1, f17 | |||
| FADD f2, f2, f18 | |||
| FADD f3, f3, f19 | |||
| FADD f4, f4, f20 | |||
| FADD f5, f5, f21 | |||
| FADD f6, f6, f22 | |||
| FADD f7, f7, f23 | |||
| addi X, X, 16 * SIZE | |||
| .align 4 | |||
| LL(50): | |||
| andi. r0, N, 15 | |||
| mtspr CTR, r0 | |||
| beq LL(999) | |||
| .align 4 | |||
| LL(60): | |||
| LFD f8, 0 * SIZE(X) | |||
| addi X, X, 1 * SIZE | |||
| FADD f0, f0, f8 | |||
| bdnz LL(60) | |||
| b LL(999) | |||
| .align 4 | |||
| LL(100): | |||
| sub X, X, INCX | |||
| srawi. r0, N, 4 | |||
| mtspr CTR, r0 | |||
| beq- LL(150) | |||
| LFDUX f8, X, INCX | |||
| LFDUX f9, X, INCX | |||
| LFDUX f10, X, INCX | |||
| LFDUX f11, X, INCX | |||
| LFDUX f12, X, INCX | |||
| LFDUX f13, X, INCX | |||
| LFDUX f14, X, INCX | |||
| LFDUX f15, X, INCX | |||
| LFDUX f24, X, INCX | |||
| LFDUX f25, X, INCX | |||
| LFDUX f26, X, INCX | |||
| LFDUX f27, X, INCX | |||
| LFDUX f28, X, INCX | |||
| LFDUX f29, X, INCX | |||
| LFDUX f30, X, INCX | |||
| LFDUX f31, X, INCX | |||
| fmr f16, f8 | |||
| fmr f17, f9 | |||
| fmr f18, f10 | |||
| fmr f19, f11 | |||
| fmr f20, f12 | |||
| fmr f21, f13 | |||
| fmr f22, f14 | |||
| fmr f23, f15 | |||
| bdz LL(120) | |||
| .align 4 | |||
| LL(110): | |||
| FADD f0, f0, f16 | |||
| fmr f16, f24 | |||
| FADD f1, f1, f17 | |||
| fmr f17, f25 | |||
| FADD f2, f2, f18 | |||
| fmr f18, f26 | |||
| FADD f3, f3, f19 | |||
| fmr f19, f27 | |||
| LFDUX f8, X, INCX | |||
| LFDUX f9, X, INCX | |||
| LFDUX f10, X, INCX | |||
| LFDUX f11, X, INCX | |||
| FADD f4, f4, f20 | |||
| fmr f20, f28 | |||
| FADD f5, f5, f21 | |||
| fmr f21, f29 | |||
| FADD f6, f6, f22 | |||
| fmr f22, f30 | |||
| FADD f7, f7, f23 | |||
| fmr f23, f31 | |||
| LFDUX f12, X, INCX | |||
| LFDUX f13, X, INCX | |||
| LFDUX f14, X, INCX | |||
| LFDUX f15, X, INCX | |||
| FADD f0, f0, f16 | |||
| fmr f16, f8 | |||
| FADD f1, f1, f17 | |||
| fmr f17, f9 | |||
| FADD f2, f2, f18 | |||
| fmr f18, f10 | |||
| FADD f3, f3, f19 | |||
| fmr f19, f11 | |||
| LFDUX f24, X, INCX | |||
| LFDUX f25, X, INCX | |||
| LFDUX f26, X, INCX | |||
| LFDUX f27, X, INCX | |||
| FADD f4, f4, f20 | |||
| fmr f20, f12 | |||
| FADD f5, f5, f21 | |||
| fmr f21, f13 | |||
| FADD f6, f6, f22 | |||
| fmr f22, f14 | |||
| FADD f7, f7, f23 | |||
| fmr f23, f15 | |||
| LFDUX f28, X, INCX | |||
| LFDUX f29, X, INCX | |||
| LFDUX f30, X, INCX | |||
| LFDUX f31, X, INCX | |||
| bdnz LL(110) | |||
| .align 4 | |||
| LL(120): | |||
| FADD f0, f0, f16 | |||
| fmr f16, f24 | |||
| FADD f1, f1, f17 | |||
| fmr f17, f25 | |||
| FADD f2, f2, f18 | |||
| fmr f18, f26 | |||
| FADD f3, f3, f19 | |||
| fmr f19, f27 | |||
| FADD f4, f4, f20 | |||
| fmr f20, f28 | |||
| FADD f5, f5, f21 | |||
| fmr f21, f29 | |||
| FADD f6, f6, f22 | |||
| fmr f22, f30 | |||
| FADD f7, f7, f23 | |||
| fmr f23, f31 | |||
| FADD f0, f0, f16 | |||
| FADD f1, f1, f17 | |||
| FADD f2, f2, f18 | |||
| FADD f3, f3, f19 | |||
| FADD f4, f4, f20 | |||
| FADD f5, f5, f21 | |||
| FADD f6, f6, f22 | |||
| FADD f7, f7, f23 | |||
| .align 4 | |||
| LL(150): | |||
| andi. r0, N, 15 | |||
| mtspr CTR, r0 | |||
| beq LL(999) | |||
| .align 4 | |||
| LL(160): | |||
| LFDUX f8, X, INCX | |||
| FADD f0, f0, f8 | |||
| bdnz LL(160) | |||
| .align 4 | |||
| LL(999): | |||
| FADD f0, f0, f1 | |||
| FADD f2, f2, f3 | |||
| FADD f4, f4, f5 | |||
| FADD f6, f6, f7 | |||
| FADD f0, f0, f2 | |||
| FADD f4, f4, f6 | |||
| FADD f1, f0, f4 | |||
| lfd f14, 0(SP) | |||
| lfd f15, 8(SP) | |||
| lfd f16, 16(SP) | |||
| lfd f17, 24(SP) | |||
| lfd f18, 32(SP) | |||
| lfd f19, 40(SP) | |||
| lfd f20, 48(SP) | |||
| lfd f21, 56(SP) | |||
| lfd f22, 64(SP) | |||
| lfd f23, 72(SP) | |||
| lfd f24, 80(SP) | |||
| lfd f25, 88(SP) | |||
| lfd f26, 96(SP) | |||
| lfd f27, 104(SP) | |||
| lfd f28, 112(SP) | |||
| lfd f29, 120(SP) | |||
| lfd f30, 128(SP) | |||
| lfd f31, 136(SP) | |||
| addi SP, SP, STACKSIZE | |||
| blr | |||
| EPILOGUE | |||
| @@ -0,0 +1,452 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #define N r3 | |||
| #define X r4 | |||
| #define INCX r5 | |||
| #define INCXM1 r9 | |||
| #define PREA r8 | |||
| #define FZERO f0 | |||
| #define STACKSIZE 160 | |||
| PROLOGUE | |||
| PROFCODE | |||
| addi SP, SP, -STACKSIZE | |||
| li r0, 0 | |||
| stfd f14, 0(SP) | |||
| stfd f15, 8(SP) | |||
| stfd f16, 16(SP) | |||
| stfd f17, 24(SP) | |||
| stfd f18, 32(SP) | |||
| stfd f19, 40(SP) | |||
| stfd f20, 48(SP) | |||
| stfd f21, 56(SP) | |||
| stfd f22, 64(SP) | |||
| stfd f23, 72(SP) | |||
| stfd f24, 80(SP) | |||
| stfd f25, 88(SP) | |||
| stfd f26, 96(SP) | |||
| stfd f27, 104(SP) | |||
| stfd f28, 112(SP) | |||
| stfd f29, 120(SP) | |||
| stfd f30, 128(SP) | |||
| stfd f31, 136(SP) | |||
| stw r0, 144(SP) | |||
| lfs FZERO,144(SP) | |||
| #ifdef F_INTERFACE | |||
| LDINT N, 0(N) | |||
| LDINT INCX, 0(INCX) | |||
| #endif | |||
| slwi INCX, INCX, ZBASE_SHIFT | |||
| subi INCXM1, INCX, SIZE | |||
| fmr f1, FZERO | |||
| fmr f2, FZERO | |||
| fmr f3, FZERO | |||
| fmr f4, FZERO | |||
| fmr f5, FZERO | |||
| fmr f6, FZERO | |||
| fmr f7, FZERO | |||
| li PREA, L1_PREFETCHSIZE | |||
| cmpwi cr0, N, 0 | |||
| ble- LL(999) | |||
| cmpwi cr0, INCX, 0 | |||
| ble- LL(999) | |||
| cmpwi cr0, INCX, 2 * SIZE | |||
| bne- cr0, LL(100) | |||
| srawi. r0, N, 3 | |||
| mtspr CTR, r0 | |||
| beq- cr0, LL(50) | |||
| .align 4 | |||
| LFD f8, 0 * SIZE(X) | |||
| LFD f9, 1 * SIZE(X) | |||
| LFD f10, 2 * SIZE(X) | |||
| LFD f11, 3 * SIZE(X) | |||
| LFD f12, 4 * SIZE(X) | |||
| LFD f13, 5 * SIZE(X) | |||
| LFD f14, 6 * SIZE(X) | |||
| LFD f15, 7 * SIZE(X) | |||
| LFD f24, 8 * SIZE(X) | |||
| LFD f25, 9 * SIZE(X) | |||
| LFD f26, 10 * SIZE(X) | |||
| LFD f27, 11 * SIZE(X) | |||
| LFD f28, 12 * SIZE(X) | |||
| LFD f29, 13 * SIZE(X) | |||
| LFD f30, 14 * SIZE(X) | |||
| LFD f31, 15 * SIZE(X) | |||
| fmr f16, f8 | |||
| fmr f17, f9 | |||
| fmr f18, f10 | |||
| fmr f19, f11 | |||
| fmr f20, f12 | |||
| fmr f21, f13 | |||
| fmr f22, f14 | |||
| fmr f23, f15 | |||
| bdz LL(20) | |||
| .align 4 | |||
| LL(10): | |||
| FADD f0, f0, f16 | |||
| fmr f16, f24 | |||
| FADD f1, f1, f17 | |||
| fmr f17, f25 | |||
| FADD f2, f2, f18 | |||
| fmr f18, f26 | |||
| FADD f3, f3, f19 | |||
| fmr f19, f27 | |||
| LFD f8, 16 * SIZE(X) | |||
| LFD f9, 17 * SIZE(X) | |||
| LFD f10, 18 * SIZE(X) | |||
| LFD f11, 19 * SIZE(X) | |||
| FADD f4, f4, f20 | |||
| fmr f20, f28 | |||
| FADD f5, f5, f21 | |||
| fmr f21, f29 | |||
| FADD f6, f6, f22 | |||
| fmr f22, f30 | |||
| FADD f7, f7, f23 | |||
| fmr f23, f31 | |||
| LFD f12, 20 * SIZE(X) | |||
| LFD f13, 21 * SIZE(X) | |||
| LFD f14, 22 * SIZE(X) | |||
| LFD f15, 23 * SIZE(X) | |||
| FADD f0, f0, f16 | |||
| fmr f16, f8 | |||
| FADD f1, f1, f17 | |||
| fmr f17, f9 | |||
| FADD f2, f2, f18 | |||
| fmr f18, f10 | |||
| FADD f3, f3, f19 | |||
| fmr f19, f11 | |||
| LFD f24, 24 * SIZE(X) | |||
| LFD f25, 25 * SIZE(X) | |||
| LFD f26, 26 * SIZE(X) | |||
| LFD f27, 27 * SIZE(X) | |||
| FADD f4, f4, f20 | |||
| fmr f20, f12 | |||
| FADD f5, f5, f21 | |||
| fmr f21, f13 | |||
| FADD f6, f6, f22 | |||
| fmr f22, f14 | |||
| FADD f7, f7, f23 | |||
| fmr f23, f15 | |||
| LFD f28, 28 * SIZE(X) | |||
| LFD f29, 29 * SIZE(X) | |||
| LFD f30, 30 * SIZE(X) | |||
| LFD f31, 31 * SIZE(X) | |||
| #ifndef POWER6 | |||
| L1_PREFETCH X, PREA | |||
| #endif | |||
| addi X, X, 16 * SIZE | |||
| #ifdef POWER6 | |||
| L1_PREFETCH X, PREA | |||
| #endif | |||
| bdnz LL(10) | |||
| .align 4 | |||
| LL(20): | |||
| FADD f0, f0, f16 | |||
| fmr f16, f24 | |||
| FADD f1, f1, f17 | |||
| fmr f17, f25 | |||
| FADD f2, f2, f18 | |||
| fmr f18, f26 | |||
| FADD f3, f3, f19 | |||
| fmr f19, f27 | |||
| FADD f4, f4, f20 | |||
| fmr f20, f28 | |||
| FADD f5, f5, f21 | |||
| fmr f21, f29 | |||
| FADD f6, f6, f22 | |||
| fmr f22, f30 | |||
| FADD f7, f7, f23 | |||
| fmr f23, f31 | |||
| FADD f0, f0, f16 | |||
| FADD f1, f1, f17 | |||
| FADD f2, f2, f18 | |||
| FADD f3, f3, f19 | |||
| FADD f4, f4, f20 | |||
| FADD f5, f5, f21 | |||
| FADD f6, f6, f22 | |||
| FADD f7, f7, f23 | |||
| addi X, X, 16 * SIZE | |||
| .align 4 | |||
| LL(50): | |||
| andi. r0, N, 7 | |||
| mtspr CTR, r0 | |||
| beq LL(999) | |||
| .align 4 | |||
| LL(60): | |||
| LFD f8, 0 * SIZE(X) | |||
| LFD f9, 1 * SIZE(X) | |||
| addi X, X, 2 * SIZE | |||
| FADD f0, f0, f8 | |||
| FADD f1, f1, f9 | |||
| bdnz LL(60) | |||
| b LL(999) | |||
| .align 4 | |||
| LL(100): | |||
| sub X, X, INCXM1 | |||
| srawi. r0, N, 3 | |||
| mtspr CTR, r0 | |||
| beq- LL(150) | |||
| LFDX f8, X, INCXM1 | |||
| LFDUX f9, X, INCX | |||
| LFDX f10, X, INCXM1 | |||
| LFDUX f11, X, INCX | |||
| LFDX f12, X, INCXM1 | |||
| LFDUX f13, X, INCX | |||
| LFDX f14, X, INCXM1 | |||
| LFDUX f15, X, INCX | |||
| LFDX f24, X, INCXM1 | |||
| LFDUX f25, X, INCX | |||
| LFDX f26, X, INCXM1 | |||
| LFDUX f27, X, INCX | |||
| LFDX f28, X, INCXM1 | |||
| LFDUX f29, X, INCX | |||
| LFDX f30, X, INCXM1 | |||
| LFDUX f31, X, INCX | |||
| fmr f16, f8 | |||
| fmr f17, f9 | |||
| fmr f18, f10 | |||
| fmr f19, f11 | |||
| fmr f20, f12 | |||
| fmr f21, f13 | |||
| fmr f22, f14 | |||
| fmr f23, f15 | |||
| bdz LL(120) | |||
| .align 4 | |||
| LL(110): | |||
| FADD f0, f0, f16 | |||
| fmr f16, f24 | |||
| FADD f1, f1, f17 | |||
| fmr f17, f25 | |||
| FADD f2, f2, f18 | |||
| fmr f18, f26 | |||
| FADD f3, f3, f19 | |||
| fmr f19, f27 | |||
| LFDX f8, X, INCXM1 | |||
| LFDUX f9, X, INCX | |||
| LFDX f10, X, INCXM1 | |||
| LFDUX f11, X, INCX | |||
| FADD f4, f4, f20 | |||
| fmr f20, f28 | |||
| FADD f5, f5, f21 | |||
| fmr f21, f29 | |||
| FADD f6, f6, f22 | |||
| fmr f22, f30 | |||
| FADD f7, f7, f23 | |||
| fmr f23, f31 | |||
| LFDX f12, X, INCXM1 | |||
| LFDUX f13, X, INCX | |||
| LFDX f14, X, INCXM1 | |||
| LFDUX f15, X, INCX | |||
| FADD f0, f0, f16 | |||
| fmr f16, f8 | |||
| FADD f1, f1, f17 | |||
| fmr f17, f9 | |||
| FADD f2, f2, f18 | |||
| fmr f18, f10 | |||
| FADD f3, f3, f19 | |||
| fmr f19, f11 | |||
| LFDX f24, X, INCXM1 | |||
| LFDUX f25, X, INCX | |||
| LFDX f26, X, INCXM1 | |||
| LFDUX f27, X, INCX | |||
| FADD f4, f4, f20 | |||
| fmr f20, f12 | |||
| FADD f5, f5, f21 | |||
| fmr f21, f13 | |||
| FADD f6, f6, f22 | |||
| fmr f22, f14 | |||
| FADD f7, f7, f23 | |||
| fmr f23, f15 | |||
| LFDX f28, X, INCXM1 | |||
| LFDUX f29, X, INCX | |||
| LFDX f30, X, INCXM1 | |||
| LFDUX f31, X, INCX | |||
| bdnz LL(110) | |||
| .align 4 | |||
| LL(120): | |||
| FADD f0, f0, f16 | |||
| fmr f16, f24 | |||
| FADD f1, f1, f17 | |||
| fmr f17, f25 | |||
| FADD f2, f2, f18 | |||
| fmr f18, f26 | |||
| FADD f3, f3, f19 | |||
| fmr f19, f27 | |||
| FADD f4, f4, f20 | |||
| fmr f20, f28 | |||
| FADD f5, f5, f21 | |||
| fmr f21, f29 | |||
| FADD f6, f6, f22 | |||
| fmr f22, f30 | |||
| FADD f7, f7, f23 | |||
| fmr f23, f31 | |||
| FADD f0, f0, f16 | |||
| FADD f1, f1, f17 | |||
| FADD f2, f2, f18 | |||
| FADD f3, f3, f19 | |||
| FADD f4, f4, f20 | |||
| FADD f5, f5, f21 | |||
| FADD f6, f6, f22 | |||
| FADD f7, f7, f23 | |||
| .align 4 | |||
| LL(150): | |||
| andi. r0, N, 7 | |||
| mtspr CTR, r0 | |||
| beq LL(999) | |||
| .align 4 | |||
| LL(160): | |||
| LFDX f8, X, INCXM1 | |||
| LFDUX f9, X, INCX | |||
| FADD f0, f0, f8 | |||
| FADD f1, f1, f9 | |||
| bdnz LL(160) | |||
| .align 4 | |||
| LL(999): | |||
| FADD f0, f0, f1 | |||
| FADD f2, f2, f3 | |||
| FADD f4, f4, f5 | |||
| FADD f6, f6, f7 | |||
| FADD f0, f0, f2 | |||
| FADD f4, f4, f6 | |||
| FADD f1, f0, f4 | |||
| lfd f14, 0(SP) | |||
| lfd f15, 8(SP) | |||
| lfd f16, 16(SP) | |||
| lfd f17, 24(SP) | |||
| lfd f18, 32(SP) | |||
| lfd f19, 40(SP) | |||
| lfd f20, 48(SP) | |||
| lfd f21, 56(SP) | |||
| lfd f22, 64(SP) | |||
| lfd f23, 72(SP) | |||
| lfd f24, 80(SP) | |||
| lfd f25, 88(SP) | |||
| lfd f26, 96(SP) | |||
| lfd f27, 104(SP) | |||
| lfd f28, 112(SP) | |||
| lfd f29, 120(SP) | |||
| lfd f30, 128(SP) | |||
| lfd f31, 136(SP) | |||
| addi SP, SP, STACKSIZE | |||
| blr | |||
| EPILOGUE | |||
| @@ -70,7 +70,7 @@ gotoblas_t TABLE_NAME = { | |||
| samax_kTS, samin_kTS, smax_kTS, smin_kTS, | |||
| isamax_kTS, isamin_kTS, ismax_kTS, ismin_kTS, | |||
| snrm2_kTS, sasum_kTS, scopy_kTS, sdot_kTS, | |||
| snrm2_kTS, sasum_kTS, ssum_kTS, scopy_kTS, sdot_kTS, | |||
| dsdot_kTS, | |||
| srot_kTS, saxpy_kTS, sscal_kTS, sswap_kTS, | |||
| sgemv_nTS, sgemv_tTS, sger_kTS, | |||
| @@ -126,7 +126,7 @@ gotoblas_t TABLE_NAME = { | |||
| damax_kTS, damin_kTS, dmax_kTS, dmin_kTS, | |||
| idamax_kTS, idamin_kTS, idmax_kTS, idmin_kTS, | |||
| dnrm2_kTS, dasum_kTS, dcopy_kTS, ddot_kTS, | |||
| dnrm2_kTS, dasum_kTS, dsum_kTS, dcopy_kTS, ddot_kTS, | |||
| drot_kTS, daxpy_kTS, dscal_kTS, dswap_kTS, | |||
| dgemv_nTS, dgemv_tTS, dger_kTS, | |||
| dsymv_LTS, dsymv_UTS, | |||
| @@ -178,7 +178,7 @@ gotoblas_t TABLE_NAME = { | |||
| qamax_kTS, qamin_kTS, qmax_kTS, qmin_kTS, | |||
| iqamax_kTS, iqamin_kTS, iqmax_kTS, iqmin_kTS, | |||
| qnrm2_kTS, qasum_kTS, qcopy_kTS, qdot_kTS, | |||
| qnrm2_kTS, qasum_kTS, qsum_kTS, qcopy_kTS, qdot_kTS, | |||
| qrot_kTS, qaxpy_kTS, qscal_kTS, qswap_kTS, | |||
| qgemv_nTS, qgemv_tTS, qger_kTS, | |||
| qsymv_LTS, qsymv_UTS, | |||
| @@ -234,7 +234,7 @@ gotoblas_t TABLE_NAME = { | |||
| #endif | |||
| camax_kTS, camin_kTS, icamax_kTS, icamin_kTS, | |||
| cnrm2_kTS, casum_kTS, ccopy_kTS, | |||
| cnrm2_kTS, casum_kTS, csum_kTS, ccopy_kTS, | |||
| cdotu_kTS, cdotc_kTS, csrot_kTS, | |||
| caxpy_kTS, caxpyc_kTS, cscal_kTS, cswap_kTS, | |||
| @@ -369,7 +369,7 @@ gotoblas_t TABLE_NAME = { | |||
| #endif | |||
| zamax_kTS, zamin_kTS, izamax_kTS, izamin_kTS, | |||
| znrm2_kTS, zasum_kTS, zcopy_kTS, | |||
| znrm2_kTS, zasum_kTS, zsum_kTS, zcopy_kTS, | |||
| zdotu_kTS, zdotc_kTS, zdrot_kTS, | |||
| zaxpy_kTS, zaxpyc_kTS, zscal_kTS, zswap_kTS, | |||
| @@ -500,7 +500,7 @@ gotoblas_t TABLE_NAME = { | |||
| XGEMM_DEFAULT_UNROLL_M, XGEMM_DEFAULT_UNROLL_N, MAX(XGEMM_DEFAULT_UNROLL_M, XGEMM_DEFAULT_UNROLL_N), | |||
| xamax_kTS, xamin_kTS, ixamax_kTS, ixamin_kTS, | |||
| xnrm2_kTS, xasum_kTS, xcopy_kTS, | |||
| xnrm2_kTS, xasum_kTS, xsum_kTS, xcopy_kTS, | |||
| xdotu_kTS, xdotc_kTS, xqrot_kTS, | |||
| xaxpy_kTS, xaxpyc_kTS, xscal_kTS, xswap_kTS, | |||
| @@ -0,0 +1,325 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #define N %i0 | |||
| #define X %i1 | |||
| #define INCX %i2 | |||
| #define I %i3 | |||
| #ifdef DOUBLE | |||
| #define c1 %f0 | |||
| #define c2 %f2 | |||
| #define t1 %f8 | |||
| #define t2 %f10 | |||
| #define t3 %f12 | |||
| #define t4 %f14 | |||
| #define a1 %f16 | |||
| #define a2 %f18 | |||
| #define a3 %f20 | |||
| #define a4 %f22 | |||
| #define a5 %f24 | |||
| #define a6 %f26 | |||
| #define a7 %f28 | |||
| #define a8 %f30 | |||
| #else | |||
| #define c1 %f0 | |||
| #define c2 %f1 | |||
| #define t1 %f4 | |||
| #define t2 %f5 | |||
| #define t3 %f6 | |||
| #define t4 %f7 | |||
| #define a1 %f8 | |||
| #define a2 %f9 | |||
| #define a3 %f10 | |||
| #define a4 %f11 | |||
| #define a5 %f12 | |||
| #define a6 %f13 | |||
| #define a7 %f14 | |||
| #define a8 %f15 | |||
| #endif | |||
| PROLOGUE | |||
| SAVESP | |||
| FCLR(0) | |||
| sll INCX, BASE_SHIFT, INCX | |||
| FMOV c1, c2 | |||
| FMOV c1, t1 | |||
| FMOV c1, t2 | |||
| FMOV c1, t3 | |||
| FMOV c1, t4 | |||
| cmp INCX, 0 | |||
| ble .LL19 | |||
| cmp INCX, SIZE | |||
| bne .LL50 | |||
| sra N, 3, I | |||
| cmp I, 0 | |||
| ble,pn %icc, .LL15 | |||
| nop | |||
| LDF [X + 0 * SIZE], a1 | |||
| add I, -1, I | |||
| LDF [X + 1 * SIZE], a2 | |||
| cmp I, 0 | |||
| LDF [X + 2 * SIZE], a3 | |||
| LDF [X + 3 * SIZE], a4 | |||
| LDF [X + 4 * SIZE], a5 | |||
| LDF [X + 5 * SIZE], a6 | |||
| LDF [X + 6 * SIZE], a7 | |||
| LDF [X + 7 * SIZE], a8 | |||
| ble,pt %icc, .LL12 | |||
| add X, 8 * SIZE, X | |||
| #define PREFETCHSIZE 128 | |||
| .LL11: | |||
| FADD c1, t1, c1 | |||
| prefetch [X + PREFETCHSIZE * SIZE], 0 | |||
| FMOV a1, t1 | |||
| LDF [X + 0 * SIZE], a1 | |||
| FADD c2, t2, c2 | |||
| add I, -1, I | |||
| FMOV a2, t2 | |||
| LDF [X + 1 * SIZE], a2 | |||
| FADD c1, t3, c1 | |||
| cmp I, 0 | |||
| FMOV a3, t3 | |||
| LDF [X + 2 * SIZE], a3 | |||
| FADD c2, t4, c2 | |||
| nop | |||
| FMOV a4, t4 | |||
| LDF [X + 3 * SIZE], a4 | |||
| FADD c1, t1, c1 | |||
| nop | |||
| FMOV a5, t1 | |||
| LDF [X + 4 * SIZE], a5 | |||
| FADD c2, t2, c2 | |||
| nop | |||
| FMOV a6, t2 | |||
| LDF [X + 5 * SIZE], a6 | |||
| FADD c1, t3, c1 | |||
| FMOV a7, t3 | |||
| LDF [X + 6 * SIZE], a7 | |||
| add X, 8 * SIZE, X | |||
| FADD c2, t4, c2 | |||
| FMOV a8, t4 | |||
| bg,pt %icc, .LL11 | |||
| LDF [X - 1 * SIZE], a8 | |||
| .LL12: | |||
| FADD c1, t1, c1 | |||
| FMOV a1, t1 | |||
| FADD c2, t2, c2 | |||
| FMOV a2, t2 | |||
| FADD c1, t3, c1 | |||
| FMOV a3, t3 | |||
| FADD c2, t4, c2 | |||
| FMOV a4, t4 | |||
| FADD c1, t1, c1 | |||
| FMOV a5, t1 | |||
| FADD c2, t2, c2 | |||
| FMOV a6, t2 | |||
| FADD c1, t3, c1 | |||
| FMOV a7, t3 | |||
| FADD c2, t4, c2 | |||
| FMOV a8, t4 | |||
| .LL15: | |||
| and N, 7, I | |||
| cmp I, 0 | |||
| ble,a,pn %icc, .LL19 | |||
| nop | |||
| .LL16: | |||
| LDF [X + 0 * SIZE], a1 | |||
| add I, -1, I | |||
| cmp I, 0 | |||
| FADD c1, t1, c1 | |||
| FMOV a1, t1 | |||
| bg,pt %icc, .LL16 | |||
| add X, 1 * SIZE, X | |||
| .LL19: | |||
| FADD c1, t1, c1 | |||
| FADD c2, t2, c2 | |||
| FADD c1, t3, c1 | |||
| FADD c2, t4, c2 | |||
| FADD c1, c2, c1 | |||
| return %i7 + 8 | |||
| clr %g0 | |||
| .LL50: | |||
| sra N, 3, I | |||
| cmp I, 0 | |||
| ble,pn %icc, .LL55 | |||
| nop | |||
| LDF [X + 0 * SIZE], a1 | |||
| add X, INCX, X | |||
| LDF [X + 0 * SIZE], a2 | |||
| add X, INCX, X | |||
| LDF [X + 0 * SIZE], a3 | |||
| add X, INCX, X | |||
| LDF [X + 0 * SIZE], a4 | |||
| add X, INCX, X | |||
| LDF [X + 0 * SIZE], a5 | |||
| add X, INCX, X | |||
| LDF [X + 0 * SIZE], a6 | |||
| add X, INCX, X | |||
| add I, -1, I | |||
| LDF [X + 0 * SIZE], a7 | |||
| cmp I, 0 | |||
| add X, INCX, X | |||
| LDF [X + 0 * SIZE], a8 | |||
| ble,pt %icc, .LL52 | |||
| add X, INCX, X | |||
| .LL51: | |||
| FADD c1, t1, c1 | |||
| add I, -1, I | |||
| FMOV a1, t1 | |||
| LDF [X + 0 * SIZE], a1 | |||
| add X, INCX, X | |||
| FADD c2, t2, c2 | |||
| cmp I, 0 | |||
| FMOV a2, t2 | |||
| LDF [X + 0 * SIZE], a2 | |||
| add X, INCX, X | |||
| FADD c1, t3, c1 | |||
| FMOV a3, t3 | |||
| LDF [X + 0 * SIZE], a3 | |||
| add X, INCX, X | |||
| FADD c2, t4, c2 | |||
| FMOV a4, t4 | |||
| LDF [X + 0 * SIZE], a4 | |||
| add X, INCX, X | |||
| FADD c1, t1, c1 | |||
| FMOV a5, t1 | |||
| LDF [X + 0 * SIZE], a5 | |||
| add X, INCX, X | |||
| FADD c2, t2, c2 | |||
| FMOV a6, t2 | |||
| LDF [X + 0 * SIZE], a6 | |||
| add X, INCX, X | |||
| FADD c1, t3, c1 | |||
| FMOV a7, t3 | |||
| LDF [X + 0 * SIZE], a7 | |||
| add X, INCX, X | |||
| FADD c2, t4, c2 | |||
| FMOV a8, t4 | |||
| LDF [X + 0 * SIZE], a8 | |||
| bg,pt %icc, .LL51 | |||
| add X, INCX, X | |||
| .LL52: | |||
| FADD c1, t1, c1 | |||
| FMOV a1, t1 | |||
| FADD c2, t2, c2 | |||
| FMOV a2, t2 | |||
| FADD c1, t3, c1 | |||
| FMOV a3, t3 | |||
| FADD c2, t4, c2 | |||
| FMOV a4, t4 | |||
| FADD c1, t1, c1 | |||
| FMOV a5, t1 | |||
| FADD c2, t2, c2 | |||
| FMOV a6, t2 | |||
| FADD c1, t3, c1 | |||
| FMOV a7, t3 | |||
| FADD c2, t4, c2 | |||
| FMOV a8, t4 | |||
| .LL55: | |||
| and N, 7, I | |||
| cmp I, 0 | |||
| ble,a,pn %icc, .LL59 | |||
| nop | |||
| .LL56: | |||
| LDF [X + 0 * SIZE], a1 | |||
| FADD c1, t1, c1 | |||
| add I, -1, I | |||
| FMOV a1, t1 | |||
| cmp I, 0 | |||
| bg,pt %icc, .LL56 | |||
| add X, INCX, X | |||
| .LL59: | |||
| FADD c1, t1, c1 | |||
| FADD c2, t2, c2 | |||
| FADD c1, t3, c1 | |||
| FADD c2, t4, c2 | |||
| FADD c1, c2, c1 | |||
| return %i7 + 8 | |||
| clr %o0 | |||
| EPILOGUE | |||
| @@ -0,0 +1,327 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #define N %i0 | |||
| #define X %i1 | |||
| #define INCX %i2 | |||
| #define I %i3 | |||
| #ifdef DOUBLE | |||
| #define c1 %f0 | |||
| #define c2 %f2 | |||
| #define t1 %f8 | |||
| #define t2 %f10 | |||
| #define t3 %f12 | |||
| #define t4 %f14 | |||
| #define a1 %f16 | |||
| #define a2 %f18 | |||
| #define a3 %f20 | |||
| #define a4 %f22 | |||
| #define a5 %f24 | |||
| #define a6 %f26 | |||
| #define a7 %f28 | |||
| #define a8 %f30 | |||
| #else | |||
| #define c1 %f0 | |||
| #define c2 %f1 | |||
| #define t1 %f4 | |||
| #define t2 %f5 | |||
| #define t3 %f6 | |||
| #define t4 %f7 | |||
| #define a1 %f8 | |||
| #define a2 %f9 | |||
| #define a3 %f10 | |||
| #define a4 %f11 | |||
| #define a5 %f12 | |||
| #define a6 %f13 | |||
| #define a7 %f14 | |||
| #define a8 %f15 | |||
| #endif | |||
| PROLOGUE | |||
| SAVESP | |||
| FCLR(0) | |||
| sll INCX, ZBASE_SHIFT, INCX | |||
| FMOV c1, c2 | |||
| FMOV c1, t1 | |||
| FMOV c1, t2 | |||
| FMOV c1, t3 | |||
| FMOV c1, t4 | |||
| cmp INCX, 0 | |||
| ble .LL19 | |||
| nop | |||
| cmp INCX, 2 * SIZE | |||
| bne .LL50 | |||
| nop | |||
| sra N, 2, I | |||
| cmp I, 0 | |||
| ble,pn %icc, .LL15 | |||
| nop | |||
| LDF [X + 0 * SIZE], a1 | |||
| add I, -1, I | |||
| LDF [X + 1 * SIZE], a2 | |||
| cmp I, 0 | |||
| LDF [X + 2 * SIZE], a3 | |||
| LDF [X + 3 * SIZE], a4 | |||
| LDF [X + 4 * SIZE], a5 | |||
| LDF [X + 5 * SIZE], a6 | |||
| LDF [X + 6 * SIZE], a7 | |||
| LDF [X + 7 * SIZE], a8 | |||
| ble,pt %icc, .LL12 | |||
| add X, 8 * SIZE, X | |||
| #define PREFETCHSIZE 32 | |||
| .LL11: | |||
| FADD c1, t1, c1 | |||
| prefetch [X + PREFETCHSIZE * SIZE], 0 | |||
| FMOV a1, t1 | |||
| LDF [X + 0 * SIZE], a1 | |||
| FADD c2, t2, c2 | |||
| add I, -1, I | |||
| FMOV a2, t2 | |||
| LDF [X + 1 * SIZE], a2 | |||
| FADD c1, t3, c1 | |||
| cmp I, 0 | |||
| FMOV a3, t3 | |||
| LDF [X + 2 * SIZE], a3 | |||
| FADD c2, t4, c2 | |||
| nop | |||
| FMOV a4, t4 | |||
| LDF [X + 3 * SIZE], a4 | |||
| FADD c1, t1, c1 | |||
| nop | |||
| FMOV a5, t1 | |||
| LDF [X + 4 * SIZE], a5 | |||
| FADD c2, t2, c2 | |||
| nop | |||
| FMOV a6, t2 | |||
| LDF [X + 5 * SIZE], a6 | |||
| FADD c1, t3, c1 | |||
| FMOV a7, t3 | |||
| LDF [X + 6 * SIZE], a7 | |||
| add X, 8 * SIZE, X | |||
| FADD c2, t4, c2 | |||
| FMOV a8, t4 | |||
| bg,pt %icc, .LL11 | |||
| LDF [X - 1 * SIZE], a8 | |||
| .LL12: | |||
| FADD c1, t1, c1 | |||
| FMOV a1, t1 | |||
| FADD c2, t2, c2 | |||
| FMOV a2, t2 | |||
| FADD c1, t3, c1 | |||
| FMOV a3, t3 | |||
| FADD c2, t4, c2 | |||
| FMOV a4, t4 | |||
| FADD c1, t1, c1 | |||
| FMOV a5, t1 | |||
| FADD c2, t2, c2 | |||
| FMOV a6, t2 | |||
| FADD c1, t3, c1 | |||
| FMOV a7, t3 | |||
| FADD c2, t4, c2 | |||
| FMOV a8, t4 | |||
| .LL15: | |||
| and N, 3, I | |||
| cmp I, 0 | |||
| ble,a,pn %icc, .LL19 | |||
| nop | |||
| .LL16: | |||
| LDF [X + 0 * SIZE], a1 | |||
| LDF [X + 1 * SIZE], a2 | |||
| add I, -1, I | |||
| cmp I, 0 | |||
| FADD c1, t1, c1 | |||
| FADD c2, t2, c2 | |||
| FMOV a1, t1 | |||
| FMOV a2, t2 | |||
| bg,pt %icc, .LL16 | |||
| add X, 2 * SIZE, X | |||
| .LL19: | |||
| FADD c1, t1, c1 | |||
| FADD c2, t2, c2 | |||
| FADD c1, t3, c1 | |||
| FADD c2, t4, c2 | |||
| FADD c1, c2, c1 | |||
| return %i7 + 8 | |||
| clr %g0 | |||
| .LL50: | |||
| sra N, 2, I | |||
| cmp I, 0 | |||
| ble,pn %icc, .LL55 | |||
| nop | |||
| LDF [X + 0 * SIZE], a1 | |||
| LDF [X + 1 * SIZE], a2 | |||
| add X, INCX, X | |||
| LDF [X + 0 * SIZE], a3 | |||
| LDF [X + 1 * SIZE], a4 | |||
| add X, INCX, X | |||
| LDF [X + 0 * SIZE], a5 | |||
| LDF [X + 1 * SIZE], a6 | |||
| add X, INCX, X | |||
| add I, -1, I | |||
| LDF [X + 0 * SIZE], a7 | |||
| cmp I, 0 | |||
| LDF [X + 1 * SIZE], a8 | |||
| ble,pt %icc, .LL52 | |||
| add X, INCX, X | |||
| .LL51: | |||
| FADD c1, t1, c1 | |||
| add I, -1, I | |||
| FMOV a1, t1 | |||
| LDF [X + 0 * SIZE], a1 | |||
| FADD c2, t2, c2 | |||
| cmp I, 0 | |||
| FMOV a2, t2 | |||
| LDF [X + 1 * SIZE], a2 | |||
| add X, INCX, X | |||
| FADD c1, t3, c1 | |||
| FMOV a3, t3 | |||
| LDF [X + 0 * SIZE], a3 | |||
| FADD c2, t4, c2 | |||
| FMOV a4, t4 | |||
| LDF [X + 1 * SIZE], a4 | |||
| add X, INCX, X | |||
| FADD c1, t1, c1 | |||
| FMOV a5, t1 | |||
| LDF [X + 0 * SIZE], a5 | |||
| FADD c2, t2, c2 | |||
| FMOV a6, t2 | |||
| LDF [X + 1 * SIZE], a6 | |||
| add X, INCX, X | |||
| FADD c1, t3, c1 | |||
| FMOV a7, t3 | |||
| LDF [X + 0 * SIZE], a7 | |||
| FADD c2, t4, c2 | |||
| FMOV a8, t4 | |||
| LDF [X + 1 * SIZE], a8 | |||
| bg,pt %icc, .LL51 | |||
| add X, INCX, X | |||
| .LL52: | |||
| FADD c1, t1, c1 | |||
| FMOV a1, t1 | |||
| FADD c2, t2, c2 | |||
| FMOV a2, t2 | |||
| FADD c1, t3, c1 | |||
| FMOV a3, t3 | |||
| FADD c2, t4, c2 | |||
| FMOV a4, t4 | |||
| FADD c1, t1, c1 | |||
| FMOV a5, t1 | |||
| FADD c2, t2, c2 | |||
| FMOV a6, t2 | |||
| FADD c1, t3, c1 | |||
| FMOV a7, t3 | |||
| FADD c2, t4, c2 | |||
| FMOV a8, t4 | |||
| .LL55: | |||
| and N, 3, I | |||
| cmp I, 0 | |||
| ble,a,pn %icc, .LL59 | |||
| nop | |||
| .LL56: | |||
| LDF [X + 0 * SIZE], a1 | |||
| LDF [X + 1 * SIZE], a2 | |||
| FADD c1, t1, c1 | |||
| FADD c2, t2, c2 | |||
| add I, -1, I | |||
| FMOV a1, t1 | |||
| FMOV a2, t2 | |||
| cmp I, 0 | |||
| bg,pt %icc, .LL56 | |||
| add X, INCX, X | |||
| .LL59: | |||
| FADD c1, t1, c1 | |||
| FADD c2, t2, c2 | |||
| FADD c1, t3, c1 | |||
| FADD c2, t4, c2 | |||
| FADD c1, c2, c1 | |||
| return %i7 + 8 | |||
| clr %o0 | |||
| EPILOGUE | |||
| @@ -94,6 +94,11 @@ DASUMKERNEL = ../arm/asum.c | |||
| CASUMKERNEL = ../arm/zasum.c | |||
| ZASUMKERNEL = ../arm/zasum.c | |||
| SSUMKERNEL = ../arm/sum.c | |||
| DSUMKERNEL = ../arm/sum.c | |||
| CSUMKERNEL = ../arm/zsum.c | |||
| ZSUMKERNEL = ../arm/zsum.c | |||
| SAXPYKERNEL = ../arm/axpy.c | |||
| DAXPYKERNEL = ../arm/axpy.c | |||
| CAXPYKERNEL = ../arm/zaxpy.c | |||
| @@ -0,0 +1,207 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #define STACK 8 | |||
| #define ARGS 0 | |||
| #define STACK_M 4 + STACK + ARGS(%esp) | |||
| #define STACK_X 8 + STACK + ARGS(%esp) | |||
| #define STACK_INCX 12 + STACK + ARGS(%esp) | |||
| #define M %edx | |||
| #define X %ecx | |||
| #define INCX %esi | |||
| #define I %eax | |||
| #include "l1param.h" | |||
| PROLOGUE | |||
| pushl %esi | |||
| pushl %ebx | |||
| PROFCODE | |||
| #if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) | |||
| EMMS | |||
| #endif | |||
| movl STACK_M, M | |||
| movl STACK_X, X | |||
| movl STACK_INCX, INCX | |||
| #ifdef F_INTERFACE | |||
| movl (M), M | |||
| movl (INCX), INCX | |||
| #endif | |||
| fldz | |||
| testl M, M | |||
| jle .L999 | |||
| testl INCX, INCX | |||
| jle .L999 | |||
| sall $BASE_SHIFT, INCX | |||
| fldz | |||
| fldz | |||
| fldz | |||
| cmpl $SIZE, INCX | |||
| jne .L40 | |||
| movl M, I | |||
| sarl $3, I | |||
| jle .L20 | |||
| ALIGN_4 | |||
| .L10: | |||
| #ifdef PREFETCH | |||
| PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) | |||
| #endif | |||
| FLD 0 * SIZE(X) | |||
| FLD 1 * SIZE(X) | |||
| FLD 2 * SIZE(X) | |||
| FLD 3 * SIZE(X) | |||
| faddp %st, %st(7) | |||
| faddp %st, %st(5) | |||
| faddp %st, %st(3) | |||
| faddp %st, %st(1) | |||
| FLD 4 * SIZE(X) | |||
| FLD 5 * SIZE(X) | |||
| FLD 6 * SIZE(X) | |||
| FLD 7 * SIZE(X) | |||
| addl $8 * SIZE, X | |||
| faddp %st, %st(7) | |||
| faddp %st, %st(5) | |||
| faddp %st, %st(3) | |||
| faddp %st, %st(1) | |||
| decl I | |||
| jg .L10 | |||
| ALIGN_4 | |||
| .L20: | |||
| movl M, I | |||
| andl $7, I | |||
| jle .L998 | |||
| ALIGN_4 | |||
| .L21: | |||
| FLD (X) | |||
| faddp %st,%st(1) | |||
| addl $1 * SIZE, X | |||
| decl I | |||
| jg .L21 | |||
| jmp .L998 | |||
| ALIGN_4 | |||
| .L40: | |||
| movl M, I | |||
| sarl $3, I | |||
| jle .L60 | |||
| ALIGN_4 | |||
| .L50: | |||
| FLD (X) | |||
| addl INCX, X | |||
| FLD (X) | |||
| addl INCX, X | |||
| FLD (X) | |||
| addl INCX, X | |||
| FLD (X) | |||
| addl INCX, X | |||
| faddp %st, %st(7) | |||
| faddp %st, %st(5) | |||
| faddp %st, %st(3) | |||
| faddp %st, %st(1) | |||
| FLD (X) | |||
| addl INCX, X | |||
| FLD (X) | |||
| addl INCX, X | |||
| FLD (X) | |||
| addl INCX, X | |||
| FLD (X) | |||
| addl INCX, X | |||
| faddp %st, %st(7) | |||
| faddp %st, %st(5) | |||
| faddp %st, %st(3) | |||
| faddp %st, %st(1) | |||
| decl I | |||
| jg .L50 | |||
| ALIGN_4 | |||
| .L60: | |||
| movl M, I | |||
| andl $7, I | |||
| jle .L998 | |||
| ALIGN_4 | |||
| .L61: | |||
| FLD (X) | |||
| addl INCX, X | |||
| faddp %st,%st(1) | |||
| decl I | |||
| jg .L61 | |||
| ALIGN_4 | |||
| .L998: | |||
| faddp %st,%st(2) | |||
| faddp %st,%st(1) | |||
| faddp %st,%st(1) | |||
| ALIGN_4 | |||
| .L999: | |||
| popl %ebx | |||
| popl %esi | |||
| ret | |||
| EPILOGUE | |||
| @@ -0,0 +1,208 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #define STACK 8 | |||
| #define ARGS 0 | |||
| #define STACK_M 4 + STACK + ARGS(%esp) | |||
| #define STACK_X 8 + STACK + ARGS(%esp) | |||
| #define STACK_INCX 12 + STACK + ARGS(%esp) | |||
| #define M %edx | |||
| #define X %ecx | |||
| #define INCX %esi | |||
| #define I %eax | |||
| #include "l1param.h" | |||
| PROLOGUE | |||
| pushl %esi | |||
| pushl %ebx | |||
| PROFCODE | |||
| #if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) | |||
| EMMS | |||
| #endif | |||
| movl STACK_M, M | |||
| movl STACK_X, X | |||
| movl STACK_INCX, INCX | |||
| #ifdef F_INTERFACE | |||
| movl (M), M | |||
| movl (INCX), INCX | |||
| #endif | |||
| fldz | |||
| testl M, M | |||
| jle .L999 | |||
| testl INCX, INCX | |||
| jle .L999 | |||
| sall $ZBASE_SHIFT, INCX | |||
| fldz | |||
| fldz | |||
| fldz | |||
| cmpl $SIZE * 2, INCX | |||
| jne .L40 | |||
| movl M, I | |||
| sarl $2, I | |||
| jle .L20 | |||
| ALIGN_4 | |||
| .L10: | |||
| #ifdef PREFETCH | |||
| PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) | |||
| #endif | |||
| FLD 0 * SIZE(X) | |||
| FLD 1 * SIZE(X) | |||
| FLD 2 * SIZE(X) | |||
| FLD 3 * SIZE(X) | |||
| faddp %st, %st(7) | |||
| faddp %st, %st(5) | |||
| faddp %st, %st(3) | |||
| faddp %st, %st(1) | |||
| FLD 4 * SIZE(X) | |||
| FLD 5 * SIZE(X) | |||
| FLD 6 * SIZE(X) | |||
| FLD 7 * SIZE(X) | |||
| addl $8 * SIZE, X | |||
| faddp %st, %st(7) | |||
| faddp %st, %st(5) | |||
| faddp %st, %st(3) | |||
| faddp %st, %st(1) | |||
| decl I | |||
| jg .L10 | |||
| ALIGN_4 | |||
| .L20: | |||
| movl M, I | |||
| andl $3, I | |||
| jle .L998 | |||
| ALIGN_4 | |||
| .L21: | |||
| FLD 0 * SIZE(X) | |||
| FLD 1 * SIZE(X) | |||
| faddp %st,%st(3) | |||
| faddp %st,%st(1) | |||
| addl $2 * SIZE, X | |||
| decl I | |||
| jg .L21 | |||
| jmp .L998 | |||
| ALIGN_4 | |||
| .L40: | |||
| movl M, I | |||
| sarl $2, I | |||
| jle .L60 | |||
| ALIGN_4 | |||
| .L50: | |||
| FLD 0 * SIZE(X) | |||
| FLD 1 * SIZE(X) | |||
| addl INCX, X | |||
| FLD 0 * SIZE(X) | |||
| FLD 1 * SIZE(X) | |||
| addl INCX, X | |||
| faddp %st, %st(7) | |||
| faddp %st, %st(5) | |||
| faddp %st, %st(3) | |||
| faddp %st, %st(1) | |||
| FLD 0 * SIZE(X) | |||
| FLD 1 * SIZE(X) | |||
| addl INCX, X | |||
| FLD 0 * SIZE(X) | |||
| FLD 1 * SIZE(X) | |||
| addl INCX, X | |||
| faddp %st, %st(7) | |||
| faddp %st, %st(5) | |||
| faddp %st, %st(3) | |||
| faddp %st, %st(1) | |||
| decl I | |||
| jg .L50 | |||
| ALIGN_4 | |||
| .L60: | |||
| movl M, I | |||
| andl $3, I | |||
| jle .L998 | |||
| ALIGN_4 | |||
| .L61: | |||
| FLD 0 * SIZE(X) | |||
| FLD 1 * SIZE(X) | |||
| addl INCX, X | |||
| faddp %st,%st(3) | |||
| faddp %st,%st(1) | |||
| decl I | |||
| jg .L61 | |||
| ALIGN_4 | |||
| .L998: | |||
| faddp %st,%st(2) | |||
| faddp %st,%st(1) | |||
| faddp %st,%st(1) | |||
| ALIGN_4 | |||
| .L999: | |||
| popl %ebx | |||
| popl %esi | |||
| ret | |||
| EPILOGUE | |||
| @@ -94,6 +94,11 @@ DASUMKERNEL = ../arm/asum.c | |||
| CASUMKERNEL = ../arm/zasum.c | |||
| ZASUMKERNEL = ../arm/zasum.c | |||
| SSUMKERNEL = ../arm/sum.c | |||
| DSUMKERNEL = ../arm/sum.c | |||
| CSUMKERNEL = ../arm/zsum.c | |||
| ZSUMKERNEL = ../arm/zsum.c | |||
| SAXPYKERNEL = ../arm/axpy.c | |||
| DAXPYKERNEL = ../arm/axpy.c | |||
| CAXPYKERNEL = ../arm/zaxpy.c | |||
| @@ -0,0 +1,179 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #define M ARG1 | |||
| #define X ARG2 | |||
| #define INCX ARG3 | |||
| #define I %rax | |||
| #include "l1param.h" | |||
| PROLOGUE | |||
| PROFCODE | |||
| fldz | |||
| testq M, M | |||
| jle .L999 | |||
| testq INCX, INCX | |||
| jle .L999 | |||
| salq $BASE_SHIFT, INCX | |||
| fldz | |||
| fldz | |||
| fldz | |||
| cmpq $SIZE, INCX | |||
| jne .L40 | |||
| movq M, I | |||
| sarq $3, I | |||
| jle .L20 | |||
| ALIGN_4 | |||
| .L10: | |||
| #ifdef PREFETCH | |||
| PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) | |||
| #endif | |||
| FLD 0 * SIZE(X) | |||
| FLD 1 * SIZE(X) | |||
| FLD 2 * SIZE(X) | |||
| FLD 3 * SIZE(X) | |||
| faddp %st, %st(7) | |||
| faddp %st, %st(5) | |||
| faddp %st, %st(3) | |||
| faddp %st, %st(1) | |||
| FLD 4 * SIZE(X) | |||
| FLD 5 * SIZE(X) | |||
| FLD 6 * SIZE(X) | |||
| FLD 7 * SIZE(X) | |||
| addq $8 * SIZE, X | |||
| faddp %st, %st(7) | |||
| faddp %st, %st(5) | |||
| faddp %st, %st(3) | |||
| faddp %st, %st(1) | |||
| decq I | |||
| jg .L10 | |||
| ALIGN_4 | |||
| .L20: | |||
| andq $7, M | |||
| jle .L998 | |||
| ALIGN_4 | |||
| .L21: | |||
| FLD (X) | |||
| faddp %st,%st(1) | |||
| addq $1 * SIZE, X | |||
| decq M | |||
| jg .L21 | |||
| jmp .L998 | |||
| ALIGN_4 | |||
| .L40: | |||
| movq M, I | |||
| sarq $3, I | |||
| jle .L60 | |||
| ALIGN_4 | |||
| .L50: | |||
| FLD (X) | |||
| addq INCX, X | |||
| FLD (X) | |||
| addq INCX, X | |||
| FLD (X) | |||
| addq INCX, X | |||
| FLD (X) | |||
| addq INCX, X | |||
| faddp %st, %st(7) | |||
| faddp %st, %st(5) | |||
| faddp %st, %st(3) | |||
| faddp %st, %st(1) | |||
| FLD (X) | |||
| addq INCX, X | |||
| FLD (X) | |||
| addq INCX, X | |||
| FLD (X) | |||
| addq INCX, X | |||
| FLD (X) | |||
| addq INCX, X | |||
| faddp %st, %st(7) | |||
| faddp %st, %st(5) | |||
| faddp %st, %st(3) | |||
| faddp %st, %st(1) | |||
| decq I | |||
| jg .L50 | |||
| ALIGN_4 | |||
| .L60: | |||
| andq $7, M | |||
| jle .L998 | |||
| ALIGN_4 | |||
| .L61: | |||
| FLD (X) | |||
| addq INCX, X | |||
| faddp %st,%st(1) | |||
| decq M | |||
| jg .L61 | |||
| ALIGN_4 | |||
| .L998: | |||
| faddp %st,%st(2) | |||
| faddp %st,%st(1) | |||
| faddp %st,%st(1) | |||
| ALIGN_4 | |||
| .L999: | |||
| ret | |||
| EPILOGUE | |||
| @@ -0,0 +1,180 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #define M ARG1 | |||
| #define X ARG2 | |||
| #define INCX ARG3 | |||
| #define I %rax | |||
| #include "l1param.h" | |||
| PROLOGUE | |||
| PROFCODE | |||
| fldz | |||
| testq M, M | |||
| jle .L999 | |||
| testq INCX, INCX | |||
| jle .L999 | |||
| salq $ZBASE_SHIFT, INCX | |||
| fldz | |||
| fldz | |||
| fldz | |||
| cmpq $SIZE * 2, INCX | |||
| jne .L40 | |||
| movq M, I | |||
| sarq $2, I | |||
| jle .L20 | |||
| ALIGN_4 | |||
| .L10: | |||
| #ifdef PREFETCH | |||
| PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) | |||
| #endif | |||
| FLD 0 * SIZE(X) | |||
| FLD 1 * SIZE(X) | |||
| FLD 2 * SIZE(X) | |||
| FLD 3 * SIZE(X) | |||
| faddp %st, %st(7) | |||
| faddp %st, %st(5) | |||
| faddp %st, %st(3) | |||
| faddp %st, %st(1) | |||
| FLD 4 * SIZE(X) | |||
| FLD 5 * SIZE(X) | |||
| FLD 6 * SIZE(X) | |||
| FLD 7 * SIZE(X) | |||
| addq $8 * SIZE, X | |||
| faddp %st, %st(7) | |||
| faddp %st, %st(5) | |||
| faddp %st, %st(3) | |||
| faddp %st, %st(1) | |||
| decq I | |||
| jg .L10 | |||
| ALIGN_4 | |||
| .L20: | |||
| andq $3, M | |||
| jle .L998 | |||
| ALIGN_4 | |||
| .L21: | |||
| FLD 0 * SIZE(X) | |||
| FLD 1 * SIZE(X) | |||
| faddp %st,%st(3) | |||
| faddp %st,%st(1) | |||
| addq $2 * SIZE, X | |||
| decq M | |||
| jg .L21 | |||
| jmp .L998 | |||
| ALIGN_4 | |||
| .L40: | |||
| movq M, I | |||
| sarq $2, I | |||
| jle .L60 | |||
| ALIGN_4 | |||
| .L50: | |||
| FLD 0 * SIZE(X) | |||
| FLD 1 * SIZE(X) | |||
| addq INCX, X | |||
| FLD 0 * SIZE(X) | |||
| FLD 1 * SIZE(X) | |||
| addq INCX, X | |||
| faddp %st, %st(7) | |||
| faddp %st, %st(5) | |||
| faddp %st, %st(3) | |||
| faddp %st, %st(1) | |||
| FLD 0 * SIZE(X) | |||
| FLD 1 * SIZE(X) | |||
| addq INCX, X | |||
| FLD 0 * SIZE(X) | |||
| FLD 1 * SIZE(X) | |||
| addq INCX, X | |||
| faddp %st, %st(7) | |||
| faddp %st, %st(5) | |||
| faddp %st, %st(3) | |||
| faddp %st, %st(1) | |||
| decq I | |||
| jg .L50 | |||
| ALIGN_4 | |||
| .L60: | |||
| andq $3, M | |||
| jle .L998 | |||
| ALIGN_4 | |||
| .L61: | |||
| FLD 0 * SIZE(X) | |||
| FLD 1 * SIZE(X) | |||
| addq INCX, X | |||
| faddp %st,%st(3) | |||
| faddp %st,%st(1) | |||
| decq M | |||
| jg .L61 | |||
| ALIGN_4 | |||
| .L998: | |||
| faddp %st,%st(2) | |||
| faddp %st,%st(1) | |||
| faddp %st,%st(1) | |||
| ALIGN_4 | |||
| .L999: | |||
| ret | |||
| EPILOGUE | |||
| @@ -35,6 +35,11 @@ DASUMKERNEL = dasum.c | |||
| CASUMKERNEL = ../arm/zasum.c | |||
| ZASUMKERNEL = zasum.c | |||
| SSUMKERNEL = ../arm/asum.c | |||
| DSUMKERNEL = dasum.c | |||
| CSUMKERNEL = ../arm/zasum.c | |||
| ZSUMKERNEL = zasum.c | |||
| SAXPYKERNEL = ../arm/axpy.c | |||
| DAXPYKERNEL = daxpy.c | |||
| CAXPYKERNEL = ../arm/zaxpy.c | |||
| @@ -35,6 +35,11 @@ DASUMKERNEL = dasum.c | |||
| CASUMKERNEL = casum.c | |||
| ZASUMKERNEL = zasum.c | |||
| SSUMKERNEL = ssum.c | |||
| DSUMKERNEL = dsum.c | |||
| CSUMKERNEL = csum.c | |||
| ZSUMKERNEL = zsum.c | |||
| SAXPYKERNEL = saxpy.c | |||
| DAXPYKERNEL = daxpy.c | |||
| CAXPYKERNEL = caxpy.c | |||
| @@ -35,6 +35,11 @@ DASUMKERNEL = ../arm/asum.c | |||
| CASUMKERNEL = ../arm/zasum.c | |||
| ZASUMKERNEL = ../arm/zasum.c | |||
| SSUMKERNEL = ../arm/sum.c | |||
| DSUMKERNEL = ../arm/sum.c | |||
| CSUMKERNEL = ../arm/zsum.c | |||
| ZSUMKERNEL = ../arm/zsum.c | |||
| SAXPYKERNEL = ../arm/axpy.c | |||
| DAXPYKERNEL = ../arm/axpy.c | |||
| CAXPYKERNEL = ../arm/zaxpy.c | |||
| @@ -0,0 +1,137 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <math.h> | |||
| static FLOAT csum_kernel_32(BLASLONG n, FLOAT *x) { | |||
| FLOAT sum; | |||
| __asm__("vzero %%v24\n\t" | |||
| "vzero %%v25\n\t" | |||
| "vzero %%v26\n\t" | |||
| "vzero %%v27\n\t" | |||
| "vzero %%v28\n\t" | |||
| "vzero %%v29\n\t" | |||
| "vzero %%v30\n\t" | |||
| "vzero %%v31\n\t" | |||
| "srlg %[n],%[n],5\n\t" | |||
| "xgr %%r1,%%r1\n\t" | |||
| "0:\n\t" | |||
| "pfd 1, 1024(%%r1,%[x])\n\t" | |||
| "vl %%v16, 0(%%r1,%[x])\n\t" | |||
| "vl %%v17, 16(%%r1,%[x])\n\t" | |||
| "vl %%v18, 32(%%r1,%[x])\n\t" | |||
| "vl %%v19, 48(%%r1,%[x])\n\t" | |||
| "vl %%v20, 64(%%r1,%[x])\n\t" | |||
| "vl %%v21, 80(%%r1,%[x])\n\t" | |||
| "vl %%v22, 96(%%r1,%[x])\n\t" | |||
| "vl %%v23, 112(%%r1,%[x])\n\t" | |||
| "vfasb %%v24,%%v24,%%v16\n\t" | |||
| "vfasb %%v25,%%v25,%%v17\n\t" | |||
| "vfasb %%v26,%%v26,%%v18\n\t" | |||
| "vfasb %%v27,%%v27,%%v19\n\t" | |||
| "vfasb %%v28,%%v28,%%v20\n\t" | |||
| "vfasb %%v29,%%v29,%%v21\n\t" | |||
| "vfasb %%v30,%%v30,%%v22\n\t" | |||
| "vfasb %%v31,%%v31,%%v23\n\t" | |||
| "vl %%v16, 128(%%r1,%[x])\n\t" | |||
| "vl %%v17, 144(%%r1,%[x])\n\t" | |||
| "vl %%v18, 160(%%r1,%[x])\n\t" | |||
| "vl %%v19, 176(%%r1,%[x])\n\t" | |||
| "vl %%v20, 192(%%r1,%[x])\n\t" | |||
| "vl %%v21, 208(%%r1,%[x])\n\t" | |||
| "vl %%v22, 224(%%r1,%[x])\n\t" | |||
| "vl %%v23, 240(%%r1,%[x])\n\t" | |||
| "vfasb %%v24,%%v24,%%v16\n\t" | |||
| "vfasb %%v25,%%v25,%%v17\n\t" | |||
| "vfasb %%v26,%%v26,%%v18\n\t" | |||
| "vfasb %%v27,%%v27,%%v19\n\t" | |||
| "vfasb %%v28,%%v28,%%v20\n\t" | |||
| "vfasb %%v29,%%v29,%%v21\n\t" | |||
| "vfasb %%v30,%%v30,%%v22\n\t" | |||
| "vfasb %%v31,%%v31,%%v23\n\t" | |||
| "agfi %%r1,256\n\t" | |||
| "brctg %[n],0b\n\t" | |||
| "vfasb %%v24,%%v24,%%v25\n\t" | |||
| "vfasb %%v24,%%v24,%%v26\n\t" | |||
| "vfasb %%v24,%%v24,%%v27\n\t" | |||
| "vfasb %%v24,%%v24,%%v28\n\t" | |||
| "vfasb %%v24,%%v24,%%v29\n\t" | |||
| "vfasb %%v24,%%v24,%%v30\n\t" | |||
| "vfasb %%v24,%%v24,%%v31\n\t" | |||
| "veslg %%v25,%%v24,32\n\t" | |||
| "vfasb %%v24,%%v24,%%v25\n\t" | |||
| "vrepf %%v25,%%v24,2\n\t" | |||
| "vfasb %%v24,%%v24,%%v25\n\t" | |||
| "vstef %%v24,%[asum],0" | |||
| : [sum] "=Q"(sum),[n] "+&r"(n) | |||
| : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) | |||
| : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", | |||
| "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); | |||
| return sum; | |||
| } | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||
| BLASLONG i = 0; | |||
| BLASLONG ip = 0; | |||
| FLOAT sumf = 0.0; | |||
| BLASLONG n1; | |||
| BLASLONG inc_x2; | |||
| if (n <= 0 || inc_x <= 0) | |||
| return (sumf); | |||
| if (inc_x == 1) { | |||
| n1 = n & -32; | |||
| if (n1 > 0) { | |||
| sumf = csum_kernel_32(n1, x); | |||
| i = n1; | |||
| ip = 2 * n1; | |||
| } | |||
| while (i < n) { | |||
| sumf += x[ip] + x[ip + 1]; | |||
| i++; | |||
| ip += 2; | |||
| } | |||
| } else { | |||
| inc_x2 = 2 * inc_x; | |||
| while (i < n) { | |||
| sumf += x[ip] + x[ip + 1]; | |||
| ip += inc_x2; | |||
| i++; | |||
| } | |||
| } | |||
| return (sumf); | |||
| } | |||
| @@ -0,0 +1,148 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <math.h> | |||
| static FLOAT dsum_kernel_32(BLASLONG n, FLOAT *x) { | |||
| FLOAT sum; | |||
| __asm__("vzero %%v24\n\t" | |||
| "vzero %%v25\n\t" | |||
| "vzero %%v26\n\t" | |||
| "vzero %%v27\n\t" | |||
| "vzero %%v28\n\t" | |||
| "vzero %%v29\n\t" | |||
| "vzero %%v30\n\t" | |||
| "vzero %%v31\n\t" | |||
| "srlg %[n],%[n],5\n\t" | |||
| "xgr %%r1,%%r1\n\t" | |||
| "0:\n\t" | |||
| "pfd 1, 1024(%%r1,%[x])\n\t" | |||
| "vl %%v16, 0(%%r1,%[x])\n\t" | |||
| "vl %%v17, 16(%%r1,%[x])\n\t" | |||
| "vl %%v18, 32(%%r1,%[x])\n\t" | |||
| "vl %%v19, 48(%%r1,%[x])\n\t" | |||
| "vl %%v20, 64(%%r1,%[x])\n\t" | |||
| "vl %%v21, 80(%%r1,%[x])\n\t" | |||
| "vl %%v22, 96(%%r1,%[x])\n\t" | |||
| "vl %%v23, 112(%%r1,%[x])\n\t" | |||
| "vfadb %%v24,%%v24,%%v16\n\t" | |||
| "vfadb %%v25,%%v25,%%v17\n\t" | |||
| "vfadb %%v26,%%v26,%%v18\n\t" | |||
| "vfadb %%v27,%%v27,%%v19\n\t" | |||
| "vfadb %%v28,%%v28,%%v20\n\t" | |||
| "vfadb %%v29,%%v29,%%v21\n\t" | |||
| "vfadb %%v30,%%v30,%%v22\n\t" | |||
| "vfadb %%v31,%%v31,%%v23\n\t" | |||
| "vl %%v16, 128(%%r1,%[x])\n\t" | |||
| "vl %%v17, 144(%%r1,%[x])\n\t" | |||
| "vl %%v18, 160(%%r1,%[x])\n\t" | |||
| "vl %%v19, 176(%%r1,%[x])\n\t" | |||
| "vl %%v20, 192(%%r1,%[x])\n\t" | |||
| "vl %%v21, 208(%%r1,%[x])\n\t" | |||
| "vl %%v22, 224(%%r1,%[x])\n\t" | |||
| "vl %%v23, 240(%%r1,%[x])\n\t" | |||
| "vfadb %%v24,%%v24,%%v16\n\t" | |||
| "vfadb %%v25,%%v25,%%v17\n\t" | |||
| "vfadb %%v26,%%v26,%%v18\n\t" | |||
| "vfadb %%v27,%%v27,%%v19\n\t" | |||
| "vfadb %%v28,%%v28,%%v20\n\t" | |||
| "vfadb %%v29,%%v29,%%v21\n\t" | |||
| "vfadb %%v30,%%v30,%%v22\n\t" | |||
| "vfadb %%v31,%%v31,%%v23\n\t" | |||
| "agfi %%r1,256\n\t" | |||
| "brctg %[n],0b\n\t" | |||
| "vfadb %%v24,%%v24,%%v25\n\t" | |||
| "vfadb %%v24,%%v24,%%v26\n\t" | |||
| "vfadb %%v24,%%v24,%%v27\n\t" | |||
| "vfadb %%v24,%%v24,%%v28\n\t" | |||
| "vfadb %%v24,%%v24,%%v29\n\t" | |||
| "vfadb %%v24,%%v24,%%v30\n\t" | |||
| "vfadb %%v24,%%v24,%%v31\n\t" | |||
| "vrepg %%v25,%%v24,1\n\t" | |||
| "vfadb %%v24,%%v24,%%v25\n\t" | |||
| "vsteg %%v24,%[asum],0" | |||
| : [sum] "=Q"(sum),[n] "+&r"(n) | |||
| : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) | |||
| : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", | |||
| "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); | |||
| return sum; | |||
| } | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||
| BLASLONG i = 0; | |||
| BLASLONG j = 0; | |||
| FLOAT sumf = 0.0; | |||
| BLASLONG n1; | |||
| if (n <= 0 || inc_x <= 0) | |||
| return sumf; | |||
| if (inc_x == 1) { | |||
| n1 = n & -32; | |||
| if (n1 > 0) { | |||
| sumf = dsum_kernel_32(n1, x); | |||
| i = n1; | |||
| } | |||
| while (i < n) { | |||
| sumf += x[i]; | |||
| i++; | |||
| } | |||
| } else { | |||
| BLASLONG n1 = n & -4; | |||
| register FLOAT sum1, sum2; | |||
| sum1 = 0.0; | |||
| sum2 = 0.0; | |||
| while (j < n1) { | |||
| sum1 += x[i]; | |||
| sum2 += x[i + inc_x]; | |||
| sum1 += x[i + 2 * inc_x]; | |||
| sum2 += x[i + 3 * inc_x]; | |||
| i += inc_x * 4; | |||
| j += 4; | |||
| } | |||
| sumf = sum1 + sum2; | |||
| while (j < n) { | |||
| sumf += x[i]; | |||
| i += inc_x; | |||
| j++; | |||
| } | |||
| } | |||
| return sumf; | |||
| } | |||
| @@ -0,0 +1,151 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <math.h> | |||
| static FLOAT ssum_kernel_64(BLASLONG n, FLOAT *x) { | |||
| FLOAT sum; | |||
| __asm__("vzero %%v24\n\t" | |||
| "vzero %%v25\n\t" | |||
| "vzero %%v26\n\t" | |||
| "vzero %%v27\n\t" | |||
| "vzero %%v28\n\t" | |||
| "vzero %%v29\n\t" | |||
| "vzero %%v30\n\t" | |||
| "vzero %%v31\n\t" | |||
| "srlg %[n],%[n],6\n\t" | |||
| "xgr %%r1,%%r1\n\t" | |||
| "0:\n\t" | |||
| "pfd 1, 1024(%%r1,%[x])\n\t" | |||
| "vl %%v16, 0(%%r1,%[x])\n\t" | |||
| "vl %%v17, 16(%%r1,%[x])\n\t" | |||
| "vl %%v18, 32(%%r1,%[x])\n\t" | |||
| "vl %%v19, 48(%%r1,%[x])\n\t" | |||
| "vl %%v20, 64(%%r1,%[x])\n\t" | |||
| "vl %%v21, 80(%%r1,%[x])\n\t" | |||
| "vl %%v22, 96(%%r1,%[x])\n\t" | |||
| "vl %%v23, 112(%%r1,%[x])\n\t" | |||
| "vfasb %%v24,%%v24,%%v16\n\t" | |||
| "vfasb %%v25,%%v25,%%v17\n\t" | |||
| "vfasb %%v26,%%v26,%%v18\n\t" | |||
| "vfasb %%v27,%%v27,%%v19\n\t" | |||
| "vfasb %%v28,%%v28,%%v20\n\t" | |||
| "vfasb %%v29,%%v29,%%v21\n\t" | |||
| "vfasb %%v30,%%v30,%%v22\n\t" | |||
| "vfasb %%v31,%%v31,%%v23\n\t" | |||
| "vl %%v16, 128(%%r1,%[x])\n\t" | |||
| "vl %%v17, 144(%%r1,%[x])\n\t" | |||
| "vl %%v18, 160(%%r1,%[x])\n\t" | |||
| "vl %%v19, 176(%%r1,%[x])\n\t" | |||
| "vl %%v20, 192(%%r1,%[x])\n\t" | |||
| "vl %%v21, 208(%%r1,%[x])\n\t" | |||
| "vl %%v22, 224(%%r1,%[x])\n\t" | |||
| "vl %%v23, 240(%%r1,%[x])\n\t" | |||
| "vfasb %%v24,%%v24,%%v16\n\t" | |||
| "vfasb %%v25,%%v25,%%v17\n\t" | |||
| "vfasb %%v26,%%v26,%%v18\n\t" | |||
| "vfasb %%v27,%%v27,%%v19\n\t" | |||
| "vfasb %%v28,%%v28,%%v20\n\t" | |||
| "vfasb %%v29,%%v29,%%v21\n\t" | |||
| "vfasb %%v30,%%v30,%%v22\n\t" | |||
| "vfasb %%v31,%%v31,%%v23\n\t" | |||
| "agfi %%r1,256\n\t" | |||
| "brctg %[n],0b\n\t" | |||
| "vfasb %%v24,%%v24,%%v25\n\t" | |||
| "vfasb %%v24,%%v24,%%v26\n\t" | |||
| "vfasb %%v24,%%v24,%%v27\n\t" | |||
| "vfasb %%v24,%%v24,%%v28\n\t" | |||
| "vfasb %%v24,%%v24,%%v29\n\t" | |||
| "vfasb %%v24,%%v24,%%v30\n\t" | |||
| "vfasb %%v24,%%v24,%%v31\n\t" | |||
| "veslg %%v25,%%v24,32\n\t" | |||
| "vfasb %%v24,%%v24,%%v25\n\t" | |||
| "vrepf %%v25,%%v24,2\n\t" | |||
| "vfasb %%v24,%%v24,%%v25\n\t" | |||
| "vstef %%v24,%[asum],0" | |||
| : [sum] "=Q"(sum),[n] "+&r"(n) | |||
| : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) | |||
| : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", | |||
| "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); | |||
| return sum; | |||
| } | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||
| BLASLONG i = 0; | |||
| BLASLONG j = 0; | |||
| FLOAT sumf = 0.0; | |||
| BLASLONG n1; | |||
| if (n <= 0 || inc_x <= 0) | |||
| return sumf; | |||
| if (inc_x == 1) { | |||
| n1 = n & -64; | |||
| if (n1 > 0) { | |||
| sumf = ssum_kernel_64(n1, x); | |||
| i = n1; | |||
| } | |||
| while (i < n) { | |||
| sumf += x[i]; | |||
| i++; | |||
| } | |||
| } else { | |||
| BLASLONG n1 = n & -4; | |||
| register FLOAT sum1, sum2; | |||
| sum1 = 0.0; | |||
| sum2 = 0.0; | |||
| while (j < n1) { | |||
| sum1 += x[i]; | |||
| sum2 += x[i + inc_x]; | |||
| sum1 += x[i + 2 * inc_x]; | |||
| sum2 += x[i + 3 * inc_x]; | |||
| i += inc_x * 4; | |||
| j += 4; | |||
| } | |||
| sumf = sum1 + sum2; | |||
| while (j < n) { | |||
| sumf += x[i]; | |||
| i += inc_x; | |||
| j++; | |||
| } | |||
| } | |||
| return sumf; | |||
| } | |||
| @@ -0,0 +1,136 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <math.h> | |||
| static FLOAT zsum_kernel_16(BLASLONG n, FLOAT *x) { | |||
| FLOAT sum; | |||
| __asm__("vzero %%v24\n\t" | |||
| "vzero %%v25\n\t" | |||
| "vzero %%v26\n\t" | |||
| "vzero %%v27\n\t" | |||
| "vzero %%v28\n\t" | |||
| "vzero %%v29\n\t" | |||
| "vzero %%v30\n\t" | |||
| "vzero %%v31\n\t" | |||
| "srlg %[n],%[n],4\n\t" | |||
| "xgr %%r1,%%r1\n\t" | |||
| "0:\n\t" | |||
| "pfd 1, 1024(%%r1,%[x])\n\t" | |||
| "vl %%v16, 0(%%r1,%[x])\n\t" | |||
| "vl %%v17, 16(%%r1,%[x])\n\t" | |||
| "vl %%v18, 32(%%r1,%[x])\n\t" | |||
| "vl %%v19, 48(%%r1,%[x])\n\t" | |||
| "vl %%v20, 64(%%r1,%[x])\n\t" | |||
| "vl %%v21, 80(%%r1,%[x])\n\t" | |||
| "vl %%v22, 96(%%r1,%[x])\n\t" | |||
| "vl %%v23, 112(%%r1,%[x])\n\t" | |||
| "vfadb %%v24,%%v24,%%v16\n\t" | |||
| "vfadb %%v25,%%v25,%%v17\n\t" | |||
| "vfadb %%v26,%%v26,%%v18\n\t" | |||
| "vfadb %%v27,%%v27,%%v19\n\t" | |||
| "vfadb %%v28,%%v28,%%v20\n\t" | |||
| "vfadb %%v29,%%v29,%%v21\n\t" | |||
| "vfadb %%v30,%%v30,%%v22\n\t" | |||
| "vfadb %%v31,%%v31,%%v23\n\t" | |||
| "vl %%v16, 128(%%r1,%[x])\n\t" | |||
| "vl %%v17, 144(%%r1,%[x])\n\t" | |||
| "vl %%v18, 160(%%r1,%[x])\n\t" | |||
| "vl %%v19, 176(%%r1,%[x])\n\t" | |||
| "vl %%v20, 192(%%r1,%[x])\n\t" | |||
| "vl %%v21, 208(%%r1,%[x])\n\t" | |||
| "vl %%v22, 224(%%r1,%[x])\n\t" | |||
| "vl %%v23, 240(%%r1,%[x])\n\t" | |||
| "vfadb %%v24,%%v24,%%v16\n\t" | |||
| "vfadb %%v25,%%v25,%%v17\n\t" | |||
| "vfadb %%v26,%%v26,%%v18\n\t" | |||
| "vfadb %%v27,%%v27,%%v19\n\t" | |||
| "vfadb %%v28,%%v28,%%v20\n\t" | |||
| "vfadb %%v29,%%v29,%%v21\n\t" | |||
| "vfadb %%v30,%%v30,%%v22\n\t" | |||
| "vfadb %%v31,%%v31,%%v23\n\t" | |||
| "agfi %%r1,256\n\t" | |||
| "brctg %[n],0b\n\t" | |||
| "vfadb %%v24,%%v24,%%v25\n\t" | |||
| "vfadb %%v24,%%v24,%%v26\n\t" | |||
| "vfadb %%v24,%%v24,%%v27\n\t" | |||
| "vfadb %%v24,%%v24,%%v28\n\t" | |||
| "vfadb %%v24,%%v24,%%v29\n\t" | |||
| "vfadb %%v24,%%v24,%%v30\n\t" | |||
| "vfadb %%v24,%%v24,%%v31\n\t" | |||
| "vrepg %%v25,%%v24,1\n\t" | |||
| "vfadb %%v24,%%v24,%%v25\n\t" | |||
| "vsteg %%v24,%[asum],0" | |||
| : [sum] "=Q"(sum),[n] "+&r"(n) | |||
| : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) | |||
| : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", | |||
| "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); | |||
| return sum; | |||
| } | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||
| BLASLONG i = 0; | |||
| BLASLONG ip = 0; | |||
| FLOAT sumf = 0.0; | |||
| BLASLONG n1; | |||
| BLASLONG inc_x2; | |||
| if (n <= 0 || inc_x <= 0) | |||
| return (sumf); | |||
| if (inc_x == 1) { | |||
| n1 = n & -16; | |||
| if (n1 > 0) { | |||
| sumf = zsum_kernel_16(n1, x); | |||
| i = n1; | |||
| ip = 2 * n1; | |||
| } | |||
| while (i < n) { | |||
| sumf += x[ip] + x[ip + 1]; | |||
| i++; | |||
| ip += 2; | |||
| } | |||
| } else { | |||
| inc_x2 = 2 * inc_x; | |||
| while (i < n) { | |||
| sumf += x[ip] + x[ip + 1]; | |||
| ip += inc_x2; | |||
| i++; | |||
| } | |||
| } | |||
| return (sumf); | |||
| } | |||