Add (C)BLAS extension ?sumtags/v0.3.6^2
| @@ -73,6 +73,11 @@ double cblas_dasum (OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS | |||||
| float cblas_scasum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | float cblas_scasum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | ||||
| double cblas_dzasum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | double cblas_dzasum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | ||||
| float cblas_ssum (OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx); | |||||
| double cblas_dsum (OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx); | |||||
| float cblas_scsum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | |||||
| double cblas_dzsum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | |||||
| float cblas_snrm2 (OPENBLAS_CONST blasint N, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX); | float cblas_snrm2 (OPENBLAS_CONST blasint N, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX); | ||||
| double cblas_dnrm2 (OPENBLAS_CONST blasint N, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX); | double cblas_dnrm2 (OPENBLAS_CONST blasint N, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX); | ||||
| float cblas_scnrm2(OPENBLAS_CONST blasint N, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX); | float cblas_scnrm2(OPENBLAS_CONST blasint N, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX); | ||||
| @@ -107,6 +107,12 @@ macro(SetDefaultL1) | |||||
| set(DAXPBYKERNEL ../arm/axpby.c) | set(DAXPBYKERNEL ../arm/axpby.c) | ||||
| set(CAXPBYKERNEL ../arm/zaxpby.c) | set(CAXPBYKERNEL ../arm/zaxpby.c) | ||||
| set(ZAXPBYKERNEL ../arm/zaxpby.c) | set(ZAXPBYKERNEL ../arm/zaxpby.c) | ||||
| set(SSUMKERNEL sum.S) | |||||
| set(DSUMKERNEL sum.S) | |||||
| set(CSUMKERNEL zsum.S) | |||||
| set(ZSUMKERNEL zsum.S) | |||||
| set(QSUMKERNEL sum.S) | |||||
| set(XSUMKERNEL zsum.S) | |||||
| endmacro () | endmacro () | ||||
| macro(SetDefaultL2) | macro(SetDefaultL2) | ||||
| @@ -162,4 +168,4 @@ macro(SetDefaultL3) | |||||
| set(DGEADD_KERNEL ../generic/geadd.c) | set(DGEADD_KERNEL ../generic/geadd.c) | ||||
| set(CGEADD_KERNEL ../generic/zgeadd.c) | set(CGEADD_KERNEL ../generic/zgeadd.c) | ||||
| set(ZGEADD_KERNEL ../generic/zgeadd.c) | set(ZGEADD_KERNEL ../generic/zgeadd.c) | ||||
| endmacro () | |||||
| endmacro () | |||||
| @@ -19,6 +19,7 @@ | |||||
| #define CDOTC_K cdotc_k | #define CDOTC_K cdotc_k | ||||
| #define CNRM2_K cnrm2_k | #define CNRM2_K cnrm2_k | ||||
| #define CSCAL_K cscal_k | #define CSCAL_K cscal_k | ||||
| #define CSUM_K csum_k | |||||
| #define CSWAP_K cswap_k | #define CSWAP_K cswap_k | ||||
| #define CROT_K csrot_k | #define CROT_K csrot_k | ||||
| @@ -249,6 +250,7 @@ | |||||
| #define CDOTC_K gotoblas -> cdotc_k | #define CDOTC_K gotoblas -> cdotc_k | ||||
| #define CNRM2_K gotoblas -> cnrm2_k | #define CNRM2_K gotoblas -> cnrm2_k | ||||
| #define CSCAL_K gotoblas -> cscal_k | #define CSCAL_K gotoblas -> cscal_k | ||||
| #define CSUM_K gotoblas -> csum_k | |||||
| #define CSWAP_K gotoblas -> cswap_k | #define CSWAP_K gotoblas -> cswap_k | ||||
| #define CROT_K gotoblas -> csrot_k | #define CROT_K gotoblas -> csrot_k | ||||
| @@ -19,6 +19,7 @@ | |||||
| #define DDOTC_K ddot_k | #define DDOTC_K ddot_k | ||||
| #define DNRM2_K dnrm2_k | #define DNRM2_K dnrm2_k | ||||
| #define DSCAL_K dscal_k | #define DSCAL_K dscal_k | ||||
| #define DSUM_K dsum_k | |||||
| #define DSWAP_K dswap_k | #define DSWAP_K dswap_k | ||||
| #define DROT_K drot_k | #define DROT_K drot_k | ||||
| @@ -174,6 +175,7 @@ | |||||
| #define DDOTC_K gotoblas -> ddot_k | #define DDOTC_K gotoblas -> ddot_k | ||||
| #define DNRM2_K gotoblas -> dnrm2_k | #define DNRM2_K gotoblas -> dnrm2_k | ||||
| #define DSCAL_K gotoblas -> dscal_k | #define DSCAL_K gotoblas -> dscal_k | ||||
| #define DSUM_K gotoblas -> dsum_k | |||||
| #define DSWAP_K gotoblas -> dswap_k | #define DSWAP_K gotoblas -> dswap_k | ||||
| #define DROT_K gotoblas -> drot_k | #define DROT_K gotoblas -> drot_k | ||||
| @@ -122,6 +122,13 @@ xdouble BLASFUNC(qasum) (blasint *, xdouble *, blasint *); | |||||
| double BLASFUNC(dzasum)(blasint *, double *, blasint *); | double BLASFUNC(dzasum)(blasint *, double *, blasint *); | ||||
| xdouble BLASFUNC(qxasum)(blasint *, xdouble *, blasint *); | xdouble BLASFUNC(qxasum)(blasint *, xdouble *, blasint *); | ||||
| FLOATRET BLASFUNC(ssum) (blasint *, float *, blasint *); | |||||
| FLOATRET BLASFUNC(scsum)(blasint *, float *, blasint *); | |||||
| double BLASFUNC(dsum) (blasint *, double *, blasint *); | |||||
| xdouble BLASFUNC(qsum) (blasint *, xdouble *, blasint *); | |||||
| double BLASFUNC(dzsum)(blasint *, double *, blasint *); | |||||
| xdouble BLASFUNC(qxsum)(blasint *, xdouble *, blasint *); | |||||
| blasint BLASFUNC(isamax)(blasint *, float *, blasint *); | blasint BLASFUNC(isamax)(blasint *, float *, blasint *); | ||||
| blasint BLASFUNC(idamax)(blasint *, double *, blasint *); | blasint BLASFUNC(idamax)(blasint *, double *, blasint *); | ||||
| blasint BLASFUNC(iqamax)(blasint *, xdouble *, blasint *); | blasint BLASFUNC(iqamax)(blasint *, xdouble *, blasint *); | ||||
| @@ -100,6 +100,13 @@ float casum_k (BLASLONG, float *, BLASLONG); | |||||
| double zasum_k (BLASLONG, double *, BLASLONG); | double zasum_k (BLASLONG, double *, BLASLONG); | ||||
| xdouble xasum_k (BLASLONG, xdouble *, BLASLONG); | xdouble xasum_k (BLASLONG, xdouble *, BLASLONG); | ||||
| float ssum_k (BLASLONG, float *, BLASLONG); | |||||
| double dsum_k (BLASLONG, double *, BLASLONG); | |||||
| xdouble qsum_k (BLASLONG, xdouble *, BLASLONG); | |||||
| float csum_k (BLASLONG, float *, BLASLONG); | |||||
| double zsum_k (BLASLONG, double *, BLASLONG); | |||||
| xdouble xsum_k (BLASLONG, xdouble *, BLASLONG); | |||||
| float samax_k (BLASLONG, float *, BLASLONG); | float samax_k (BLASLONG, float *, BLASLONG); | ||||
| double damax_k (BLASLONG, double *, BLASLONG); | double damax_k (BLASLONG, double *, BLASLONG); | ||||
| xdouble qamax_k (BLASLONG, xdouble *, BLASLONG); | xdouble qamax_k (BLASLONG, xdouble *, BLASLONG); | ||||
| @@ -66,6 +66,7 @@ | |||||
| #define DOTC_K QDOTC_K | #define DOTC_K QDOTC_K | ||||
| #define NRM2_K QNRM2_K | #define NRM2_K QNRM2_K | ||||
| #define SCAL_K QSCAL_K | #define SCAL_K QSCAL_K | ||||
| #define SUM_K QSUM_K | |||||
| #define SWAP_K QSWAP_K | #define SWAP_K QSWAP_K | ||||
| #define ROT_K QROT_K | #define ROT_K QROT_K | ||||
| @@ -356,6 +357,7 @@ | |||||
| #define DOTC_K DDOTC_K | #define DOTC_K DDOTC_K | ||||
| #define NRM2_K DNRM2_K | #define NRM2_K DNRM2_K | ||||
| #define SCAL_K DSCAL_K | #define SCAL_K DSCAL_K | ||||
| #define SUM_K DSUM_K | |||||
| #define SWAP_K DSWAP_K | #define SWAP_K DSWAP_K | ||||
| #define ROT_K DROT_K | #define ROT_K DROT_K | ||||
| @@ -658,6 +660,7 @@ | |||||
| #define DOTC_K SDOTC_K | #define DOTC_K SDOTC_K | ||||
| #define NRM2_K SNRM2_K | #define NRM2_K SNRM2_K | ||||
| #define SCAL_K SSCAL_K | #define SCAL_K SSCAL_K | ||||
| #define SUM_K SSUM_K | |||||
| #define SWAP_K SSWAP_K | #define SWAP_K SSWAP_K | ||||
| #define ROT_K SROT_K | #define ROT_K SROT_K | ||||
| @@ -962,6 +965,7 @@ | |||||
| #define DOTC_K XDOTC_K | #define DOTC_K XDOTC_K | ||||
| #define NRM2_K XNRM2_K | #define NRM2_K XNRM2_K | ||||
| #define SCAL_K XSCAL_K | #define SCAL_K XSCAL_K | ||||
| #define SUM_K XSUM_K | |||||
| #define SWAP_K XSWAP_K | #define SWAP_K XSWAP_K | ||||
| #define ROT_K XROT_K | #define ROT_K XROT_K | ||||
| @@ -1363,6 +1367,7 @@ | |||||
| #define DOTC_K ZDOTC_K | #define DOTC_K ZDOTC_K | ||||
| #define NRM2_K ZNRM2_K | #define NRM2_K ZNRM2_K | ||||
| #define SCAL_K ZSCAL_K | #define SCAL_K ZSCAL_K | ||||
| #define SUM_K ZSUM_K | |||||
| #define SWAP_K ZSWAP_K | #define SWAP_K ZSWAP_K | ||||
| #define ROT_K ZROT_K | #define ROT_K ZROT_K | ||||
| @@ -1785,6 +1790,7 @@ | |||||
| #define DOTC_K CDOTC_K | #define DOTC_K CDOTC_K | ||||
| #define NRM2_K CNRM2_K | #define NRM2_K CNRM2_K | ||||
| #define SCAL_K CSCAL_K | #define SCAL_K CSCAL_K | ||||
| #define SUM_K CSUM_K | |||||
| #define SWAP_K CSWAP_K | #define SWAP_K CSWAP_K | ||||
| #define ROT_K CROT_K | #define ROT_K CROT_K | ||||
| @@ -63,6 +63,7 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); | |||||
| float (*snrm2_k) (BLASLONG, float *, BLASLONG); | float (*snrm2_k) (BLASLONG, float *, BLASLONG); | ||||
| float (*sasum_k) (BLASLONG, float *, BLASLONG); | float (*sasum_k) (BLASLONG, float *, BLASLONG); | ||||
| float (*ssum_k) (BLASLONG, float *, BLASLONG); | |||||
| int (*scopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); | int (*scopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); | ||||
| float (*sdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); | float (*sdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); | ||||
| double (*dsdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); | double (*dsdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); | ||||
| @@ -154,6 +155,7 @@ BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG); | |||||
| double (*dnrm2_k) (BLASLONG, double *, BLASLONG); | double (*dnrm2_k) (BLASLONG, double *, BLASLONG); | ||||
| double (*dasum_k) (BLASLONG, double *, BLASLONG); | double (*dasum_k) (BLASLONG, double *, BLASLONG); | ||||
| double (*dsum_k) (BLASLONG, double *, BLASLONG); | |||||
| int (*dcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); | int (*dcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); | ||||
| double (*ddot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); | double (*ddot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); | ||||
| int (*drot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG, double, double); | int (*drot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG, double, double); | ||||
| @@ -245,6 +247,7 @@ BLASLONG (*iqmin_k) (BLASLONG, xdouble *, BLASLONG); | |||||
| xdouble (*qnrm2_k) (BLASLONG, xdouble *, BLASLONG); | xdouble (*qnrm2_k) (BLASLONG, xdouble *, BLASLONG); | ||||
| xdouble (*qasum_k) (BLASLONG, xdouble *, BLASLONG); | xdouble (*qasum_k) (BLASLONG, xdouble *, BLASLONG); | ||||
| xdouble (*qsum_k) (BLASLONG, xdouble *, BLASLONG); | |||||
| int (*qcopy_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); | int (*qcopy_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); | ||||
| xdouble (*qdot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); | xdouble (*qdot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); | ||||
| int (*qrot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble); | int (*qrot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble); | ||||
| @@ -332,6 +335,7 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG); | |||||
| float (*cnrm2_k) (BLASLONG, float *, BLASLONG); | float (*cnrm2_k) (BLASLONG, float *, BLASLONG); | ||||
| float (*casum_k) (BLASLONG, float *, BLASLONG); | float (*casum_k) (BLASLONG, float *, BLASLONG); | ||||
| float (*csum_k) (BLASLONG, float *, BLASLONG); | |||||
| int (*ccopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); | int (*ccopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); | ||||
| openblas_complex_float (*cdotu_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); | openblas_complex_float (*cdotu_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); | ||||
| openblas_complex_float (*cdotc_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); | openblas_complex_float (*cdotc_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); | ||||
| @@ -495,6 +499,7 @@ BLASLONG (*izamin_k)(BLASLONG, double *, BLASLONG); | |||||
| double (*znrm2_k) (BLASLONG, double *, BLASLONG); | double (*znrm2_k) (BLASLONG, double *, BLASLONG); | ||||
| double (*zasum_k) (BLASLONG, double *, BLASLONG); | double (*zasum_k) (BLASLONG, double *, BLASLONG); | ||||
| double (*zsum_k) (BLASLONG, double *, BLASLONG); | |||||
| int (*zcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); | int (*zcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); | ||||
| openblas_complex_double (*zdotu_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); | openblas_complex_double (*zdotu_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); | ||||
| openblas_complex_double (*zdotc_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); | openblas_complex_double (*zdotc_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); | ||||
| @@ -660,6 +665,7 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); | |||||
| xdouble (*xnrm2_k) (BLASLONG, xdouble *, BLASLONG); | xdouble (*xnrm2_k) (BLASLONG, xdouble *, BLASLONG); | ||||
| xdouble (*xasum_k) (BLASLONG, xdouble *, BLASLONG); | xdouble (*xasum_k) (BLASLONG, xdouble *, BLASLONG); | ||||
| xdouble (*xsum_k) (BLASLONG, xdouble *, BLASLONG); | |||||
| int (*xcopy_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); | int (*xcopy_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); | ||||
| openblas_complex_xdouble (*xdotu_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); | openblas_complex_xdouble (*xdotu_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); | ||||
| openblas_complex_xdouble (*xdotc_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); | openblas_complex_xdouble (*xdotc_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); | ||||
| @@ -19,6 +19,7 @@ | |||||
| #define QDOTC_K qdot_k | #define QDOTC_K qdot_k | ||||
| #define QNRM2_K qnrm2_k | #define QNRM2_K qnrm2_k | ||||
| #define QSCAL_K qscal_k | #define QSCAL_K qscal_k | ||||
| #define QSUM_K qsum_k | |||||
| #define QSWAP_K qswap_k | #define QSWAP_K qswap_k | ||||
| #define QROT_K qrot_k | #define QROT_K qrot_k | ||||
| @@ -161,6 +162,7 @@ | |||||
| #define QDOTC_K gotoblas -> qdot_k | #define QDOTC_K gotoblas -> qdot_k | ||||
| #define QNRM2_K gotoblas -> qnrm2_k | #define QNRM2_K gotoblas -> qnrm2_k | ||||
| #define QSCAL_K gotoblas -> qscal_k | #define QSCAL_K gotoblas -> qscal_k | ||||
| #define QSUM_K gotoblas -> qsum_k | |||||
| #define QSWAP_K gotoblas -> qswap_k | #define QSWAP_K gotoblas -> qswap_k | ||||
| #define QROT_K gotoblas -> qrot_k | #define QROT_K gotoblas -> qrot_k | ||||
| @@ -12,6 +12,7 @@ | |||||
| #define ISMAX_K ismax_k | #define ISMAX_K ismax_k | ||||
| #define ISMIN_K ismin_k | #define ISMIN_K ismin_k | ||||
| #define SASUM_K sasum_k | #define SASUM_K sasum_k | ||||
| #define SSUM_K ssum_k | |||||
| #define SAXPYU_K saxpy_k | #define SAXPYU_K saxpy_k | ||||
| #define SAXPYC_K saxpy_k | #define SAXPYC_K saxpy_k | ||||
| #define SCOPY_K scopy_k | #define SCOPY_K scopy_k | ||||
| @@ -170,6 +171,7 @@ | |||||
| #define ISMAX_K gotoblas -> ismax_k | #define ISMAX_K gotoblas -> ismax_k | ||||
| #define ISMIN_K gotoblas -> ismin_k | #define ISMIN_K gotoblas -> ismin_k | ||||
| #define SASUM_K gotoblas -> sasum_k | #define SASUM_K gotoblas -> sasum_k | ||||
| #define SSUM_K gotoblas -> ssum_k | |||||
| #define SAXPYU_K gotoblas -> saxpy_k | #define SAXPYU_K gotoblas -> saxpy_k | ||||
| #define SAXPYC_K gotoblas -> saxpy_k | #define SAXPYC_K gotoblas -> saxpy_k | ||||
| #define SCOPY_K gotoblas -> scopy_k | #define SCOPY_K gotoblas -> scopy_k | ||||
| @@ -19,6 +19,7 @@ | |||||
| #define XDOTC_K xdotc_k | #define XDOTC_K xdotc_k | ||||
| #define XNRM2_K xnrm2_k | #define XNRM2_K xnrm2_k | ||||
| #define XSCAL_K xscal_k | #define XSCAL_K xscal_k | ||||
| #define XSUM_K xsum_k | |||||
| #define XSWAP_K xswap_k | #define XSWAP_K xswap_k | ||||
| #define XROT_K xqrot_k | #define XROT_K xqrot_k | ||||
| @@ -227,6 +228,7 @@ | |||||
| #define XDOTC_K gotoblas -> xdotc_k | #define XDOTC_K gotoblas -> xdotc_k | ||||
| #define XNRM2_K gotoblas -> xnrm2_k | #define XNRM2_K gotoblas -> xnrm2_k | ||||
| #define XSCAL_K gotoblas -> xscal_k | #define XSCAL_K gotoblas -> xscal_k | ||||
| #define XSUM_K gotoblas -> xsum_k | |||||
| #define XSWAP_K gotoblas -> xswap_k | #define XSWAP_K gotoblas -> xswap_k | ||||
| #define XROT_K gotoblas -> xqrot_k | #define XROT_K gotoblas -> xqrot_k | ||||
| @@ -19,6 +19,7 @@ | |||||
| #define ZDOTC_K zdotc_k | #define ZDOTC_K zdotc_k | ||||
| #define ZNRM2_K znrm2_k | #define ZNRM2_K znrm2_k | ||||
| #define ZSCAL_K zscal_k | #define ZSCAL_K zscal_k | ||||
| #define ZSUM_K zsum_k | |||||
| #define ZSWAP_K zswap_k | #define ZSWAP_K zswap_k | ||||
| #define ZROT_K zdrot_k | #define ZROT_K zdrot_k | ||||
| @@ -249,6 +250,7 @@ | |||||
| #define ZDOTC_K gotoblas -> zdotc_k | #define ZDOTC_K gotoblas -> zdotc_k | ||||
| #define ZNRM2_K gotoblas -> znrm2_k | #define ZNRM2_K gotoblas -> znrm2_k | ||||
| #define ZSCAL_K gotoblas -> zscal_k | #define ZSCAL_K gotoblas -> zscal_k | ||||
| #define ZSUM_K gotoblas -> zsum_k | |||||
| #define ZSWAP_K gotoblas -> zswap_k | #define ZSWAP_K gotoblas -> zswap_k | ||||
| #define ZROT_K gotoblas -> zdrot_k | #define ZROT_K gotoblas -> zdrot_k | ||||
| @@ -12,6 +12,7 @@ set(BLAS1_REAL_ONLY_SOURCES | |||||
| rotm.c rotmg.c # N.B. these do not have complex counterparts | rotm.c rotmg.c # N.B. these do not have complex counterparts | ||||
| rot.c | rot.c | ||||
| asum.c | asum.c | ||||
| sum.c | |||||
| ) | ) | ||||
| # these will have 'z' prepended for the complex version | # these will have 'z' prepended for the complex version | ||||
| @@ -124,6 +125,7 @@ foreach (float_type ${FLOAT_TYPES}) | |||||
| GenerateNamedObjects("max.c" "USE_ABS;USE_MIN" "scamin" ${CBLAS_FLAG} "" "" true "COMPLEX") | GenerateNamedObjects("max.c" "USE_ABS;USE_MIN" "scamin" ${CBLAS_FLAG} "" "" true "COMPLEX") | ||||
| GenerateNamedObjects("max.c" "USE_ABS" "scamax" ${CBLAS_FLAG} "" "" true "COMPLEX") | GenerateNamedObjects("max.c" "USE_ABS" "scamax" ${CBLAS_FLAG} "" "" true "COMPLEX") | ||||
| GenerateNamedObjects("asum.c" "" "scasum" ${CBLAS_FLAG} "" "" true "COMPLEX") | GenerateNamedObjects("asum.c" "" "scasum" ${CBLAS_FLAG} "" "" true "COMPLEX") | ||||
| GenerateNamedObjects("sum.c" "" "scsum" ${CBLAS_FLAG} "" "" true "COMPLEX") | |||||
| endif () | endif () | ||||
| if (${float_type} STREQUAL "ZCOMPLEX") | if (${float_type} STREQUAL "ZCOMPLEX") | ||||
| GenerateNamedObjects("zscal.c" "SSCAL" "dscal" ${CBLAS_FLAG} "" "" false "ZCOMPLEX") | GenerateNamedObjects("zscal.c" "SSCAL" "dscal" ${CBLAS_FLAG} "" "" false "ZCOMPLEX") | ||||
| @@ -132,6 +134,7 @@ foreach (float_type ${FLOAT_TYPES}) | |||||
| GenerateNamedObjects("max.c" "USE_ABS;USE_MIN" "dzamin" ${CBLAS_FLAG} "" "" true "ZCOMPLEX") | GenerateNamedObjects("max.c" "USE_ABS;USE_MIN" "dzamin" ${CBLAS_FLAG} "" "" true "ZCOMPLEX") | ||||
| GenerateNamedObjects("max.c" "USE_ABS" "dzamax" ${CBLAS_FLAG} "" "" true "ZCOMPLEX") | GenerateNamedObjects("max.c" "USE_ABS" "dzamax" ${CBLAS_FLAG} "" "" true "ZCOMPLEX") | ||||
| GenerateNamedObjects("asum.c" "" "dzasum" ${CBLAS_FLAG} "" "" true "ZCOMPLEX") | GenerateNamedObjects("asum.c" "" "dzasum" ${CBLAS_FLAG} "" "" true "ZCOMPLEX") | ||||
| GenerateNamedObjects("sum.c" "" "dzsum" ${CBLAS_FLAG} "" "" true "ZCOMPLEX") | |||||
| endif () | endif () | ||||
| endforeach () | endforeach () | ||||
| @@ -25,7 +25,7 @@ SBLAS1OBJS = \ | |||||
| saxpy.$(SUFFIX) sswap.$(SUFFIX) \ | saxpy.$(SUFFIX) sswap.$(SUFFIX) \ | ||||
| scopy.$(SUFFIX) sscal.$(SUFFIX) \ | scopy.$(SUFFIX) sscal.$(SUFFIX) \ | ||||
| sdot.$(SUFFIX) sdsdot.$(SUFFIX) dsdot.$(SUFFIX) \ | sdot.$(SUFFIX) sdsdot.$(SUFFIX) dsdot.$(SUFFIX) \ | ||||
| sasum.$(SUFFIX) snrm2.$(SUFFIX) \ | |||||
| sasum.$(SUFFIX) ssum.$(SUFFIX) snrm2.$(SUFFIX) \ | |||||
| smax.$(SUFFIX) samax.$(SUFFIX) ismax.$(SUFFIX) isamax.$(SUFFIX) \ | smax.$(SUFFIX) samax.$(SUFFIX) ismax.$(SUFFIX) isamax.$(SUFFIX) \ | ||||
| smin.$(SUFFIX) samin.$(SUFFIX) ismin.$(SUFFIX) isamin.$(SUFFIX) \ | smin.$(SUFFIX) samin.$(SUFFIX) ismin.$(SUFFIX) isamin.$(SUFFIX) \ | ||||
| srot.$(SUFFIX) srotg.$(SUFFIX) srotm.$(SUFFIX) srotmg.$(SUFFIX) \ | srot.$(SUFFIX) srotg.$(SUFFIX) srotm.$(SUFFIX) srotmg.$(SUFFIX) \ | ||||
| @@ -51,7 +51,7 @@ DBLAS1OBJS = \ | |||||
| daxpy.$(SUFFIX) dswap.$(SUFFIX) \ | daxpy.$(SUFFIX) dswap.$(SUFFIX) \ | ||||
| dcopy.$(SUFFIX) dscal.$(SUFFIX) \ | dcopy.$(SUFFIX) dscal.$(SUFFIX) \ | ||||
| ddot.$(SUFFIX) \ | ddot.$(SUFFIX) \ | ||||
| dasum.$(SUFFIX) dnrm2.$(SUFFIX) \ | |||||
| dasum.$(SUFFIX) dsum.$(SUFFIX) dnrm2.$(SUFFIX) \ | |||||
| dmax.$(SUFFIX) damax.$(SUFFIX) idmax.$(SUFFIX) idamax.$(SUFFIX) \ | dmax.$(SUFFIX) damax.$(SUFFIX) idmax.$(SUFFIX) idamax.$(SUFFIX) \ | ||||
| dmin.$(SUFFIX) damin.$(SUFFIX) idmin.$(SUFFIX) idamin.$(SUFFIX) \ | dmin.$(SUFFIX) damin.$(SUFFIX) idmin.$(SUFFIX) idamin.$(SUFFIX) \ | ||||
| drot.$(SUFFIX) drotg.$(SUFFIX) drotm.$(SUFFIX) drotmg.$(SUFFIX) \ | drot.$(SUFFIX) drotg.$(SUFFIX) drotm.$(SUFFIX) drotmg.$(SUFFIX) \ | ||||
| @@ -76,7 +76,7 @@ CBLAS1OBJS = \ | |||||
| caxpy.$(SUFFIX) caxpyc.$(SUFFIX) cswap.$(SUFFIX) \ | caxpy.$(SUFFIX) caxpyc.$(SUFFIX) cswap.$(SUFFIX) \ | ||||
| ccopy.$(SUFFIX) cscal.$(SUFFIX) csscal.$(SUFFIX) \ | ccopy.$(SUFFIX) cscal.$(SUFFIX) csscal.$(SUFFIX) \ | ||||
| cdotc.$(SUFFIX) cdotu.$(SUFFIX) \ | cdotc.$(SUFFIX) cdotu.$(SUFFIX) \ | ||||
| scasum.$(SUFFIX) scnrm2.$(SUFFIX) \ | |||||
| scasum.$(SUFFIX) scsum.$(SUFFIX) scnrm2.$(SUFFIX) \ | |||||
| scamax.$(SUFFIX) icamax.$(SUFFIX) \ | scamax.$(SUFFIX) icamax.$(SUFFIX) \ | ||||
| scamin.$(SUFFIX) icamin.$(SUFFIX) \ | scamin.$(SUFFIX) icamin.$(SUFFIX) \ | ||||
| csrot.$(SUFFIX) crotg.$(SUFFIX) \ | csrot.$(SUFFIX) crotg.$(SUFFIX) \ | ||||
| @@ -105,7 +105,7 @@ ZBLAS1OBJS = \ | |||||
| zaxpy.$(SUFFIX) zaxpyc.$(SUFFIX) zswap.$(SUFFIX) \ | zaxpy.$(SUFFIX) zaxpyc.$(SUFFIX) zswap.$(SUFFIX) \ | ||||
| zcopy.$(SUFFIX) zscal.$(SUFFIX) zdscal.$(SUFFIX) \ | zcopy.$(SUFFIX) zscal.$(SUFFIX) zdscal.$(SUFFIX) \ | ||||
| zdotc.$(SUFFIX) zdotu.$(SUFFIX) \ | zdotc.$(SUFFIX) zdotu.$(SUFFIX) \ | ||||
| dzasum.$(SUFFIX) dznrm2.$(SUFFIX) \ | |||||
| dzasum.$(SUFFIX) dzsum.$(SUFFIX) dznrm2.$(SUFFIX) \ | |||||
| dzamax.$(SUFFIX) izamax.$(SUFFIX) \ | dzamax.$(SUFFIX) izamax.$(SUFFIX) \ | ||||
| dzamin.$(SUFFIX) izamin.$(SUFFIX) \ | dzamin.$(SUFFIX) izamin.$(SUFFIX) \ | ||||
| zdrot.$(SUFFIX) zrotg.$(SUFFIX) \ | zdrot.$(SUFFIX) zrotg.$(SUFFIX) \ | ||||
| @@ -146,7 +146,7 @@ QBLAS1OBJS = \ | |||||
| qaxpy.$(SUFFIX) qswap.$(SUFFIX) \ | qaxpy.$(SUFFIX) qswap.$(SUFFIX) \ | ||||
| qcopy.$(SUFFIX) qscal.$(SUFFIX) \ | qcopy.$(SUFFIX) qscal.$(SUFFIX) \ | ||||
| qdot.$(SUFFIX) \ | qdot.$(SUFFIX) \ | ||||
| qasum.$(SUFFIX) qnrm2.$(SUFFIX) \ | |||||
| qasum.$(SUFFIX) qsum.$(SUFFIX) qnrm2.$(SUFFIX) \ | |||||
| qmax.$(SUFFIX) qamax.$(SUFFIX) iqmax.$(SUFFIX) iqamax.$(SUFFIX) \ | qmax.$(SUFFIX) qamax.$(SUFFIX) iqmax.$(SUFFIX) iqamax.$(SUFFIX) \ | ||||
| qmin.$(SUFFIX) qamin.$(SUFFIX) iqmin.$(SUFFIX) iqamin.$(SUFFIX) \ | qmin.$(SUFFIX) qamin.$(SUFFIX) iqmin.$(SUFFIX) iqamin.$(SUFFIX) \ | ||||
| qrot.$(SUFFIX) qrotg.$(SUFFIX) qrotm.$(SUFFIX) qrotmg.$(SUFFIX) \ | qrot.$(SUFFIX) qrotg.$(SUFFIX) qrotm.$(SUFFIX) qrotmg.$(SUFFIX) \ | ||||
| @@ -168,7 +168,7 @@ XBLAS1OBJS = \ | |||||
| xaxpy.$(SUFFIX) xaxpyc.$(SUFFIX) xswap.$(SUFFIX) \ | xaxpy.$(SUFFIX) xaxpyc.$(SUFFIX) xswap.$(SUFFIX) \ | ||||
| xcopy.$(SUFFIX) xscal.$(SUFFIX) xqscal.$(SUFFIX) \ | xcopy.$(SUFFIX) xscal.$(SUFFIX) xqscal.$(SUFFIX) \ | ||||
| xdotc.$(SUFFIX) xdotu.$(SUFFIX) \ | xdotc.$(SUFFIX) xdotu.$(SUFFIX) \ | ||||
| qxasum.$(SUFFIX) qxnrm2.$(SUFFIX) \ | |||||
| qxasum.$(SUFFIX) qxsum.$(SUFFIX) qxnrm2.$(SUFFIX) \ | |||||
| qxamax.$(SUFFIX) ixamax.$(SUFFIX) \ | qxamax.$(SUFFIX) ixamax.$(SUFFIX) \ | ||||
| qxamin.$(SUFFIX) ixamin.$(SUFFIX) \ | qxamin.$(SUFFIX) ixamin.$(SUFFIX) \ | ||||
| xqrot.$(SUFFIX) xrotg.$(SUFFIX) \ | xqrot.$(SUFFIX) xrotg.$(SUFFIX) \ | ||||
| @@ -203,7 +203,7 @@ ifdef QUAD_PRECISION | |||||
| QBLAS1OBJS = \ | QBLAS1OBJS = \ | ||||
| qaxpy.$(SUFFIX) qswap.$(SUFFIX) \ | qaxpy.$(SUFFIX) qswap.$(SUFFIX) \ | ||||
| qcopy.$(SUFFIX) qscal.$(SUFFIX) \ | qcopy.$(SUFFIX) qscal.$(SUFFIX) \ | ||||
| qasum.$(SUFFIX) qnrm2.$(SUFFIX) \ | |||||
| qasum.$(SUFFIX) qsum.$(SUFFIX) qnrm2.$(SUFFIX) \ | |||||
| qmax.$(SUFFIX) qamax.$(SUFFIX) iqmax.$(SUFFIX) iqamax.$(SUFFIX) \ | qmax.$(SUFFIX) qamax.$(SUFFIX) iqmax.$(SUFFIX) iqamax.$(SUFFIX) \ | ||||
| qmin.$(SUFFIX) qamin.$(SUFFIX) iqmin.$(SUFFIX) iqamin.$(SUFFIX) \ | qmin.$(SUFFIX) qamin.$(SUFFIX) iqmin.$(SUFFIX) iqamin.$(SUFFIX) \ | ||||
| qrot.$(SUFFIX) qrotg.$(SUFFIX) qrotm.$(SUFFIX) qrotmg.$(SUFFIX) \ | qrot.$(SUFFIX) qrotg.$(SUFFIX) qrotm.$(SUFFIX) qrotmg.$(SUFFIX) \ | ||||
| @@ -224,7 +224,7 @@ QBLAS3OBJS = \ | |||||
| XBLAS1OBJS = \ | XBLAS1OBJS = \ | ||||
| xaxpy.$(SUFFIX) xaxpyc.$(SUFFIX) xswap.$(SUFFIX) \ | xaxpy.$(SUFFIX) xaxpyc.$(SUFFIX) xswap.$(SUFFIX) \ | ||||
| xcopy.$(SUFFIX) xscal.$(SUFFIX) xqscal.$(SUFFIX) \ | xcopy.$(SUFFIX) xscal.$(SUFFIX) xqscal.$(SUFFIX) \ | ||||
| qxasum.$(SUFFIX) qxnrm2.$(SUFFIX) \ | |||||
| qxasum.$(SUFFIX) qxsum.$(SUFFIX) qxnrm2.$(SUFFIX) \ | |||||
| qxamax.$(SUFFIX) ixamax.$(SUFFIX) \ | qxamax.$(SUFFIX) ixamax.$(SUFFIX) \ | ||||
| qxamin.$(SUFFIX) ixamin.$(SUFFIX) \ | qxamin.$(SUFFIX) ixamin.$(SUFFIX) \ | ||||
| xqrot.$(SUFFIX) xrotg.$(SUFFIX) \ | xqrot.$(SUFFIX) xrotg.$(SUFFIX) \ | ||||
| @@ -264,7 +264,7 @@ CSBLAS1OBJS = \ | |||||
| cblas_scopy.$(SUFFIX) cblas_sdot.$(SUFFIX) cblas_sdsdot.$(SUFFIX) cblas_dsdot.$(SUFFIX) \ | cblas_scopy.$(SUFFIX) cblas_sdot.$(SUFFIX) cblas_sdsdot.$(SUFFIX) cblas_dsdot.$(SUFFIX) \ | ||||
| cblas_srot.$(SUFFIX) cblas_srotg.$(SUFFIX) cblas_srotm.$(SUFFIX) cblas_srotmg.$(SUFFIX) \ | cblas_srot.$(SUFFIX) cblas_srotg.$(SUFFIX) cblas_srotm.$(SUFFIX) cblas_srotmg.$(SUFFIX) \ | ||||
| cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) cblas_saxpby.$(SUFFIX) \ | cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) cblas_saxpby.$(SUFFIX) \ | ||||
| cblas_ismin.$(SUFFIX) cblas_ismax.$(SUFFIX) | |||||
| cblas_ismin.$(SUFFIX) cblas_ismax.$(SUFFIX) cblas_ssum.$(SUFFIX) | |||||
| CSBLAS2OBJS = \ | CSBLAS2OBJS = \ | ||||
| cblas_sgemv.$(SUFFIX) cblas_sger.$(SUFFIX) cblas_ssymv.$(SUFFIX) cblas_strmv.$(SUFFIX) \ | cblas_sgemv.$(SUFFIX) cblas_sger.$(SUFFIX) cblas_ssymv.$(SUFFIX) cblas_strmv.$(SUFFIX) \ | ||||
| @@ -282,7 +282,7 @@ CDBLAS1OBJS = \ | |||||
| cblas_dcopy.$(SUFFIX) cblas_ddot.$(SUFFIX) \ | cblas_dcopy.$(SUFFIX) cblas_ddot.$(SUFFIX) \ | ||||
| cblas_drot.$(SUFFIX) cblas_drotg.$(SUFFIX) cblas_drotm.$(SUFFIX) cblas_drotmg.$(SUFFIX) \ | cblas_drot.$(SUFFIX) cblas_drotg.$(SUFFIX) cblas_drotm.$(SUFFIX) cblas_drotmg.$(SUFFIX) \ | ||||
| cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) cblas_daxpby.$(SUFFIX) \ | cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) cblas_daxpby.$(SUFFIX) \ | ||||
| cblas_idmin.$(SUFFIX) cblas_idmax.$(SUFFIX) | |||||
| cblas_idmin.$(SUFFIX) cblas_idmax.$(SUFFIX) cblas_dsum.$(SUFFIX) | |||||
| CDBLAS2OBJS = \ | CDBLAS2OBJS = \ | ||||
| cblas_dgemv.$(SUFFIX) cblas_dger.$(SUFFIX) cblas_dsymv.$(SUFFIX) cblas_dtrmv.$(SUFFIX) \ | cblas_dgemv.$(SUFFIX) cblas_dger.$(SUFFIX) cblas_dsymv.$(SUFFIX) cblas_dtrmv.$(SUFFIX) \ | ||||
| @@ -303,7 +303,7 @@ CCBLAS1OBJS = \ | |||||
| cblas_cscal.$(SUFFIX) cblas_csscal.$(SUFFIX) \ | cblas_cscal.$(SUFFIX) cblas_csscal.$(SUFFIX) \ | ||||
| cblas_cswap.$(SUFFIX) cblas_scnrm2.$(SUFFIX) \ | cblas_cswap.$(SUFFIX) cblas_scnrm2.$(SUFFIX) \ | ||||
| cblas_caxpby.$(SUFFIX) \ | cblas_caxpby.$(SUFFIX) \ | ||||
| cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX) | |||||
| cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX) cblas_scsum.$(SUFFIX) | |||||
| CCBLAS2OBJS = \ | CCBLAS2OBJS = \ | ||||
| cblas_cgemv.$(SUFFIX) cblas_cgerc.$(SUFFIX) cblas_cgeru.$(SUFFIX) \ | cblas_cgemv.$(SUFFIX) cblas_cgerc.$(SUFFIX) cblas_cgeru.$(SUFFIX) \ | ||||
| @@ -330,7 +330,7 @@ CZBLAS1OBJS = \ | |||||
| cblas_zscal.$(SUFFIX) cblas_zdscal.$(SUFFIX) \ | cblas_zscal.$(SUFFIX) cblas_zdscal.$(SUFFIX) \ | ||||
| cblas_zswap.$(SUFFIX) cblas_dznrm2.$(SUFFIX) \ | cblas_zswap.$(SUFFIX) cblas_dznrm2.$(SUFFIX) \ | ||||
| cblas_zaxpby.$(SUFFIX) \ | cblas_zaxpby.$(SUFFIX) \ | ||||
| cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX) | |||||
| cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX) cblas_dzsum.$(SUFFIX) | |||||
| CZBLAS2OBJS = \ | CZBLAS2OBJS = \ | ||||
| @@ -565,6 +565,24 @@ dzasum.$(SUFFIX) dzasum.$(PSUFFIX) : asum.c | |||||
| qxasum.$(SUFFIX) qxasum.$(PSUFFIX) : asum.c | qxasum.$(SUFFIX) qxasum.$(PSUFFIX) : asum.c | ||||
| $(CC) $(CFLAGS) -c $< -o $(@F) | $(CC) $(CFLAGS) -c $< -o $(@F) | ||||
| ssum.$(SUFFIX) ssum.$(PSUFFIX) : sum.c | |||||
| $(CC) $(CFLAGS) -c $< -o $(@F) | |||||
| dsum.$(SUFFIX) dsum.$(PSUFFIX) : sum.c | |||||
| $(CC) $(CFLAGS) -c $< -o $(@F) | |||||
| qsum.$(SUFFIX) qsum.$(PSUFFIX) : sum.c | |||||
| $(CC) $(CFLAGS) -c $< -o $(@F) | |||||
| scsum.$(SUFFIX) scsum.$(PSUFFIX) : sum.c | |||||
| $(CC) $(CFLAGS) -c $< -o $(@F) | |||||
| dzsum.$(SUFFIX) dzsum.$(PSUFFIX) : sum.c | |||||
| $(CC) $(CFLAGS) -c $< -o $(@F) | |||||
| qxsum.$(SUFFIX) qxsum.$(PSUFFIX) : sum.c | |||||
| $(CC) $(CFLAGS) -c $< -o $(@F) | |||||
| snrm2.$(SUFFIX) snrm2.$(PSUFFIX) : nrm2.c | snrm2.$(SUFFIX) snrm2.$(PSUFFIX) : nrm2.c | ||||
| $(CC) $(CFLAGS) -c $< -o $(@F) | $(CC) $(CFLAGS) -c $< -o $(@F) | ||||
| @@ -1412,6 +1430,18 @@ cblas_scasum.$(SUFFIX) cblas_scasum.$(PSUFFIX) : asum.c | |||||
| cblas_dzasum.$(SUFFIX) cblas_dzasum.$(PSUFFIX) : asum.c | cblas_dzasum.$(SUFFIX) cblas_dzasum.$(PSUFFIX) : asum.c | ||||
| $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | ||||
| cblas_ssum.$(SUFFIX) cblas_ssum.$(PSUFFIX) : sum.c | |||||
| $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | |||||
| cblas_dsum.$(SUFFIX) cblas_dsum.$(PSUFFIX) : sum.c | |||||
| $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | |||||
| cblas_scsum.$(SUFFIX) cblas_scsum.$(PSUFFIX) : sum.c | |||||
| $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | |||||
| cblas_dzsum.$(SUFFIX) cblas_dzsum.$(PSUFFIX) : sum.c | |||||
| $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | |||||
| cblas_sdsdot.$(SUFFIX) cblas_sdsdot.$(PSUFFIX) : sdsdot.c | cblas_sdsdot.$(SUFFIX) cblas_sdsdot.$(PSUFFIX) : sdsdot.c | ||||
| $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | ||||
| @@ -1419,7 +1449,7 @@ cblas_dsdot.$(SUFFIX) cblas_dsdot.$(PSUFFIX) : dsdot.c | |||||
| $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | ||||
| cblas_sdot.$(SUFFIX) cblas_sdot.$(PSUFFIX) : dot.c | cblas_sdot.$(SUFFIX) cblas_sdot.$(PSUFFIX) : dot.c | ||||
| $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | |||||
| $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | |||||
| cblas_ddot.$(SUFFIX) cblas_ddot.$(PSUFFIX) : dot.c | cblas_ddot.$(SUFFIX) cblas_ddot.$(PSUFFIX) : dot.c | ||||
| $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | ||||
| @@ -0,0 +1,97 @@ | |||||
| /*********************************************************************/ | |||||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||||
| /* All rights reserved. */ | |||||
| /* */ | |||||
| /* Redistribution and use in source and binary forms, with or */ | |||||
| /* without modification, are permitted provided that the following */ | |||||
| /* conditions are met: */ | |||||
| /* */ | |||||
| /* 1. Redistributions of source code must retain the above */ | |||||
| /* copyright notice, this list of conditions and the following */ | |||||
| /* disclaimer. */ | |||||
| /* */ | |||||
| /* 2. Redistributions in binary form must reproduce the above */ | |||||
| /* copyright notice, this list of conditions and the following */ | |||||
| /* disclaimer in the documentation and/or other materials */ | |||||
| /* provided with the distribution. */ | |||||
| /* */ | |||||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||||
| /* */ | |||||
| /* The views and conclusions contained in the software and */ | |||||
| /* documentation are those of the authors and should not be */ | |||||
| /* interpreted as representing official policies, either expressed */ | |||||
| /* or implied, of The University of Texas at Austin. */ | |||||
| /*********************************************************************/ | |||||
| #include <stdio.h> | |||||
| #include "common.h" | |||||
| #ifdef FUNCTION_PROFILE | |||||
| #include "functable.h" | |||||
| #endif | |||||
| #ifndef CBLAS | |||||
| FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){ | |||||
| BLASLONG n = *N; | |||||
| BLASLONG incx = *INCX; | |||||
| FLOATRET ret; | |||||
| PRINT_DEBUG_NAME; | |||||
| if (n <= 0) return 0; | |||||
| IDEBUG_START; | |||||
| FUNCTION_PROFILE_START(); | |||||
| ret = (FLOATRET)SUM_K(n, x, incx); | |||||
| FUNCTION_PROFILE_END(COMPSIZE, n, n); | |||||
| IDEBUG_END; | |||||
| return ret; | |||||
| } | |||||
| #else | |||||
| #ifdef COMPLEX | |||||
| FLOAT CNAME(blasint n, void *vx, blasint incx){ | |||||
| FLOAT *x = (FLOAT*) vx; | |||||
| #else | |||||
| FLOAT CNAME(blasint n, FLOAT *x, blasint incx){ | |||||
| #endif | |||||
| FLOAT ret; | |||||
| PRINT_DEBUG_CNAME; | |||||
| if (n <= 0) return 0; | |||||
| IDEBUG_START; | |||||
| FUNCTION_PROFILE_START(); | |||||
| ret = SUM_K(n, x, incx); | |||||
| FUNCTION_PROFILE_END(COMPSIZE, n, n); | |||||
| IDEBUG_END; | |||||
| return ret; | |||||
| } | |||||
| #endif | |||||
| @@ -65,6 +65,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) | |||||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}SCALKERNEL}" "" "scal_k" false "" "" false ${float_type}) | GenerateNamedObjects("${KERNELDIR}/${${float_char}SCALKERNEL}" "" "scal_k" false "" "" false ${float_type}) | ||||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}SWAPKERNEL}" "" "swap_k" false "" "" false ${float_type}) | GenerateNamedObjects("${KERNELDIR}/${${float_char}SWAPKERNEL}" "" "swap_k" false "" "" false ${float_type}) | ||||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}AXPBYKERNEL}" "" "axpby_k" false "" "" false ${float_type}) | GenerateNamedObjects("${KERNELDIR}/${${float_char}AXPBYKERNEL}" "" "axpby_k" false "" "" false ${float_type}) | ||||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}SUMKERNEL}" "" "sum_k" false "" "" false ${float_type}) | |||||
| if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") | if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") | ||||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}AXPYKERNEL}" "CONJ" "axpyc_k" false "" "" false ${float_type}) | GenerateNamedObjects("${KERNELDIR}/${${float_char}AXPYKERNEL}" "CONJ" "axpyc_k" false "" "" false ${float_type}) | ||||
| @@ -340,6 +340,32 @@ ifndef XSCALKERNEL | |||||
| XSCALKERNEL = zscal.S | XSCALKERNEL = zscal.S | ||||
| endif | endif | ||||
| ### SUM ### | |||||
| ifndef SSUMKERNEL | |||||
| SSUMKERNEL = sum.S | |||||
| endif | |||||
| ifndef DSUMKERNEL | |||||
| DSUMKERNEL = sum.S | |||||
| endif | |||||
| ifndef CSUMKERNEL | |||||
| CSUMKERNEL = zsum.S | |||||
| endif | |||||
| ifndef ZSUMKERNEL | |||||
| ZSUMKERNEL = zsum.S | |||||
| endif | |||||
| ifndef QSUMKERNEL | |||||
| QSUMKERNEL = sum.S | |||||
| endif | |||||
| ifndef XSUMKERNEL | |||||
| XSUMKERNEL = zsum.S | |||||
| endif | |||||
| ### SWAP ### | ### SWAP ### | ||||
| ifndef SSWAPKERNEL | ifndef SSWAPKERNEL | ||||
| @@ -453,7 +479,7 @@ endif | |||||
| SBLASOBJS += \ | SBLASOBJS += \ | ||||
| samax_k$(TSUFFIX).$(SUFFIX) samin_k$(TSUFFIX).$(SUFFIX) smax_k$(TSUFFIX).$(SUFFIX) smin_k$(TSUFFIX).$(SUFFIX) \ | samax_k$(TSUFFIX).$(SUFFIX) samin_k$(TSUFFIX).$(SUFFIX) smax_k$(TSUFFIX).$(SUFFIX) smin_k$(TSUFFIX).$(SUFFIX) \ | ||||
| isamax_k$(TSUFFIX).$(SUFFIX) isamin_k$(TSUFFIX).$(SUFFIX) ismax_k$(TSUFFIX).$(SUFFIX) ismin_k$(TSUFFIX).$(SUFFIX) \ | isamax_k$(TSUFFIX).$(SUFFIX) isamin_k$(TSUFFIX).$(SUFFIX) ismax_k$(TSUFFIX).$(SUFFIX) ismin_k$(TSUFFIX).$(SUFFIX) \ | ||||
| sasum_k$(TSUFFIX).$(SUFFIX) saxpy_k$(TSUFFIX).$(SUFFIX) scopy_k$(TSUFFIX).$(SUFFIX) \ | |||||
| sasum_k$(TSUFFIX).$(SUFFIX) ssum_k$(TSUFFIX).$(SUFFIX) saxpy_k$(TSUFFIX).$(SUFFIX) scopy_k$(TSUFFIX).$(SUFFIX) \ | |||||
| sdot_k$(TSUFFIX).$(SUFFIX) sdsdot_k$(TSUFFIX).$(SUFFIX) dsdot_k$(TSUFFIX).$(SUFFIX) \ | sdot_k$(TSUFFIX).$(SUFFIX) sdsdot_k$(TSUFFIX).$(SUFFIX) dsdot_k$(TSUFFIX).$(SUFFIX) \ | ||||
| snrm2_k$(TSUFFIX).$(SUFFIX) srot_k$(TSUFFIX).$(SUFFIX) sscal_k$(TSUFFIX).$(SUFFIX) sswap_k$(TSUFFIX).$(SUFFIX) \ | snrm2_k$(TSUFFIX).$(SUFFIX) srot_k$(TSUFFIX).$(SUFFIX) sscal_k$(TSUFFIX).$(SUFFIX) sswap_k$(TSUFFIX).$(SUFFIX) \ | ||||
| saxpby_k$(TSUFFIX).$(SUFFIX) | saxpby_k$(TSUFFIX).$(SUFFIX) | ||||
| @@ -463,31 +489,32 @@ DBLASOBJS += \ | |||||
| idamax_k$(TSUFFIX).$(SUFFIX) idamin_k$(TSUFFIX).$(SUFFIX) idmax_k$(TSUFFIX).$(SUFFIX) idmin_k$(TSUFFIX).$(SUFFIX) \ | idamax_k$(TSUFFIX).$(SUFFIX) idamin_k$(TSUFFIX).$(SUFFIX) idmax_k$(TSUFFIX).$(SUFFIX) idmin_k$(TSUFFIX).$(SUFFIX) \ | ||||
| dasum_k$(TSUFFIX).$(SUFFIX) daxpy_k$(TSUFFIX).$(SUFFIX) dcopy_k$(TSUFFIX).$(SUFFIX) ddot_k$(TSUFFIX).$(SUFFIX) \ | dasum_k$(TSUFFIX).$(SUFFIX) daxpy_k$(TSUFFIX).$(SUFFIX) dcopy_k$(TSUFFIX).$(SUFFIX) ddot_k$(TSUFFIX).$(SUFFIX) \ | ||||
| dnrm2_k$(TSUFFIX).$(SUFFIX) drot_k$(TSUFFIX).$(SUFFIX) dscal_k$(TSUFFIX).$(SUFFIX) dswap_k$(TSUFFIX).$(SUFFIX) \ | dnrm2_k$(TSUFFIX).$(SUFFIX) drot_k$(TSUFFIX).$(SUFFIX) dscal_k$(TSUFFIX).$(SUFFIX) dswap_k$(TSUFFIX).$(SUFFIX) \ | ||||
| daxpby_k$(TSUFFIX).$(SUFFIX) | |||||
| daxpby_k$(TSUFFIX).$(SUFFIX) dsum_k$(TSUFFIX).$(SUFFIX) | |||||
| QBLASOBJS += \ | QBLASOBJS += \ | ||||
| qamax_k$(TSUFFIX).$(SUFFIX) qamin_k$(TSUFFIX).$(SUFFIX) qmax_k$(TSUFFIX).$(SUFFIX) qmin_k$(TSUFFIX).$(SUFFIX) \ | qamax_k$(TSUFFIX).$(SUFFIX) qamin_k$(TSUFFIX).$(SUFFIX) qmax_k$(TSUFFIX).$(SUFFIX) qmin_k$(TSUFFIX).$(SUFFIX) \ | ||||
| iqamax_k$(TSUFFIX).$(SUFFIX) iqamin_k$(TSUFFIX).$(SUFFIX) iqmax_k$(TSUFFIX).$(SUFFIX) iqmin_k$(TSUFFIX).$(SUFFIX) \ | iqamax_k$(TSUFFIX).$(SUFFIX) iqamin_k$(TSUFFIX).$(SUFFIX) iqmax_k$(TSUFFIX).$(SUFFIX) iqmin_k$(TSUFFIX).$(SUFFIX) \ | ||||
| qasum_k$(TSUFFIX).$(SUFFIX) qaxpy_k$(TSUFFIX).$(SUFFIX) qcopy_k$(TSUFFIX).$(SUFFIX) qdot_k$(TSUFFIX).$(SUFFIX) \ | qasum_k$(TSUFFIX).$(SUFFIX) qaxpy_k$(TSUFFIX).$(SUFFIX) qcopy_k$(TSUFFIX).$(SUFFIX) qdot_k$(TSUFFIX).$(SUFFIX) \ | ||||
| qnrm2_k$(TSUFFIX).$(SUFFIX) qrot_k$(TSUFFIX).$(SUFFIX) qscal_k$(TSUFFIX).$(SUFFIX) qswap_k$(TSUFFIX).$(SUFFIX) | |||||
| qnrm2_k$(TSUFFIX).$(SUFFIX) qrot_k$(TSUFFIX).$(SUFFIX) qscal_k$(TSUFFIX).$(SUFFIX) qswap_k$(TSUFFIX).$(SUFFIX) \ | |||||
| qsum_k$(TSUFFIX).$(SUFFIX) | |||||
| CBLASOBJS += \ | CBLASOBJS += \ | ||||
| camax_k$(TSUFFIX).$(SUFFIX) camin_k$(TSUFFIX).$(SUFFIX) icamax_k$(TSUFFIX).$(SUFFIX) icamin_k$(TSUFFIX).$(SUFFIX) \ | camax_k$(TSUFFIX).$(SUFFIX) camin_k$(TSUFFIX).$(SUFFIX) icamax_k$(TSUFFIX).$(SUFFIX) icamin_k$(TSUFFIX).$(SUFFIX) \ | ||||
| casum_k$(TSUFFIX).$(SUFFIX) caxpy_k$(TSUFFIX).$(SUFFIX) caxpyc_k$(TSUFFIX).$(SUFFIX) ccopy_k$(TSUFFIX).$(SUFFIX) \ | casum_k$(TSUFFIX).$(SUFFIX) caxpy_k$(TSUFFIX).$(SUFFIX) caxpyc_k$(TSUFFIX).$(SUFFIX) ccopy_k$(TSUFFIX).$(SUFFIX) \ | ||||
| cdotc_k$(TSUFFIX).$(SUFFIX) cdotu_k$(TSUFFIX).$(SUFFIX) cnrm2_k$(TSUFFIX).$(SUFFIX) csrot_k$(TSUFFIX).$(SUFFIX) \ | cdotc_k$(TSUFFIX).$(SUFFIX) cdotu_k$(TSUFFIX).$(SUFFIX) cnrm2_k$(TSUFFIX).$(SUFFIX) csrot_k$(TSUFFIX).$(SUFFIX) \ | ||||
| cscal_k$(TSUFFIX).$(SUFFIX) cswap_k$(TSUFFIX).$(SUFFIX) caxpby_k$(TSUFFIX).$(SUFFIX) | |||||
| cscal_k$(TSUFFIX).$(SUFFIX) cswap_k$(TSUFFIX).$(SUFFIX) caxpby_k$(TSUFFIX).$(SUFFIX) csum_k$(TSUFFIX).$(SUFFIX) | |||||
| ZBLASOBJS += \ | ZBLASOBJS += \ | ||||
| zamax_k$(TSUFFIX).$(SUFFIX) zamin_k$(TSUFFIX).$(SUFFIX) izamax_k$(TSUFFIX).$(SUFFIX) izamin_k$(TSUFFIX).$(SUFFIX) \ | zamax_k$(TSUFFIX).$(SUFFIX) zamin_k$(TSUFFIX).$(SUFFIX) izamax_k$(TSUFFIX).$(SUFFIX) izamin_k$(TSUFFIX).$(SUFFIX) \ | ||||
| zasum_k$(TSUFFIX).$(SUFFIX) zaxpy_k$(TSUFFIX).$(SUFFIX) zaxpyc_k$(TSUFFIX).$(SUFFIX) zcopy_k$(TSUFFIX).$(SUFFIX) \ | zasum_k$(TSUFFIX).$(SUFFIX) zaxpy_k$(TSUFFIX).$(SUFFIX) zaxpyc_k$(TSUFFIX).$(SUFFIX) zcopy_k$(TSUFFIX).$(SUFFIX) \ | ||||
| zdotc_k$(TSUFFIX).$(SUFFIX) zdotu_k$(TSUFFIX).$(SUFFIX) znrm2_k$(TSUFFIX).$(SUFFIX) zdrot_k$(TSUFFIX).$(SUFFIX) \ | zdotc_k$(TSUFFIX).$(SUFFIX) zdotu_k$(TSUFFIX).$(SUFFIX) znrm2_k$(TSUFFIX).$(SUFFIX) zdrot_k$(TSUFFIX).$(SUFFIX) \ | ||||
| zscal_k$(TSUFFIX).$(SUFFIX) zswap_k$(TSUFFIX).$(SUFFIX) zaxpby_k$(TSUFFIX).$(SUFFIX) | |||||
| zscal_k$(TSUFFIX).$(SUFFIX) zswap_k$(TSUFFIX).$(SUFFIX) zaxpby_k$(TSUFFIX).$(SUFFIX) zsum_k$(TSUFFIX).$(SUFFIX) | |||||
| XBLASOBJS += \ | XBLASOBJS += \ | ||||
| xamax_k$(TSUFFIX).$(SUFFIX) xamin_k$(TSUFFIX).$(SUFFIX) ixamax_k$(TSUFFIX).$(SUFFIX) ixamin_k$(TSUFFIX).$(SUFFIX) \ | xamax_k$(TSUFFIX).$(SUFFIX) xamin_k$(TSUFFIX).$(SUFFIX) ixamax_k$(TSUFFIX).$(SUFFIX) ixamin_k$(TSUFFIX).$(SUFFIX) \ | ||||
| xasum_k$(TSUFFIX).$(SUFFIX) xaxpy_k$(TSUFFIX).$(SUFFIX) xaxpyc_k$(TSUFFIX).$(SUFFIX) xcopy_k$(TSUFFIX).$(SUFFIX) \ | xasum_k$(TSUFFIX).$(SUFFIX) xaxpy_k$(TSUFFIX).$(SUFFIX) xaxpyc_k$(TSUFFIX).$(SUFFIX) xcopy_k$(TSUFFIX).$(SUFFIX) \ | ||||
| xdotc_k$(TSUFFIX).$(SUFFIX) xdotu_k$(TSUFFIX).$(SUFFIX) xnrm2_k$(TSUFFIX).$(SUFFIX) xqrot_k$(TSUFFIX).$(SUFFIX) \ | xdotc_k$(TSUFFIX).$(SUFFIX) xdotu_k$(TSUFFIX).$(SUFFIX) xnrm2_k$(TSUFFIX).$(SUFFIX) xqrot_k$(TSUFFIX).$(SUFFIX) \ | ||||
| xscal_k$(TSUFFIX).$(SUFFIX) xswap_k$(TSUFFIX).$(SUFFIX) | |||||
| xscal_k$(TSUFFIX).$(SUFFIX) xswap_k$(TSUFFIX).$(SUFFIX) xsum_k$(TSUFFIX).$(SUFFIX) | |||||
| ### AMAX ### | ### AMAX ### | ||||
| @@ -617,7 +644,7 @@ $(KDIR)idmin_k$(TSUFFIX).$(SUFFIX) $(KDIR)idmin_k$(TPSUFFIX).$(PSUFFIX) : $(KE | |||||
| $(KDIR)iqmin_k$(TSUFFIX).$(SUFFIX) $(KDIR)iqmin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IQMINKERNEL) | $(KDIR)iqmin_k$(TSUFFIX).$(SUFFIX) $(KDIR)iqmin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IQMINKERNEL) | ||||
| $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UUSE_ABS -DUSE_MIN $< -o $@ | $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UUSE_ABS -DUSE_MIN $< -o $@ | ||||
| ### ASUM ### | |||||
| $(KDIR)sasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)sasum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SASUMKERNEL) | $(KDIR)sasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)sasum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SASUMKERNEL) | ||||
| $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@ | $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@ | ||||
| @@ -636,6 +663,26 @@ $(KDIR)zasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)zasum_k$(TPSUFFIX).$(PSUFFIX) : $(KE | |||||
| $(KDIR)xasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)xasum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XASUMKERNEL) | $(KDIR)xasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)xasum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XASUMKERNEL) | ||||
| $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $@ | $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $@ | ||||
| ### SUM ### | |||||
| $(KDIR)ssum_k$(TSUFFIX).$(SUFFIX) $(KDIR)ssum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SSUMKERNEL) | |||||
| $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@ | |||||
| $(KDIR)dsum_k$(TSUFFIX).$(SUFFIX) $(KDIR)dsum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DSUMKERNEL) | |||||
| $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $@ | |||||
| $(KDIR)qsum_k$(TSUFFIX).$(SUFFIX) $(KDIR)qsum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QSUMKERNEL) | |||||
| $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $@ | |||||
| $(KDIR)csum_k$(TSUFFIX).$(SUFFIX) $(KDIR)csum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CSUMKERNEL) | |||||
| $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE $< -o $@ | |||||
| $(KDIR)zsum_k$(TSUFFIX).$(SUFFIX) $(KDIR)zsum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZSUMKERNEL) | |||||
| $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE $< -o $@ | |||||
| $(KDIR)xsum_k$(TSUFFIX).$(SUFFIX) $(KDIR)xsum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XSUMKERNEL) | |||||
| $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $@ | |||||
| ### AXPY ### | |||||
| $(KDIR)saxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)saxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SAXPYKERNEL) | $(KDIR)saxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)saxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SAXPYKERNEL) | ||||
| $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@ | $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@ | ||||
| @@ -0,0 +1,206 @@ | |||||
| /*********************************************************************/ | |||||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||||
| /* All rights reserved. */ | |||||
| /* */ | |||||
| /* Redistribution and use in source and binary forms, with or */ | |||||
| /* without modification, are permitted provided that the following */ | |||||
| /* conditions are met: */ | |||||
| /* */ | |||||
| /* 1. Redistributions of source code must retain the above */ | |||||
| /* copyright notice, this list of conditions and the following */ | |||||
| /* disclaimer. */ | |||||
| /* */ | |||||
| /* 2. Redistributions in binary form must reproduce the above */ | |||||
| /* copyright notice, this list of conditions and the following */ | |||||
| /* disclaimer in the documentation and/or other materials */ | |||||
| /* provided with the distribution. */ | |||||
| /* */ | |||||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||||
| /* */ | |||||
| /* The views and conclusions contained in the software and */ | |||||
| /* documentation are those of the authors and should not be */ | |||||
| /* interpreted as representing official policies, either expressed */ | |||||
| /* or implied, of The University of Texas at Austin. */ | |||||
| /*********************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #include "version.h" | |||||
| #define PREFETCHSIZE 88 | |||||
| #define N $16 | |||||
| #define X $17 | |||||
| #define INCX $18 | |||||
| #define I $19 | |||||
| #define s0 $f0 | |||||
| #define s1 $f1 | |||||
| #define s2 $f10 | |||||
| #define s3 $f11 | |||||
| #define a0 $f12 | |||||
| #define a1 $f13 | |||||
| #define a2 $f14 | |||||
| #define a3 $f15 | |||||
| #define a4 $f16 | |||||
| #define a5 $f17 | |||||
| #define a6 $f18 | |||||
| #define a7 $f19 | |||||
| #define t0 $f20 | |||||
| #define t1 $f21 | |||||
| #define t2 $f22 | |||||
| #define t3 $f23 | |||||
| PROLOGUE | |||||
| PROFCODE | |||||
| fclr s0 | |||||
| unop | |||||
| fclr t0 | |||||
| ble N, $L999 | |||||
| sra N, 3, I | |||||
| fclr s1 | |||||
| fclr s2 | |||||
| ble I, $L15 | |||||
| LD a0, 0 * SIZE(X) | |||||
| fclr t1 | |||||
| SXADDQ INCX, X, X | |||||
| fclr t2 | |||||
| LD a1, 0 * SIZE(X) | |||||
| fclr t3 | |||||
| SXADDQ INCX, X, X | |||||
| fclr s3 | |||||
| LD a2, 0 * SIZE(X) | |||||
| SXADDQ INCX, X, X | |||||
| LD a3, 0 * SIZE(X) | |||||
| SXADDQ INCX, X, X | |||||
| LD a4, 0 * SIZE(X) | |||||
| SXADDQ INCX, X, X | |||||
| LD a5, 0 * SIZE(X) | |||||
| SXADDQ INCX, X, X | |||||
| lda I, -1(I) | |||||
| ble I, $L13 | |||||
| .align 4 | |||||
| $L12: | |||||
| ADD s0, t0, s0 | |||||
| ldl $31, PREFETCHSIZE * 2 * SIZE(X) | |||||
| fmov a0, t0 | |||||
| lda I, -1(I) | |||||
| ADD s1, t1, s1 | |||||
| LD a6, 0 * SIZE(X) | |||||
| fmov a1, t1 | |||||
| SXADDQ INCX, X, X | |||||
| ADD s2, t2, s2 | |||||
| LD a7, 0 * SIZE(X) | |||||
| fmov a2, t2 | |||||
| SXADDQ INCX, X, X | |||||
| ADD s3, t3, s3 | |||||
| LD a0, 0 * SIZE(X) | |||||
| fmov a3, t3 | |||||
| SXADDQ INCX, X, X | |||||
| ADD s0, t0, s0 | |||||
| LD a1, 0 * SIZE(X) | |||||
| fmov a4, t0 | |||||
| SXADDQ INCX, X, X | |||||
| ADD s1, t1, s1 | |||||
| LD a2, 0 * SIZE(X) | |||||
| fmov a5, t1 | |||||
| SXADDQ INCX, X, X | |||||
| ADD s2, t2, s2 | |||||
| LD a3, 0 * SIZE(X) | |||||
| fmov a6, t2 | |||||
| SXADDQ INCX, X, X | |||||
| ADD s3, t3, s3 | |||||
| LD a4, 0 * SIZE(X) | |||||
| fmov a7, t3 | |||||
| SXADDQ INCX, X, X | |||||
| LD a5, 0 * SIZE(X) | |||||
| unop | |||||
| SXADDQ INCX, X, X | |||||
| bne I, $L12 | |||||
| .align 4 | |||||
| $L13: | |||||
| ADD s0, t0, s0 | |||||
| LD a6, 0 * SIZE(X) | |||||
| fmov a0, t0 | |||||
| SXADDQ INCX, X, X | |||||
| ADD s1, t1, s1 | |||||
| LD a7, 0 * SIZE(X) | |||||
| fmov a1, t1 | |||||
| SXADDQ INCX, X, X | |||||
| ADD s2, t2, s2 | |||||
| fmov a2, t2 | |||||
| ADD s3, t3, s3 | |||||
| fmov a3, t3 | |||||
| ADD s0, t0, s0 | |||||
| fmov a4, t0 | |||||
| ADD s1, t1, s1 | |||||
| fmov a5, t1 | |||||
| ADD s2, t2, s2 | |||||
| fmov a6, t2 | |||||
| ADD s3, t3, s3 | |||||
| fmov a7, t3 | |||||
| ADD s1, t1, s1 | |||||
| ADD s2, t2, s2 | |||||
| ADD s3, t3, s3 | |||||
| ADD s0, s1, s0 | |||||
| ADD s2, s3, s2 | |||||
| .align 4 | |||||
| $L15: | |||||
| and N, 7, I | |||||
| ADD s0, s2, s0 | |||||
| unop | |||||
| ble I, $L999 | |||||
| .align 4 | |||||
| $L17: | |||||
| ADD s0, t0, s0 | |||||
| LD a0, 0 * SIZE(X) | |||||
| SXADDQ INCX, X, X | |||||
| fmov a0, t0 | |||||
| lda I, -1(I) | |||||
| bne I, $L17 | |||||
| .align 4 | |||||
| $L999: | |||||
| ADD s0, t0, s0 | |||||
| ret | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,208 @@ | |||||
| /*********************************************************************/ | |||||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||||
| /* All rights reserved. */ | |||||
| /* */ | |||||
| /* Redistribution and use in source and binary forms, with or */ | |||||
| /* without modification, are permitted provided that the following */ | |||||
| /* conditions are met: */ | |||||
| /* */ | |||||
| /* 1. Redistributions of source code must retain the above */ | |||||
| /* copyright notice, this list of conditions and the following */ | |||||
| /* disclaimer. */ | |||||
| /* */ | |||||
| /* 2. Redistributions in binary form must reproduce the above */ | |||||
| /* copyright notice, this list of conditions and the following */ | |||||
| /* disclaimer in the documentation and/or other materials */ | |||||
| /* provided with the distribution. */ | |||||
| /* */ | |||||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||||
| /* */ | |||||
| /* The views and conclusions contained in the software and */ | |||||
| /* documentation are those of the authors and should not be */ | |||||
| /* interpreted as representing official policies, either expressed */ | |||||
| /* or implied, of The University of Texas at Austin. */ | |||||
| /*********************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #include "version.h" | |||||
| #define PREFETCHSIZE 88 | |||||
| #define N $16 | |||||
| #define X $17 | |||||
| #define INCX $18 | |||||
| #define I $19 | |||||
| #define s0 $f0 | |||||
| #define s1 $f1 | |||||
| #define s2 $f10 | |||||
| #define s3 $f11 | |||||
| #define a0 $f12 | |||||
| #define a1 $f13 | |||||
| #define a2 $f14 | |||||
| #define a3 $f15 | |||||
| #define a4 $f16 | |||||
| #define a5 $f17 | |||||
| #define a6 $f18 | |||||
| #define a7 $f19 | |||||
| #define t0 $f20 | |||||
| #define t1 $f21 | |||||
| #define t2 $f22 | |||||
| #define t3 $f23 | |||||
| PROLOGUE | |||||
| PROFCODE | |||||
| fclr s0 | |||||
| unop | |||||
| fclr t0 | |||||
| addq INCX, INCX, INCX | |||||
| fclr s1 | |||||
| unop | |||||
| fclr t1 | |||||
| ble N, $L999 | |||||
| fclr s2 | |||||
| sra N, 2, I | |||||
| fclr s3 | |||||
| ble I, $L15 | |||||
| LD a0, 0 * SIZE(X) | |||||
| fclr t2 | |||||
| LD a1, 1 * SIZE(X) | |||||
| SXADDQ INCX, X, X | |||||
| LD a2, 0 * SIZE(X) | |||||
| fclr t3 | |||||
| LD a3, 1 * SIZE(X) | |||||
| SXADDQ INCX, X, X | |||||
| LD a4, 0 * SIZE(X) | |||||
| LD a5, 1 * SIZE(X) | |||||
| SXADDQ INCX, X, X | |||||
| lda I, -1(I) | |||||
| ble I, $L13 | |||||
| .align 4 | |||||
| $L12: | |||||
| ADD s0, t0, s0 | |||||
| ldl $31, PREFETCHSIZE * SIZE(X) | |||||
| fmov a0, t0 | |||||
| lda I, -1(I) | |||||
| ADD s1, t1, s1 | |||||
| LD a6, 0 * SIZE(X) | |||||
| fmov a1, t1 | |||||
| unop | |||||
| ADD s2, t2, s2 | |||||
| LD a7, 1 * SIZE(X) | |||||
| fmov a2, t2 | |||||
| SXADDQ INCX, X, X | |||||
| ADD s3, t3, s3 | |||||
| LD a0, 0 * SIZE(X) | |||||
| fmov a3, t3 | |||||
| unop | |||||
| ADD s0, t0, s0 | |||||
| LD a1, 1 * SIZE(X) | |||||
| fmov a4, t0 | |||||
| SXADDQ INCX, X, X | |||||
| ADD s1, t1, s1 | |||||
| LD a2, 0 * SIZE(X) | |||||
| fmov a5, t1 | |||||
| unop | |||||
| ADD s2, t2, s2 | |||||
| LD a3, 1 * SIZE(X) | |||||
| fmov a6, t2 | |||||
| SXADDQ INCX, X, X | |||||
| ADD s3, t3, s3 | |||||
| LD a4, 0 * SIZE(X) | |||||
| fmov a7, t3 | |||||
| unop | |||||
| LD a5, 1 * SIZE(X) | |||||
| unop | |||||
| SXADDQ INCX, X, X | |||||
| bne I, $L12 | |||||
| .align 4 | |||||
| $L13: | |||||
| ADD s0, t0, s0 | |||||
| LD a6, 0 * SIZE(X) | |||||
| fmov a0, t0 | |||||
| ADD s1, t1, s1 | |||||
| LD a7, 1 * SIZE(X) | |||||
| fmov a1, t1 | |||||
| SXADDQ INCX, X, X | |||||
| ADD s2, t2, s2 | |||||
| fmov a2, t2 | |||||
| ADD s3, t3, s3 | |||||
| fmov a3, t3 | |||||
| ADD s0, t0, s0 | |||||
| fmov a4, t0 | |||||
| ADD s1, t1, s1 | |||||
| fmov a5, t1 | |||||
| ADD s2, t2, s2 | |||||
| fmov a6, t2 | |||||
| ADD s3, t3, s3 | |||||
| fmov a7, t3 | |||||
| ADD s2, t2, s2 | |||||
| ADD s3, t3, s3 | |||||
| .align 4 | |||||
| $L15: | |||||
| ADD s0, s2, s0 | |||||
| and N, 3, I | |||||
| ADD s1, s3, s1 | |||||
| ble I, $L999 | |||||
| .align 4 | |||||
| $L17: | |||||
| ADD s0, t0, s0 | |||||
| LD a0, 0 * SIZE(X) | |||||
| fmov a0, t0 | |||||
| lda I, -1(I) | |||||
| ADD s1, t1, s1 | |||||
| LD a1, 1 * SIZE(X) | |||||
| fmov a1, t1 | |||||
| SXADDQ INCX, X, X | |||||
| bne I, $L17 | |||||
| .align 4 | |||||
| $L999: | |||||
| ADD s0, t0, s0 | |||||
| ADD s1, t1, s1 | |||||
| ADD s0, s1, s0 | |||||
| ret | |||||
| EPILOGUE | |||||
| @@ -35,6 +35,11 @@ DASUMKERNEL = ../arm/asum.c | |||||
| CASUMKERNEL = ../arm/zasum.c | CASUMKERNEL = ../arm/zasum.c | ||||
| ZASUMKERNEL = ../arm/zasum.c | ZASUMKERNEL = ../arm/zasum.c | ||||
| SSUMKERNEL = ../arm/sum.c | |||||
| DSUMKERNEL = ../arm/sum.c | |||||
| CSUMKERNEL = ../arm/zsum.c | |||||
| ZSUMKERNEL = ../arm/zsum.c | |||||
| SAXPYKERNEL = ../arm/axpy.c | SAXPYKERNEL = ../arm/axpy.c | ||||
| DAXPYKERNEL = ../arm/axpy.c | DAXPYKERNEL = ../arm/axpy.c | ||||
| CAXPYKERNEL = ../arm/zaxpy.c | CAXPYKERNEL = ../arm/zaxpy.c | ||||
| @@ -37,6 +37,9 @@ DASUMKERNEL = asum_vfp.S | |||||
| CASUMKERNEL = asum_vfp.S | CASUMKERNEL = asum_vfp.S | ||||
| ZASUMKERNEL = asum_vfp.S | ZASUMKERNEL = asum_vfp.S | ||||
| SSUMKERNEL = sum_vfp.S | |||||
| DSUMKERNEL = sum_vfp.S | |||||
| SAXPYKERNEL = axpy_vfp.S | SAXPYKERNEL = axpy_vfp.S | ||||
| DAXPYKERNEL = axpy_vfp.S | DAXPYKERNEL = axpy_vfp.S | ||||
| CAXPYKERNEL = axpy_vfp.S | CAXPYKERNEL = axpy_vfp.S | ||||
| @@ -0,0 +1,51 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * trivial copy of asum.c with the ABS() removed * | |||||
| **************************************************************************************/ | |||||
| #include "common.h" | |||||
| #include <math.h> | |||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| { | |||||
| BLASLONG i=0; | |||||
| FLOAT sumf = 0.0; | |||||
| if (n <= 0 || inc_x <= 0) return(sumf); | |||||
| n *= inc_x; | |||||
| while(i < n) | |||||
| { | |||||
| sumf += x[i]; | |||||
| i += inc_x; | |||||
| } | |||||
| return(sumf); | |||||
| } | |||||
| @@ -0,0 +1,425 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * trivial copy of asum_vfp.S with the in-place vabs.f64 calls removed * | |||||
| **************************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define STACKSIZE 256 | |||||
| #define N r0 | |||||
| #define X r1 | |||||
| #define INC_X r2 | |||||
| #define I r12 | |||||
| #define X_PRE 512 | |||||
| /************************************************************************************** | |||||
| * Macro definitions | |||||
| **************************************************************************************/ | |||||
| #if !defined(COMPLEX) | |||||
| #if defined(DOUBLE) | |||||
| .macro KERNEL_F4 | |||||
| pld [ X, #X_PRE ] | |||||
| vldmia.f64 X!, { d4 - d5 } | |||||
| vadd.f64 d0 , d0, d4 | |||||
| vldmia.f64 X!, { d6 - d7 } | |||||
| vadd.f64 d1 , d1, d5 | |||||
| vadd.f64 d0 , d0, d6 | |||||
| vadd.f64 d1 , d1, d7 | |||||
| .endm | |||||
| .macro KERNEL_F1 | |||||
| vldmia.f64 X!, { d4 } | |||||
| vadd.f64 d0 , d0, d4 | |||||
| .endm | |||||
| .macro KERNEL_S4 | |||||
| vldmia.f64 X, { d4 } | |||||
| vadd.f64 d0 , d0, d4 | |||||
| add X, X, INC_X | |||||
| vldmia.f64 X, { d4 } | |||||
| vadd.f64 d0 , d0, d4 | |||||
| add X, X, INC_X | |||||
| vldmia.f64 X, { d4 } | |||||
| vadd.f64 d0 , d0, d4 | |||||
| add X, X, INC_X | |||||
| vldmia.f64 X, { d4 } | |||||
| vadd.f64 d0 , d0, d4 | |||||
| add X, X, INC_X | |||||
| .endm | |||||
| .macro KERNEL_S1 | |||||
| vldmia.f64 X, { d4 } | |||||
| vadd.f64 d0 , d0, d4 | |||||
| add X, X, INC_X | |||||
| .endm | |||||
| #else | |||||
| .macro KERNEL_F4 | |||||
| vldmia.f32 X!, { s4 - s5 } | |||||
| vadd.f32 s0 , s0, s4 | |||||
| vldmia.f32 X!, { s6 - s7 } | |||||
| vadd.f32 s1 , s1, s5 | |||||
| vadd.f32 s0 , s0, s6 | |||||
| vadd.f32 s1 , s1, s7 | |||||
| .endm | |||||
| .macro KERNEL_F1 | |||||
| vldmia.f32 X!, { s4 } | |||||
| vadd.f32 s0 , s0, s4 | |||||
| .endm | |||||
| .macro KERNEL_S4 | |||||
| vldmia.f32 X, { s4 } | |||||
| vadd.f32 s0 , s0, s4 | |||||
| add X, X, INC_X | |||||
| vldmia.f32 X, { s4 } | |||||
| vadd.f32 s0 , s0, s4 | |||||
| add X, X, INC_X | |||||
| vldmia.f32 X, { s4 } | |||||
| vadd.f32 s0 , s0, s4 | |||||
| add X, X, INC_X | |||||
| vldmia.f32 X, { s4 } | |||||
| vadd.f32 s0 , s0, s4 | |||||
| add X, X, INC_X | |||||
| .endm | |||||
| .macro KERNEL_S1 | |||||
| vldmia.f32 X, { s4 } | |||||
| vadd.f32 s0 , s0, s4 | |||||
| add X, X, INC_X | |||||
| .endm | |||||
| #endif | |||||
| #else | |||||
| #if defined(DOUBLE) | |||||
| .macro KERNEL_F4 | |||||
| pld [ X, #X_PRE ] | |||||
| vldmia.f64 X!, { d4 - d5 } | |||||
| vadd.f64 d0 , d0, d4 | |||||
| vldmia.f64 X!, { d6 - d7 } | |||||
| vadd.f64 d1 , d1, d5 | |||||
| vadd.f64 d0 , d0, d6 | |||||
| vadd.f64 d1 , d1, d7 | |||||
| pld [ X, #X_PRE ] | |||||
| vldmia.f64 X!, { d4 - d5 } | |||||
| vadd.f64 d0 , d0, d4 | |||||
| vldmia.f64 X!, { d6 - d7 } | |||||
| vadd.f64 d1 , d1, d5 | |||||
| vadd.f64 d0 , d0, d6 | |||||
| vadd.f64 d1 , d1, d7 | |||||
| .endm | |||||
| .macro KERNEL_F1 | |||||
| vldmia.f64 X!, { d4 } | |||||
| vadd.f64 d0 , d0, d4 | |||||
| vldmia.f64 X!, { d4 } | |||||
| vadd.f64 d0 , d0, d4 | |||||
| .endm | |||||
| .macro KERNEL_S4 | |||||
| vldmia.f64 X, { d4 -d5 } | |||||
| vadd.f64 d0 , d0, d4 | |||||
| vadd.f64 d0 , d0, d5 | |||||
| add X, X, INC_X | |||||
| vldmia.f64 X, { d4 -d5 } | |||||
| vadd.f64 d0 , d0, d4 | |||||
| vadd.f64 d0 , d0, d5 | |||||
| add X, X, INC_X | |||||
| vldmia.f64 X, { d4 -d5 } | |||||
| vadd.f64 d0 , d0, d4 | |||||
| vadd.f64 d0 , d0, d5 | |||||
| add X, X, INC_X | |||||
| vldmia.f64 X, { d4 -d5 } | |||||
| vadd.f64 d0 , d0, d4 | |||||
| vadd.f64 d0 , d0, d5 | |||||
| add X, X, INC_X | |||||
| .endm | |||||
| .macro KERNEL_S1 | |||||
| vldmia.f64 X, { d4 -d5 } | |||||
| vadd.f64 d0 , d0, d4 | |||||
| vadd.f64 d0 , d0, d5 | |||||
| add X, X, INC_X | |||||
| .endm | |||||
| #else | |||||
| .macro KERNEL_F4 | |||||
| pld [ X, #X_PRE ] | |||||
| vldmia.f32 X!, { s4 - s5 } | |||||
| vadd.f32 s0 , s0, s4 | |||||
| vldmia.f32 X!, { s6 - s7 } | |||||
| vadd.f32 s1 , s1, s5 | |||||
| vadd.f32 s0 , s0, s6 | |||||
| vadd.f32 s1 , s1, s7 | |||||
| vldmia.f32 X!, { s4 - s5 } | |||||
| vadd.f32 s0 , s0, s4 | |||||
| vldmia.f32 X!, { s6 - s7 } | |||||
| vadd.f32 s1 , s1, s5 | |||||
| vadd.f32 s0 , s0, s6 | |||||
| vadd.f32 s1 , s1, s7 | |||||
| .endm | |||||
| .macro KERNEL_F1 | |||||
| vldmia.f32 X!, { s4 } | |||||
| vadd.f32 s0 , s0, s4 | |||||
| vldmia.f32 X!, { s4 } | |||||
| vadd.f32 s0 , s0, s4 | |||||
| .endm | |||||
| .macro KERNEL_S4 | |||||
| vldmia.f32 X, { s4 -s5 } | |||||
| vadd.f32 s0 , s0, s4 | |||||
| vadd.f32 s0 , s0, s5 | |||||
| add X, X, INC_X | |||||
| vldmia.f32 X, { s4 -s5 } | |||||
| vadd.f32 s0 , s0, s4 | |||||
| vadd.f32 s0 , s0, s5 | |||||
| add X, X, INC_X | |||||
| vldmia.f32 X, { s4 -s5 } | |||||
| vadd.f32 s0 , s0, s4 | |||||
| vadd.f32 s0 , s0, s5 | |||||
| add X, X, INC_X | |||||
| vldmia.f32 X, { s4 -s5 } | |||||
| vadd.f32 s0 , s0, s4 | |||||
| vadd.f32 s0 , s0, s5 | |||||
| add X, X, INC_X | |||||
| .endm | |||||
| .macro KERNEL_S1 | |||||
| vldmia.f32 X, { s4 -s5 } | |||||
| vadd.f32 s0 , s0, s4 | |||||
| vadd.f32 s0 , s0, s5 | |||||
| add X, X, INC_X | |||||
| .endm | |||||
| #endif | |||||
| #endif | |||||
| /************************************************************************************** | |||||
| * End of macro definitions | |||||
| **************************************************************************************/ | |||||
| PROLOGUE | |||||
| .align 5 | |||||
| movs r12, #0 // clear floating point register | |||||
| vmov s0, r12 | |||||
| vmov s1, r12 | |||||
| #if defined(DOUBLE) | |||||
| vcvt.f64.f32 d0, s0 | |||||
| vcvt.f64.f32 d1, s1 | |||||
| #endif | |||||
| cmp N, #0 | |||||
| ble asum_kernel_L999 | |||||
| cmp INC_X, #0 | |||||
| beq asum_kernel_L999 | |||||
| cmp INC_X, #1 | |||||
| bne asum_kernel_S_BEGIN | |||||
| asum_kernel_F_BEGIN: | |||||
| asrs I, N, #2 // I = N / 4 | |||||
| ble asum_kernel_F1 | |||||
| .align 5 | |||||
| asum_kernel_F4: | |||||
| #if !defined(DOUBLE) && !defined(COMPLEX) | |||||
| pld [ X, #X_PRE ] | |||||
| #endif | |||||
| KERNEL_F4 | |||||
| subs I, I, #1 | |||||
| ble asum_kernel_F1 | |||||
| KERNEL_F4 | |||||
| subs I, I, #1 | |||||
| bne asum_kernel_F4 | |||||
| asum_kernel_F1: | |||||
| ands I, N, #3 | |||||
| ble asum_kernel_L999 | |||||
| asum_kernel_F10: | |||||
| KERNEL_F1 | |||||
| subs I, I, #1 | |||||
| bne asum_kernel_F10 | |||||
| b asum_kernel_L999 | |||||
| asum_kernel_S_BEGIN: | |||||
| #if defined(COMPLEX) | |||||
| #if defined(DOUBLE) | |||||
| lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 | |||||
| #else | |||||
| lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 | |||||
| #endif | |||||
| #else | |||||
| #if defined(DOUBLE) | |||||
| lsl INC_X, INC_X, #3 // INC_X * SIZE | |||||
| #else | |||||
| lsl INC_X, INC_X, #2 // INC_X * SIZE | |||||
| #endif | |||||
| #endif | |||||
| asrs I, N, #2 // I = N / 4 | |||||
| ble asum_kernel_S1 | |||||
| .align 5 | |||||
| asum_kernel_S4: | |||||
| KERNEL_S4 | |||||
| subs I, I, #1 | |||||
| bne asum_kernel_S4 | |||||
| asum_kernel_S1: | |||||
| ands I, N, #3 | |||||
| ble asum_kernel_L999 | |||||
| asum_kernel_S10: | |||||
| KERNEL_S1 | |||||
| subs I, I, #1 | |||||
| bne asum_kernel_S10 | |||||
| asum_kernel_L999: | |||||
| #if defined(DOUBLE) | |||||
| vadd.f64 d0 , d0, d1 // set return value | |||||
| #else | |||||
| vadd.f32 s0 , s0, s1 // set return value | |||||
| #endif | |||||
| #if !defined(__ARM_PCS_VFP) | |||||
| #if !defined(DOUBLE) | |||||
| vmov r0, s0 | |||||
| #else | |||||
| vmov r0, r1, d0 | |||||
| #endif | |||||
| #endif | |||||
| bx lr | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,57 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * trivial copy of zasum.c with the ABS() removed * | |||||
| **************************************************************************************/ | |||||
| #include "common.h" | |||||
| #include <math.h> | |||||
| #define CSUM1(x,i) x[i]+x[i+1] | |||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| { | |||||
| BLASLONG i=0; | |||||
| FLOAT sumf = 0.0; | |||||
| BLASLONG inc_x2; | |||||
| if (n <= 0 || inc_x <= 0) return(sumf); | |||||
| inc_x2 = 2 * inc_x; | |||||
| n *= inc_x2; | |||||
| while(i < n) | |||||
| { | |||||
| sumf += CSUM1(x,i); | |||||
| i += inc_x2; | |||||
| } | |||||
| return(sumf); | |||||
| } | |||||
| @@ -0,0 +1,164 @@ | |||||
| /******************************************************************************* | |||||
| Copyright (c) 2019, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *******************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define N x0 /* vector length */ | |||||
| #define X x1 /* X vector address */ | |||||
| #define INC_X x2 /* X stride */ | |||||
| #define I x5 /* loop variable */ | |||||
| /******************************************************************************* | |||||
| * Macro definitions | |||||
| *******************************************************************************/ | |||||
| #define REG0 wzr | |||||
| #define SUMF s0 | |||||
| #define TMPF s1 | |||||
| #define TMPVF {v1.s}[0] | |||||
| #define SZ 4 | |||||
| /******************************************************************************/ | |||||
| .macro KERNEL_F1 | |||||
| ld1 {v1.2s}, [X], #8 | |||||
| ext v2.8b, v1.8b, v1.8b, #4 | |||||
| fadd TMPF, TMPF, s2 | |||||
| fadd SUMF, SUMF, TMPF | |||||
| .endm | |||||
| .macro KERNEL_F8 | |||||
| ld1 {v1.4s, v2.4s, v3.4s, v4.4s}, [X] | |||||
| add X, X, #64 | |||||
| PRFM PLDL1KEEP, [X, #1024] | |||||
| fadd v1.4s, v1.4s, v2.4s | |||||
| fadd v3.4s, v3.4s, v4.4s | |||||
| fadd v0.4s, v0.4s, v1.4s | |||||
| fadd v0.4s, v0.4s, v3.4s | |||||
| .endm | |||||
| .macro KERNEL_F8_FINALIZE | |||||
| ext v1.16b, v0.16b, v0.16b, #8 | |||||
| fadd v0.2s, v0.2s, v1.2s | |||||
| faddp SUMF, v0.2s | |||||
| .endm | |||||
| .macro INIT_S | |||||
| lsl INC_X, INC_X, #3 | |||||
| .endm | |||||
| .macro KERNEL_S1 | |||||
| ld1 {v1.2s}, [X], INC_X | |||||
| ext v2.8b, v1.8b, v1.8b, #4 | |||||
| fadd TMPF, TMPF, s2 | |||||
| fadd SUMF, SUMF, TMPF | |||||
| .endm | |||||
| /******************************************************************************* | |||||
| * End of macro definitions | |||||
| *******************************************************************************/ | |||||
| PROLOGUE | |||||
| fmov SUMF, REG0 | |||||
| fmov s1, SUMF | |||||
| cmp N, xzr | |||||
| ble .Lcsum_kernel_L999 | |||||
| cmp INC_X, xzr | |||||
| ble .Lcsum_kernel_L999 | |||||
| cmp INC_X, #1 | |||||
| bne .Lcsum_kernel_S_BEGIN | |||||
| .Lcsum_kernel_F_BEGIN: | |||||
| asr I, N, #3 | |||||
| cmp I, xzr | |||||
| beq .Lcsum_kernel_F1 | |||||
| .Lcsum_kernel_F8: | |||||
| KERNEL_F8 | |||||
| subs I, I, #1 | |||||
| bne .Lcsum_kernel_F8 | |||||
| KERNEL_F8_FINALIZE | |||||
| .Lcsum_kernel_F1: | |||||
| ands I, N, #7 | |||||
| ble .Lcsum_kernel_L999 | |||||
| .Lcsum_kernel_F10: | |||||
| KERNEL_F1 | |||||
| subs I, I, #1 | |||||
| bne .Lcsum_kernel_F10 | |||||
| .Lcsum_kernel_L999: | |||||
| ret | |||||
| .Lcsum_kernel_S_BEGIN: | |||||
| INIT_S | |||||
| asr I, N, #2 | |||||
| cmp I, xzr | |||||
| ble .Lcsum_kernel_S1 | |||||
| .Lcsum_kernel_S4: | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| subs I, I, #1 | |||||
| bne .Lcsum_kernel_S4 | |||||
| .Lcsum_kernel_S1: | |||||
| ands I, N, #3 | |||||
| ble .Lcsum_kernel_L999 | |||||
| .Lcsum_kernel_S10: | |||||
| KERNEL_S1 | |||||
| subs I, I, #1 | |||||
| bne .Lcsum_kernel_S10 | |||||
| ret | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,186 @@ | |||||
| /******************************************************************************* | |||||
| Copyright (c) 2019, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *******************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define N x0 /* vector length */ | |||||
| #define X x1 /* X vector address */ | |||||
| #define INC_X x2 /* X stride */ | |||||
| #define I x5 /* loop variable */ | |||||
| /******************************************************************************* | |||||
| * Macro definitions | |||||
| *******************************************************************************/ | |||||
| #if !defined(DOUBLE) | |||||
| #define REG0 wzr | |||||
| #define SUMF s0 | |||||
| #define TMPF s1 | |||||
| #define TMPVF {v1.s}[0] | |||||
| #define SZ 4 | |||||
| #else | |||||
| #define REG0 xzr | |||||
| #define SUMF d0 | |||||
| #define TMPF d1 | |||||
| #define TMPVF {v1.d}[0] | |||||
| #define SZ 8 | |||||
| #endif | |||||
| /******************************************************************************/ | |||||
| .macro KERNEL_F1 | |||||
| ldr TMPF, [X], #SZ | |||||
| fadd SUMF, SUMF, TMPF | |||||
| .endm | |||||
| .macro KERNEL_F8 | |||||
| #if !defined(DOUBLE) | |||||
| ld1 {v1.4s, v2.4s}, [X], #32 // Load [X3, X2, X1, X0] | |||||
| fadd v1.4s, v1.4s, v2.4s // [X3+X1, X2+X0] | |||||
| fadd v0.4s, v0.4s, v1.4s // [X3+X1, X2+X0] | |||||
| PRFM PLDL1KEEP, [X, #1024] | |||||
| #else // DOUBLE | |||||
| ld1 {v2.2d, v3.2d, v4.2d, v5.2d}, [X] | |||||
| add X, X, #64 | |||||
| PRFM PLDL1KEEP, [X, #1024] | |||||
| fadd v2.2d, v2.2d, v3.2d | |||||
| fadd v4.2d, v4.2d, v5.2d | |||||
| fadd v0.2d, v0.2d, v2.2d | |||||
| fadd v0.2d, v0.2d, v4.2d | |||||
| #endif | |||||
| .endm | |||||
| .macro KERNEL_F8_FINALIZE | |||||
| #if !defined(DOUBLE) | |||||
| ext v1.16b, v0.16b, v0.16b, #8 | |||||
| fadd v0.2s, v0.2s, v1.2s | |||||
| faddp SUMF, v0.2s | |||||
| #else | |||||
| faddp SUMF, v0.2d | |||||
| #endif | |||||
| .endm | |||||
| .macro INIT_S | |||||
| #if !defined(DOUBLE) | |||||
| lsl INC_X, INC_X, #2 | |||||
| #else | |||||
| lsl INC_X, INC_X, #3 | |||||
| #endif | |||||
| .endm | |||||
| .macro KERNEL_S1 | |||||
| ld1 TMPVF, [X], INC_X | |||||
| fadd SUMF, SUMF, TMPF | |||||
| .endm | |||||
| /******************************************************************************* | |||||
| * End of macro definitions | |||||
| *******************************************************************************/ | |||||
| PROLOGUE | |||||
| fmov SUMF, REG0 | |||||
| #if !defined(DOUBLE) | |||||
| fmov s1, SUMF | |||||
| #else | |||||
| fmov d1, SUMF | |||||
| #endif | |||||
| cmp N, xzr | |||||
| ble .Lsum_kernel_L999 | |||||
| cmp INC_X, xzr | |||||
| ble .Lsum_kernel_L999 | |||||
| cmp INC_X, #1 | |||||
| bne .Lsum_kernel_S_BEGIN | |||||
| .Lsum_kernel_F_BEGIN: | |||||
| asr I, N, #3 | |||||
| cmp I, xzr | |||||
| beq .Lsum_kernel_F1 | |||||
| .Lsum_kernel_F8: | |||||
| KERNEL_F8 | |||||
| subs I, I, #1 | |||||
| bne .Lsum_kernel_F8 | |||||
| KERNEL_F8_FINALIZE | |||||
| .Lsum_kernel_F1: | |||||
| ands I, N, #7 | |||||
| ble .Lsum_kernel_L999 | |||||
| .Lsum_kernel_F10: | |||||
| KERNEL_F1 | |||||
| subs I, I, #1 | |||||
| bne .Lsum_kernel_F10 | |||||
| .Lsum_kernel_L999: | |||||
| ret | |||||
| .Lsum_kernel_S_BEGIN: | |||||
| INIT_S | |||||
| asr I, N, #2 | |||||
| cmp I, xzr | |||||
| ble .Lsum_kernel_S1 | |||||
| .Lsum_kernel_S4: | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| subs I, I, #1 | |||||
| bne .Lsum_kernel_S4 | |||||
| .Lsum_kernel_S1: | |||||
| ands I, N, #3 | |||||
| ble .Lsum_kernel_L999 | |||||
| .Lsum_kernel_S10: | |||||
| KERNEL_S1 | |||||
| subs I, I, #1 | |||||
| bne .Lsum_kernel_S10 | |||||
| ret | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,158 @@ | |||||
| /******************************************************************************* | |||||
| Copyright (c) 2015, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *******************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define N x0 /* vector length */ | |||||
| #define X x1 /* X vector address */ | |||||
| #define INC_X x2 /* X stride */ | |||||
| #define I x5 /* loop variable */ | |||||
| /******************************************************************************* | |||||
| * Macro definitions | |||||
| *******************************************************************************/ | |||||
| #define REG0 xzr | |||||
| #define SUMF d0 | |||||
| #define TMPF d1 | |||||
| #define TMPVF {v1.d}[0] | |||||
| #define SZ 8 | |||||
| /******************************************************************************/ | |||||
| .macro KERNEL_F1 | |||||
| ld1 {v1.2d}, [X], #16 | |||||
| faddp TMPF, v1.2d | |||||
| fadd SUMF, SUMF, TMPF | |||||
| .endm | |||||
| .macro KERNEL_F4 | |||||
| ld1 {v1.2d, v2.2d, v3.2d, v4.2d}, [X], #64 | |||||
| fadd v1.2d, v1.2d, v2.2d | |||||
| fadd v3.2d, v3.2d, v4.2d | |||||
| fadd v0.2d, v0.2d, v1.2d | |||||
| fadd v0.2d, v0.2d, v3.2d | |||||
| PRFM PLDL1KEEP, [X, #1024] | |||||
| .endm | |||||
| .macro KERNEL_F4_FINALIZE | |||||
| faddp SUMF, v0.2d | |||||
| .endm | |||||
| .macro INIT_S | |||||
| lsl INC_X, INC_X, #4 | |||||
| .endm | |||||
| .macro KERNEL_S1 | |||||
| ld1 {v1.2d}, [X], INC_X | |||||
| faddp TMPF, v1.2d | |||||
| fadd SUMF, SUMF, TMPF | |||||
| .endm | |||||
| /******************************************************************************* | |||||
| * End of macro definitions | |||||
| *******************************************************************************/ | |||||
| PROLOGUE | |||||
| fmov SUMF, REG0 | |||||
| cmp N, xzr | |||||
| ble .Lzsum_kernel_L999 | |||||
| cmp INC_X, xzr | |||||
| ble .Lzsum_kernel_L999 | |||||
| cmp INC_X, #1 | |||||
| bne .Lzsum_kernel_S_BEGIN | |||||
| .Lzsum_kernel_F_BEGIN: | |||||
| asr I, N, #2 | |||||
| cmp I, xzr | |||||
| beq .Lzsum_kernel_F1 | |||||
| .Lzsum_kernel_F4: | |||||
| KERNEL_F4 | |||||
| subs I, I, #1 | |||||
| bne .Lzsum_kernel_F4 | |||||
| KERNEL_F4_FINALIZE | |||||
| .Lzsum_kernel_F1: | |||||
| ands I, N, #3 | |||||
| ble .Lzsum_kernel_L999 | |||||
| .Lzsum_kernel_F10: | |||||
| KERNEL_F1 | |||||
| subs I, I, #1 | |||||
| bne .Lzsum_kernel_F10 | |||||
| .Lzsum_kernel_L999: | |||||
| ret | |||||
| .Lzsum_kernel_S_BEGIN: | |||||
| INIT_S | |||||
| asr I, N, #2 | |||||
| cmp I, xzr | |||||
| ble .Lzsum_kernel_S1 | |||||
| .Lzsum_kernel_S4: | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| subs I, I, #1 | |||||
| bne .Lzsum_kernel_S4 | |||||
| .Lzsum_kernel_S1: | |||||
| ands I, N, #3 | |||||
| ble .Lzsum_kernel_L999 | |||||
| .Lzsum_kernel_S10: | |||||
| KERNEL_S1 | |||||
| subs I, I, #1 | |||||
| bne .Lzsum_kernel_S10 | |||||
| ret | |||||
| EPILOGUE | |||||
| @@ -60,6 +60,10 @@ CASUMKERNEL = asum.S | |||||
| ZASUMKERNEL = asum.S | ZASUMKERNEL = asum.S | ||||
| XASUMKERNEL = asum.S | XASUMKERNEL = asum.S | ||||
| CSUMKERNEL = sum.S | |||||
| ZSUMKERNEL = sum.S | |||||
| XSUMKERNEL = sum.S | |||||
| CNRM2KERNEL = nrm2.S | CNRM2KERNEL = nrm2.S | ||||
| ZNRM2KERNEL = nrm2.S | ZNRM2KERNEL = nrm2.S | ||||
| XNRM2KERNEL = nrm2.S | XNRM2KERNEL = nrm2.S | ||||
| @@ -0,0 +1,358 @@ | |||||
| /*********************************************************************/ | |||||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||||
| /* Copyright 2019, The OpenBLAS project */ | |||||
| /* All rights reserved. */ | |||||
| /* */ | |||||
| /* Redistribution and use in source and binary forms, with or */ | |||||
| /* without modification, are permitted provided that the following */ | |||||
| /* conditions are met: */ | |||||
| /* */ | |||||
| /* 1. Redistributions of source code must retain the above */ | |||||
| /* copyright notice, this list of conditions and the following */ | |||||
| /* disclaimer. */ | |||||
| /* */ | |||||
| /* 2. Redistributions in binary form must reproduce the above */ | |||||
| /* copyright notice, this list of conditions and the following */ | |||||
| /* disclaimer in the documentation and/or other materials */ | |||||
| /* provided with the distribution. */ | |||||
| /* */ | |||||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||||
| /* */ | |||||
| /* The views and conclusions contained in the software and */ | |||||
| /* documentation are those of the authors and should not be */ | |||||
| /* interpreted as representing official policies, either expressed */ | |||||
| /* or implied, of The University of Texas at Austin. */ | |||||
| /*********************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #ifdef XDOUBLE | |||||
| #define PREFETCH_SIZE ( 8 * 16 + 4) | |||||
| #elif defined(DOUBLE) | |||||
| #define PREFETCH_SIZE (16 * 16 + 8) | |||||
| #else | |||||
| #define PREFETCH_SIZE (32 * 16 + 16) | |||||
| #endif | |||||
| #ifndef COMPLEX | |||||
| #define COMPADD 0 | |||||
| #define STRIDE INCX | |||||
| #else | |||||
| #define COMPADD 1 | |||||
| #define STRIDE SIZE | |||||
| #endif | |||||
| #define PRE1 r2 | |||||
| #define I r17 | |||||
| #define J r18 | |||||
| #define INCX16 r21 | |||||
| #define PR r30 | |||||
| #define ARLC r31 | |||||
| #define N r32 | |||||
| #define X r33 | |||||
| #define INCX r34 | |||||
| PROLOGUE | |||||
| .prologue | |||||
| PROFCODE | |||||
| { .mfi | |||||
| adds PRE1 = PREFETCH_SIZE * SIZE, X | |||||
| mov f8 = f0 | |||||
| .save ar.lc, ARLC | |||||
| mov ARLC = ar.lc | |||||
| } | |||||
| ;; | |||||
| .body | |||||
| #ifdef F_INTERFACE | |||||
| { .mmi | |||||
| LDINT N = [N] | |||||
| LDINT INCX = [INCX] | |||||
| nop.i 0 | |||||
| } | |||||
| ;; | |||||
| #ifndef USE64BITINT | |||||
| { .mii | |||||
| nop.m 0 | |||||
| sxt4 N = N | |||||
| sxt4 INCX = INCX | |||||
| } | |||||
| ;; | |||||
| #endif | |||||
| #endif | |||||
| { .mmi | |||||
| cmp.lt p0, p6 = r0, INCX | |||||
| cmp.lt p0, p7 = r0, N | |||||
| shr I = N, (4 - COMPADD) | |||||
| } | |||||
| { .mbb | |||||
| and J = ((1 << (4 - COMPADD)) - 1), N | |||||
| (p6) br.ret.sptk.many b0 | |||||
| (p7) br.ret.sptk.many b0 | |||||
| } | |||||
| ;; | |||||
| { .mfi | |||||
| adds I = -1, I | |||||
| mov f10 = f0 | |||||
| mov PR = pr | |||||
| } | |||||
| { .mfi | |||||
| cmp.eq p9, p0 = r0, J | |||||
| mov f9 = f0 | |||||
| tbit.z p0, p12 = N, 3 - COMPADD | |||||
| } | |||||
| ;; | |||||
| { .mmi | |||||
| cmp.eq p16, p0 = r0, r0 | |||||
| cmp.ne p17, p0 = r0, r0 | |||||
| mov ar.ec= 3 | |||||
| } | |||||
| { .mfi | |||||
| cmp.ne p18, p0 = r0, r0 | |||||
| mov f11 = f0 | |||||
| shl INCX = INCX, BASE_SHIFT + COMPADD | |||||
| } | |||||
| ;; | |||||
| { .mmi | |||||
| #ifdef XDOUBLE | |||||
| shladd INCX16 = INCX, (3 - COMPADD), r0 | |||||
| #else | |||||
| shladd INCX16 = INCX, (4 - COMPADD), r0 | |||||
| #endif | |||||
| cmp.ne p19, p0 = r0, r0 | |||||
| mov ar.lc = I | |||||
| } | |||||
| { .mmb | |||||
| cmp.gt p8 ,p0 = r0, I | |||||
| #ifdef COMPLEX | |||||
| adds INCX = - SIZE, INCX | |||||
| #else | |||||
| nop.m 0 | |||||
| #endif | |||||
| (p8) br.cond.dpnt .L55 | |||||
| } | |||||
| ;; | |||||
| .align 32 | |||||
| .L52: | |||||
| { .mmf | |||||
| (p16) lfetch.nt1 [PRE1], INCX16 | |||||
| (p16) LDFD f32 = [X], STRIDE | |||||
| } | |||||
| { .mfb | |||||
| (p19) FADD f8 = f8, f71 | |||||
| } | |||||
| ;; | |||||
| { .mmf | |||||
| (p16) LDFD f35 = [X], INCX | |||||
| } | |||||
| { .mfb | |||||
| (p19) FADD f9 = f9, f74 | |||||
| } | |||||
| ;; | |||||
| { .mmf | |||||
| (p16) LDFD f38 = [X], STRIDE | |||||
| } | |||||
| { .mfb | |||||
| (p19) FADD f10 = f10, f77 | |||||
| } | |||||
| ;; | |||||
| { .mmf | |||||
| (p16) LDFD f41 = [X], INCX | |||||
| } | |||||
| { .mfb | |||||
| (p19) FADD f11 = f11, f80 | |||||
| } | |||||
| ;; | |||||
| { .mmf | |||||
| (p16) LDFD f44 = [X], STRIDE | |||||
| } | |||||
| { .mfb | |||||
| (p18) FADD f8 = f8, f34 | |||||
| } | |||||
| ;; | |||||
| { .mmf | |||||
| (p16) LDFD f47 = [X], INCX | |||||
| } | |||||
| { .mfb | |||||
| (p18) FADD f9 = f9, f37 | |||||
| } | |||||
| ;; | |||||
| { .mmf | |||||
| (p16) LDFD f50 = [X], STRIDE | |||||
| } | |||||
| { .mfb | |||||
| (p18) FADD f10 = f10, f40 | |||||
| } | |||||
| ;; | |||||
| { .mmf | |||||
| (p16) LDFD f53 = [X], INCX | |||||
| } | |||||
| { .mfb | |||||
| (p18) FADD f11 = f11, f43 | |||||
| } | |||||
| ;; | |||||
| { .mmf | |||||
| #ifdef XDOUBLE | |||||
| (p16) lfetch.nt1 [PRE1], INCX16 | |||||
| #endif | |||||
| (p16) LDFD f56 = [X], STRIDE | |||||
| } | |||||
| { .mfb | |||||
| (p18) FADD f8 = f8, f46 | |||||
| } | |||||
| ;; | |||||
| { .mmf | |||||
| (p16) LDFD f59 = [X], INCX | |||||
| } | |||||
| { .mfb | |||||
| (p18) FADD f9 = f9, f49 | |||||
| } | |||||
| ;; | |||||
| { .mmf | |||||
| (p16) LDFD f62 = [X], STRIDE | |||||
| } | |||||
| { .mfb | |||||
| (p18) FADD f10 = f10, f52 | |||||
| } | |||||
| ;; | |||||
| { .mmf | |||||
| (p16) LDFD f65 = [X], INCX | |||||
| } | |||||
| { .mfb | |||||
| (p18) FADD f11 = f11, f55 | |||||
| } | |||||
| ;; | |||||
| { .mmf | |||||
| (p16) LDFD f68 = [X], STRIDE | |||||
| } | |||||
| { .mfb | |||||
| (p18) FADD f8 = f8, f58 | |||||
| } | |||||
| ;; | |||||
| { .mmf | |||||
| (p16) LDFD f71 = [X], INCX | |||||
| } | |||||
| { .mfb | |||||
| (p18) FADD f9 = f9, f61 | |||||
| } | |||||
| ;; | |||||
| { .mmf | |||||
| (p16) LDFD f74 = [X], STRIDE | |||||
| } | |||||
| { .mfb | |||||
| (p18) FADD f10 = f10, f64 | |||||
| } | |||||
| ;; | |||||
| { .mmf | |||||
| (p16) LDFD f77 = [X], INCX | |||||
| } | |||||
| { .mfb | |||||
| (p18) FADD f11 = f11, f67 | |||||
| br.ctop.sptk.few .L52 | |||||
| } | |||||
| ;; | |||||
| FADD f8 = f8, f71 | |||||
| FADD f9 = f9, f74 | |||||
| FADD f10 = f10, f77 | |||||
| FADD f11 = f11, f80 | |||||
| .align 32 | |||||
| ;; | |||||
| .L55: | |||||
| (p12) LDFD f32 = [X], STRIDE | |||||
| (p9) br.cond.dptk .L998 | |||||
| ;; | |||||
| (p12) LDFD f33 = [X], INCX | |||||
| ;; | |||||
| (p12) LDFD f34 = [X], STRIDE | |||||
| ;; | |||||
| (p12) LDFD f35 = [X], INCX | |||||
| tbit.z p0, p13 = N, (2 - COMPADD) | |||||
| ;; | |||||
| (p12) LDFD f36 = [X], STRIDE | |||||
| tbit.z p0, p14 = N, (1 - COMPADD) | |||||
| ;; | |||||
| (p12) LDFD f37 = [X], INCX | |||||
| #ifndef COMPLEX | |||||
| tbit.z p0, p15 = N, 0 | |||||
| #endif | |||||
| ;; | |||||
| (p12) LDFD f38 = [X], STRIDE | |||||
| ;; | |||||
| (p12) LDFD f39 = [X], INCX | |||||
| ;; | |||||
| (p13) LDFD f40 = [X], STRIDE | |||||
| ;; | |||||
| (p13) LDFD f41 = [X], INCX | |||||
| ;; | |||||
| (p13) LDFD f42 = [X], STRIDE | |||||
| (p12) FADD f8 = f8, f32 | |||||
| ;; | |||||
| (p13) LDFD f43 = [X], INCX | |||||
| (p12) FADD f9 = f9, f33 | |||||
| ;; | |||||
| (p14) LDFD f44 = [X], STRIDE | |||||
| (p12) FADD f10 = f10, f34 | |||||
| ;; | |||||
| (p14) LDFD f45 = [X], INCX | |||||
| (p12) FADD f11 = f11, f35 | |||||
| ;; | |||||
| #ifndef COMPLEX | |||||
| (p15) LDFD f46 = [X] | |||||
| #endif | |||||
| (p12) FADD f8 = f8, f36 | |||||
| ;; | |||||
| (p12) FADD f9 = f9, f37 | |||||
| (p12) FADD f10 = f10, f38 | |||||
| (p12) FADD f11 = f11, f39 | |||||
| ;; | |||||
| (p13) FADD f8 = f8, f40 | |||||
| (p13) FADD f9 = f9, f41 | |||||
| #ifndef COMPLEX | |||||
| #endif | |||||
| (p13) FADD f10 = f10, f42 | |||||
| ;; | |||||
| (p13) FADD f11 = f11, f43 | |||||
| (p14) FADD f8 = f8, f44 | |||||
| (p14) FADD f9 = f9, f45 | |||||
| #ifndef COMPLEX | |||||
| (p15) FADD f10 = f10, f46 | |||||
| #endif | |||||
| ;; | |||||
| .align 32 | |||||
| .L998: | |||||
| { .mfi | |||||
| FADD f8 = f8, f9 | |||||
| mov ar.lc = ARLC | |||||
| } | |||||
| { .mmf | |||||
| FADD f10 = f10, f11 | |||||
| } | |||||
| ;; | |||||
| { .mii | |||||
| mov pr = PR, -65474 | |||||
| } | |||||
| ;; | |||||
| { .mfb | |||||
| FADD f8 = f8, f10 | |||||
| br.ret.sptk.many b0 | |||||
| } | |||||
| EPILOGUE | |||||
| @@ -30,6 +30,11 @@ IDMAXKERNEL = ../mips/imax.c | |||||
| ISMINKERNEL = ../mips/imin.c | ISMINKERNEL = ../mips/imin.c | ||||
| IDMINKERNEL = ../mips/imin.c | IDMINKERNEL = ../mips/imin.c | ||||
| SSUMKERNEL = ../mips/sum.c | |||||
| DSUMKERNEL = ../mips/sum.c | |||||
| CSUMKERNEL = ../mips/zsum.c | |||||
| ZSUMKERNEL = ../mips/zsum.c | |||||
| ifdef HAVE_MSA | ifdef HAVE_MSA | ||||
| SASUMKERNEL = ../mips/sasum_msa.c | SASUMKERNEL = ../mips/sasum_msa.c | ||||
| DASUMKERNEL = ../mips/dasum_msa.c | DASUMKERNEL = ../mips/dasum_msa.c | ||||
| @@ -0,0 +1,47 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2016, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #include "common.h" | |||||
| #include <math.h> | |||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| { | |||||
| BLASLONG i=0; | |||||
| FLOAT sumf = 0.0; | |||||
| if (n <= 0 || inc_x <= 0) return(sumf); | |||||
| n *= inc_x; | |||||
| while(i < n) | |||||
| { | |||||
| sumf += x[i]; | |||||
| i += inc_x; | |||||
| } | |||||
| return(sumf); | |||||
| } | |||||
| @@ -0,0 +1,52 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2016, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #include "common.h" | |||||
| #include <math.h> | |||||
| #define CSUM1(x,i) x[i]+x[i+1] | |||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| { | |||||
| BLASLONG i=0; | |||||
| FLOAT sumf = 0.0; | |||||
| BLASLONG inc_x2; | |||||
| if (n <= 0 || inc_x <= 0) return(sumf); | |||||
| inc_x2 = 2 * inc_x; | |||||
| n *= inc_x2; | |||||
| while(i < n) | |||||
| { | |||||
| sumf += CSUM1(x,i); | |||||
| i += inc_x2; | |||||
| } | |||||
| return(sumf); | |||||
| } | |||||
| @@ -0,0 +1,332 @@ | |||||
| /*********************************************************************/ | |||||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||||
| /* All rights reserved. */ | |||||
| /* */ | |||||
| /* Redistribution and use in source and binary forms, with or */ | |||||
| /* without modification, are permitted provided that the following */ | |||||
| /* conditions are met: */ | |||||
| /* */ | |||||
| /* 1. Redistributions of source code must retain the above */ | |||||
| /* copyright notice, this list of conditions and the following */ | |||||
| /* disclaimer. */ | |||||
| /* */ | |||||
| /* 2. Redistributions in binary form must reproduce the above */ | |||||
| /* copyright notice, this list of conditions and the following */ | |||||
| /* disclaimer in the documentation and/or other materials */ | |||||
| /* provided with the distribution. */ | |||||
| /* */ | |||||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||||
| /* */ | |||||
| /* The views and conclusions contained in the software and */ | |||||
| /* documentation are those of the authors and should not be */ | |||||
| /* interpreted as representing official policies, either expressed */ | |||||
| /* or implied, of The University of Texas at Austin. */ | |||||
| /*********************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define N $4 | |||||
| #define X $5 | |||||
| #define INCX $6 | |||||
| #define I $2 | |||||
| #define TEMP $3 | |||||
| #define a1 $f2 | |||||
| #define a2 $f3 | |||||
| #define a3 $f4 | |||||
| #define a4 $f5 | |||||
| #define a5 $f6 | |||||
| #define a6 $f7 | |||||
| #define a7 $f8 | |||||
| #define a8 $f9 | |||||
| #define t1 $f10 | |||||
| #define t2 $f11 | |||||
| #define t3 $f12 | |||||
| #define t4 $f13 | |||||
| #define s1 $f0 | |||||
| #define s2 $f1 | |||||
| PROLOGUE | |||||
| #ifdef F_INTERFACE | |||||
| LDINT N, 0(N) | |||||
| LDINT INCX, 0(INCX) | |||||
| #endif | |||||
| MTC $0, s1 | |||||
| MTC $0, s2 | |||||
| dsll INCX, INCX, BASE_SHIFT | |||||
| blez N, .L999 | |||||
| li TEMP, SIZE | |||||
| bne INCX, TEMP, .L20 | |||||
| dsra I, N, 3 | |||||
| blez I, .L15 | |||||
| NOP | |||||
| LD a1, 0 * SIZE(X) | |||||
| LD a2, 1 * SIZE(X) | |||||
| LD a3, 2 * SIZE(X) | |||||
| LD a4, 3 * SIZE(X) | |||||
| LD a5, 4 * SIZE(X) | |||||
| MOV t1, a1 | |||||
| LD a6, 5 * SIZE(X) | |||||
| MOV t2, a2 | |||||
| LD a7, 6 * SIZE(X) | |||||
| MOV t3, a3 | |||||
| MOV t4, a4 | |||||
| daddiu I, I, -1 | |||||
| blez I, .L13 | |||||
| LD a8, 7 * SIZE(X) | |||||
| .align 3 | |||||
| .L12: | |||||
| ADD s1, s1, t1 | |||||
| LD a1, 8 * SIZE(X) | |||||
| MOV t1, a5 | |||||
| daddiu I, I, -1 | |||||
| ADD s2, s2, t2 | |||||
| LD a2, 9 * SIZE(X) | |||||
| MOV t2, a6 | |||||
| NOP | |||||
| ADD s1, s1, t3 | |||||
| LD a3, 10 * SIZE(X) | |||||
| MOV t3, a7 | |||||
| NOP | |||||
| ADD s2, s2, t4 | |||||
| LD a4, 11 * SIZE(X) | |||||
| MOV t4, a8 | |||||
| daddiu X, X, 8 * SIZE | |||||
| ADD s1, s1, t1 | |||||
| LD a5, 4 * SIZE(X) | |||||
| MOV t1, a1 | |||||
| NOP | |||||
| ADD s2, s2, t2 | |||||
| LD a6, 5 * SIZE(X) | |||||
| MOV t2, a2 | |||||
| NOP | |||||
| ADD s1, s1, t3 | |||||
| LD a7, 6 * SIZE(X) | |||||
| MOV t3, a3 | |||||
| NOP | |||||
| ADD s2, s2, t4 | |||||
| LD a8, 7 * SIZE(X) | |||||
| bgtz I, .L12 | |||||
| MOV t4, a4 | |||||
| .align 3 | |||||
| .L13: | |||||
| ADD s1, s1, t1 | |||||
| daddiu X, X, 8 * SIZE | |||||
| MOV t1, a5 | |||||
| NOP | |||||
| ADD s2, s2, t2 | |||||
| MOV t2, a6 | |||||
| ADD s1, s1, t3 | |||||
| MOV t3, a7 | |||||
| ADD s2, s2, t4 | |||||
| MOV t4, a8 | |||||
| ADD s1, s1, t1 | |||||
| ADD s2, s2, t2 | |||||
| ADD s1, s1, t3 | |||||
| ADD s2, s2, t4 | |||||
| .align 3 | |||||
| .L15: | |||||
| andi I, N, 7 | |||||
| blez I, .L999 | |||||
| NOP | |||||
| .align 3 | |||||
| .L16: | |||||
| LD a1, 0 * SIZE(X) | |||||
| daddiu I, I, -1 | |||||
| MOV t1, a1 | |||||
| ADD s1, s1, t1 | |||||
| bgtz I, .L16 | |||||
| daddiu X, X, SIZE | |||||
| j .L999 | |||||
| NOP | |||||
| .align 3 | |||||
| .L20: | |||||
| blez I, .L25 | |||||
| NOP | |||||
| LD a1, 0 * SIZE(X) | |||||
| daddu X, X, INCX | |||||
| LD a2, 0 * SIZE(X) | |||||
| daddu X, X, INCX | |||||
| LD a3, 0 * SIZE(X) | |||||
| daddu X, X, INCX | |||||
| LD a4, 0 * SIZE(X) | |||||
| daddu X, X, INCX | |||||
| LD a5, 0 * SIZE(X) | |||||
| daddu X, X, INCX | |||||
| LD a6, 0 * SIZE(X) | |||||
| daddu X, X, INCX | |||||
| MOV t1, a1 | |||||
| LD a7, 0 * SIZE(X) | |||||
| MOV t2, a2 | |||||
| daddu X, X, INCX | |||||
| MOV t3, a3 | |||||
| LD a8, 0 * SIZE(X) | |||||
| MOV t4, a4 | |||||
| daddiu I, I, -1 | |||||
| blez I, .L24 | |||||
| daddu X, X, INCX | |||||
| .align 3 | |||||
| .L23: | |||||
| ADD s1, s1, t1 | |||||
| LD a1, 0 * SIZE(X) | |||||
| MOV t1, a5 | |||||
| daddu X, X, INCX | |||||
| ADD s2, s2, t2 | |||||
| LD a2, 0 * SIZE(X) | |||||
| MOV t2, a6 | |||||
| daddu X, X, INCX | |||||
| ADD s1, s1, t3 | |||||
| LD a3, 0 * SIZE(X) | |||||
| MOV t3, a7 | |||||
| daddu X, X, INCX | |||||
| ADD s2, s2, t4 | |||||
| LD a4, 0 * SIZE(X) | |||||
| MOV t4, a8 | |||||
| daddu X, X, INCX | |||||
| ADD s1, s1, t1 | |||||
| LD a5, 0 * SIZE(X) | |||||
| MOV t1, a1 | |||||
| daddu X, X, INCX | |||||
| ADD s2, s2, t2 | |||||
| LD a6, 0 * SIZE(X) | |||||
| MOV t2, a2 | |||||
| daddu X, X, INCX | |||||
| ADD s1, s1, t3 | |||||
| LD a7, 0 * SIZE(X) | |||||
| MOV t3, a3 | |||||
| daddu X, X, INCX | |||||
| ADD s2, s2, t4 | |||||
| LD a8, 0 * SIZE(X) | |||||
| MOV t4, a4 | |||||
| daddiu I, I, -1 | |||||
| bgtz I, .L23 | |||||
| daddu X, X, INCX | |||||
| .align 3 | |||||
| .L24: | |||||
| ADD s1, s1, t1 | |||||
| MOV t1, a5 | |||||
| ADD s2, s2, t2 | |||||
| MOV t2, a6 | |||||
| ADD s1, s1, t3 | |||||
| MOV t3, a7 | |||||
| ADD s2, s2, t4 | |||||
| MOV t4, a8 | |||||
| ADD s1, s1, t1 | |||||
| ADD s2, s2, t2 | |||||
| ADD s1, s1, t3 | |||||
| ADD s2, s2, t4 | |||||
| .align 3 | |||||
| .L25: | |||||
| andi I, N, 7 | |||||
| blez I, .L999 | |||||
| NOP | |||||
| .align 3 | |||||
| .L26: | |||||
| LD a1, 0 * SIZE(X) | |||||
| daddiu I, I, -1 | |||||
| MOV t1, a1 | |||||
| daddu X, X, INCX | |||||
| bgtz I, .L26 | |||||
| ADD s1, s1, t1 | |||||
| .align 3 | |||||
| .L999: | |||||
| j $31 | |||||
| ADD s1, s1, s2 | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,204 @@ | |||||
| /*********************************************************************/ | |||||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||||
| /* All rights reserved. */ | |||||
| /* */ | |||||
| /* Redistribution and use in source and binary forms, with or */ | |||||
| /* without modification, are permitted provided that the following */ | |||||
| /* conditions are met: */ | |||||
| /* */ | |||||
| /* 1. Redistributions of source code must retain the above */ | |||||
| /* copyright notice, this list of conditions and the following */ | |||||
| /* disclaimer. */ | |||||
| /* */ | |||||
| /* 2. Redistributions in binary form must reproduce the above */ | |||||
| /* copyright notice, this list of conditions and the following */ | |||||
| /* disclaimer in the documentation and/or other materials */ | |||||
| /* provided with the distribution. */ | |||||
| /* */ | |||||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||||
| /* */ | |||||
| /* The views and conclusions contained in the software and */ | |||||
| /* documentation are those of the authors and should not be */ | |||||
| /* interpreted as representing official policies, either expressed */ | |||||
| /* or implied, of The University of Texas at Austin. */ | |||||
| /*********************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define N $4 | |||||
| #define X $5 | |||||
| #define INCX $6 | |||||
| #define I $2 | |||||
| #define TEMP $3 | |||||
| #define a1 $f2 | |||||
| #define a2 $f3 | |||||
| #define a3 $f4 | |||||
| #define a4 $f5 | |||||
| #define a5 $f6 | |||||
| #define a6 $f7 | |||||
| #define a7 $f8 | |||||
| #define a8 $f9 | |||||
| #define t1 $f10 | |||||
| #define t2 $f11 | |||||
| #define t3 $f12 | |||||
| #define t4 $f13 | |||||
| #define s1 $f0 | |||||
| #define s2 $f1 | |||||
| PROLOGUE | |||||
| #ifdef F_INTERFACE | |||||
| LDINT N, 0(N) | |||||
| LDINT INCX, 0(INCX) | |||||
| #endif | |||||
| MTC $0, s1 | |||||
| MTC $0, s2 | |||||
| dsll INCX, INCX, ZBASE_SHIFT | |||||
| blez N, .L999 | |||||
| dsra I, N, 2 | |||||
| blez I, .L25 | |||||
| NOP | |||||
| LD a1, 0 * SIZE(X) | |||||
| LD a2, 1 * SIZE(X) | |||||
| daddu X, X, INCX | |||||
| LD a3, 0 * SIZE(X) | |||||
| LD a4, 1 * SIZE(X) | |||||
| daddu X, X, INCX | |||||
| LD a5, 0 * SIZE(X) | |||||
| LD a6, 1 * SIZE(X) | |||||
| daddu X, X, INCX | |||||
| MOV t1, a1 | |||||
| MOV t2, a2 | |||||
| LD a7, 0 * SIZE(X) | |||||
| LD a8, 1 * SIZE(X) | |||||
| MOV t3, a3 | |||||
| MOV t4, a4 | |||||
| daddiu I, I, -1 | |||||
| blez I, .L24 | |||||
| daddu X, X, INCX | |||||
| .align 3 | |||||
| .L23: | |||||
| ADD s1, s1, t1 | |||||
| LD a1, 0 * SIZE(X) | |||||
| MOV t1, a5 | |||||
| daddiu I, I, -1 | |||||
| ADD s2, s2, t2 | |||||
| LD a2, 1 * SIZE(X) | |||||
| MOV t2, a6 | |||||
| daddu X, X, INCX | |||||
| ADD s1, s1, t3 | |||||
| LD a3, 0 * SIZE(X) | |||||
| MOV t3, a7 | |||||
| NOP | |||||
| ADD s2, s2, t4 | |||||
| LD a4, 1 * SIZE(X) | |||||
| MOV t4, a8 | |||||
| daddu X, X, INCX | |||||
| ADD s1, s1, t1 | |||||
| LD a5, 0 * SIZE(X) | |||||
| MOV t1, a1 | |||||
| NOP | |||||
| ADD s2, s2, t2 | |||||
| LD a6, 1 * SIZE(X) | |||||
| MOV t2, a2 | |||||
| daddu X, X, INCX | |||||
| ADD s1, s1, t3 | |||||
| LD a7, 0 * SIZE(X) | |||||
| MOV t3, a3 | |||||
| LD a8, 1 * SIZE(X) | |||||
| ADD s2, s2, t4 | |||||
| daddu X, X, INCX | |||||
| bgtz I, .L23 | |||||
| MOV t4, a4 | |||||
| .align 3 | |||||
| .L24: | |||||
| ADD s1, s1, t1 | |||||
| MOV t1, a5 | |||||
| ADD s2, s2, t2 | |||||
| MOV t2, a6 | |||||
| ADD s1, s1, t3 | |||||
| MOV t3, a7 | |||||
| ADD s2, s2, t4 | |||||
| MOV t4, a8 | |||||
| ADD s1, s1, t1 | |||||
| ADD s2, s2, t2 | |||||
| ADD s1, s1, t3 | |||||
| ADD s2, s2, t4 | |||||
| .align 3 | |||||
| .L25: | |||||
| andi I, N, 3 | |||||
| blez I, .L999 | |||||
| NOP | |||||
| .align 3 | |||||
| .L26: | |||||
| LD a1, 0 * SIZE(X) | |||||
| LD a2, 1 * SIZE(X) | |||||
| MOV t1, a1 | |||||
| daddiu I, I, -1 | |||||
| MOV t2, a2 | |||||
| daddu X, X, INCX | |||||
| ADD s1, s1, t1 | |||||
| bgtz I, .L26 | |||||
| ADD s2, s2, t2 | |||||
| .align 3 | |||||
| .L999: | |||||
| j $31 | |||||
| ADD s1, s1, s2 | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,446 @@ | |||||
| /*********************************************************************/ | |||||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||||
| /* All rights reserved. */ | |||||
| /* */ | |||||
| /* Redistribution and use in source and binary forms, with or */ | |||||
| /* without modification, are permitted provided that the following */ | |||||
| /* conditions are met: */ | |||||
| /* */ | |||||
| /* 1. Redistributions of source code must retain the above */ | |||||
| /* copyright notice, this list of conditions and the following */ | |||||
| /* disclaimer. */ | |||||
| /* */ | |||||
| /* 2. Redistributions in binary form must reproduce the above */ | |||||
| /* copyright notice, this list of conditions and the following */ | |||||
| /* disclaimer in the documentation and/or other materials */ | |||||
| /* provided with the distribution. */ | |||||
| /* */ | |||||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||||
| /* */ | |||||
| /* The views and conclusions contained in the software and */ | |||||
| /* documentation are those of the authors and should not be */ | |||||
| /* interpreted as representing official policies, either expressed */ | |||||
| /* or implied, of The University of Texas at Austin. */ | |||||
| /*********************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define N r3 | |||||
| #define X r4 | |||||
| #define INCX r5 | |||||
| #define PREA r8 | |||||
| #define FZERO f0 | |||||
| #define STACKSIZE 160 | |||||
| PROLOGUE | |||||
| PROFCODE | |||||
| addi SP, SP, -STACKSIZE | |||||
| li r0, 0 | |||||
| stfd f14, 0(SP) | |||||
| stfd f15, 8(SP) | |||||
| stfd f16, 16(SP) | |||||
| stfd f17, 24(SP) | |||||
| stfd f18, 32(SP) | |||||
| stfd f19, 40(SP) | |||||
| stfd f20, 48(SP) | |||||
| stfd f21, 56(SP) | |||||
| stfd f22, 64(SP) | |||||
| stfd f23, 72(SP) | |||||
| stfd f24, 80(SP) | |||||
| stfd f25, 88(SP) | |||||
| stfd f26, 96(SP) | |||||
| stfd f27, 104(SP) | |||||
| stfd f28, 112(SP) | |||||
| stfd f29, 120(SP) | |||||
| stfd f30, 128(SP) | |||||
| stfd f31, 136(SP) | |||||
| stw r0, 144(SP) | |||||
| lfs FZERO,144(SP) | |||||
| #ifdef F_INTERFACE | |||||
| LDINT N, 0(N) | |||||
| LDINT INCX, 0(INCX) | |||||
| #endif | |||||
| slwi INCX, INCX, BASE_SHIFT | |||||
| fmr f1, FZERO | |||||
| fmr f2, FZERO | |||||
| fmr f3, FZERO | |||||
| fmr f4, FZERO | |||||
| fmr f5, FZERO | |||||
| fmr f6, FZERO | |||||
| fmr f7, FZERO | |||||
| li PREA, L1_PREFETCHSIZE | |||||
| cmpwi cr0, N, 0 | |||||
| ble- LL(999) | |||||
| cmpwi cr0, INCX, 0 | |||||
| ble- LL(999) | |||||
| cmpwi cr0, INCX, SIZE | |||||
| bne- cr0, LL(100) | |||||
| srawi. r0, N, 4 | |||||
| mtspr CTR, r0 | |||||
| beq- cr0, LL(50) | |||||
| .align 4 | |||||
| LFD f8, 0 * SIZE(X) | |||||
| LFD f9, 1 * SIZE(X) | |||||
| LFD f10, 2 * SIZE(X) | |||||
| LFD f11, 3 * SIZE(X) | |||||
| LFD f12, 4 * SIZE(X) | |||||
| LFD f13, 5 * SIZE(X) | |||||
| LFD f14, 6 * SIZE(X) | |||||
| LFD f15, 7 * SIZE(X) | |||||
| LFD f24, 8 * SIZE(X) | |||||
| LFD f25, 9 * SIZE(X) | |||||
| LFD f26, 10 * SIZE(X) | |||||
| LFD f27, 11 * SIZE(X) | |||||
| LFD f28, 12 * SIZE(X) | |||||
| LFD f29, 13 * SIZE(X) | |||||
| LFD f30, 14 * SIZE(X) | |||||
| LFD f31, 15 * SIZE(X) | |||||
| fmr f16, f8 | |||||
| fmr f17, f9 | |||||
| fmr f18, f10 | |||||
| fmr f19, f11 | |||||
| fmr f20, f12 | |||||
| fmr f21, f13 | |||||
| fmr f22, f14 | |||||
| fmr f23, f15 | |||||
| bdz LL(20) | |||||
| .align 4 | |||||
| LL(10): | |||||
| FADD f0, f0, f16 | |||||
| fmr f16, f24 | |||||
| FADD f1, f1, f17 | |||||
| fmr f17, f25 | |||||
| FADD f2, f2, f18 | |||||
| fmr f18, f26 | |||||
| FADD f3, f3, f19 | |||||
| fmr f19, f27 | |||||
| LFD f8, 16 * SIZE(X) | |||||
| LFD f9, 17 * SIZE(X) | |||||
| LFD f10, 18 * SIZE(X) | |||||
| LFD f11, 19 * SIZE(X) | |||||
| FADD f4, f4, f20 | |||||
| fmr f20, f28 | |||||
| FADD f5, f5, f21 | |||||
| fmr f21, f29 | |||||
| FADD f6, f6, f22 | |||||
| fmr f22, f30 | |||||
| FADD f7, f7, f23 | |||||
| fmr f23, f31 | |||||
| LFD f12, 20 * SIZE(X) | |||||
| LFD f13, 21 * SIZE(X) | |||||
| LFD f14, 22 * SIZE(X) | |||||
| LFD f15, 23 * SIZE(X) | |||||
| FADD f0, f0, f16 | |||||
| fmr f16, f8 | |||||
| FADD f1, f1, f17 | |||||
| fmr f17, f9 | |||||
| FADD f2, f2, f18 | |||||
| fmr f18, f10 | |||||
| FADD f3, f3, f19 | |||||
| fmr f19, f11 | |||||
| LFD f24, 24 * SIZE(X) | |||||
| LFD f25, 25 * SIZE(X) | |||||
| LFD f26, 26 * SIZE(X) | |||||
| LFD f27, 27 * SIZE(X) | |||||
| FADD f4, f4, f20 | |||||
| fmr f20, f12 | |||||
| FADD f5, f5, f21 | |||||
| fmr f21, f13 | |||||
| FADD f6, f6, f22 | |||||
| fmr f22, f14 | |||||
| FADD f7, f7, f23 | |||||
| fmr f23, f15 | |||||
| LFD f28, 28 * SIZE(X) | |||||
| LFD f29, 29 * SIZE(X) | |||||
| LFD f30, 30 * SIZE(X) | |||||
| LFD f31, 31 * SIZE(X) | |||||
| #ifndef POWER6 | |||||
| L1_PREFETCH X, PREA | |||||
| #endif | |||||
| addi X, X, 16 * SIZE | |||||
| #ifdef POWER6 | |||||
| L1_PREFETCH X, PREA | |||||
| #endif | |||||
| bdnz LL(10) | |||||
| .align 4 | |||||
| LL(20): | |||||
| FADD f0, f0, f16 | |||||
| fmr f16, f24 | |||||
| FADD f1, f1, f17 | |||||
| fmr f17, f25 | |||||
| FADD f2, f2, f18 | |||||
| fmr f18, f26 | |||||
| FADD f3, f3, f19 | |||||
| fmr f19, f27 | |||||
| FADD f4, f4, f20 | |||||
| fmr f20, f28 | |||||
| FADD f5, f5, f21 | |||||
| fmr f21, f29 | |||||
| FADD f6, f6, f22 | |||||
| fmr f22, f30 | |||||
| FADD f7, f7, f23 | |||||
| fmr f23, f31 | |||||
| FADD f0, f0, f16 | |||||
| FADD f1, f1, f17 | |||||
| FADD f2, f2, f18 | |||||
| FADD f3, f3, f19 | |||||
| FADD f4, f4, f20 | |||||
| FADD f5, f5, f21 | |||||
| FADD f6, f6, f22 | |||||
| FADD f7, f7, f23 | |||||
| addi X, X, 16 * SIZE | |||||
| .align 4 | |||||
| LL(50): | |||||
| andi. r0, N, 15 | |||||
| mtspr CTR, r0 | |||||
| beq LL(999) | |||||
| .align 4 | |||||
| LL(60): | |||||
| LFD f8, 0 * SIZE(X) | |||||
| addi X, X, 1 * SIZE | |||||
| FADD f0, f0, f8 | |||||
| bdnz LL(60) | |||||
| b LL(999) | |||||
| .align 4 | |||||
| LL(100): | |||||
| sub X, X, INCX | |||||
| srawi. r0, N, 4 | |||||
| mtspr CTR, r0 | |||||
| beq- LL(150) | |||||
| LFDUX f8, X, INCX | |||||
| LFDUX f9, X, INCX | |||||
| LFDUX f10, X, INCX | |||||
| LFDUX f11, X, INCX | |||||
| LFDUX f12, X, INCX | |||||
| LFDUX f13, X, INCX | |||||
| LFDUX f14, X, INCX | |||||
| LFDUX f15, X, INCX | |||||
| LFDUX f24, X, INCX | |||||
| LFDUX f25, X, INCX | |||||
| LFDUX f26, X, INCX | |||||
| LFDUX f27, X, INCX | |||||
| LFDUX f28, X, INCX | |||||
| LFDUX f29, X, INCX | |||||
| LFDUX f30, X, INCX | |||||
| LFDUX f31, X, INCX | |||||
| fmr f16, f8 | |||||
| fmr f17, f9 | |||||
| fmr f18, f10 | |||||
| fmr f19, f11 | |||||
| fmr f20, f12 | |||||
| fmr f21, f13 | |||||
| fmr f22, f14 | |||||
| fmr f23, f15 | |||||
| bdz LL(120) | |||||
| .align 4 | |||||
| LL(110): | |||||
| FADD f0, f0, f16 | |||||
| fmr f16, f24 | |||||
| FADD f1, f1, f17 | |||||
| fmr f17, f25 | |||||
| FADD f2, f2, f18 | |||||
| fmr f18, f26 | |||||
| FADD f3, f3, f19 | |||||
| fmr f19, f27 | |||||
| LFDUX f8, X, INCX | |||||
| LFDUX f9, X, INCX | |||||
| LFDUX f10, X, INCX | |||||
| LFDUX f11, X, INCX | |||||
| FADD f4, f4, f20 | |||||
| fmr f20, f28 | |||||
| FADD f5, f5, f21 | |||||
| fmr f21, f29 | |||||
| FADD f6, f6, f22 | |||||
| fmr f22, f30 | |||||
| FADD f7, f7, f23 | |||||
| fmr f23, f31 | |||||
| LFDUX f12, X, INCX | |||||
| LFDUX f13, X, INCX | |||||
| LFDUX f14, X, INCX | |||||
| LFDUX f15, X, INCX | |||||
| FADD f0, f0, f16 | |||||
| fmr f16, f8 | |||||
| FADD f1, f1, f17 | |||||
| fmr f17, f9 | |||||
| FADD f2, f2, f18 | |||||
| fmr f18, f10 | |||||
| FADD f3, f3, f19 | |||||
| fmr f19, f11 | |||||
| LFDUX f24, X, INCX | |||||
| LFDUX f25, X, INCX | |||||
| LFDUX f26, X, INCX | |||||
| LFDUX f27, X, INCX | |||||
| FADD f4, f4, f20 | |||||
| fmr f20, f12 | |||||
| FADD f5, f5, f21 | |||||
| fmr f21, f13 | |||||
| FADD f6, f6, f22 | |||||
| fmr f22, f14 | |||||
| FADD f7, f7, f23 | |||||
| fmr f23, f15 | |||||
| LFDUX f28, X, INCX | |||||
| LFDUX f29, X, INCX | |||||
| LFDUX f30, X, INCX | |||||
| LFDUX f31, X, INCX | |||||
| bdnz LL(110) | |||||
| .align 4 | |||||
| LL(120): | |||||
| FADD f0, f0, f16 | |||||
| fmr f16, f24 | |||||
| FADD f1, f1, f17 | |||||
| fmr f17, f25 | |||||
| FADD f2, f2, f18 | |||||
| fmr f18, f26 | |||||
| FADD f3, f3, f19 | |||||
| fmr f19, f27 | |||||
| FADD f4, f4, f20 | |||||
| fmr f20, f28 | |||||
| FADD f5, f5, f21 | |||||
| fmr f21, f29 | |||||
| FADD f6, f6, f22 | |||||
| fmr f22, f30 | |||||
| FADD f7, f7, f23 | |||||
| fmr f23, f31 | |||||
| FADD f0, f0, f16 | |||||
| FADD f1, f1, f17 | |||||
| FADD f2, f2, f18 | |||||
| FADD f3, f3, f19 | |||||
| FADD f4, f4, f20 | |||||
| FADD f5, f5, f21 | |||||
| FADD f6, f6, f22 | |||||
| FADD f7, f7, f23 | |||||
| .align 4 | |||||
| LL(150): | |||||
| andi. r0, N, 15 | |||||
| mtspr CTR, r0 | |||||
| beq LL(999) | |||||
| .align 4 | |||||
| LL(160): | |||||
| LFDUX f8, X, INCX | |||||
| FADD f0, f0, f8 | |||||
| bdnz LL(160) | |||||
| .align 4 | |||||
| LL(999): | |||||
| FADD f0, f0, f1 | |||||
| FADD f2, f2, f3 | |||||
| FADD f4, f4, f5 | |||||
| FADD f6, f6, f7 | |||||
| FADD f0, f0, f2 | |||||
| FADD f4, f4, f6 | |||||
| FADD f1, f0, f4 | |||||
| lfd f14, 0(SP) | |||||
| lfd f15, 8(SP) | |||||
| lfd f16, 16(SP) | |||||
| lfd f17, 24(SP) | |||||
| lfd f18, 32(SP) | |||||
| lfd f19, 40(SP) | |||||
| lfd f20, 48(SP) | |||||
| lfd f21, 56(SP) | |||||
| lfd f22, 64(SP) | |||||
| lfd f23, 72(SP) | |||||
| lfd f24, 80(SP) | |||||
| lfd f25, 88(SP) | |||||
| lfd f26, 96(SP) | |||||
| lfd f27, 104(SP) | |||||
| lfd f28, 112(SP) | |||||
| lfd f29, 120(SP) | |||||
| lfd f30, 128(SP) | |||||
| lfd f31, 136(SP) | |||||
| addi SP, SP, STACKSIZE | |||||
| blr | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,452 @@ | |||||
| /*********************************************************************/ | |||||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||||
| /* All rights reserved. */ | |||||
| /* */ | |||||
| /* Redistribution and use in source and binary forms, with or */ | |||||
| /* without modification, are permitted provided that the following */ | |||||
| /* conditions are met: */ | |||||
| /* */ | |||||
| /* 1. Redistributions of source code must retain the above */ | |||||
| /* copyright notice, this list of conditions and the following */ | |||||
| /* disclaimer. */ | |||||
| /* */ | |||||
| /* 2. Redistributions in binary form must reproduce the above */ | |||||
| /* copyright notice, this list of conditions and the following */ | |||||
| /* disclaimer in the documentation and/or other materials */ | |||||
| /* provided with the distribution. */ | |||||
| /* */ | |||||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||||
| /* */ | |||||
| /* The views and conclusions contained in the software and */ | |||||
| /* documentation are those of the authors and should not be */ | |||||
| /* interpreted as representing official policies, either expressed */ | |||||
| /* or implied, of The University of Texas at Austin. */ | |||||
| /*********************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define N r3 | |||||
| #define X r4 | |||||
| #define INCX r5 | |||||
| #define INCXM1 r9 | |||||
| #define PREA r8 | |||||
| #define FZERO f0 | |||||
| #define STACKSIZE 160 | |||||
| PROLOGUE | |||||
| PROFCODE | |||||
| addi SP, SP, -STACKSIZE | |||||
| li r0, 0 | |||||
| stfd f14, 0(SP) | |||||
| stfd f15, 8(SP) | |||||
| stfd f16, 16(SP) | |||||
| stfd f17, 24(SP) | |||||
| stfd f18, 32(SP) | |||||
| stfd f19, 40(SP) | |||||
| stfd f20, 48(SP) | |||||
| stfd f21, 56(SP) | |||||
| stfd f22, 64(SP) | |||||
| stfd f23, 72(SP) | |||||
| stfd f24, 80(SP) | |||||
| stfd f25, 88(SP) | |||||
| stfd f26, 96(SP) | |||||
| stfd f27, 104(SP) | |||||
| stfd f28, 112(SP) | |||||
| stfd f29, 120(SP) | |||||
| stfd f30, 128(SP) | |||||
| stfd f31, 136(SP) | |||||
| stw r0, 144(SP) | |||||
| lfs FZERO,144(SP) | |||||
| #ifdef F_INTERFACE | |||||
| LDINT N, 0(N) | |||||
| LDINT INCX, 0(INCX) | |||||
| #endif | |||||
| slwi INCX, INCX, ZBASE_SHIFT | |||||
| subi INCXM1, INCX, SIZE | |||||
| fmr f1, FZERO | |||||
| fmr f2, FZERO | |||||
| fmr f3, FZERO | |||||
| fmr f4, FZERO | |||||
| fmr f5, FZERO | |||||
| fmr f6, FZERO | |||||
| fmr f7, FZERO | |||||
| li PREA, L1_PREFETCHSIZE | |||||
| cmpwi cr0, N, 0 | |||||
| ble- LL(999) | |||||
| cmpwi cr0, INCX, 0 | |||||
| ble- LL(999) | |||||
| cmpwi cr0, INCX, 2 * SIZE | |||||
| bne- cr0, LL(100) | |||||
| srawi. r0, N, 3 | |||||
| mtspr CTR, r0 | |||||
| beq- cr0, LL(50) | |||||
| .align 4 | |||||
| LFD f8, 0 * SIZE(X) | |||||
| LFD f9, 1 * SIZE(X) | |||||
| LFD f10, 2 * SIZE(X) | |||||
| LFD f11, 3 * SIZE(X) | |||||
| LFD f12, 4 * SIZE(X) | |||||
| LFD f13, 5 * SIZE(X) | |||||
| LFD f14, 6 * SIZE(X) | |||||
| LFD f15, 7 * SIZE(X) | |||||
| LFD f24, 8 * SIZE(X) | |||||
| LFD f25, 9 * SIZE(X) | |||||
| LFD f26, 10 * SIZE(X) | |||||
| LFD f27, 11 * SIZE(X) | |||||
| LFD f28, 12 * SIZE(X) | |||||
| LFD f29, 13 * SIZE(X) | |||||
| LFD f30, 14 * SIZE(X) | |||||
| LFD f31, 15 * SIZE(X) | |||||
| fmr f16, f8 | |||||
| fmr f17, f9 | |||||
| fmr f18, f10 | |||||
| fmr f19, f11 | |||||
| fmr f20, f12 | |||||
| fmr f21, f13 | |||||
| fmr f22, f14 | |||||
| fmr f23, f15 | |||||
| bdz LL(20) | |||||
| .align 4 | |||||
| LL(10): | |||||
| FADD f0, f0, f16 | |||||
| fmr f16, f24 | |||||
| FADD f1, f1, f17 | |||||
| fmr f17, f25 | |||||
| FADD f2, f2, f18 | |||||
| fmr f18, f26 | |||||
| FADD f3, f3, f19 | |||||
| fmr f19, f27 | |||||
| LFD f8, 16 * SIZE(X) | |||||
| LFD f9, 17 * SIZE(X) | |||||
| LFD f10, 18 * SIZE(X) | |||||
| LFD f11, 19 * SIZE(X) | |||||
| FADD f4, f4, f20 | |||||
| fmr f20, f28 | |||||
| FADD f5, f5, f21 | |||||
| fmr f21, f29 | |||||
| FADD f6, f6, f22 | |||||
| fmr f22, f30 | |||||
| FADD f7, f7, f23 | |||||
| fmr f23, f31 | |||||
| LFD f12, 20 * SIZE(X) | |||||
| LFD f13, 21 * SIZE(X) | |||||
| LFD f14, 22 * SIZE(X) | |||||
| LFD f15, 23 * SIZE(X) | |||||
| FADD f0, f0, f16 | |||||
| fmr f16, f8 | |||||
| FADD f1, f1, f17 | |||||
| fmr f17, f9 | |||||
| FADD f2, f2, f18 | |||||
| fmr f18, f10 | |||||
| FADD f3, f3, f19 | |||||
| fmr f19, f11 | |||||
| LFD f24, 24 * SIZE(X) | |||||
| LFD f25, 25 * SIZE(X) | |||||
| LFD f26, 26 * SIZE(X) | |||||
| LFD f27, 27 * SIZE(X) | |||||
| FADD f4, f4, f20 | |||||
| fmr f20, f12 | |||||
| FADD f5, f5, f21 | |||||
| fmr f21, f13 | |||||
| FADD f6, f6, f22 | |||||
| fmr f22, f14 | |||||
| FADD f7, f7, f23 | |||||
| fmr f23, f15 | |||||
| LFD f28, 28 * SIZE(X) | |||||
| LFD f29, 29 * SIZE(X) | |||||
| LFD f30, 30 * SIZE(X) | |||||
| LFD f31, 31 * SIZE(X) | |||||
| #ifndef POWER6 | |||||
| L1_PREFETCH X, PREA | |||||
| #endif | |||||
| addi X, X, 16 * SIZE | |||||
| #ifdef POWER6 | |||||
| L1_PREFETCH X, PREA | |||||
| #endif | |||||
| bdnz LL(10) | |||||
| .align 4 | |||||
| LL(20): | |||||
| FADD f0, f0, f16 | |||||
| fmr f16, f24 | |||||
| FADD f1, f1, f17 | |||||
| fmr f17, f25 | |||||
| FADD f2, f2, f18 | |||||
| fmr f18, f26 | |||||
| FADD f3, f3, f19 | |||||
| fmr f19, f27 | |||||
| FADD f4, f4, f20 | |||||
| fmr f20, f28 | |||||
| FADD f5, f5, f21 | |||||
| fmr f21, f29 | |||||
| FADD f6, f6, f22 | |||||
| fmr f22, f30 | |||||
| FADD f7, f7, f23 | |||||
| fmr f23, f31 | |||||
| FADD f0, f0, f16 | |||||
| FADD f1, f1, f17 | |||||
| FADD f2, f2, f18 | |||||
| FADD f3, f3, f19 | |||||
| FADD f4, f4, f20 | |||||
| FADD f5, f5, f21 | |||||
| FADD f6, f6, f22 | |||||
| FADD f7, f7, f23 | |||||
| addi X, X, 16 * SIZE | |||||
| .align 4 | |||||
| LL(50): | |||||
| andi. r0, N, 7 | |||||
| mtspr CTR, r0 | |||||
| beq LL(999) | |||||
| .align 4 | |||||
| LL(60): | |||||
| LFD f8, 0 * SIZE(X) | |||||
| LFD f9, 1 * SIZE(X) | |||||
| addi X, X, 2 * SIZE | |||||
| FADD f0, f0, f8 | |||||
| FADD f1, f1, f9 | |||||
| bdnz LL(60) | |||||
| b LL(999) | |||||
| .align 4 | |||||
| LL(100): | |||||
| sub X, X, INCXM1 | |||||
| srawi. r0, N, 3 | |||||
| mtspr CTR, r0 | |||||
| beq- LL(150) | |||||
| LFDX f8, X, INCXM1 | |||||
| LFDUX f9, X, INCX | |||||
| LFDX f10, X, INCXM1 | |||||
| LFDUX f11, X, INCX | |||||
| LFDX f12, X, INCXM1 | |||||
| LFDUX f13, X, INCX | |||||
| LFDX f14, X, INCXM1 | |||||
| LFDUX f15, X, INCX | |||||
| LFDX f24, X, INCXM1 | |||||
| LFDUX f25, X, INCX | |||||
| LFDX f26, X, INCXM1 | |||||
| LFDUX f27, X, INCX | |||||
| LFDX f28, X, INCXM1 | |||||
| LFDUX f29, X, INCX | |||||
| LFDX f30, X, INCXM1 | |||||
| LFDUX f31, X, INCX | |||||
| fmr f16, f8 | |||||
| fmr f17, f9 | |||||
| fmr f18, f10 | |||||
| fmr f19, f11 | |||||
| fmr f20, f12 | |||||
| fmr f21, f13 | |||||
| fmr f22, f14 | |||||
| fmr f23, f15 | |||||
| bdz LL(120) | |||||
| .align 4 | |||||
| LL(110): | |||||
| FADD f0, f0, f16 | |||||
| fmr f16, f24 | |||||
| FADD f1, f1, f17 | |||||
| fmr f17, f25 | |||||
| FADD f2, f2, f18 | |||||
| fmr f18, f26 | |||||
| FADD f3, f3, f19 | |||||
| fmr f19, f27 | |||||
| LFDX f8, X, INCXM1 | |||||
| LFDUX f9, X, INCX | |||||
| LFDX f10, X, INCXM1 | |||||
| LFDUX f11, X, INCX | |||||
| FADD f4, f4, f20 | |||||
| fmr f20, f28 | |||||
| FADD f5, f5, f21 | |||||
| fmr f21, f29 | |||||
| FADD f6, f6, f22 | |||||
| fmr f22, f30 | |||||
| FADD f7, f7, f23 | |||||
| fmr f23, f31 | |||||
| LFDX f12, X, INCXM1 | |||||
| LFDUX f13, X, INCX | |||||
| LFDX f14, X, INCXM1 | |||||
| LFDUX f15, X, INCX | |||||
| FADD f0, f0, f16 | |||||
| fmr f16, f8 | |||||
| FADD f1, f1, f17 | |||||
| fmr f17, f9 | |||||
| FADD f2, f2, f18 | |||||
| fmr f18, f10 | |||||
| FADD f3, f3, f19 | |||||
| fmr f19, f11 | |||||
| LFDX f24, X, INCXM1 | |||||
| LFDUX f25, X, INCX | |||||
| LFDX f26, X, INCXM1 | |||||
| LFDUX f27, X, INCX | |||||
| FADD f4, f4, f20 | |||||
| fmr f20, f12 | |||||
| FADD f5, f5, f21 | |||||
| fmr f21, f13 | |||||
| FADD f6, f6, f22 | |||||
| fmr f22, f14 | |||||
| FADD f7, f7, f23 | |||||
| fmr f23, f15 | |||||
| LFDX f28, X, INCXM1 | |||||
| LFDUX f29, X, INCX | |||||
| LFDX f30, X, INCXM1 | |||||
| LFDUX f31, X, INCX | |||||
| bdnz LL(110) | |||||
| .align 4 | |||||
| LL(120): | |||||
| FADD f0, f0, f16 | |||||
| fmr f16, f24 | |||||
| FADD f1, f1, f17 | |||||
| fmr f17, f25 | |||||
| FADD f2, f2, f18 | |||||
| fmr f18, f26 | |||||
| FADD f3, f3, f19 | |||||
| fmr f19, f27 | |||||
| FADD f4, f4, f20 | |||||
| fmr f20, f28 | |||||
| FADD f5, f5, f21 | |||||
| fmr f21, f29 | |||||
| FADD f6, f6, f22 | |||||
| fmr f22, f30 | |||||
| FADD f7, f7, f23 | |||||
| fmr f23, f31 | |||||
| FADD f0, f0, f16 | |||||
| FADD f1, f1, f17 | |||||
| FADD f2, f2, f18 | |||||
| FADD f3, f3, f19 | |||||
| FADD f4, f4, f20 | |||||
| FADD f5, f5, f21 | |||||
| FADD f6, f6, f22 | |||||
| FADD f7, f7, f23 | |||||
| .align 4 | |||||
| LL(150): | |||||
| andi. r0, N, 7 | |||||
| mtspr CTR, r0 | |||||
| beq LL(999) | |||||
| .align 4 | |||||
| LL(160): | |||||
| LFDX f8, X, INCXM1 | |||||
| LFDUX f9, X, INCX | |||||
| FADD f0, f0, f8 | |||||
| FADD f1, f1, f9 | |||||
| bdnz LL(160) | |||||
| .align 4 | |||||
| LL(999): | |||||
| FADD f0, f0, f1 | |||||
| FADD f2, f2, f3 | |||||
| FADD f4, f4, f5 | |||||
| FADD f6, f6, f7 | |||||
| FADD f0, f0, f2 | |||||
| FADD f4, f4, f6 | |||||
| FADD f1, f0, f4 | |||||
| lfd f14, 0(SP) | |||||
| lfd f15, 8(SP) | |||||
| lfd f16, 16(SP) | |||||
| lfd f17, 24(SP) | |||||
| lfd f18, 32(SP) | |||||
| lfd f19, 40(SP) | |||||
| lfd f20, 48(SP) | |||||
| lfd f21, 56(SP) | |||||
| lfd f22, 64(SP) | |||||
| lfd f23, 72(SP) | |||||
| lfd f24, 80(SP) | |||||
| lfd f25, 88(SP) | |||||
| lfd f26, 96(SP) | |||||
| lfd f27, 104(SP) | |||||
| lfd f28, 112(SP) | |||||
| lfd f29, 120(SP) | |||||
| lfd f30, 128(SP) | |||||
| lfd f31, 136(SP) | |||||
| addi SP, SP, STACKSIZE | |||||
| blr | |||||
| EPILOGUE | |||||
| @@ -70,7 +70,7 @@ gotoblas_t TABLE_NAME = { | |||||
| samax_kTS, samin_kTS, smax_kTS, smin_kTS, | samax_kTS, samin_kTS, smax_kTS, smin_kTS, | ||||
| isamax_kTS, isamin_kTS, ismax_kTS, ismin_kTS, | isamax_kTS, isamin_kTS, ismax_kTS, ismin_kTS, | ||||
| snrm2_kTS, sasum_kTS, scopy_kTS, sdot_kTS, | |||||
| snrm2_kTS, sasum_kTS, ssum_kTS, scopy_kTS, sdot_kTS, | |||||
| dsdot_kTS, | dsdot_kTS, | ||||
| srot_kTS, saxpy_kTS, sscal_kTS, sswap_kTS, | srot_kTS, saxpy_kTS, sscal_kTS, sswap_kTS, | ||||
| sgemv_nTS, sgemv_tTS, sger_kTS, | sgemv_nTS, sgemv_tTS, sger_kTS, | ||||
| @@ -126,7 +126,7 @@ gotoblas_t TABLE_NAME = { | |||||
| damax_kTS, damin_kTS, dmax_kTS, dmin_kTS, | damax_kTS, damin_kTS, dmax_kTS, dmin_kTS, | ||||
| idamax_kTS, idamin_kTS, idmax_kTS, idmin_kTS, | idamax_kTS, idamin_kTS, idmax_kTS, idmin_kTS, | ||||
| dnrm2_kTS, dasum_kTS, dcopy_kTS, ddot_kTS, | |||||
| dnrm2_kTS, dasum_kTS, dsum_kTS, dcopy_kTS, ddot_kTS, | |||||
| drot_kTS, daxpy_kTS, dscal_kTS, dswap_kTS, | drot_kTS, daxpy_kTS, dscal_kTS, dswap_kTS, | ||||
| dgemv_nTS, dgemv_tTS, dger_kTS, | dgemv_nTS, dgemv_tTS, dger_kTS, | ||||
| dsymv_LTS, dsymv_UTS, | dsymv_LTS, dsymv_UTS, | ||||
| @@ -178,7 +178,7 @@ gotoblas_t TABLE_NAME = { | |||||
| qamax_kTS, qamin_kTS, qmax_kTS, qmin_kTS, | qamax_kTS, qamin_kTS, qmax_kTS, qmin_kTS, | ||||
| iqamax_kTS, iqamin_kTS, iqmax_kTS, iqmin_kTS, | iqamax_kTS, iqamin_kTS, iqmax_kTS, iqmin_kTS, | ||||
| qnrm2_kTS, qasum_kTS, qcopy_kTS, qdot_kTS, | |||||
| qnrm2_kTS, qasum_kTS, qsum_kTS, qcopy_kTS, qdot_kTS, | |||||
| qrot_kTS, qaxpy_kTS, qscal_kTS, qswap_kTS, | qrot_kTS, qaxpy_kTS, qscal_kTS, qswap_kTS, | ||||
| qgemv_nTS, qgemv_tTS, qger_kTS, | qgemv_nTS, qgemv_tTS, qger_kTS, | ||||
| qsymv_LTS, qsymv_UTS, | qsymv_LTS, qsymv_UTS, | ||||
| @@ -234,7 +234,7 @@ gotoblas_t TABLE_NAME = { | |||||
| #endif | #endif | ||||
| camax_kTS, camin_kTS, icamax_kTS, icamin_kTS, | camax_kTS, camin_kTS, icamax_kTS, icamin_kTS, | ||||
| cnrm2_kTS, casum_kTS, ccopy_kTS, | |||||
| cnrm2_kTS, casum_kTS, csum_kTS, ccopy_kTS, | |||||
| cdotu_kTS, cdotc_kTS, csrot_kTS, | cdotu_kTS, cdotc_kTS, csrot_kTS, | ||||
| caxpy_kTS, caxpyc_kTS, cscal_kTS, cswap_kTS, | caxpy_kTS, caxpyc_kTS, cscal_kTS, cswap_kTS, | ||||
| @@ -369,7 +369,7 @@ gotoblas_t TABLE_NAME = { | |||||
| #endif | #endif | ||||
| zamax_kTS, zamin_kTS, izamax_kTS, izamin_kTS, | zamax_kTS, zamin_kTS, izamax_kTS, izamin_kTS, | ||||
| znrm2_kTS, zasum_kTS, zcopy_kTS, | |||||
| znrm2_kTS, zasum_kTS, zsum_kTS, zcopy_kTS, | |||||
| zdotu_kTS, zdotc_kTS, zdrot_kTS, | zdotu_kTS, zdotc_kTS, zdrot_kTS, | ||||
| zaxpy_kTS, zaxpyc_kTS, zscal_kTS, zswap_kTS, | zaxpy_kTS, zaxpyc_kTS, zscal_kTS, zswap_kTS, | ||||
| @@ -500,7 +500,7 @@ gotoblas_t TABLE_NAME = { | |||||
| XGEMM_DEFAULT_UNROLL_M, XGEMM_DEFAULT_UNROLL_N, MAX(XGEMM_DEFAULT_UNROLL_M, XGEMM_DEFAULT_UNROLL_N), | XGEMM_DEFAULT_UNROLL_M, XGEMM_DEFAULT_UNROLL_N, MAX(XGEMM_DEFAULT_UNROLL_M, XGEMM_DEFAULT_UNROLL_N), | ||||
| xamax_kTS, xamin_kTS, ixamax_kTS, ixamin_kTS, | xamax_kTS, xamin_kTS, ixamax_kTS, ixamin_kTS, | ||||
| xnrm2_kTS, xasum_kTS, xcopy_kTS, | |||||
| xnrm2_kTS, xasum_kTS, xsum_kTS, xcopy_kTS, | |||||
| xdotu_kTS, xdotc_kTS, xqrot_kTS, | xdotu_kTS, xdotc_kTS, xqrot_kTS, | ||||
| xaxpy_kTS, xaxpyc_kTS, xscal_kTS, xswap_kTS, | xaxpy_kTS, xaxpyc_kTS, xscal_kTS, xswap_kTS, | ||||
| @@ -0,0 +1,325 @@ | |||||
| /*********************************************************************/ | |||||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||||
| /* All rights reserved. */ | |||||
| /* */ | |||||
| /* Redistribution and use in source and binary forms, with or */ | |||||
| /* without modification, are permitted provided that the following */ | |||||
| /* conditions are met: */ | |||||
| /* */ | |||||
| /* 1. Redistributions of source code must retain the above */ | |||||
| /* copyright notice, this list of conditions and the following */ | |||||
| /* disclaimer. */ | |||||
| /* */ | |||||
| /* 2. Redistributions in binary form must reproduce the above */ | |||||
| /* copyright notice, this list of conditions and the following */ | |||||
| /* disclaimer in the documentation and/or other materials */ | |||||
| /* provided with the distribution. */ | |||||
| /* */ | |||||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||||
| /* */ | |||||
| /* The views and conclusions contained in the software and */ | |||||
| /* documentation are those of the authors and should not be */ | |||||
| /* interpreted as representing official policies, either expressed */ | |||||
| /* or implied, of The University of Texas at Austin. */ | |||||
| /*********************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define N %i0 | |||||
| #define X %i1 | |||||
| #define INCX %i2 | |||||
| #define I %i3 | |||||
| #ifdef DOUBLE | |||||
| #define c1 %f0 | |||||
| #define c2 %f2 | |||||
| #define t1 %f8 | |||||
| #define t2 %f10 | |||||
| #define t3 %f12 | |||||
| #define t4 %f14 | |||||
| #define a1 %f16 | |||||
| #define a2 %f18 | |||||
| #define a3 %f20 | |||||
| #define a4 %f22 | |||||
| #define a5 %f24 | |||||
| #define a6 %f26 | |||||
| #define a7 %f28 | |||||
| #define a8 %f30 | |||||
| #else | |||||
| #define c1 %f0 | |||||
| #define c2 %f1 | |||||
| #define t1 %f4 | |||||
| #define t2 %f5 | |||||
| #define t3 %f6 | |||||
| #define t4 %f7 | |||||
| #define a1 %f8 | |||||
| #define a2 %f9 | |||||
| #define a3 %f10 | |||||
| #define a4 %f11 | |||||
| #define a5 %f12 | |||||
| #define a6 %f13 | |||||
| #define a7 %f14 | |||||
| #define a8 %f15 | |||||
| #endif | |||||
| PROLOGUE | |||||
| SAVESP | |||||
| FCLR(0) | |||||
| sll INCX, BASE_SHIFT, INCX | |||||
| FMOV c1, c2 | |||||
| FMOV c1, t1 | |||||
| FMOV c1, t2 | |||||
| FMOV c1, t3 | |||||
| FMOV c1, t4 | |||||
| cmp INCX, 0 | |||||
| ble .LL19 | |||||
| cmp INCX, SIZE | |||||
| bne .LL50 | |||||
| sra N, 3, I | |||||
| cmp I, 0 | |||||
| ble,pn %icc, .LL15 | |||||
| nop | |||||
| LDF [X + 0 * SIZE], a1 | |||||
| add I, -1, I | |||||
| LDF [X + 1 * SIZE], a2 | |||||
| cmp I, 0 | |||||
| LDF [X + 2 * SIZE], a3 | |||||
| LDF [X + 3 * SIZE], a4 | |||||
| LDF [X + 4 * SIZE], a5 | |||||
| LDF [X + 5 * SIZE], a6 | |||||
| LDF [X + 6 * SIZE], a7 | |||||
| LDF [X + 7 * SIZE], a8 | |||||
| ble,pt %icc, .LL12 | |||||
| add X, 8 * SIZE, X | |||||
| #define PREFETCHSIZE 128 | |||||
| .LL11: | |||||
| FADD c1, t1, c1 | |||||
| prefetch [X + PREFETCHSIZE * SIZE], 0 | |||||
| FMOV a1, t1 | |||||
| LDF [X + 0 * SIZE], a1 | |||||
| FADD c2, t2, c2 | |||||
| add I, -1, I | |||||
| FMOV a2, t2 | |||||
| LDF [X + 1 * SIZE], a2 | |||||
| FADD c1, t3, c1 | |||||
| cmp I, 0 | |||||
| FMOV a3, t3 | |||||
| LDF [X + 2 * SIZE], a3 | |||||
| FADD c2, t4, c2 | |||||
| nop | |||||
| FMOV a4, t4 | |||||
| LDF [X + 3 * SIZE], a4 | |||||
| FADD c1, t1, c1 | |||||
| nop | |||||
| FMOV a5, t1 | |||||
| LDF [X + 4 * SIZE], a5 | |||||
| FADD c2, t2, c2 | |||||
| nop | |||||
| FMOV a6, t2 | |||||
| LDF [X + 5 * SIZE], a6 | |||||
| FADD c1, t3, c1 | |||||
| FMOV a7, t3 | |||||
| LDF [X + 6 * SIZE], a7 | |||||
| add X, 8 * SIZE, X | |||||
| FADD c2, t4, c2 | |||||
| FMOV a8, t4 | |||||
| bg,pt %icc, .LL11 | |||||
| LDF [X - 1 * SIZE], a8 | |||||
| .LL12: | |||||
| FADD c1, t1, c1 | |||||
| FMOV a1, t1 | |||||
| FADD c2, t2, c2 | |||||
| FMOV a2, t2 | |||||
| FADD c1, t3, c1 | |||||
| FMOV a3, t3 | |||||
| FADD c2, t4, c2 | |||||
| FMOV a4, t4 | |||||
| FADD c1, t1, c1 | |||||
| FMOV a5, t1 | |||||
| FADD c2, t2, c2 | |||||
| FMOV a6, t2 | |||||
| FADD c1, t3, c1 | |||||
| FMOV a7, t3 | |||||
| FADD c2, t4, c2 | |||||
| FMOV a8, t4 | |||||
| .LL15: | |||||
| and N, 7, I | |||||
| cmp I, 0 | |||||
| ble,a,pn %icc, .LL19 | |||||
| nop | |||||
| .LL16: | |||||
| LDF [X + 0 * SIZE], a1 | |||||
| add I, -1, I | |||||
| cmp I, 0 | |||||
| FADD c1, t1, c1 | |||||
| FMOV a1, t1 | |||||
| bg,pt %icc, .LL16 | |||||
| add X, 1 * SIZE, X | |||||
| .LL19: | |||||
| FADD c1, t1, c1 | |||||
| FADD c2, t2, c2 | |||||
| FADD c1, t3, c1 | |||||
| FADD c2, t4, c2 | |||||
| FADD c1, c2, c1 | |||||
| return %i7 + 8 | |||||
| clr %g0 | |||||
| .LL50: | |||||
| sra N, 3, I | |||||
| cmp I, 0 | |||||
| ble,pn %icc, .LL55 | |||||
| nop | |||||
| LDF [X + 0 * SIZE], a1 | |||||
| add X, INCX, X | |||||
| LDF [X + 0 * SIZE], a2 | |||||
| add X, INCX, X | |||||
| LDF [X + 0 * SIZE], a3 | |||||
| add X, INCX, X | |||||
| LDF [X + 0 * SIZE], a4 | |||||
| add X, INCX, X | |||||
| LDF [X + 0 * SIZE], a5 | |||||
| add X, INCX, X | |||||
| LDF [X + 0 * SIZE], a6 | |||||
| add X, INCX, X | |||||
| add I, -1, I | |||||
| LDF [X + 0 * SIZE], a7 | |||||
| cmp I, 0 | |||||
| add X, INCX, X | |||||
| LDF [X + 0 * SIZE], a8 | |||||
| ble,pt %icc, .LL52 | |||||
| add X, INCX, X | |||||
| .LL51: | |||||
| FADD c1, t1, c1 | |||||
| add I, -1, I | |||||
| FMOV a1, t1 | |||||
| LDF [X + 0 * SIZE], a1 | |||||
| add X, INCX, X | |||||
| FADD c2, t2, c2 | |||||
| cmp I, 0 | |||||
| FMOV a2, t2 | |||||
| LDF [X + 0 * SIZE], a2 | |||||
| add X, INCX, X | |||||
| FADD c1, t3, c1 | |||||
| FMOV a3, t3 | |||||
| LDF [X + 0 * SIZE], a3 | |||||
| add X, INCX, X | |||||
| FADD c2, t4, c2 | |||||
| FMOV a4, t4 | |||||
| LDF [X + 0 * SIZE], a4 | |||||
| add X, INCX, X | |||||
| FADD c1, t1, c1 | |||||
| FMOV a5, t1 | |||||
| LDF [X + 0 * SIZE], a5 | |||||
| add X, INCX, X | |||||
| FADD c2, t2, c2 | |||||
| FMOV a6, t2 | |||||
| LDF [X + 0 * SIZE], a6 | |||||
| add X, INCX, X | |||||
| FADD c1, t3, c1 | |||||
| FMOV a7, t3 | |||||
| LDF [X + 0 * SIZE], a7 | |||||
| add X, INCX, X | |||||
| FADD c2, t4, c2 | |||||
| FMOV a8, t4 | |||||
| LDF [X + 0 * SIZE], a8 | |||||
| bg,pt %icc, .LL51 | |||||
| add X, INCX, X | |||||
| .LL52: | |||||
| FADD c1, t1, c1 | |||||
| FMOV a1, t1 | |||||
| FADD c2, t2, c2 | |||||
| FMOV a2, t2 | |||||
| FADD c1, t3, c1 | |||||
| FMOV a3, t3 | |||||
| FADD c2, t4, c2 | |||||
| FMOV a4, t4 | |||||
| FADD c1, t1, c1 | |||||
| FMOV a5, t1 | |||||
| FADD c2, t2, c2 | |||||
| FMOV a6, t2 | |||||
| FADD c1, t3, c1 | |||||
| FMOV a7, t3 | |||||
| FADD c2, t4, c2 | |||||
| FMOV a8, t4 | |||||
| .LL55: | |||||
| and N, 7, I | |||||
| cmp I, 0 | |||||
| ble,a,pn %icc, .LL59 | |||||
| nop | |||||
| .LL56: | |||||
| LDF [X + 0 * SIZE], a1 | |||||
| FADD c1, t1, c1 | |||||
| add I, -1, I | |||||
| FMOV a1, t1 | |||||
| cmp I, 0 | |||||
| bg,pt %icc, .LL56 | |||||
| add X, INCX, X | |||||
| .LL59: | |||||
| FADD c1, t1, c1 | |||||
| FADD c2, t2, c2 | |||||
| FADD c1, t3, c1 | |||||
| FADD c2, t4, c2 | |||||
| FADD c1, c2, c1 | |||||
| return %i7 + 8 | |||||
| clr %o0 | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,327 @@ | |||||
| /*********************************************************************/ | |||||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||||
| /* All rights reserved. */ | |||||
| /* */ | |||||
| /* Redistribution and use in source and binary forms, with or */ | |||||
| /* without modification, are permitted provided that the following */ | |||||
| /* conditions are met: */ | |||||
| /* */ | |||||
| /* 1. Redistributions of source code must retain the above */ | |||||
| /* copyright notice, this list of conditions and the following */ | |||||
| /* disclaimer. */ | |||||
| /* */ | |||||
| /* 2. Redistributions in binary form must reproduce the above */ | |||||
| /* copyright notice, this list of conditions and the following */ | |||||
| /* disclaimer in the documentation and/or other materials */ | |||||
| /* provided with the distribution. */ | |||||
| /* */ | |||||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||||
| /* */ | |||||
| /* The views and conclusions contained in the software and */ | |||||
| /* documentation are those of the authors and should not be */ | |||||
| /* interpreted as representing official policies, either expressed */ | |||||
| /* or implied, of The University of Texas at Austin. */ | |||||
| /*********************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define N %i0 | |||||
| #define X %i1 | |||||
| #define INCX %i2 | |||||
| #define I %i3 | |||||
| #ifdef DOUBLE | |||||
| #define c1 %f0 | |||||
| #define c2 %f2 | |||||
| #define t1 %f8 | |||||
| #define t2 %f10 | |||||
| #define t3 %f12 | |||||
| #define t4 %f14 | |||||
| #define a1 %f16 | |||||
| #define a2 %f18 | |||||
| #define a3 %f20 | |||||
| #define a4 %f22 | |||||
| #define a5 %f24 | |||||
| #define a6 %f26 | |||||
| #define a7 %f28 | |||||
| #define a8 %f30 | |||||
| #else | |||||
| #define c1 %f0 | |||||
| #define c2 %f1 | |||||
| #define t1 %f4 | |||||
| #define t2 %f5 | |||||
| #define t3 %f6 | |||||
| #define t4 %f7 | |||||
| #define a1 %f8 | |||||
| #define a2 %f9 | |||||
| #define a3 %f10 | |||||
| #define a4 %f11 | |||||
| #define a5 %f12 | |||||
| #define a6 %f13 | |||||
| #define a7 %f14 | |||||
| #define a8 %f15 | |||||
| #endif | |||||
| PROLOGUE | |||||
| SAVESP | |||||
| FCLR(0) | |||||
| sll INCX, ZBASE_SHIFT, INCX | |||||
| FMOV c1, c2 | |||||
| FMOV c1, t1 | |||||
| FMOV c1, t2 | |||||
| FMOV c1, t3 | |||||
| FMOV c1, t4 | |||||
| cmp INCX, 0 | |||||
| ble .LL19 | |||||
| nop | |||||
| cmp INCX, 2 * SIZE | |||||
| bne .LL50 | |||||
| nop | |||||
| sra N, 2, I | |||||
| cmp I, 0 | |||||
| ble,pn %icc, .LL15 | |||||
| nop | |||||
| LDF [X + 0 * SIZE], a1 | |||||
| add I, -1, I | |||||
| LDF [X + 1 * SIZE], a2 | |||||
| cmp I, 0 | |||||
| LDF [X + 2 * SIZE], a3 | |||||
| LDF [X + 3 * SIZE], a4 | |||||
| LDF [X + 4 * SIZE], a5 | |||||
| LDF [X + 5 * SIZE], a6 | |||||
| LDF [X + 6 * SIZE], a7 | |||||
| LDF [X + 7 * SIZE], a8 | |||||
| ble,pt %icc, .LL12 | |||||
| add X, 8 * SIZE, X | |||||
| #define PREFETCHSIZE 32 | |||||
| .LL11: | |||||
| FADD c1, t1, c1 | |||||
| prefetch [X + PREFETCHSIZE * SIZE], 0 | |||||
| FMOV a1, t1 | |||||
| LDF [X + 0 * SIZE], a1 | |||||
| FADD c2, t2, c2 | |||||
| add I, -1, I | |||||
| FMOV a2, t2 | |||||
| LDF [X + 1 * SIZE], a2 | |||||
| FADD c1, t3, c1 | |||||
| cmp I, 0 | |||||
| FMOV a3, t3 | |||||
| LDF [X + 2 * SIZE], a3 | |||||
| FADD c2, t4, c2 | |||||
| nop | |||||
| FMOV a4, t4 | |||||
| LDF [X + 3 * SIZE], a4 | |||||
| FADD c1, t1, c1 | |||||
| nop | |||||
| FMOV a5, t1 | |||||
| LDF [X + 4 * SIZE], a5 | |||||
| FADD c2, t2, c2 | |||||
| nop | |||||
| FMOV a6, t2 | |||||
| LDF [X + 5 * SIZE], a6 | |||||
| FADD c1, t3, c1 | |||||
| FMOV a7, t3 | |||||
| LDF [X + 6 * SIZE], a7 | |||||
| add X, 8 * SIZE, X | |||||
| FADD c2, t4, c2 | |||||
| FMOV a8, t4 | |||||
| bg,pt %icc, .LL11 | |||||
| LDF [X - 1 * SIZE], a8 | |||||
| .LL12: | |||||
| FADD c1, t1, c1 | |||||
| FMOV a1, t1 | |||||
| FADD c2, t2, c2 | |||||
| FMOV a2, t2 | |||||
| FADD c1, t3, c1 | |||||
| FMOV a3, t3 | |||||
| FADD c2, t4, c2 | |||||
| FMOV a4, t4 | |||||
| FADD c1, t1, c1 | |||||
| FMOV a5, t1 | |||||
| FADD c2, t2, c2 | |||||
| FMOV a6, t2 | |||||
| FADD c1, t3, c1 | |||||
| FMOV a7, t3 | |||||
| FADD c2, t4, c2 | |||||
| FMOV a8, t4 | |||||
| .LL15: | |||||
| and N, 3, I | |||||
| cmp I, 0 | |||||
| ble,a,pn %icc, .LL19 | |||||
| nop | |||||
| .LL16: | |||||
| LDF [X + 0 * SIZE], a1 | |||||
| LDF [X + 1 * SIZE], a2 | |||||
| add I, -1, I | |||||
| cmp I, 0 | |||||
| FADD c1, t1, c1 | |||||
| FADD c2, t2, c2 | |||||
| FMOV a1, t1 | |||||
| FMOV a2, t2 | |||||
| bg,pt %icc, .LL16 | |||||
| add X, 2 * SIZE, X | |||||
| .LL19: | |||||
| FADD c1, t1, c1 | |||||
| FADD c2, t2, c2 | |||||
| FADD c1, t3, c1 | |||||
| FADD c2, t4, c2 | |||||
| FADD c1, c2, c1 | |||||
| return %i7 + 8 | |||||
| clr %g0 | |||||
| .LL50: | |||||
| sra N, 2, I | |||||
| cmp I, 0 | |||||
| ble,pn %icc, .LL55 | |||||
| nop | |||||
| LDF [X + 0 * SIZE], a1 | |||||
| LDF [X + 1 * SIZE], a2 | |||||
| add X, INCX, X | |||||
| LDF [X + 0 * SIZE], a3 | |||||
| LDF [X + 1 * SIZE], a4 | |||||
| add X, INCX, X | |||||
| LDF [X + 0 * SIZE], a5 | |||||
| LDF [X + 1 * SIZE], a6 | |||||
| add X, INCX, X | |||||
| add I, -1, I | |||||
| LDF [X + 0 * SIZE], a7 | |||||
| cmp I, 0 | |||||
| LDF [X + 1 * SIZE], a8 | |||||
| ble,pt %icc, .LL52 | |||||
| add X, INCX, X | |||||
| .LL51: | |||||
| FADD c1, t1, c1 | |||||
| add I, -1, I | |||||
| FMOV a1, t1 | |||||
| LDF [X + 0 * SIZE], a1 | |||||
| FADD c2, t2, c2 | |||||
| cmp I, 0 | |||||
| FMOV a2, t2 | |||||
| LDF [X + 1 * SIZE], a2 | |||||
| add X, INCX, X | |||||
| FADD c1, t3, c1 | |||||
| FMOV a3, t3 | |||||
| LDF [X + 0 * SIZE], a3 | |||||
| FADD c2, t4, c2 | |||||
| FMOV a4, t4 | |||||
| LDF [X + 1 * SIZE], a4 | |||||
| add X, INCX, X | |||||
| FADD c1, t1, c1 | |||||
| FMOV a5, t1 | |||||
| LDF [X + 0 * SIZE], a5 | |||||
| FADD c2, t2, c2 | |||||
| FMOV a6, t2 | |||||
| LDF [X + 1 * SIZE], a6 | |||||
| add X, INCX, X | |||||
| FADD c1, t3, c1 | |||||
| FMOV a7, t3 | |||||
| LDF [X + 0 * SIZE], a7 | |||||
| FADD c2, t4, c2 | |||||
| FMOV a8, t4 | |||||
| LDF [X + 1 * SIZE], a8 | |||||
| bg,pt %icc, .LL51 | |||||
| add X, INCX, X | |||||
| .LL52: | |||||
| FADD c1, t1, c1 | |||||
| FMOV a1, t1 | |||||
| FADD c2, t2, c2 | |||||
| FMOV a2, t2 | |||||
| FADD c1, t3, c1 | |||||
| FMOV a3, t3 | |||||
| FADD c2, t4, c2 | |||||
| FMOV a4, t4 | |||||
| FADD c1, t1, c1 | |||||
| FMOV a5, t1 | |||||
| FADD c2, t2, c2 | |||||
| FMOV a6, t2 | |||||
| FADD c1, t3, c1 | |||||
| FMOV a7, t3 | |||||
| FADD c2, t4, c2 | |||||
| FMOV a8, t4 | |||||
| .LL55: | |||||
| and N, 3, I | |||||
| cmp I, 0 | |||||
| ble,a,pn %icc, .LL59 | |||||
| nop | |||||
| .LL56: | |||||
| LDF [X + 0 * SIZE], a1 | |||||
| LDF [X + 1 * SIZE], a2 | |||||
| FADD c1, t1, c1 | |||||
| FADD c2, t2, c2 | |||||
| add I, -1, I | |||||
| FMOV a1, t1 | |||||
| FMOV a2, t2 | |||||
| cmp I, 0 | |||||
| bg,pt %icc, .LL56 | |||||
| add X, INCX, X | |||||
| .LL59: | |||||
| FADD c1, t1, c1 | |||||
| FADD c2, t2, c2 | |||||
| FADD c1, t3, c1 | |||||
| FADD c2, t4, c2 | |||||
| FADD c1, c2, c1 | |||||
| return %i7 + 8 | |||||
| clr %o0 | |||||
| EPILOGUE | |||||
| @@ -94,6 +94,11 @@ DASUMKERNEL = ../arm/asum.c | |||||
| CASUMKERNEL = ../arm/zasum.c | CASUMKERNEL = ../arm/zasum.c | ||||
| ZASUMKERNEL = ../arm/zasum.c | ZASUMKERNEL = ../arm/zasum.c | ||||
| SSUMKERNEL = ../arm/sum.c | |||||
| DSUMKERNEL = ../arm/sum.c | |||||
| CSUMKERNEL = ../arm/zsum.c | |||||
| ZSUMKERNEL = ../arm/zsum.c | |||||
| SAXPYKERNEL = ../arm/axpy.c | SAXPYKERNEL = ../arm/axpy.c | ||||
| DAXPYKERNEL = ../arm/axpy.c | DAXPYKERNEL = ../arm/axpy.c | ||||
| CAXPYKERNEL = ../arm/zaxpy.c | CAXPYKERNEL = ../arm/zaxpy.c | ||||
| @@ -0,0 +1,207 @@ | |||||
| /*********************************************************************/ | |||||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||||
| /* All rights reserved. */ | |||||
| /* */ | |||||
| /* Redistribution and use in source and binary forms, with or */ | |||||
| /* without modification, are permitted provided that the following */ | |||||
| /* conditions are met: */ | |||||
| /* */ | |||||
| /* 1. Redistributions of source code must retain the above */ | |||||
| /* copyright notice, this list of conditions and the following */ | |||||
| /* disclaimer. */ | |||||
| /* */ | |||||
| /* 2. Redistributions in binary form must reproduce the above */ | |||||
| /* copyright notice, this list of conditions and the following */ | |||||
| /* disclaimer in the documentation and/or other materials */ | |||||
| /* provided with the distribution. */ | |||||
| /* */ | |||||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||||
| /* */ | |||||
| /* The views and conclusions contained in the software and */ | |||||
| /* documentation are those of the authors and should not be */ | |||||
| /* interpreted as representing official policies, either expressed */ | |||||
| /* or implied, of The University of Texas at Austin. */ | |||||
| /*********************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define STACK 8 | |||||
| #define ARGS 0 | |||||
| #define STACK_M 4 + STACK + ARGS(%esp) | |||||
| #define STACK_X 8 + STACK + ARGS(%esp) | |||||
| #define STACK_INCX 12 + STACK + ARGS(%esp) | |||||
| #define M %edx | |||||
| #define X %ecx | |||||
| #define INCX %esi | |||||
| #define I %eax | |||||
| #include "l1param.h" | |||||
| PROLOGUE | |||||
| pushl %esi | |||||
| pushl %ebx | |||||
| PROFCODE | |||||
| #if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) | |||||
| EMMS | |||||
| #endif | |||||
| movl STACK_M, M | |||||
| movl STACK_X, X | |||||
| movl STACK_INCX, INCX | |||||
| #ifdef F_INTERFACE | |||||
| movl (M), M | |||||
| movl (INCX), INCX | |||||
| #endif | |||||
| fldz | |||||
| testl M, M | |||||
| jle .L999 | |||||
| testl INCX, INCX | |||||
| jle .L999 | |||||
| sall $BASE_SHIFT, INCX | |||||
| fldz | |||||
| fldz | |||||
| fldz | |||||
| cmpl $SIZE, INCX | |||||
| jne .L40 | |||||
| movl M, I | |||||
| sarl $3, I | |||||
| jle .L20 | |||||
| ALIGN_4 | |||||
| .L10: | |||||
| #ifdef PREFETCH | |||||
| PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) | |||||
| #endif | |||||
| FLD 0 * SIZE(X) | |||||
| FLD 1 * SIZE(X) | |||||
| FLD 2 * SIZE(X) | |||||
| FLD 3 * SIZE(X) | |||||
| faddp %st, %st(7) | |||||
| faddp %st, %st(5) | |||||
| faddp %st, %st(3) | |||||
| faddp %st, %st(1) | |||||
| FLD 4 * SIZE(X) | |||||
| FLD 5 * SIZE(X) | |||||
| FLD 6 * SIZE(X) | |||||
| FLD 7 * SIZE(X) | |||||
| addl $8 * SIZE, X | |||||
| faddp %st, %st(7) | |||||
| faddp %st, %st(5) | |||||
| faddp %st, %st(3) | |||||
| faddp %st, %st(1) | |||||
| decl I | |||||
| jg .L10 | |||||
| ALIGN_4 | |||||
| .L20: | |||||
| movl M, I | |||||
| andl $7, I | |||||
| jle .L998 | |||||
| ALIGN_4 | |||||
| .L21: | |||||
| FLD (X) | |||||
| faddp %st,%st(1) | |||||
| addl $1 * SIZE, X | |||||
| decl I | |||||
| jg .L21 | |||||
| jmp .L998 | |||||
| ALIGN_4 | |||||
| .L40: | |||||
| movl M, I | |||||
| sarl $3, I | |||||
| jle .L60 | |||||
| ALIGN_4 | |||||
| .L50: | |||||
| FLD (X) | |||||
| addl INCX, X | |||||
| FLD (X) | |||||
| addl INCX, X | |||||
| FLD (X) | |||||
| addl INCX, X | |||||
| FLD (X) | |||||
| addl INCX, X | |||||
| faddp %st, %st(7) | |||||
| faddp %st, %st(5) | |||||
| faddp %st, %st(3) | |||||
| faddp %st, %st(1) | |||||
| FLD (X) | |||||
| addl INCX, X | |||||
| FLD (X) | |||||
| addl INCX, X | |||||
| FLD (X) | |||||
| addl INCX, X | |||||
| FLD (X) | |||||
| addl INCX, X | |||||
| faddp %st, %st(7) | |||||
| faddp %st, %st(5) | |||||
| faddp %st, %st(3) | |||||
| faddp %st, %st(1) | |||||
| decl I | |||||
| jg .L50 | |||||
| ALIGN_4 | |||||
| .L60: | |||||
| movl M, I | |||||
| andl $7, I | |||||
| jle .L998 | |||||
| ALIGN_4 | |||||
| .L61: | |||||
| FLD (X) | |||||
| addl INCX, X | |||||
| faddp %st,%st(1) | |||||
| decl I | |||||
| jg .L61 | |||||
| ALIGN_4 | |||||
| .L998: | |||||
| faddp %st,%st(2) | |||||
| faddp %st,%st(1) | |||||
| faddp %st,%st(1) | |||||
| ALIGN_4 | |||||
| .L999: | |||||
| popl %ebx | |||||
| popl %esi | |||||
| ret | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,208 @@ | |||||
| /*********************************************************************/ | |||||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||||
| /* All rights reserved. */ | |||||
| /* */ | |||||
| /* Redistribution and use in source and binary forms, with or */ | |||||
| /* without modification, are permitted provided that the following */ | |||||
| /* conditions are met: */ | |||||
| /* */ | |||||
| /* 1. Redistributions of source code must retain the above */ | |||||
| /* copyright notice, this list of conditions and the following */ | |||||
| /* disclaimer. */ | |||||
| /* */ | |||||
| /* 2. Redistributions in binary form must reproduce the above */ | |||||
| /* copyright notice, this list of conditions and the following */ | |||||
| /* disclaimer in the documentation and/or other materials */ | |||||
| /* provided with the distribution. */ | |||||
| /* */ | |||||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||||
| /* */ | |||||
| /* The views and conclusions contained in the software and */ | |||||
| /* documentation are those of the authors and should not be */ | |||||
| /* interpreted as representing official policies, either expressed */ | |||||
| /* or implied, of The University of Texas at Austin. */ | |||||
| /*********************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define STACK 8 | |||||
| #define ARGS 0 | |||||
| #define STACK_M 4 + STACK + ARGS(%esp) | |||||
| #define STACK_X 8 + STACK + ARGS(%esp) | |||||
| #define STACK_INCX 12 + STACK + ARGS(%esp) | |||||
| #define M %edx | |||||
| #define X %ecx | |||||
| #define INCX %esi | |||||
| #define I %eax | |||||
| #include "l1param.h" | |||||
| PROLOGUE | |||||
| pushl %esi | |||||
| pushl %ebx | |||||
| PROFCODE | |||||
| #if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) | |||||
| EMMS | |||||
| #endif | |||||
| movl STACK_M, M | |||||
| movl STACK_X, X | |||||
| movl STACK_INCX, INCX | |||||
| #ifdef F_INTERFACE | |||||
| movl (M), M | |||||
| movl (INCX), INCX | |||||
| #endif | |||||
| fldz | |||||
| testl M, M | |||||
| jle .L999 | |||||
| testl INCX, INCX | |||||
| jle .L999 | |||||
| sall $ZBASE_SHIFT, INCX | |||||
| fldz | |||||
| fldz | |||||
| fldz | |||||
| cmpl $SIZE * 2, INCX | |||||
| jne .L40 | |||||
| movl M, I | |||||
| sarl $2, I | |||||
| jle .L20 | |||||
| ALIGN_4 | |||||
| .L10: | |||||
| #ifdef PREFETCH | |||||
| PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) | |||||
| #endif | |||||
| FLD 0 * SIZE(X) | |||||
| FLD 1 * SIZE(X) | |||||
| FLD 2 * SIZE(X) | |||||
| FLD 3 * SIZE(X) | |||||
| faddp %st, %st(7) | |||||
| faddp %st, %st(5) | |||||
| faddp %st, %st(3) | |||||
| faddp %st, %st(1) | |||||
| FLD 4 * SIZE(X) | |||||
| FLD 5 * SIZE(X) | |||||
| FLD 6 * SIZE(X) | |||||
| FLD 7 * SIZE(X) | |||||
| addl $8 * SIZE, X | |||||
| faddp %st, %st(7) | |||||
| faddp %st, %st(5) | |||||
| faddp %st, %st(3) | |||||
| faddp %st, %st(1) | |||||
| decl I | |||||
| jg .L10 | |||||
| ALIGN_4 | |||||
| .L20: | |||||
| movl M, I | |||||
| andl $3, I | |||||
| jle .L998 | |||||
| ALIGN_4 | |||||
| .L21: | |||||
| FLD 0 * SIZE(X) | |||||
| FLD 1 * SIZE(X) | |||||
| faddp %st,%st(3) | |||||
| faddp %st,%st(1) | |||||
| addl $2 * SIZE, X | |||||
| decl I | |||||
| jg .L21 | |||||
| jmp .L998 | |||||
| ALIGN_4 | |||||
| .L40: | |||||
| movl M, I | |||||
| sarl $2, I | |||||
| jle .L60 | |||||
| ALIGN_4 | |||||
| .L50: | |||||
| FLD 0 * SIZE(X) | |||||
| FLD 1 * SIZE(X) | |||||
| addl INCX, X | |||||
| FLD 0 * SIZE(X) | |||||
| FLD 1 * SIZE(X) | |||||
| addl INCX, X | |||||
| faddp %st, %st(7) | |||||
| faddp %st, %st(5) | |||||
| faddp %st, %st(3) | |||||
| faddp %st, %st(1) | |||||
| FLD 0 * SIZE(X) | |||||
| FLD 1 * SIZE(X) | |||||
| addl INCX, X | |||||
| FLD 0 * SIZE(X) | |||||
| FLD 1 * SIZE(X) | |||||
| addl INCX, X | |||||
| faddp %st, %st(7) | |||||
| faddp %st, %st(5) | |||||
| faddp %st, %st(3) | |||||
| faddp %st, %st(1) | |||||
| decl I | |||||
| jg .L50 | |||||
| ALIGN_4 | |||||
| .L60: | |||||
| movl M, I | |||||
| andl $3, I | |||||
| jle .L998 | |||||
| ALIGN_4 | |||||
| .L61: | |||||
| FLD 0 * SIZE(X) | |||||
| FLD 1 * SIZE(X) | |||||
| addl INCX, X | |||||
| faddp %st,%st(3) | |||||
| faddp %st,%st(1) | |||||
| decl I | |||||
| jg .L61 | |||||
| ALIGN_4 | |||||
| .L998: | |||||
| faddp %st,%st(2) | |||||
| faddp %st,%st(1) | |||||
| faddp %st,%st(1) | |||||
| ALIGN_4 | |||||
| .L999: | |||||
| popl %ebx | |||||
| popl %esi | |||||
| ret | |||||
| EPILOGUE | |||||
| @@ -94,6 +94,11 @@ DASUMKERNEL = ../arm/asum.c | |||||
| CASUMKERNEL = ../arm/zasum.c | CASUMKERNEL = ../arm/zasum.c | ||||
| ZASUMKERNEL = ../arm/zasum.c | ZASUMKERNEL = ../arm/zasum.c | ||||
| SSUMKERNEL = ../arm/sum.c | |||||
| DSUMKERNEL = ../arm/sum.c | |||||
| CSUMKERNEL = ../arm/zsum.c | |||||
| ZSUMKERNEL = ../arm/zsum.c | |||||
| SAXPYKERNEL = ../arm/axpy.c | SAXPYKERNEL = ../arm/axpy.c | ||||
| DAXPYKERNEL = ../arm/axpy.c | DAXPYKERNEL = ../arm/axpy.c | ||||
| CAXPYKERNEL = ../arm/zaxpy.c | CAXPYKERNEL = ../arm/zaxpy.c | ||||
| @@ -0,0 +1,179 @@ | |||||
| /*********************************************************************/ | |||||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||||
| /* All rights reserved. */ | |||||
| /* */ | |||||
| /* Redistribution and use in source and binary forms, with or */ | |||||
| /* without modification, are permitted provided that the following */ | |||||
| /* conditions are met: */ | |||||
| /* */ | |||||
| /* 1. Redistributions of source code must retain the above */ | |||||
| /* copyright notice, this list of conditions and the following */ | |||||
| /* disclaimer. */ | |||||
| /* */ | |||||
| /* 2. Redistributions in binary form must reproduce the above */ | |||||
| /* copyright notice, this list of conditions and the following */ | |||||
| /* disclaimer in the documentation and/or other materials */ | |||||
| /* provided with the distribution. */ | |||||
| /* */ | |||||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||||
| /* */ | |||||
| /* The views and conclusions contained in the software and */ | |||||
| /* documentation are those of the authors and should not be */ | |||||
| /* interpreted as representing official policies, either expressed */ | |||||
| /* or implied, of The University of Texas at Austin. */ | |||||
| /*********************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define M ARG1 | |||||
| #define X ARG2 | |||||
| #define INCX ARG3 | |||||
| #define I %rax | |||||
| #include "l1param.h" | |||||
| PROLOGUE | |||||
| PROFCODE | |||||
| fldz | |||||
| testq M, M | |||||
| jle .L999 | |||||
| testq INCX, INCX | |||||
| jle .L999 | |||||
| salq $BASE_SHIFT, INCX | |||||
| fldz | |||||
| fldz | |||||
| fldz | |||||
| cmpq $SIZE, INCX | |||||
| jne .L40 | |||||
| movq M, I | |||||
| sarq $3, I | |||||
| jle .L20 | |||||
| ALIGN_4 | |||||
| .L10: | |||||
| #ifdef PREFETCH | |||||
| PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) | |||||
| #endif | |||||
| FLD 0 * SIZE(X) | |||||
| FLD 1 * SIZE(X) | |||||
| FLD 2 * SIZE(X) | |||||
| FLD 3 * SIZE(X) | |||||
| faddp %st, %st(7) | |||||
| faddp %st, %st(5) | |||||
| faddp %st, %st(3) | |||||
| faddp %st, %st(1) | |||||
| FLD 4 * SIZE(X) | |||||
| FLD 5 * SIZE(X) | |||||
| FLD 6 * SIZE(X) | |||||
| FLD 7 * SIZE(X) | |||||
| addq $8 * SIZE, X | |||||
| faddp %st, %st(7) | |||||
| faddp %st, %st(5) | |||||
| faddp %st, %st(3) | |||||
| faddp %st, %st(1) | |||||
| decq I | |||||
| jg .L10 | |||||
| ALIGN_4 | |||||
| .L20: | |||||
| andq $7, M | |||||
| jle .L998 | |||||
| ALIGN_4 | |||||
| .L21: | |||||
| FLD (X) | |||||
| faddp %st,%st(1) | |||||
| addq $1 * SIZE, X | |||||
| decq M | |||||
| jg .L21 | |||||
| jmp .L998 | |||||
| ALIGN_4 | |||||
| .L40: | |||||
| movq M, I | |||||
| sarq $3, I | |||||
| jle .L60 | |||||
| ALIGN_4 | |||||
| .L50: | |||||
| FLD (X) | |||||
| addq INCX, X | |||||
| FLD (X) | |||||
| addq INCX, X | |||||
| FLD (X) | |||||
| addq INCX, X | |||||
| FLD (X) | |||||
| addq INCX, X | |||||
| faddp %st, %st(7) | |||||
| faddp %st, %st(5) | |||||
| faddp %st, %st(3) | |||||
| faddp %st, %st(1) | |||||
| FLD (X) | |||||
| addq INCX, X | |||||
| FLD (X) | |||||
| addq INCX, X | |||||
| FLD (X) | |||||
| addq INCX, X | |||||
| FLD (X) | |||||
| addq INCX, X | |||||
| faddp %st, %st(7) | |||||
| faddp %st, %st(5) | |||||
| faddp %st, %st(3) | |||||
| faddp %st, %st(1) | |||||
| decq I | |||||
| jg .L50 | |||||
| ALIGN_4 | |||||
| .L60: | |||||
| andq $7, M | |||||
| jle .L998 | |||||
| ALIGN_4 | |||||
| .L61: | |||||
| FLD (X) | |||||
| addq INCX, X | |||||
| faddp %st,%st(1) | |||||
| decq M | |||||
| jg .L61 | |||||
| ALIGN_4 | |||||
| .L998: | |||||
| faddp %st,%st(2) | |||||
| faddp %st,%st(1) | |||||
| faddp %st,%st(1) | |||||
| ALIGN_4 | |||||
| .L999: | |||||
| ret | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,180 @@ | |||||
| /*********************************************************************/ | |||||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||||
| /* All rights reserved. */ | |||||
| /* */ | |||||
| /* Redistribution and use in source and binary forms, with or */ | |||||
| /* without modification, are permitted provided that the following */ | |||||
| /* conditions are met: */ | |||||
| /* */ | |||||
| /* 1. Redistributions of source code must retain the above */ | |||||
| /* copyright notice, this list of conditions and the following */ | |||||
| /* disclaimer. */ | |||||
| /* */ | |||||
| /* 2. Redistributions in binary form must reproduce the above */ | |||||
| /* copyright notice, this list of conditions and the following */ | |||||
| /* disclaimer in the documentation and/or other materials */ | |||||
| /* provided with the distribution. */ | |||||
| /* */ | |||||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||||
| /* */ | |||||
| /* The views and conclusions contained in the software and */ | |||||
| /* documentation are those of the authors and should not be */ | |||||
| /* interpreted as representing official policies, either expressed */ | |||||
| /* or implied, of The University of Texas at Austin. */ | |||||
| /*********************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define M ARG1 | |||||
| #define X ARG2 | |||||
| #define INCX ARG3 | |||||
| #define I %rax | |||||
| #include "l1param.h" | |||||
| PROLOGUE | |||||
| PROFCODE | |||||
| fldz | |||||
| testq M, M | |||||
| jle .L999 | |||||
| testq INCX, INCX | |||||
| jle .L999 | |||||
| salq $ZBASE_SHIFT, INCX | |||||
| fldz | |||||
| fldz | |||||
| fldz | |||||
| cmpq $SIZE * 2, INCX | |||||
| jne .L40 | |||||
| movq M, I | |||||
| sarq $2, I | |||||
| jle .L20 | |||||
| ALIGN_4 | |||||
| .L10: | |||||
| #ifdef PREFETCH | |||||
| PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) | |||||
| #endif | |||||
| FLD 0 * SIZE(X) | |||||
| FLD 1 * SIZE(X) | |||||
| FLD 2 * SIZE(X) | |||||
| FLD 3 * SIZE(X) | |||||
| faddp %st, %st(7) | |||||
| faddp %st, %st(5) | |||||
| faddp %st, %st(3) | |||||
| faddp %st, %st(1) | |||||
| FLD 4 * SIZE(X) | |||||
| FLD 5 * SIZE(X) | |||||
| FLD 6 * SIZE(X) | |||||
| FLD 7 * SIZE(X) | |||||
| addq $8 * SIZE, X | |||||
| faddp %st, %st(7) | |||||
| faddp %st, %st(5) | |||||
| faddp %st, %st(3) | |||||
| faddp %st, %st(1) | |||||
| decq I | |||||
| jg .L10 | |||||
| ALIGN_4 | |||||
| .L20: | |||||
| andq $3, M | |||||
| jle .L998 | |||||
| ALIGN_4 | |||||
| .L21: | |||||
| FLD 0 * SIZE(X) | |||||
| FLD 1 * SIZE(X) | |||||
| faddp %st,%st(3) | |||||
| faddp %st,%st(1) | |||||
| addq $2 * SIZE, X | |||||
| decq M | |||||
| jg .L21 | |||||
| jmp .L998 | |||||
| ALIGN_4 | |||||
| .L40: | |||||
| movq M, I | |||||
| sarq $2, I | |||||
| jle .L60 | |||||
| ALIGN_4 | |||||
| .L50: | |||||
| FLD 0 * SIZE(X) | |||||
| FLD 1 * SIZE(X) | |||||
| addq INCX, X | |||||
| FLD 0 * SIZE(X) | |||||
| FLD 1 * SIZE(X) | |||||
| addq INCX, X | |||||
| faddp %st, %st(7) | |||||
| faddp %st, %st(5) | |||||
| faddp %st, %st(3) | |||||
| faddp %st, %st(1) | |||||
| FLD 0 * SIZE(X) | |||||
| FLD 1 * SIZE(X) | |||||
| addq INCX, X | |||||
| FLD 0 * SIZE(X) | |||||
| FLD 1 * SIZE(X) | |||||
| addq INCX, X | |||||
| faddp %st, %st(7) | |||||
| faddp %st, %st(5) | |||||
| faddp %st, %st(3) | |||||
| faddp %st, %st(1) | |||||
| decq I | |||||
| jg .L50 | |||||
| ALIGN_4 | |||||
| .L60: | |||||
| andq $3, M | |||||
| jle .L998 | |||||
| ALIGN_4 | |||||
| .L61: | |||||
| FLD 0 * SIZE(X) | |||||
| FLD 1 * SIZE(X) | |||||
| addq INCX, X | |||||
| faddp %st,%st(3) | |||||
| faddp %st,%st(1) | |||||
| decq M | |||||
| jg .L61 | |||||
| ALIGN_4 | |||||
| .L998: | |||||
| faddp %st,%st(2) | |||||
| faddp %st,%st(1) | |||||
| faddp %st,%st(1) | |||||
| ALIGN_4 | |||||
| .L999: | |||||
| ret | |||||
| EPILOGUE | |||||
| @@ -35,6 +35,11 @@ DASUMKERNEL = dasum.c | |||||
| CASUMKERNEL = ../arm/zasum.c | CASUMKERNEL = ../arm/zasum.c | ||||
| ZASUMKERNEL = zasum.c | ZASUMKERNEL = zasum.c | ||||
| SSUMKERNEL = ../arm/asum.c | |||||
| DSUMKERNEL = dasum.c | |||||
| CSUMKERNEL = ../arm/zasum.c | |||||
| ZSUMKERNEL = zasum.c | |||||
| SAXPYKERNEL = ../arm/axpy.c | SAXPYKERNEL = ../arm/axpy.c | ||||
| DAXPYKERNEL = daxpy.c | DAXPYKERNEL = daxpy.c | ||||
| CAXPYKERNEL = ../arm/zaxpy.c | CAXPYKERNEL = ../arm/zaxpy.c | ||||
| @@ -35,6 +35,11 @@ DASUMKERNEL = dasum.c | |||||
| CASUMKERNEL = casum.c | CASUMKERNEL = casum.c | ||||
| ZASUMKERNEL = zasum.c | ZASUMKERNEL = zasum.c | ||||
| SSUMKERNEL = ssum.c | |||||
| DSUMKERNEL = dsum.c | |||||
| CSUMKERNEL = csum.c | |||||
| ZSUMKERNEL = zsum.c | |||||
| SAXPYKERNEL = saxpy.c | SAXPYKERNEL = saxpy.c | ||||
| DAXPYKERNEL = daxpy.c | DAXPYKERNEL = daxpy.c | ||||
| CAXPYKERNEL = caxpy.c | CAXPYKERNEL = caxpy.c | ||||
| @@ -35,6 +35,11 @@ DASUMKERNEL = ../arm/asum.c | |||||
| CASUMKERNEL = ../arm/zasum.c | CASUMKERNEL = ../arm/zasum.c | ||||
| ZASUMKERNEL = ../arm/zasum.c | ZASUMKERNEL = ../arm/zasum.c | ||||
| SSUMKERNEL = ../arm/sum.c | |||||
| DSUMKERNEL = ../arm/sum.c | |||||
| CSUMKERNEL = ../arm/zsum.c | |||||
| ZSUMKERNEL = ../arm/zsum.c | |||||
| SAXPYKERNEL = ../arm/axpy.c | SAXPYKERNEL = ../arm/axpy.c | ||||
| DAXPYKERNEL = ../arm/axpy.c | DAXPYKERNEL = ../arm/axpy.c | ||||
| CAXPYKERNEL = ../arm/zaxpy.c | CAXPYKERNEL = ../arm/zaxpy.c | ||||
| @@ -0,0 +1,137 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #include "common.h" | |||||
| #include <math.h> | |||||
| static FLOAT csum_kernel_32(BLASLONG n, FLOAT *x) { | |||||
| FLOAT sum; | |||||
| __asm__("vzero %%v24\n\t" | |||||
| "vzero %%v25\n\t" | |||||
| "vzero %%v26\n\t" | |||||
| "vzero %%v27\n\t" | |||||
| "vzero %%v28\n\t" | |||||
| "vzero %%v29\n\t" | |||||
| "vzero %%v30\n\t" | |||||
| "vzero %%v31\n\t" | |||||
| "srlg %[n],%[n],5\n\t" | |||||
| "xgr %%r1,%%r1\n\t" | |||||
| "0:\n\t" | |||||
| "pfd 1, 1024(%%r1,%[x])\n\t" | |||||
| "vl %%v16, 0(%%r1,%[x])\n\t" | |||||
| "vl %%v17, 16(%%r1,%[x])\n\t" | |||||
| "vl %%v18, 32(%%r1,%[x])\n\t" | |||||
| "vl %%v19, 48(%%r1,%[x])\n\t" | |||||
| "vl %%v20, 64(%%r1,%[x])\n\t" | |||||
| "vl %%v21, 80(%%r1,%[x])\n\t" | |||||
| "vl %%v22, 96(%%r1,%[x])\n\t" | |||||
| "vl %%v23, 112(%%r1,%[x])\n\t" | |||||
| "vfasb %%v24,%%v24,%%v16\n\t" | |||||
| "vfasb %%v25,%%v25,%%v17\n\t" | |||||
| "vfasb %%v26,%%v26,%%v18\n\t" | |||||
| "vfasb %%v27,%%v27,%%v19\n\t" | |||||
| "vfasb %%v28,%%v28,%%v20\n\t" | |||||
| "vfasb %%v29,%%v29,%%v21\n\t" | |||||
| "vfasb %%v30,%%v30,%%v22\n\t" | |||||
| "vfasb %%v31,%%v31,%%v23\n\t" | |||||
| "vl %%v16, 128(%%r1,%[x])\n\t" | |||||
| "vl %%v17, 144(%%r1,%[x])\n\t" | |||||
| "vl %%v18, 160(%%r1,%[x])\n\t" | |||||
| "vl %%v19, 176(%%r1,%[x])\n\t" | |||||
| "vl %%v20, 192(%%r1,%[x])\n\t" | |||||
| "vl %%v21, 208(%%r1,%[x])\n\t" | |||||
| "vl %%v22, 224(%%r1,%[x])\n\t" | |||||
| "vl %%v23, 240(%%r1,%[x])\n\t" | |||||
| "vfasb %%v24,%%v24,%%v16\n\t" | |||||
| "vfasb %%v25,%%v25,%%v17\n\t" | |||||
| "vfasb %%v26,%%v26,%%v18\n\t" | |||||
| "vfasb %%v27,%%v27,%%v19\n\t" | |||||
| "vfasb %%v28,%%v28,%%v20\n\t" | |||||
| "vfasb %%v29,%%v29,%%v21\n\t" | |||||
| "vfasb %%v30,%%v30,%%v22\n\t" | |||||
| "vfasb %%v31,%%v31,%%v23\n\t" | |||||
| "agfi %%r1,256\n\t" | |||||
| "brctg %[n],0b\n\t" | |||||
| "vfasb %%v24,%%v24,%%v25\n\t" | |||||
| "vfasb %%v24,%%v24,%%v26\n\t" | |||||
| "vfasb %%v24,%%v24,%%v27\n\t" | |||||
| "vfasb %%v24,%%v24,%%v28\n\t" | |||||
| "vfasb %%v24,%%v24,%%v29\n\t" | |||||
| "vfasb %%v24,%%v24,%%v30\n\t" | |||||
| "vfasb %%v24,%%v24,%%v31\n\t" | |||||
| "veslg %%v25,%%v24,32\n\t" | |||||
| "vfasb %%v24,%%v24,%%v25\n\t" | |||||
| "vrepf %%v25,%%v24,2\n\t" | |||||
| "vfasb %%v24,%%v24,%%v25\n\t" | |||||
| "vstef %%v24,%[asum],0" | |||||
| : [sum] "=Q"(sum),[n] "+&r"(n) | |||||
| : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) | |||||
| : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", | |||||
| "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); | |||||
| return sum; | |||||
| } | |||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||||
| BLASLONG i = 0; | |||||
| BLASLONG ip = 0; | |||||
| FLOAT sumf = 0.0; | |||||
| BLASLONG n1; | |||||
| BLASLONG inc_x2; | |||||
| if (n <= 0 || inc_x <= 0) | |||||
| return (sumf); | |||||
| if (inc_x == 1) { | |||||
| n1 = n & -32; | |||||
| if (n1 > 0) { | |||||
| sumf = csum_kernel_32(n1, x); | |||||
| i = n1; | |||||
| ip = 2 * n1; | |||||
| } | |||||
| while (i < n) { | |||||
| sumf += x[ip] + x[ip + 1]; | |||||
| i++; | |||||
| ip += 2; | |||||
| } | |||||
| } else { | |||||
| inc_x2 = 2 * inc_x; | |||||
| while (i < n) { | |||||
| sumf += x[ip] + x[ip + 1]; | |||||
| ip += inc_x2; | |||||
| i++; | |||||
| } | |||||
| } | |||||
| return (sumf); | |||||
| } | |||||
| @@ -0,0 +1,148 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #include "common.h" | |||||
| #include <math.h> | |||||
| static FLOAT dsum_kernel_32(BLASLONG n, FLOAT *x) { | |||||
| FLOAT sum; | |||||
| __asm__("vzero %%v24\n\t" | |||||
| "vzero %%v25\n\t" | |||||
| "vzero %%v26\n\t" | |||||
| "vzero %%v27\n\t" | |||||
| "vzero %%v28\n\t" | |||||
| "vzero %%v29\n\t" | |||||
| "vzero %%v30\n\t" | |||||
| "vzero %%v31\n\t" | |||||
| "srlg %[n],%[n],5\n\t" | |||||
| "xgr %%r1,%%r1\n\t" | |||||
| "0:\n\t" | |||||
| "pfd 1, 1024(%%r1,%[x])\n\t" | |||||
| "vl %%v16, 0(%%r1,%[x])\n\t" | |||||
| "vl %%v17, 16(%%r1,%[x])\n\t" | |||||
| "vl %%v18, 32(%%r1,%[x])\n\t" | |||||
| "vl %%v19, 48(%%r1,%[x])\n\t" | |||||
| "vl %%v20, 64(%%r1,%[x])\n\t" | |||||
| "vl %%v21, 80(%%r1,%[x])\n\t" | |||||
| "vl %%v22, 96(%%r1,%[x])\n\t" | |||||
| "vl %%v23, 112(%%r1,%[x])\n\t" | |||||
| "vfadb %%v24,%%v24,%%v16\n\t" | |||||
| "vfadb %%v25,%%v25,%%v17\n\t" | |||||
| "vfadb %%v26,%%v26,%%v18\n\t" | |||||
| "vfadb %%v27,%%v27,%%v19\n\t" | |||||
| "vfadb %%v28,%%v28,%%v20\n\t" | |||||
| "vfadb %%v29,%%v29,%%v21\n\t" | |||||
| "vfadb %%v30,%%v30,%%v22\n\t" | |||||
| "vfadb %%v31,%%v31,%%v23\n\t" | |||||
| "vl %%v16, 128(%%r1,%[x])\n\t" | |||||
| "vl %%v17, 144(%%r1,%[x])\n\t" | |||||
| "vl %%v18, 160(%%r1,%[x])\n\t" | |||||
| "vl %%v19, 176(%%r1,%[x])\n\t" | |||||
| "vl %%v20, 192(%%r1,%[x])\n\t" | |||||
| "vl %%v21, 208(%%r1,%[x])\n\t" | |||||
| "vl %%v22, 224(%%r1,%[x])\n\t" | |||||
| "vl %%v23, 240(%%r1,%[x])\n\t" | |||||
| "vfadb %%v24,%%v24,%%v16\n\t" | |||||
| "vfadb %%v25,%%v25,%%v17\n\t" | |||||
| "vfadb %%v26,%%v26,%%v18\n\t" | |||||
| "vfadb %%v27,%%v27,%%v19\n\t" | |||||
| "vfadb %%v28,%%v28,%%v20\n\t" | |||||
| "vfadb %%v29,%%v29,%%v21\n\t" | |||||
| "vfadb %%v30,%%v30,%%v22\n\t" | |||||
| "vfadb %%v31,%%v31,%%v23\n\t" | |||||
| "agfi %%r1,256\n\t" | |||||
| "brctg %[n],0b\n\t" | |||||
| "vfadb %%v24,%%v24,%%v25\n\t" | |||||
| "vfadb %%v24,%%v24,%%v26\n\t" | |||||
| "vfadb %%v24,%%v24,%%v27\n\t" | |||||
| "vfadb %%v24,%%v24,%%v28\n\t" | |||||
| "vfadb %%v24,%%v24,%%v29\n\t" | |||||
| "vfadb %%v24,%%v24,%%v30\n\t" | |||||
| "vfadb %%v24,%%v24,%%v31\n\t" | |||||
| "vrepg %%v25,%%v24,1\n\t" | |||||
| "vfadb %%v24,%%v24,%%v25\n\t" | |||||
| "vsteg %%v24,%[asum],0" | |||||
| : [sum] "=Q"(sum),[n] "+&r"(n) | |||||
| : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) | |||||
| : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", | |||||
| "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); | |||||
| return sum; | |||||
| } | |||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||||
| BLASLONG i = 0; | |||||
| BLASLONG j = 0; | |||||
| FLOAT sumf = 0.0; | |||||
| BLASLONG n1; | |||||
| if (n <= 0 || inc_x <= 0) | |||||
| return sumf; | |||||
| if (inc_x == 1) { | |||||
| n1 = n & -32; | |||||
| if (n1 > 0) { | |||||
| sumf = dsum_kernel_32(n1, x); | |||||
| i = n1; | |||||
| } | |||||
| while (i < n) { | |||||
| sumf += x[i]; | |||||
| i++; | |||||
| } | |||||
| } else { | |||||
| BLASLONG n1 = n & -4; | |||||
| register FLOAT sum1, sum2; | |||||
| sum1 = 0.0; | |||||
| sum2 = 0.0; | |||||
| while (j < n1) { | |||||
| sum1 += x[i]; | |||||
| sum2 += x[i + inc_x]; | |||||
| sum1 += x[i + 2 * inc_x]; | |||||
| sum2 += x[i + 3 * inc_x]; | |||||
| i += inc_x * 4; | |||||
| j += 4; | |||||
| } | |||||
| sumf = sum1 + sum2; | |||||
| while (j < n) { | |||||
| sumf += x[i]; | |||||
| i += inc_x; | |||||
| j++; | |||||
| } | |||||
| } | |||||
| return sumf; | |||||
| } | |||||
| @@ -0,0 +1,151 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #include "common.h" | |||||
| #include <math.h> | |||||
| static FLOAT ssum_kernel_64(BLASLONG n, FLOAT *x) { | |||||
| FLOAT sum; | |||||
| __asm__("vzero %%v24\n\t" | |||||
| "vzero %%v25\n\t" | |||||
| "vzero %%v26\n\t" | |||||
| "vzero %%v27\n\t" | |||||
| "vzero %%v28\n\t" | |||||
| "vzero %%v29\n\t" | |||||
| "vzero %%v30\n\t" | |||||
| "vzero %%v31\n\t" | |||||
| "srlg %[n],%[n],6\n\t" | |||||
| "xgr %%r1,%%r1\n\t" | |||||
| "0:\n\t" | |||||
| "pfd 1, 1024(%%r1,%[x])\n\t" | |||||
| "vl %%v16, 0(%%r1,%[x])\n\t" | |||||
| "vl %%v17, 16(%%r1,%[x])\n\t" | |||||
| "vl %%v18, 32(%%r1,%[x])\n\t" | |||||
| "vl %%v19, 48(%%r1,%[x])\n\t" | |||||
| "vl %%v20, 64(%%r1,%[x])\n\t" | |||||
| "vl %%v21, 80(%%r1,%[x])\n\t" | |||||
| "vl %%v22, 96(%%r1,%[x])\n\t" | |||||
| "vl %%v23, 112(%%r1,%[x])\n\t" | |||||
| "vfasb %%v24,%%v24,%%v16\n\t" | |||||
| "vfasb %%v25,%%v25,%%v17\n\t" | |||||
| "vfasb %%v26,%%v26,%%v18\n\t" | |||||
| "vfasb %%v27,%%v27,%%v19\n\t" | |||||
| "vfasb %%v28,%%v28,%%v20\n\t" | |||||
| "vfasb %%v29,%%v29,%%v21\n\t" | |||||
| "vfasb %%v30,%%v30,%%v22\n\t" | |||||
| "vfasb %%v31,%%v31,%%v23\n\t" | |||||
| "vl %%v16, 128(%%r1,%[x])\n\t" | |||||
| "vl %%v17, 144(%%r1,%[x])\n\t" | |||||
| "vl %%v18, 160(%%r1,%[x])\n\t" | |||||
| "vl %%v19, 176(%%r1,%[x])\n\t" | |||||
| "vl %%v20, 192(%%r1,%[x])\n\t" | |||||
| "vl %%v21, 208(%%r1,%[x])\n\t" | |||||
| "vl %%v22, 224(%%r1,%[x])\n\t" | |||||
| "vl %%v23, 240(%%r1,%[x])\n\t" | |||||
| "vfasb %%v24,%%v24,%%v16\n\t" | |||||
| "vfasb %%v25,%%v25,%%v17\n\t" | |||||
| "vfasb %%v26,%%v26,%%v18\n\t" | |||||
| "vfasb %%v27,%%v27,%%v19\n\t" | |||||
| "vfasb %%v28,%%v28,%%v20\n\t" | |||||
| "vfasb %%v29,%%v29,%%v21\n\t" | |||||
| "vfasb %%v30,%%v30,%%v22\n\t" | |||||
| "vfasb %%v31,%%v31,%%v23\n\t" | |||||
| "agfi %%r1,256\n\t" | |||||
| "brctg %[n],0b\n\t" | |||||
| "vfasb %%v24,%%v24,%%v25\n\t" | |||||
| "vfasb %%v24,%%v24,%%v26\n\t" | |||||
| "vfasb %%v24,%%v24,%%v27\n\t" | |||||
| "vfasb %%v24,%%v24,%%v28\n\t" | |||||
| "vfasb %%v24,%%v24,%%v29\n\t" | |||||
| "vfasb %%v24,%%v24,%%v30\n\t" | |||||
| "vfasb %%v24,%%v24,%%v31\n\t" | |||||
| "veslg %%v25,%%v24,32\n\t" | |||||
| "vfasb %%v24,%%v24,%%v25\n\t" | |||||
| "vrepf %%v25,%%v24,2\n\t" | |||||
| "vfasb %%v24,%%v24,%%v25\n\t" | |||||
| "vstef %%v24,%[asum],0" | |||||
| : [sum] "=Q"(sum),[n] "+&r"(n) | |||||
| : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) | |||||
| : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", | |||||
| "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); | |||||
| return sum; | |||||
| } | |||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||||
| BLASLONG i = 0; | |||||
| BLASLONG j = 0; | |||||
| FLOAT sumf = 0.0; | |||||
| BLASLONG n1; | |||||
| if (n <= 0 || inc_x <= 0) | |||||
| return sumf; | |||||
| if (inc_x == 1) { | |||||
| n1 = n & -64; | |||||
| if (n1 > 0) { | |||||
| sumf = ssum_kernel_64(n1, x); | |||||
| i = n1; | |||||
| } | |||||
| while (i < n) { | |||||
| sumf += x[i]; | |||||
| i++; | |||||
| } | |||||
| } else { | |||||
| BLASLONG n1 = n & -4; | |||||
| register FLOAT sum1, sum2; | |||||
| sum1 = 0.0; | |||||
| sum2 = 0.0; | |||||
| while (j < n1) { | |||||
| sum1 += x[i]; | |||||
| sum2 += x[i + inc_x]; | |||||
| sum1 += x[i + 2 * inc_x]; | |||||
| sum2 += x[i + 3 * inc_x]; | |||||
| i += inc_x * 4; | |||||
| j += 4; | |||||
| } | |||||
| sumf = sum1 + sum2; | |||||
| while (j < n) { | |||||
| sumf += x[i]; | |||||
| i += inc_x; | |||||
| j++; | |||||
| } | |||||
| } | |||||
| return sumf; | |||||
| } | |||||
| @@ -0,0 +1,136 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #include "common.h" | |||||
| #include <math.h> | |||||
| static FLOAT zsum_kernel_16(BLASLONG n, FLOAT *x) { | |||||
| FLOAT sum; | |||||
| __asm__("vzero %%v24\n\t" | |||||
| "vzero %%v25\n\t" | |||||
| "vzero %%v26\n\t" | |||||
| "vzero %%v27\n\t" | |||||
| "vzero %%v28\n\t" | |||||
| "vzero %%v29\n\t" | |||||
| "vzero %%v30\n\t" | |||||
| "vzero %%v31\n\t" | |||||
| "srlg %[n],%[n],4\n\t" | |||||
| "xgr %%r1,%%r1\n\t" | |||||
| "0:\n\t" | |||||
| "pfd 1, 1024(%%r1,%[x])\n\t" | |||||
| "vl %%v16, 0(%%r1,%[x])\n\t" | |||||
| "vl %%v17, 16(%%r1,%[x])\n\t" | |||||
| "vl %%v18, 32(%%r1,%[x])\n\t" | |||||
| "vl %%v19, 48(%%r1,%[x])\n\t" | |||||
| "vl %%v20, 64(%%r1,%[x])\n\t" | |||||
| "vl %%v21, 80(%%r1,%[x])\n\t" | |||||
| "vl %%v22, 96(%%r1,%[x])\n\t" | |||||
| "vl %%v23, 112(%%r1,%[x])\n\t" | |||||
| "vfadb %%v24,%%v24,%%v16\n\t" | |||||
| "vfadb %%v25,%%v25,%%v17\n\t" | |||||
| "vfadb %%v26,%%v26,%%v18\n\t" | |||||
| "vfadb %%v27,%%v27,%%v19\n\t" | |||||
| "vfadb %%v28,%%v28,%%v20\n\t" | |||||
| "vfadb %%v29,%%v29,%%v21\n\t" | |||||
| "vfadb %%v30,%%v30,%%v22\n\t" | |||||
| "vfadb %%v31,%%v31,%%v23\n\t" | |||||
| "vl %%v16, 128(%%r1,%[x])\n\t" | |||||
| "vl %%v17, 144(%%r1,%[x])\n\t" | |||||
| "vl %%v18, 160(%%r1,%[x])\n\t" | |||||
| "vl %%v19, 176(%%r1,%[x])\n\t" | |||||
| "vl %%v20, 192(%%r1,%[x])\n\t" | |||||
| "vl %%v21, 208(%%r1,%[x])\n\t" | |||||
| "vl %%v22, 224(%%r1,%[x])\n\t" | |||||
| "vl %%v23, 240(%%r1,%[x])\n\t" | |||||
| "vfadb %%v24,%%v24,%%v16\n\t" | |||||
| "vfadb %%v25,%%v25,%%v17\n\t" | |||||
| "vfadb %%v26,%%v26,%%v18\n\t" | |||||
| "vfadb %%v27,%%v27,%%v19\n\t" | |||||
| "vfadb %%v28,%%v28,%%v20\n\t" | |||||
| "vfadb %%v29,%%v29,%%v21\n\t" | |||||
| "vfadb %%v30,%%v30,%%v22\n\t" | |||||
| "vfadb %%v31,%%v31,%%v23\n\t" | |||||
| "agfi %%r1,256\n\t" | |||||
| "brctg %[n],0b\n\t" | |||||
| "vfadb %%v24,%%v24,%%v25\n\t" | |||||
| "vfadb %%v24,%%v24,%%v26\n\t" | |||||
| "vfadb %%v24,%%v24,%%v27\n\t" | |||||
| "vfadb %%v24,%%v24,%%v28\n\t" | |||||
| "vfadb %%v24,%%v24,%%v29\n\t" | |||||
| "vfadb %%v24,%%v24,%%v30\n\t" | |||||
| "vfadb %%v24,%%v24,%%v31\n\t" | |||||
| "vrepg %%v25,%%v24,1\n\t" | |||||
| "vfadb %%v24,%%v24,%%v25\n\t" | |||||
| "vsteg %%v24,%[asum],0" | |||||
| : [sum] "=Q"(sum),[n] "+&r"(n) | |||||
| : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) | |||||
| : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", | |||||
| "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); | |||||
| return sum; | |||||
| } | |||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||||
| BLASLONG i = 0; | |||||
| BLASLONG ip = 0; | |||||
| FLOAT sumf = 0.0; | |||||
| BLASLONG n1; | |||||
| BLASLONG inc_x2; | |||||
| if (n <= 0 || inc_x <= 0) | |||||
| return (sumf); | |||||
| if (inc_x == 1) { | |||||
| n1 = n & -16; | |||||
| if (n1 > 0) { | |||||
| sumf = zsum_kernel_16(n1, x); | |||||
| i = n1; | |||||
| ip = 2 * n1; | |||||
| } | |||||
| while (i < n) { | |||||
| sumf += x[ip] + x[ip + 1]; | |||||
| i++; | |||||
| ip += 2; | |||||
| } | |||||
| } else { | |||||
| inc_x2 = 2 * inc_x; | |||||
| while (i < n) { | |||||
| sumf += x[ip] + x[ip + 1]; | |||||
| ip += inc_x2; | |||||
| i++; | |||||
| } | |||||
| } | |||||
| return (sumf); | |||||
| } | |||||