Merge pull request #2072 from martin-frbg/sum

Add (C)BLAS extension ?sum
6 years ago · ccfb7ead15
--- a/cblas.h
+++ b/cblas.h
@@ -73,6 +73,11 @@ double cblas_dasum (OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS
 float  cblas_scasum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void  *x, OPENBLAS_CONST blasint incx);
 double cblas_dzasum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);

 float  cblas_ssum (OPENBLAS_CONST blasint n, OPENBLAS_CONST float  *x, OPENBLAS_CONST blasint incx);
 double cblas_dsum (OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
 float  cblas_scsum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void  *x, OPENBLAS_CONST blasint incx);
 double cblas_dzsum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);

 float  cblas_snrm2 (OPENBLAS_CONST blasint N, OPENBLAS_CONST float  *X, OPENBLAS_CONST blasint incX);
 double cblas_dnrm2 (OPENBLAS_CONST blasint N, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX);
 float  cblas_scnrm2(OPENBLAS_CONST blasint N, OPENBLAS_CONST void  *X, OPENBLAS_CONST blasint incX);
--- a/cmake/kernel.cmake
+++ b/cmake/kernel.cmake
@@ -107,6 +107,12 @@ macro(SetDefaultL1)
  set(DAXPBYKERNEL ../arm/axpby.c)
  set(CAXPBYKERNEL ../arm/zaxpby.c)
  set(ZAXPBYKERNEL ../arm/zaxpby.c)
  set(SSUMKERNEL sum.S)
  set(DSUMKERNEL sum.S)
  set(CSUMKERNEL zsum.S)
  set(ZSUMKERNEL zsum.S)
  set(QSUMKERNEL sum.S)
  set(XSUMKERNEL zsum.S)
 endmacro ()

 macro(SetDefaultL2)
@@ -162,4 +168,4 @@ macro(SetDefaultL3)
  set(DGEADD_KERNEL ../generic/geadd.c)
  set(CGEADD_KERNEL ../generic/zgeadd.c)
  set(ZGEADD_KERNEL ../generic/zgeadd.c)
 endmacro ()
 endmacro ()
--- a/common_c.h
+++ b/common_c.h
@@ -19,6 +19,7 @@
 #define	CDOTC_K			cdotc_k
 #define	CNRM2_K			cnrm2_k
 #define	CSCAL_K			cscal_k
 #define	CSUM_K			csum_k
 #define	CSWAP_K			cswap_k
 #define	CROT_K			csrot_k

@@ -249,6 +250,7 @@
 #define	CDOTC_K			gotoblas -> cdotc_k
 #define	CNRM2_K			gotoblas -> cnrm2_k
 #define	CSCAL_K			gotoblas -> cscal_k
 #define	CSUM_K			gotoblas -> csum_k
 #define	CSWAP_K			gotoblas -> cswap_k
 #define	CROT_K			gotoblas -> csrot_k

--- a/common_d.h
+++ b/common_d.h
@@ -19,6 +19,7 @@
 #define	DDOTC_K			ddot_k
 #define	DNRM2_K			dnrm2_k
 #define	DSCAL_K			dscal_k
 #define	DSUM_K			dsum_k
 #define	DSWAP_K			dswap_k
 #define	DROT_K			drot_k

@@ -174,6 +175,7 @@
 #define	DDOTC_K			gotoblas -> ddot_k
 #define	DNRM2_K			gotoblas -> dnrm2_k
 #define	DSCAL_K			gotoblas -> dscal_k
 #define	DSUM_K			gotoblas -> dsum_k
 #define	DSWAP_K			gotoblas -> dswap_k
 #define	DROT_K			gotoblas -> drot_k

--- a/common_interface.h
+++ b/common_interface.h
@@ -122,6 +122,13 @@ xdouble BLASFUNC(qasum) (blasint *, xdouble *, blasint *);
 double BLASFUNC(dzasum)(blasint *, double *, blasint *);
 xdouble BLASFUNC(qxasum)(blasint *, xdouble *, blasint *);

 FLOATRET  BLASFUNC(ssum) (blasint *, float  *, blasint *);
 FLOATRET  BLASFUNC(scsum)(blasint *, float  *, blasint *);
 double BLASFUNC(dsum) (blasint *, double *, blasint *);
 xdouble BLASFUNC(qsum) (blasint *, xdouble *, blasint *);
 double BLASFUNC(dzsum)(blasint *, double *, blasint *);
 xdouble BLASFUNC(qxsum)(blasint *, xdouble *, blasint *);

 blasint    BLASFUNC(isamax)(blasint *, float  *, blasint *);
 blasint    BLASFUNC(idamax)(blasint *, double *, blasint *);
 blasint    BLASFUNC(iqamax)(blasint *, xdouble *, blasint *);
--- a/common_level1.h
+++ b/common_level1.h
@@ -100,6 +100,13 @@ float   casum_k (BLASLONG, float  *, BLASLONG);
 double  zasum_k (BLASLONG, double *, BLASLONG);
 xdouble xasum_k (BLASLONG, xdouble *, BLASLONG);

 float   ssum_k (BLASLONG, float  *, BLASLONG);
 double  dsum_k (BLASLONG, double *, BLASLONG);
 xdouble qsum_k (BLASLONG, xdouble *, BLASLONG);
 float   csum_k (BLASLONG, float  *, BLASLONG);
 double  zsum_k (BLASLONG, double *, BLASLONG);
 xdouble xsum_k (BLASLONG, xdouble *, BLASLONG);

 float   samax_k (BLASLONG, float  *, BLASLONG);
 double  damax_k (BLASLONG, double *, BLASLONG);
 xdouble qamax_k (BLASLONG, xdouble *, BLASLONG);
--- a/common_macro.h
+++ b/common_macro.h
@@ -66,6 +66,7 @@
 #define	DOTC_K			QDOTC_K
 #define	NRM2_K			QNRM2_K
 #define	SCAL_K			QSCAL_K
 #define	SUM_K			QSUM_K
 #define	SWAP_K			QSWAP_K
 #define	ROT_K			QROT_K

@@ -356,6 +357,7 @@
 #define	DOTC_K			DDOTC_K
 #define	NRM2_K			DNRM2_K
 #define	SCAL_K			DSCAL_K
 #define	SUM_K			DSUM_K
 #define	SWAP_K			DSWAP_K
 #define	ROT_K			DROT_K

@@ -658,6 +660,7 @@
 #define	DOTC_K			SDOTC_K
 #define	NRM2_K			SNRM2_K
 #define	SCAL_K			SSCAL_K
 #define	SUM_K			SSUM_K
 #define	SWAP_K			SSWAP_K
 #define	ROT_K			SROT_K

@@ -962,6 +965,7 @@
 #define	DOTC_K			XDOTC_K
 #define	NRM2_K			XNRM2_K
 #define	SCAL_K			XSCAL_K
 #define	SUM_K			XSUM_K
 #define	SWAP_K			XSWAP_K
 #define	ROT_K			XROT_K

@@ -1363,6 +1367,7 @@
 #define	DOTC_K			ZDOTC_K
 #define	NRM2_K			ZNRM2_K
 #define	SCAL_K			ZSCAL_K
 #define	SUM_K			ZSUM_K
 #define	SWAP_K			ZSWAP_K
 #define	ROT_K			ZROT_K

@@ -1785,6 +1790,7 @@
 #define	DOTC_K			CDOTC_K
 #define	NRM2_K			CNRM2_K
 #define	SCAL_K			CSCAL_K
 #define	SUM_K			CSUM_K
 #define	SWAP_K			CSWAP_K
 #define	ROT_K			CROT_K

--- a/common_param.h
+++ b/common_param.h
@@ -63,6 +63,7 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG);

  float  (*snrm2_k) (BLASLONG, float *, BLASLONG);
  float  (*sasum_k) (BLASLONG, float *, BLASLONG);
  float  (*ssum_k)  (BLASLONG, float *, BLASLONG);
  int    (*scopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
  float  (*sdot_k)  (BLASLONG, float *, BLASLONG, float *, BLASLONG);
  double (*dsdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
@@ -154,6 +155,7 @@ BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG);

  double (*dnrm2_k) (BLASLONG, double *, BLASLONG);
  double (*dasum_k) (BLASLONG, double *, BLASLONG);
  double (*dsum_k)  (BLASLONG, double *, BLASLONG);
  int    (*dcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG);
  double (*ddot_k)  (BLASLONG, double *, BLASLONG, double *, BLASLONG);
  int    (*drot_k)  (BLASLONG, double *, BLASLONG, double *, BLASLONG, double, double);
@@ -245,6 +247,7 @@ BLASLONG (*iqmin_k) (BLASLONG, xdouble *, BLASLONG);

 xdouble (*qnrm2_k) (BLASLONG, xdouble *, BLASLONG);
 xdouble (*qasum_k) (BLASLONG, xdouble *, BLASLONG);
 xdouble (*qsum_k)  (BLASLONG, xdouble *, BLASLONG);
  int    (*qcopy_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
 xdouble (*qdot_k)  (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
  int    (*qrot_k)  (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble);
@@ -332,6 +335,7 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG);

  float (*cnrm2_k) (BLASLONG, float *, BLASLONG);
  float (*casum_k) (BLASLONG, float *, BLASLONG);
  float (*csum_k)  (BLASLONG, float *, BLASLONG);
  int    (*ccopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
  openblas_complex_float (*cdotu_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
  openblas_complex_float (*cdotc_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
@@ -495,6 +499,7 @@ BLASLONG (*izamin_k)(BLASLONG, double *, BLASLONG);

  double (*znrm2_k) (BLASLONG, double *, BLASLONG);
  double (*zasum_k) (BLASLONG, double *, BLASLONG);
  double (*zsum_k)  (BLASLONG, double *, BLASLONG);
  int    (*zcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG);
  openblas_complex_double (*zdotu_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG);
  openblas_complex_double (*zdotc_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG);
@@ -660,6 +665,7 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG);

  xdouble (*xnrm2_k) (BLASLONG, xdouble *, BLASLONG);
  xdouble (*xasum_k) (BLASLONG, xdouble *, BLASLONG);
  xdouble (*xsum_k) (BLASLONG, xdouble *, BLASLONG);
  int    (*xcopy_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
  openblas_complex_xdouble (*xdotu_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
  openblas_complex_xdouble (*xdotc_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
--- a/common_q.h
+++ b/common_q.h
@@ -19,6 +19,7 @@
 #define	QDOTC_K			qdot_k
 #define	QNRM2_K			qnrm2_k
 #define	QSCAL_K			qscal_k
 #define	QSUM_K			qsum_k
 #define	QSWAP_K			qswap_k
 #define	QROT_K			qrot_k

@@ -161,6 +162,7 @@
 #define	QDOTC_K			gotoblas -> qdot_k
 #define	QNRM2_K			gotoblas -> qnrm2_k
 #define	QSCAL_K			gotoblas -> qscal_k
 #define	QSUM_K			gotoblas -> qsum_k
 #define	QSWAP_K			gotoblas -> qswap_k
 #define	QROT_K			gotoblas -> qrot_k

--- a/common_s.h
+++ b/common_s.h
@@ -12,6 +12,7 @@
 #define	ISMAX_K			ismax_k
 #define	ISMIN_K			ismin_k
 #define	SASUM_K			sasum_k
 #define	SSUM_K			ssum_k
 #define	SAXPYU_K		saxpy_k
 #define	SAXPYC_K		saxpy_k
 #define	SCOPY_K			scopy_k
@@ -170,6 +171,7 @@
 #define	ISMAX_K			gotoblas -> ismax_k
 #define	ISMIN_K			gotoblas -> ismin_k
 #define	SASUM_K			gotoblas -> sasum_k
 #define	SSUM_K			gotoblas -> ssum_k
 #define	SAXPYU_K		gotoblas -> saxpy_k
 #define	SAXPYC_K		gotoblas -> saxpy_k
 #define	SCOPY_K			gotoblas -> scopy_k
--- a/common_x.h
+++ b/common_x.h
@@ -19,6 +19,7 @@
 #define	XDOTC_K			xdotc_k
 #define	XNRM2_K			xnrm2_k
 #define	XSCAL_K			xscal_k
 #define	XSUM_K			xsum_k
 #define	XSWAP_K			xswap_k
 #define	XROT_K			xqrot_k

@@ -227,6 +228,7 @@
 #define	XDOTC_K			gotoblas -> xdotc_k
 #define	XNRM2_K			gotoblas -> xnrm2_k
 #define	XSCAL_K			gotoblas -> xscal_k
 #define	XSUM_K			gotoblas -> xsum_k
 #define	XSWAP_K			gotoblas -> xswap_k
 #define	XROT_K			gotoblas -> xqrot_k

--- a/common_z.h
+++ b/common_z.h
@@ -19,6 +19,7 @@
 #define	ZDOTC_K			zdotc_k
 #define	ZNRM2_K			znrm2_k
 #define	ZSCAL_K			zscal_k
 #define	ZSUM_K			zsum_k
 #define	ZSWAP_K			zswap_k
 #define	ZROT_K			zdrot_k

@@ -249,6 +250,7 @@
 #define	ZDOTC_K			gotoblas -> zdotc_k
 #define	ZNRM2_K			gotoblas -> znrm2_k
 #define	ZSCAL_K			gotoblas -> zscal_k
 #define	ZSUM_K			gotoblas -> zsum_k
 #define	ZSWAP_K			gotoblas -> zswap_k
 #define	ZROT_K			gotoblas -> zdrot_k

--- a/interface/CMakeLists.txt
+++ b/interface/CMakeLists.txt
@@ -12,6 +12,7 @@ set(BLAS1_REAL_ONLY_SOURCES
  rotm.c rotmg.c # N.B. these do not have complex counterparts
  rot.c
  asum.c
  sum.c
 )

 # these will have 'z' prepended for the complex version
@@ -124,6 +125,7 @@ foreach (float_type ${FLOAT_TYPES})
    GenerateNamedObjects("max.c" "USE_ABS;USE_MIN" "scamin" ${CBLAS_FLAG} "" "" true "COMPLEX")
    GenerateNamedObjects("max.c" "USE_ABS" "scamax" ${CBLAS_FLAG} "" "" true "COMPLEX")
    GenerateNamedObjects("asum.c" "" "scasum" ${CBLAS_FLAG} "" "" true "COMPLEX")
    GenerateNamedObjects("sum.c" "" "scsum" ${CBLAS_FLAG} "" "" true "COMPLEX")
  endif ()
  if (${float_type} STREQUAL "ZCOMPLEX")
    GenerateNamedObjects("zscal.c" "SSCAL" "dscal" ${CBLAS_FLAG} "" "" false "ZCOMPLEX")
@@ -132,6 +134,7 @@ foreach (float_type ${FLOAT_TYPES})
    GenerateNamedObjects("max.c" "USE_ABS;USE_MIN" "dzamin" ${CBLAS_FLAG} "" "" true "ZCOMPLEX")
    GenerateNamedObjects("max.c" "USE_ABS" "dzamax" ${CBLAS_FLAG} "" "" true "ZCOMPLEX")
    GenerateNamedObjects("asum.c" "" "dzasum" ${CBLAS_FLAG} "" "" true "ZCOMPLEX")
    GenerateNamedObjects("sum.c" "" "dzsum" ${CBLAS_FLAG} "" "" true "ZCOMPLEX")
  endif ()
 endforeach ()

--- a/interface/Makefile
+++ b/interface/Makefile
@@ -25,7 +25,7 @@ SBLAS1OBJS    = \
 		saxpy.$(SUFFIX) sswap.$(SUFFIX) \
 		scopy.$(SUFFIX) sscal.$(SUFFIX) \
 		sdot.$(SUFFIX) sdsdot.$(SUFFIX) dsdot.$(SUFFIX) \
 		sasum.$(SUFFIX) snrm2.$(SUFFIX) \
 		sasum.$(SUFFIX) ssum.$(SUFFIX) snrm2.$(SUFFIX) \
 		smax.$(SUFFIX) samax.$(SUFFIX) ismax.$(SUFFIX) isamax.$(SUFFIX) \
 		smin.$(SUFFIX) samin.$(SUFFIX) ismin.$(SUFFIX) isamin.$(SUFFIX) \
 		srot.$(SUFFIX) srotg.$(SUFFIX) srotm.$(SUFFIX) srotmg.$(SUFFIX) \
@@ -51,7 +51,7 @@ DBLAS1OBJS    = \
 		daxpy.$(SUFFIX) dswap.$(SUFFIX) \
 		dcopy.$(SUFFIX) dscal.$(SUFFIX) \
 		ddot.$(SUFFIX) \
 		dasum.$(SUFFIX) dnrm2.$(SUFFIX) \
 		dasum.$(SUFFIX) dsum.$(SUFFIX) dnrm2.$(SUFFIX) \
 		dmax.$(SUFFIX) damax.$(SUFFIX) idmax.$(SUFFIX) idamax.$(SUFFIX) \
 		dmin.$(SUFFIX) damin.$(SUFFIX) idmin.$(SUFFIX) idamin.$(SUFFIX) \
 		drot.$(SUFFIX) drotg.$(SUFFIX) drotm.$(SUFFIX) drotmg.$(SUFFIX) \
@@ -76,7 +76,7 @@ CBLAS1OBJS    = \
 		caxpy.$(SUFFIX) caxpyc.$(SUFFIX) cswap.$(SUFFIX) \
 		ccopy.$(SUFFIX) cscal.$(SUFFIX) csscal.$(SUFFIX) \
 		cdotc.$(SUFFIX)  cdotu.$(SUFFIX) \
 		scasum.$(SUFFIX) scnrm2.$(SUFFIX) \
 		scasum.$(SUFFIX) scsum.$(SUFFIX) scnrm2.$(SUFFIX) \
 		scamax.$(SUFFIX) icamax.$(SUFFIX) \
 		scamin.$(SUFFIX) icamin.$(SUFFIX) \
 		csrot.$(SUFFIX) crotg.$(SUFFIX) \
@@ -105,7 +105,7 @@ ZBLAS1OBJS    = \
 		zaxpy.$(SUFFIX) zaxpyc.$(SUFFIX) zswap.$(SUFFIX) \
 		zcopy.$(SUFFIX) zscal.$(SUFFIX) zdscal.$(SUFFIX) \
 		zdotc.$(SUFFIX)  zdotu.$(SUFFIX) \
 		dzasum.$(SUFFIX)  dznrm2.$(SUFFIX) \
 		dzasum.$(SUFFIX) dzsum.$(SUFFIX) dznrm2.$(SUFFIX) \
 		dzamax.$(SUFFIX) izamax.$(SUFFIX) \
 		dzamin.$(SUFFIX) izamin.$(SUFFIX) \
 		zdrot.$(SUFFIX) zrotg.$(SUFFIX) \
@@ -146,7 +146,7 @@ QBLAS1OBJS    = \
 		qaxpy.$(SUFFIX) qswap.$(SUFFIX) \
 		qcopy.$(SUFFIX) qscal.$(SUFFIX) \
 		qdot.$(SUFFIX) \
 		qasum.$(SUFFIX) qnrm2.$(SUFFIX) \
 		qasum.$(SUFFIX) qsum.$(SUFFIX) qnrm2.$(SUFFIX) \
 		qmax.$(SUFFIX) qamax.$(SUFFIX) iqmax.$(SUFFIX) iqamax.$(SUFFIX) \
 		qmin.$(SUFFIX) qamin.$(SUFFIX) iqmin.$(SUFFIX) iqamin.$(SUFFIX) \
 		qrot.$(SUFFIX) qrotg.$(SUFFIX) qrotm.$(SUFFIX) qrotmg.$(SUFFIX) \
@@ -168,7 +168,7 @@ XBLAS1OBJS    = \
 		xaxpy.$(SUFFIX) xaxpyc.$(SUFFIX) xswap.$(SUFFIX) \
 		xcopy.$(SUFFIX) xscal.$(SUFFIX) xqscal.$(SUFFIX) \
 		xdotc.$(SUFFIX)  xdotu.$(SUFFIX) \
 		qxasum.$(SUFFIX) qxnrm2.$(SUFFIX) \
 		qxasum.$(SUFFIX) qxsum.$(SUFFIX) qxnrm2.$(SUFFIX) \
 		qxamax.$(SUFFIX) ixamax.$(SUFFIX) \
 		qxamin.$(SUFFIX) ixamin.$(SUFFIX) \
 		xqrot.$(SUFFIX) xrotg.$(SUFFIX) \
@@ -203,7 +203,7 @@ ifdef QUAD_PRECISION
 QBLAS1OBJS    = \
 		qaxpy.$(SUFFIX) qswap.$(SUFFIX) \
 		qcopy.$(SUFFIX) qscal.$(SUFFIX) \
 		qasum.$(SUFFIX) qnrm2.$(SUFFIX) \
 		qasum.$(SUFFIX) qsum.$(SUFFIX) qnrm2.$(SUFFIX) \
 		qmax.$(SUFFIX) qamax.$(SUFFIX) iqmax.$(SUFFIX) iqamax.$(SUFFIX) \
 		qmin.$(SUFFIX) qamin.$(SUFFIX) iqmin.$(SUFFIX) iqamin.$(SUFFIX) \
 		qrot.$(SUFFIX) qrotg.$(SUFFIX) qrotm.$(SUFFIX) qrotmg.$(SUFFIX) \
@@ -224,7 +224,7 @@ QBLAS3OBJS    = \
 XBLAS1OBJS    = \
 		xaxpy.$(SUFFIX) xaxpyc.$(SUFFIX) xswap.$(SUFFIX) \
 		xcopy.$(SUFFIX) xscal.$(SUFFIX) xqscal.$(SUFFIX) \
 		qxasum.$(SUFFIX) qxnrm2.$(SUFFIX) \
 		qxasum.$(SUFFIX) qxsum.$(SUFFIX) qxnrm2.$(SUFFIX) \
 		qxamax.$(SUFFIX) ixamax.$(SUFFIX) \
 		qxamin.$(SUFFIX) ixamin.$(SUFFIX) \
 		xqrot.$(SUFFIX) xrotg.$(SUFFIX) \
@@ -264,7 +264,7 @@ CSBLAS1OBJS   = \
 	cblas_scopy.$(SUFFIX) cblas_sdot.$(SUFFIX) cblas_sdsdot.$(SUFFIX) cblas_dsdot.$(SUFFIX) \
 	cblas_srot.$(SUFFIX) cblas_srotg.$(SUFFIX) cblas_srotm.$(SUFFIX) cblas_srotmg.$(SUFFIX) \
 	cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) cblas_saxpby.$(SUFFIX) \
 	cblas_ismin.$(SUFFIX) cblas_ismax.$(SUFFIX)
 	cblas_ismin.$(SUFFIX) cblas_ismax.$(SUFFIX) cblas_ssum.$(SUFFIX)

 CSBLAS2OBJS   = \
 	cblas_sgemv.$(SUFFIX) cblas_sger.$(SUFFIX) cblas_ssymv.$(SUFFIX) cblas_strmv.$(SUFFIX) \
@@ -282,7 +282,7 @@ CDBLAS1OBJS   = \
 	cblas_dcopy.$(SUFFIX) cblas_ddot.$(SUFFIX) \
 	cblas_drot.$(SUFFIX) cblas_drotg.$(SUFFIX) cblas_drotm.$(SUFFIX) cblas_drotmg.$(SUFFIX) \
 	cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) cblas_daxpby.$(SUFFIX) \
 	cblas_idmin.$(SUFFIX) cblas_idmax.$(SUFFIX)
 	cblas_idmin.$(SUFFIX) cblas_idmax.$(SUFFIX) cblas_dsum.$(SUFFIX)

 CDBLAS2OBJS   = \
 	cblas_dgemv.$(SUFFIX) cblas_dger.$(SUFFIX) cblas_dsymv.$(SUFFIX) cblas_dtrmv.$(SUFFIX) \
@@ -303,7 +303,7 @@ CCBLAS1OBJS   = \
 	cblas_cscal.$(SUFFIX) cblas_csscal.$(SUFFIX) \
 	cblas_cswap.$(SUFFIX) cblas_scnrm2.$(SUFFIX) \
 	cblas_caxpby.$(SUFFIX) \
 	cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX)
 	cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX) cblas_scsum.$(SUFFIX)

 CCBLAS2OBJS   = \
 	cblas_cgemv.$(SUFFIX) cblas_cgerc.$(SUFFIX) cblas_cgeru.$(SUFFIX) \
@@ -330,7 +330,7 @@ CZBLAS1OBJS   = \
 	cblas_zscal.$(SUFFIX) cblas_zdscal.$(SUFFIX) \
 	cblas_zswap.$(SUFFIX) cblas_dznrm2.$(SUFFIX) \
 	cblas_zaxpby.$(SUFFIX) \
 	cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX)
 	cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX) cblas_dzsum.$(SUFFIX)


 CZBLAS2OBJS   = \
@@ -565,6 +565,24 @@ dzasum.$(SUFFIX) dzasum.$(PSUFFIX) : asum.c
 qxasum.$(SUFFIX) qxasum.$(PSUFFIX) : asum.c
 	$(CC) $(CFLAGS) -c $< -o $(@F)

 ssum.$(SUFFIX) ssum.$(PSUFFIX) : sum.c
 	$(CC) $(CFLAGS) -c $< -o $(@F)

 dsum.$(SUFFIX) dsum.$(PSUFFIX) : sum.c
 	$(CC) $(CFLAGS) -c $< -o $(@F)

 qsum.$(SUFFIX) qsum.$(PSUFFIX) : sum.c
 	$(CC) $(CFLAGS) -c $< -o $(@F)

 scsum.$(SUFFIX) scsum.$(PSUFFIX) : sum.c
 	$(CC) $(CFLAGS) -c $< -o $(@F)

 dzsum.$(SUFFIX) dzsum.$(PSUFFIX) : sum.c
 	$(CC) $(CFLAGS) -c $< -o $(@F)

 qxsum.$(SUFFIX) qxsum.$(PSUFFIX) : sum.c
 	$(CC) $(CFLAGS) -c $< -o $(@F)

 snrm2.$(SUFFIX) snrm2.$(PSUFFIX) : nrm2.c
 	$(CC) $(CFLAGS) -c $< -o $(@F)

@@ -1412,6 +1430,18 @@ cblas_scasum.$(SUFFIX) cblas_scasum.$(PSUFFIX) : asum.c
 cblas_dzasum.$(SUFFIX) cblas_dzasum.$(PSUFFIX) : asum.c
 	$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)

 cblas_ssum.$(SUFFIX) cblas_ssum.$(PSUFFIX) : sum.c
 	$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)

 cblas_dsum.$(SUFFIX) cblas_dsum.$(PSUFFIX) : sum.c
 	$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)

 cblas_scsum.$(SUFFIX) cblas_scsum.$(PSUFFIX) : sum.c
 	$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)

 cblas_dzsum.$(SUFFIX) cblas_dzsum.$(PSUFFIX) : sum.c
 	$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)

 cblas_sdsdot.$(SUFFIX) cblas_sdsdot.$(PSUFFIX) : sdsdot.c
 	$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)

@@ -1419,7 +1449,7 @@ cblas_dsdot.$(SUFFIX) cblas_dsdot.$(PSUFFIX) : dsdot.c
 	$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)

 cblas_sdot.$(SUFFIX) cblas_sdot.$(PSUFFIX) : dot.c
 	$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
 		$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)

 cblas_ddot.$(SUFFIX) cblas_ddot.$(PSUFFIX) : dot.c
 	$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
--- a/interface/sum.c
+++ b/interface/sum.c
@@ -0,0 +1,97 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
 /* without modification, are permitted provided that the following   */
 /* conditions are met:                                               */
 /*                                                                   */
 /*   1. Redistributions of source code must retain the above         */
 /*      copyright notice, this list of conditions and the following  */
 /*      disclaimer.                                                  */
 /*                                                                   */
 /*   2. Redistributions in binary form must reproduce the above      */
 /*      copyright notice, this list of conditions and the following  */
 /*      disclaimer in the documentation and/or other materials       */
 /*      provided with the distribution.                              */
 /*                                                                   */
 /*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
 /*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
 /*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
 /*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
 /*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
 /*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
 /*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
 /*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
 /*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
 /*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
 /*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
 /*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
 /*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
 /*    POSSIBILITY OF SUCH DAMAGE.                                    */
 /*                                                                   */
 /* The views and conclusions contained in the software and           */
 /* documentation are those of the authors and should not be          */
 /* interpreted as representing official policies, either expressed   */
 /* or implied, of The University of Texas at Austin.                 */
 /*********************************************************************/

 #include <stdio.h>
 #include "common.h"
 #ifdef FUNCTION_PROFILE
 #include "functable.h"
 #endif

 #ifndef CBLAS

 FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){

  BLASLONG n    = *N;
  BLASLONG incx = *INCX;
  FLOATRET ret;

  PRINT_DEBUG_NAME;

  if (n <= 0) return 0;

  IDEBUG_START;

  FUNCTION_PROFILE_START();

  ret = (FLOATRET)SUM_K(n, x, incx);

  FUNCTION_PROFILE_END(COMPSIZE, n, n);

  IDEBUG_END;

  return ret;
 }

 #else
 #ifdef COMPLEX
 FLOAT CNAME(blasint n, void *vx, blasint incx){
  FLOAT *x = (FLOAT*) vx;
 #else
 FLOAT CNAME(blasint n, FLOAT *x, blasint incx){
 #endif

  FLOAT ret;

  PRINT_DEBUG_CNAME;

  if (n <= 0) return 0;

  IDEBUG_START;

  FUNCTION_PROFILE_START();

  ret = SUM_K(n, x, incx);

  FUNCTION_PROFILE_END(COMPSIZE, n, n);

  IDEBUG_END;

  return ret;
 }

 #endif
--- a/kernel/CMakeLists.txt
+++ b/kernel/CMakeLists.txt
@@ -65,6 +65,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
      GenerateNamedObjects("${KERNELDIR}/${${float_char}SCALKERNEL}" "" "scal_k" false "" "" false ${float_type})
      GenerateNamedObjects("${KERNELDIR}/${${float_char}SWAPKERNEL}" "" "swap_k" false "" "" false ${float_type})
      GenerateNamedObjects("${KERNELDIR}/${${float_char}AXPBYKERNEL}" "" "axpby_k" false "" "" false ${float_type})
      GenerateNamedObjects("${KERNELDIR}/${${float_char}SUMKERNEL}" "" "sum_k" false "" "" false ${float_type})

      if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX")
        GenerateNamedObjects("${KERNELDIR}/${${float_char}AXPYKERNEL}" "CONJ" "axpyc_k" false "" "" false ${float_type})
--- a/kernel/Makefile.L1
+++ b/kernel/Makefile.L1
@@ -340,6 +340,32 @@ ifndef XSCALKERNEL
 XSCALKERNEL = zscal.S
 endif

 ### SUM ###

 ifndef SSUMKERNEL
 SSUMKERNEL =  sum.S
 endif

 ifndef DSUMKERNEL
 DSUMKERNEL =  sum.S
 endif

 ifndef CSUMKERNEL
 CSUMKERNEL = zsum.S
 endif

 ifndef ZSUMKERNEL
 ZSUMKERNEL = zsum.S
 endif

 ifndef QSUMKERNEL
 QSUMKERNEL =  sum.S
 endif

 ifndef XSUMKERNEL
 XSUMKERNEL = zsum.S
 endif

 ### SWAP ###

 ifndef SSWAPKERNEL
@@ -453,7 +479,7 @@ endif
 SBLASOBJS	+= \
 	 samax_k$(TSUFFIX).$(SUFFIX)  samin_k$(TSUFFIX).$(SUFFIX)  smax_k$(TSUFFIX).$(SUFFIX)  smin_k$(TSUFFIX).$(SUFFIX) \
 	isamax_k$(TSUFFIX).$(SUFFIX) isamin_k$(TSUFFIX).$(SUFFIX) ismax_k$(TSUFFIX).$(SUFFIX) ismin_k$(TSUFFIX).$(SUFFIX) \
 	sasum_k$(TSUFFIX).$(SUFFIX) saxpy_k$(TSUFFIX).$(SUFFIX) scopy_k$(TSUFFIX).$(SUFFIX) \
 	sasum_k$(TSUFFIX).$(SUFFIX) ssum_k$(TSUFFIX).$(SUFFIX) saxpy_k$(TSUFFIX).$(SUFFIX) scopy_k$(TSUFFIX).$(SUFFIX) \
 	sdot_k$(TSUFFIX).$(SUFFIX) sdsdot_k$(TSUFFIX).$(SUFFIX) dsdot_k$(TSUFFIX).$(SUFFIX) \
 	snrm2_k$(TSUFFIX).$(SUFFIX) srot_k$(TSUFFIX).$(SUFFIX) sscal_k$(TSUFFIX).$(SUFFIX) sswap_k$(TSUFFIX).$(SUFFIX) \
 	saxpby_k$(TSUFFIX).$(SUFFIX)
@@ -463,31 +489,32 @@ DBLASOBJS	+= \
 	idamax_k$(TSUFFIX).$(SUFFIX) idamin_k$(TSUFFIX).$(SUFFIX) idmax_k$(TSUFFIX).$(SUFFIX) idmin_k$(TSUFFIX).$(SUFFIX) \
 	dasum_k$(TSUFFIX).$(SUFFIX) daxpy_k$(TSUFFIX).$(SUFFIX) dcopy_k$(TSUFFIX).$(SUFFIX) ddot_k$(TSUFFIX).$(SUFFIX) \
 	dnrm2_k$(TSUFFIX).$(SUFFIX) drot_k$(TSUFFIX).$(SUFFIX) dscal_k$(TSUFFIX).$(SUFFIX) dswap_k$(TSUFFIX).$(SUFFIX) \
 	daxpby_k$(TSUFFIX).$(SUFFIX)
 	daxpby_k$(TSUFFIX).$(SUFFIX) dsum_k$(TSUFFIX).$(SUFFIX)

 QBLASOBJS	+= \
 	 qamax_k$(TSUFFIX).$(SUFFIX)  qamin_k$(TSUFFIX).$(SUFFIX)  qmax_k$(TSUFFIX).$(SUFFIX)  qmin_k$(TSUFFIX).$(SUFFIX) \
 	iqamax_k$(TSUFFIX).$(SUFFIX) iqamin_k$(TSUFFIX).$(SUFFIX) iqmax_k$(TSUFFIX).$(SUFFIX) iqmin_k$(TSUFFIX).$(SUFFIX) \
 	qasum_k$(TSUFFIX).$(SUFFIX) qaxpy_k$(TSUFFIX).$(SUFFIX) qcopy_k$(TSUFFIX).$(SUFFIX) qdot_k$(TSUFFIX).$(SUFFIX) \
 	qnrm2_k$(TSUFFIX).$(SUFFIX) qrot_k$(TSUFFIX).$(SUFFIX) qscal_k$(TSUFFIX).$(SUFFIX) qswap_k$(TSUFFIX).$(SUFFIX)
 	qnrm2_k$(TSUFFIX).$(SUFFIX) qrot_k$(TSUFFIX).$(SUFFIX) qscal_k$(TSUFFIX).$(SUFFIX) qswap_k$(TSUFFIX).$(SUFFIX) \
 	qsum_k$(TSUFFIX).$(SUFFIX)

 CBLASOBJS	+= \
 	camax_k$(TSUFFIX).$(SUFFIX)  camin_k$(TSUFFIX).$(SUFFIX) icamax_k$(TSUFFIX).$(SUFFIX) icamin_k$(TSUFFIX).$(SUFFIX) \
 	casum_k$(TSUFFIX).$(SUFFIX) caxpy_k$(TSUFFIX).$(SUFFIX) caxpyc_k$(TSUFFIX).$(SUFFIX) ccopy_k$(TSUFFIX).$(SUFFIX) \
 	cdotc_k$(TSUFFIX).$(SUFFIX) cdotu_k$(TSUFFIX).$(SUFFIX) cnrm2_k$(TSUFFIX).$(SUFFIX) csrot_k$(TSUFFIX).$(SUFFIX) \
 	cscal_k$(TSUFFIX).$(SUFFIX) cswap_k$(TSUFFIX).$(SUFFIX) caxpby_k$(TSUFFIX).$(SUFFIX)
 	cscal_k$(TSUFFIX).$(SUFFIX) cswap_k$(TSUFFIX).$(SUFFIX) caxpby_k$(TSUFFIX).$(SUFFIX) csum_k$(TSUFFIX).$(SUFFIX)

 ZBLASOBJS	+= \
 	 zamax_k$(TSUFFIX).$(SUFFIX)  zamin_k$(TSUFFIX).$(SUFFIX) izamax_k$(TSUFFIX).$(SUFFIX) izamin_k$(TSUFFIX).$(SUFFIX) \
 	zasum_k$(TSUFFIX).$(SUFFIX) zaxpy_k$(TSUFFIX).$(SUFFIX) zaxpyc_k$(TSUFFIX).$(SUFFIX) zcopy_k$(TSUFFIX).$(SUFFIX) \
 	zdotc_k$(TSUFFIX).$(SUFFIX) zdotu_k$(TSUFFIX).$(SUFFIX) znrm2_k$(TSUFFIX).$(SUFFIX) zdrot_k$(TSUFFIX).$(SUFFIX) \
 	zscal_k$(TSUFFIX).$(SUFFIX) zswap_k$(TSUFFIX).$(SUFFIX) zaxpby_k$(TSUFFIX).$(SUFFIX)
 	zscal_k$(TSUFFIX).$(SUFFIX) zswap_k$(TSUFFIX).$(SUFFIX) zaxpby_k$(TSUFFIX).$(SUFFIX) zsum_k$(TSUFFIX).$(SUFFIX)

 XBLASOBJS	+= \
 	 xamax_k$(TSUFFIX).$(SUFFIX)  xamin_k$(TSUFFIX).$(SUFFIX) ixamax_k$(TSUFFIX).$(SUFFIX) ixamin_k$(TSUFFIX).$(SUFFIX) \
 	xasum_k$(TSUFFIX).$(SUFFIX) xaxpy_k$(TSUFFIX).$(SUFFIX) xaxpyc_k$(TSUFFIX).$(SUFFIX) xcopy_k$(TSUFFIX).$(SUFFIX) \
 	xdotc_k$(TSUFFIX).$(SUFFIX) xdotu_k$(TSUFFIX).$(SUFFIX) xnrm2_k$(TSUFFIX).$(SUFFIX) xqrot_k$(TSUFFIX).$(SUFFIX) \
 	xscal_k$(TSUFFIX).$(SUFFIX) xswap_k$(TSUFFIX).$(SUFFIX)
 	xscal_k$(TSUFFIX).$(SUFFIX) xswap_k$(TSUFFIX).$(SUFFIX) xsum_k$(TSUFFIX).$(SUFFIX)

 ### AMAX ###

@@ -617,7 +644,7 @@ $(KDIR)idmin_k$(TSUFFIX).$(SUFFIX)  $(KDIR)idmin_k$(TPSUFFIX).$(PSUFFIX)  : $(KE
 $(KDIR)iqmin_k$(TSUFFIX).$(SUFFIX)  $(KDIR)iqmin_k$(TPSUFFIX).$(PSUFFIX)  : $(KERNELDIR)/$(IQMINKERNEL)
 	$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UUSE_ABS -DUSE_MIN $< -o $@


 ### ASUM ###
 $(KDIR)sasum_k$(TSUFFIX).$(SUFFIX)   $(KDIR)sasum_k$(TPSUFFIX).$(PSUFFIX)   : $(KERNELDIR)/$(SASUMKERNEL)
 	$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@

@@ -636,6 +663,26 @@ $(KDIR)zasum_k$(TSUFFIX).$(SUFFIX)  $(KDIR)zasum_k$(TPSUFFIX).$(PSUFFIX)  : $(KE
 $(KDIR)xasum_k$(TSUFFIX).$(SUFFIX)  $(KDIR)xasum_k$(TPSUFFIX).$(PSUFFIX)  : $(KERNELDIR)/$(XASUMKERNEL)
 	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $@

 ### SUM ###
 $(KDIR)ssum_k$(TSUFFIX).$(SUFFIX)   $(KDIR)ssum_k$(TPSUFFIX).$(PSUFFIX)   : $(KERNELDIR)/$(SSUMKERNEL)
 	$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@

 $(KDIR)dsum_k$(TSUFFIX).$(SUFFIX)   $(KDIR)dsum_k$(TPSUFFIX).$(PSUFFIX)   : $(KERNELDIR)/$(DSUMKERNEL)
 	$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $@

 $(KDIR)qsum_k$(TSUFFIX).$(SUFFIX)   $(KDIR)qsum_k$(TPSUFFIX).$(PSUFFIX)   : $(KERNELDIR)/$(QSUMKERNEL)
 	$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $@

 $(KDIR)csum_k$(TSUFFIX).$(SUFFIX)  $(KDIR)csum_k$(TPSUFFIX).$(PSUFFIX)  : $(KERNELDIR)/$(CSUMKERNEL)
 	$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE $< -o $@

 $(KDIR)zsum_k$(TSUFFIX).$(SUFFIX)  $(KDIR)zsum_k$(TPSUFFIX).$(PSUFFIX)  : $(KERNELDIR)/$(ZSUMKERNEL)
 	$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE $< -o $@

 $(KDIR)xsum_k$(TSUFFIX).$(SUFFIX)  $(KDIR)xsum_k$(TPSUFFIX).$(PSUFFIX)  : $(KERNELDIR)/$(XSUMKERNEL)
 	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $@

 ### AXPY ###
 $(KDIR)saxpy_k$(TSUFFIX).$(SUFFIX)  $(KDIR)saxpy_k$(TPSUFFIX).$(PSUFFIX)  : $(KERNELDIR)/$(SAXPYKERNEL)
 	$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@

--- a/kernel/alpha/sum.S
+++ b/kernel/alpha/sum.S
@@ -0,0 +1,206 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
 /* without modification, are permitted provided that the following   */
 /* conditions are met:                                               */
 /*                                                                   */
 /*   1. Redistributions of source code must retain the above         */
 /*      copyright notice, this list of conditions and the following  */
 /*      disclaimer.                                                  */
 /*                                                                   */
 /*   2. Redistributions in binary form must reproduce the above      */
 /*      copyright notice, this list of conditions and the following  */
 /*      disclaimer in the documentation and/or other materials       */
 /*      provided with the distribution.                              */
 /*                                                                   */
 /*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
 /*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
 /*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
 /*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
 /*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
 /*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
 /*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
 /*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
 /*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
 /*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
 /*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
 /*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
 /*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
 /*    POSSIBILITY OF SUCH DAMAGE.                                    */
 /*                                                                   */
 /* The views and conclusions contained in the software and           */
 /* documentation are those of the authors and should not be          */
 /* interpreted as representing official policies, either expressed   */
 /* or implied, of The University of Texas at Austin.                 */
 /*********************************************************************/

 #define ASSEMBLER
 #include "common.h"
 #include "version.h"

 #define PREFETCHSIZE	88

 #define N	$16
 #define X	$17
 #define INCX	$18
 #define I	$19

 #define s0	$f0
 #define s1	$f1
 #define s2	$f10
 #define s3	$f11

 #define a0	$f12
 #define a1	$f13
 #define a2	$f14
 #define a3	$f15
 #define a4	$f16
 #define a5	$f17
 #define a6	$f18
 #define a7	$f19

 #define t0	$f20
 #define t1	$f21
 #define t2	$f22
 #define t3	$f23

 	PROLOGUE
 	PROFCODE

 	fclr	s0
 	unop
 	fclr	t0
 	ble	N,  $L999

 	sra	N, 3, I
 	fclr	s1
 	fclr	s2
 	ble	I, $L15

 	LD	a0,  0 * SIZE(X)
 	fclr	t1
 	SXADDQ	INCX, X, X
 	fclr	t2

 	LD	a1,  0 * SIZE(X)
 	fclr	t3
 	SXADDQ	INCX, X, X
 	fclr	s3

 	LD	a2,  0 * SIZE(X)
 	SXADDQ	INCX, X, X
 	LD	a3,  0 * SIZE(X)
 	SXADDQ	INCX, X, X

 	LD	a4,  0 * SIZE(X)
 	SXADDQ	INCX, X, X
 	LD	a5,  0 * SIZE(X)
 	SXADDQ	INCX, X, X

 	lda	I,  -1(I)
 	ble	I, $L13
 	.align 4

 $L12:
 	ADD	s0, t0, s0
 	ldl	$31, PREFETCHSIZE * 2 * SIZE(X)
 	fmov	a0, t0
 	lda	I,  -1(I)

 	ADD	s1, t1, s1
 	LD	a6,  0 * SIZE(X)
 	fmov	a1, t1
 	SXADDQ	INCX, X, X

 	ADD	s2, t2, s2
 	LD	a7,  0 * SIZE(X)
 	fmov	a2, t2
 	SXADDQ	INCX, X, X

 	ADD	s3, t3, s3
 	LD	a0,  0 * SIZE(X)
 	fmov	a3, t3
 	SXADDQ	INCX, X, X

 	ADD	s0, t0, s0
 	LD	a1,  0 * SIZE(X)
 	fmov	a4, t0
 	SXADDQ	INCX, X, X

 	ADD	s1, t1, s1
 	LD	a2,  0 * SIZE(X)
 	fmov	a5, t1
 	SXADDQ	INCX, X, X

 	ADD	s2, t2, s2
 	LD	a3,  0 * SIZE(X)
 	fmov	a6, t2
 	SXADDQ	INCX, X, X

 	ADD	s3, t3, s3
 	LD	a4,  0 * SIZE(X)
 	fmov	a7, t3
 	SXADDQ	INCX, X, X

 	LD	a5,  0 * SIZE(X)
 	unop
 	SXADDQ	INCX, X, X
 	bne	I, $L12
 	.align 4

 $L13:
 	ADD	s0, t0, s0
 	LD	a6,  0 * SIZE(X)
 	fmov	a0, t0
 	SXADDQ	INCX, X, X

 	ADD	s1, t1, s1
 	LD	a7,  0 * SIZE(X)
 	fmov	a1, t1
 	SXADDQ	INCX, X, X

 	ADD	s2, t2, s2
 	fmov	a2, t2
 	ADD	s3, t3, s3
 	fmov	a3, t3

 	ADD	s0, t0, s0
 	fmov	a4, t0
 	ADD	s1, t1, s1
 	fmov	a5, t1
 	ADD	s2, t2, s2
 	fmov	a6, t2
 	ADD	s3, t3, s3
 	fmov	a7, t3

 	ADD	s1, t1, s1
 	ADD	s2, t2, s2
 	ADD	s3, t3, s3

 	ADD	s0, s1, s0
 	ADD	s2, s3, s2
 	.align 4

 $L15:
 	and	N, 7, I
 	ADD	s0, s2, s0
 	unop
 	ble	I, $L999
 	.align 4

 $L17:
 	ADD	s0, t0, s0
 	LD	a0,  0 * SIZE(X)
 	SXADDQ	INCX, X, X
 	fmov	a0, t0

 	lda	I,  -1(I)
 	bne	I, $L17
 	.align 4

 $L999:
 	ADD	s0, t0, s0
 	ret
 	EPILOGUE
--- a/kernel/alpha/zsum.S
+++ b/kernel/alpha/zsum.S
@@ -0,0 +1,208 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
 /* without modification, are permitted provided that the following   */
 /* conditions are met:                                               */
 /*                                                                   */
 /*   1. Redistributions of source code must retain the above         */
 /*      copyright notice, this list of conditions and the following  */
 /*      disclaimer.                                                  */
 /*                                                                   */
 /*   2. Redistributions in binary form must reproduce the above      */
 /*      copyright notice, this list of conditions and the following  */
 /*      disclaimer in the documentation and/or other materials       */
 /*      provided with the distribution.                              */
 /*                                                                   */
 /*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
 /*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
 /*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
 /*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
 /*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
 /*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
 /*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
 /*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
 /*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
 /*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
 /*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
 /*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
 /*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
 /*    POSSIBILITY OF SUCH DAMAGE.                                    */
 /*                                                                   */
 /* The views and conclusions contained in the software and           */
 /* documentation are those of the authors and should not be          */
 /* interpreted as representing official policies, either expressed   */
 /* or implied, of The University of Texas at Austin.                 */
 /*********************************************************************/

 #define ASSEMBLER
 #include "common.h"
 #include "version.h"

 #define PREFETCHSIZE	88

 #define N	$16
 #define X	$17
 #define INCX	$18
 #define I	$19

 #define s0	$f0
 #define s1	$f1
 #define s2	$f10
 #define s3	$f11

 #define a0	$f12
 #define a1	$f13
 #define a2	$f14
 #define a3	$f15
 #define a4	$f16
 #define a5	$f17
 #define a6	$f18
 #define a7	$f19

 #define t0	$f20
 #define t1	$f21
 #define t2	$f22
 #define t3	$f23

 	PROLOGUE
 	PROFCODE

 	fclr	s0
 	unop
 	fclr	t0
 	addq	INCX, INCX, INCX

 	fclr	s1
 	unop
 	fclr	t1
 	ble	N,  $L999

 	fclr	s2
 	sra	N, 2, I
 	fclr	s3
 	ble	I, $L15

 	LD	a0,  0 * SIZE(X)
 	fclr	t2
 	LD	a1,  1 * SIZE(X)
 	SXADDQ	INCX, X, X

 	LD	a2,  0 * SIZE(X)
 	fclr	t3
 	LD	a3,  1 * SIZE(X)
 	SXADDQ	INCX, X, X

 	LD	a4,  0 * SIZE(X)
 	LD	a5,  1 * SIZE(X)
 	SXADDQ	INCX, X, X
 	lda	I,  -1(I)

 	ble	I, $L13
 	.align 4

 $L12:
 	ADD	s0, t0, s0
 	ldl	$31, PREFETCHSIZE * SIZE(X)
 	fmov	a0, t0
 	lda	I,  -1(I)

 	ADD	s1, t1, s1
 	LD	a6,  0 * SIZE(X)
 	fmov	a1, t1
 	unop

 	ADD	s2, t2, s2
 	LD	a7,  1 * SIZE(X)
 	fmov	a2, t2
 	SXADDQ	INCX, X, X

 	ADD	s3, t3, s3
 	LD	a0,  0 * SIZE(X)
 	fmov	a3, t3
 	unop

 	ADD	s0, t0, s0
 	LD	a1,  1 * SIZE(X)
 	fmov	a4, t0
 	SXADDQ	INCX, X, X

 	ADD	s1, t1, s1
 	LD	a2,  0 * SIZE(X)
 	fmov	a5, t1
 	unop

 	ADD	s2, t2, s2
 	LD	a3,  1 * SIZE(X)
 	fmov	a6, t2
 	SXADDQ	INCX, X, X

 	ADD	s3, t3, s3
 	LD	a4,  0 * SIZE(X)
 	fmov	a7, t3
 	unop

 	LD	a5,  1 * SIZE(X)
 	unop
 	SXADDQ	INCX, X, X
 	bne	I, $L12
 	.align 4

 $L13:
 	ADD	s0, t0, s0
 	LD	a6,  0 * SIZE(X)
 	fmov	a0, t0

 	ADD	s1, t1, s1
 	LD	a7,  1 * SIZE(X)
 	fmov	a1, t1
 	SXADDQ	INCX, X, X

 	ADD	s2, t2, s2
 	fmov	a2, t2
 	ADD	s3, t3, s3
 	fmov	a3, t3

 	ADD	s0, t0, s0
 	fmov	a4, t0
 	ADD	s1, t1, s1
 	fmov	a5, t1
 	ADD	s2, t2, s2
 	fmov	a6, t2
 	ADD	s3, t3, s3
 	fmov	a7, t3

 	ADD	s2, t2, s2
 	ADD	s3, t3, s3

 	.align 4

 $L15:
 	ADD	s0, s2, s0
 	and	N, 3, I
 	ADD	s1, s3, s1
 	ble	I, $L999
 	.align 4

 $L17:
 	ADD	s0, t0, s0
 	LD	a0,  0 * SIZE(X)
 	fmov	a0, t0
 	lda	I,  -1(I)

 	ADD	s1, t1, s1
 	LD	a1,  1 * SIZE(X)
 	fmov	a1, t1
 	SXADDQ	INCX, X, X

 	bne	I, $L17
 	.align 4

 $L999:
 	ADD	s0, t0, s0
 	ADD	s1, t1, s1

 	ADD	s0, s1, s0
 	ret
 	EPILOGUE
--- a/kernel/arm/KERNEL.ARMV5
+++ b/kernel/arm/KERNEL.ARMV5
@@ -35,6 +35,11 @@ DASUMKERNEL  = ../arm/asum.c
 CASUMKERNEL  = ../arm/zasum.c
 ZASUMKERNEL  = ../arm/zasum.c

 SSUMKERNEL  = ../arm/sum.c
 DSUMKERNEL  = ../arm/sum.c
 CSUMKERNEL  = ../arm/zsum.c
 ZSUMKERNEL  = ../arm/zsum.c

 SAXPYKERNEL  = ../arm/axpy.c
 DAXPYKERNEL  = ../arm/axpy.c
 CAXPYKERNEL  = ../arm/zaxpy.c
--- a/kernel/arm/KERNEL.ARMV6
+++ b/kernel/arm/KERNEL.ARMV6
@@ -37,6 +37,9 @@ DASUMKERNEL  = asum_vfp.S
 CASUMKERNEL  = asum_vfp.S
 ZASUMKERNEL  = asum_vfp.S

 SSUMKERNEL  = sum_vfp.S
 DSUMKERNEL  = sum_vfp.S

 SAXPYKERNEL  = axpy_vfp.S
 DAXPYKERNEL  = axpy_vfp.S
 CAXPYKERNEL  = axpy_vfp.S
--- a/kernel/arm/sum.c
+++ b/kernel/arm/sum.c
@@ -0,0 +1,51 @@
 /***************************************************************************
 Copyright (c) 2013, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/

 /**************************************************************************************
 * trivial copy of asum.c with the ABS() removed                                       *
 **************************************************************************************/


 #include "common.h"
 #include <math.h>

 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
 	BLASLONG i=0;
 	FLOAT sumf = 0.0;
 	if (n <= 0 || inc_x <= 0) return(sumf);

 	n *= inc_x;
 	while(i < n)
 	{
 		sumf += x[i];
 		i += inc_x;
 	}
 	return(sumf);
 }


--- a/kernel/arm/sum_vfp.S
+++ b/kernel/arm/sum_vfp.S
@@ -0,0 +1,425 @@
 /***************************************************************************
 Copyright (c) 2013, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/

 /**************************************************************************************
 * trivial copy of asum_vfp.S with the in-place vabs.f64 calls removed                                    *
 **************************************************************************************/

 #define ASSEMBLER
 #include "common.h"

 #define STACKSIZE 256

 #define	N	r0
 #define	X	r1
 #define	INC_X	r2


 #define I	r12

 #define X_PRE	512

 /**************************************************************************************
 * Macro definitions
 **************************************************************************************/

 #if	!defined(COMPLEX)

 #if	defined(DOUBLE)

 .macro KERNEL_F4

 	pld	[ X, #X_PRE  ]
 	vldmia.f64	X!, { d4 - d5 }
 	vadd.f64   d0  , d0,  d4
 	vldmia.f64	X!, { d6 - d7 }
 	vadd.f64   d1  , d1,  d5
 	vadd.f64   d0  , d0,  d6
 	vadd.f64   d1  , d1,  d7

 .endm

 .macro KERNEL_F1

 	vldmia.f64	X!, { d4 }
 	vadd.f64   d0  , d0,  d4

 .endm


 .macro KERNEL_S4

 	vldmia.f64	X, { d4 }
 	vadd.f64   d0  , d0,  d4
 	add	X, X, INC_X

 	vldmia.f64	X, { d4 }
 	vadd.f64   d0  , d0,  d4
 	add	X, X, INC_X

 	vldmia.f64	X, { d4 }
 	vadd.f64   d0  , d0,  d4
 	add	X, X, INC_X

 	vldmia.f64	X, { d4 }
 	vadd.f64   d0  , d0,  d4
 	add	X, X, INC_X

 .endm


 .macro KERNEL_S1

 	vldmia.f64	X, { d4 }
 	vadd.f64   d0  , d0,  d4
 	add	X, X, INC_X

 .endm

 #else

 .macro KERNEL_F4

 	vldmia.f32	X!, { s4 - s5 }
 	vadd.f32   s0  , s0,  s4
 	vldmia.f32	X!, { s6 - s7 }
 	vadd.f32   s1  , s1,  s5
 	vadd.f32   s0  , s0,  s6
 	vadd.f32   s1  , s1,  s7

 .endm

 .macro KERNEL_F1

 	vldmia.f32	X!, { s4 }
 	vadd.f32   s0  , s0,  s4

 .endm


 .macro KERNEL_S4

 	vldmia.f32	X, { s4 }
 	vadd.f32   s0  , s0,  s4
 	add	X, X, INC_X

 	vldmia.f32	X, { s4 }
 	vadd.f32   s0  , s0,  s4
 	add	X, X, INC_X

 	vldmia.f32	X, { s4 }
 	vadd.f32   s0  , s0,  s4
 	add	X, X, INC_X

 	vldmia.f32	X, { s4 }
 	vadd.f32   s0  , s0,  s4
 	add	X, X, INC_X

 .endm


 .macro KERNEL_S1

 	vldmia.f32	X, { s4 }
 	vadd.f32   s0  , s0,  s4
 	add	X, X, INC_X

 .endm


 #endif

 #else

 #if	defined(DOUBLE)

 .macro KERNEL_F4

 	pld	[ X, #X_PRE  ]
 	vldmia.f64	X!, { d4 - d5 }
 	vadd.f64   d0  , d0,  d4
 	vldmia.f64	X!, { d6 - d7 }
 	vadd.f64   d1  , d1,  d5
 	vadd.f64   d0  , d0,  d6
 	vadd.f64   d1  , d1,  d7

 	pld	[ X, #X_PRE  ]
 	vldmia.f64	X!, { d4 - d5 }
 	vadd.f64   d0  , d0,  d4
 	vldmia.f64	X!, { d6 - d7 }
 	vadd.f64   d1  , d1,  d5
 	vadd.f64   d0  , d0,  d6
 	vadd.f64   d1  , d1,  d7


 .endm

 .macro KERNEL_F1

 	vldmia.f64	X!, { d4 }
 	vadd.f64   d0  , d0,  d4

 	vldmia.f64	X!, { d4 }
 	vadd.f64   d0  , d0,  d4


 .endm


 .macro KERNEL_S4

 	vldmia.f64	X, { d4 -d5 }
 	vadd.f64   d0  , d0,  d4
 	vadd.f64   d0  , d0,  d5
 	add	X, X, INC_X

 	vldmia.f64	X, { d4 -d5 }
 	vadd.f64   d0  , d0,  d4
 	vadd.f64   d0  , d0,  d5
 	add	X, X, INC_X

 	vldmia.f64	X, { d4 -d5 }
 	vadd.f64   d0  , d0,  d4
 	vadd.f64   d0  , d0,  d5
 	add	X, X, INC_X

 	vldmia.f64	X, { d4 -d5 }
 	vadd.f64   d0  , d0,  d4
 	vadd.f64   d0  , d0,  d5
 	add	X, X, INC_X

 .endm


 .macro KERNEL_S1

 	vldmia.f64	X, { d4 -d5 }
 	vadd.f64   d0  , d0,  d4
 	vadd.f64   d0  , d0,  d5
 	add	X, X, INC_X

 .endm

 #else

 .macro KERNEL_F4

 	pld	[ X, #X_PRE  ]
 	vldmia.f32	X!, { s4 - s5 }
 	vadd.f32   s0  , s0,  s4
 	vldmia.f32	X!, { s6 - s7 }
 	vadd.f32   s1  , s1,  s5
 	vadd.f32   s0  , s0,  s6
 	vadd.f32   s1  , s1,  s7

 	vldmia.f32	X!, { s4 - s5 }
 	vadd.f32   s0  , s0,  s4
 	vldmia.f32	X!, { s6 - s7 }
 	vadd.f32   s1  , s1,  s5
 	vadd.f32   s0  , s0,  s6
 	vadd.f32   s1  , s1,  s7


 .endm

 .macro KERNEL_F1

 	vldmia.f32	X!, { s4 }
 	vadd.f32   s0  , s0,  s4

 	vldmia.f32	X!, { s4 }
 	vadd.f32   s0  , s0,  s4

 .endm


 .macro KERNEL_S4

 	vldmia.f32	X, { s4 -s5 }
 	vadd.f32   s0  , s0,  s4
 	vadd.f32   s0  , s0,  s5
 	add	X, X, INC_X

 	vldmia.f32	X, { s4 -s5 }
 	vadd.f32   s0  , s0,  s4
 	vadd.f32   s0  , s0,  s5
 	add	X, X, INC_X

 	vldmia.f32	X, { s4 -s5 }
 	vadd.f32   s0  , s0,  s4
 	vadd.f32   s0  , s0,  s5
 	add	X, X, INC_X

 	vldmia.f32	X, { s4 -s5 }
 	vadd.f32   s0  , s0,  s4
 	vadd.f32   s0  , s0,  s5
 	add	X, X, INC_X

 .endm


 .macro KERNEL_S1

 	vldmia.f32	X, { s4 -s5 }
 	vadd.f32   s0  , s0,  s4
 	vadd.f32   s0  , s0,  s5
 	add	X, X, INC_X

 .endm

 #endif

 #endif

 /**************************************************************************************
 * End of macro definitions
 **************************************************************************************/

 	PROLOGUE

 	.align 5

        movs    r12, #0                                          // clear floating point register
        vmov    s0, r12
        vmov    s1, r12
 #if     defined(DOUBLE)
        vcvt.f64.f32    d0, s0
        vcvt.f64.f32    d1, s1
 #endif

 	cmp	N, #0
 	ble	asum_kernel_L999

 	cmp	INC_X, #0
 	beq	asum_kernel_L999

 	cmp	INC_X, #1
 	bne	asum_kernel_S_BEGIN


 asum_kernel_F_BEGIN:

 	asrs	I, N, #2					// I = N / 4
 	ble	asum_kernel_F1

 	.align 5

 asum_kernel_F4:

 #if !defined(DOUBLE) && !defined(COMPLEX)
 	pld	[ X, #X_PRE  ]
 #endif
 	KERNEL_F4

 	subs	I, I, #1
 	ble	asum_kernel_F1

 	KERNEL_F4

 	subs	I, I, #1
 	bne	asum_kernel_F4

 asum_kernel_F1:

 	ands	I, N, #3
 	ble	asum_kernel_L999

 asum_kernel_F10:

 	KERNEL_F1

 	subs    I, I, #1
        bne     asum_kernel_F10

 	b	asum_kernel_L999

 asum_kernel_S_BEGIN:

 #if defined(COMPLEX)

 #if defined(DOUBLE)
 	lsl	INC_X, INC_X, #4				// INC_X * SIZE * 2
 #else
 	lsl	INC_X, INC_X, #3				// INC_X * SIZE * 2
 #endif

 #else

 #if defined(DOUBLE)
 	lsl	INC_X, INC_X, #3				// INC_X * SIZE
 #else
 	lsl	INC_X, INC_X, #2				// INC_X * SIZE
 #endif

 #endif

 	asrs	I, N, #2					// I = N / 4
 	ble	asum_kernel_S1

 	.align 5

 asum_kernel_S4:

 	KERNEL_S4

 	subs	I, I, #1
 	bne	asum_kernel_S4

 asum_kernel_S1:

 	ands	I, N, #3
 	ble	asum_kernel_L999

 asum_kernel_S10:

 	KERNEL_S1

 	subs    I, I, #1
        bne     asum_kernel_S10


 asum_kernel_L999:


 #if defined(DOUBLE)
 	vadd.f64	d0 , d0, d1				// set return value
 #else
 	vadd.f32	s0 , s0, s1				// set return value
 #endif

 #if !defined(__ARM_PCS_VFP)
 #if !defined(DOUBLE)
 	vmov	r0, s0
 #else
 	vmov	r0, r1, d0
 #endif
 #endif

 	bx	lr

 	EPILOGUE

--- a/kernel/arm/zsum.c
+++ b/kernel/arm/zsum.c
@@ -0,0 +1,57 @@
 /***************************************************************************
 Copyright (c) 2013, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/

 /**************************************************************************************
 * trivial copy of zasum.c with the ABS() removed                                      *
 **************************************************************************************/


 #include "common.h"
 #include <math.h>

 #define CSUM1(x,i)	x[i]+x[i+1]

 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
 	BLASLONG i=0;
 	FLOAT sumf = 0.0;
 	BLASLONG inc_x2;

 	if (n <= 0 || inc_x <= 0) return(sumf);

 	inc_x2 = 2 * inc_x;

 	n *= inc_x2;
 	while(i < n)
 	{
 		sumf += CSUM1(x,i);
 		i += inc_x2;
 	}
 	return(sumf);
 }


--- a/kernel/arm64/csum.S
+++ b/kernel/arm64/csum.S
@@ -0,0 +1,164 @@
 /*******************************************************************************
 Copyright (c) 2019, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *******************************************************************************/

 #define ASSEMBLER
 #include "common.h"

 #define	N	x0	/* vector length */
 #define	X	x1	/* X vector address */
 #define	INC_X	x2	/* X stride */
 #define I	x5	/* loop variable */

 /*******************************************************************************
 * Macro definitions
 *******************************************************************************/

 #define REG0	wzr
 #define SUMF	s0
 #define TMPF	s1
 #define TMPVF	{v1.s}[0]
 #define SZ	4

 /******************************************************************************/

 .macro KERNEL_F1
 	ld1	{v1.2s}, [X], #8
 	ext	v2.8b, v1.8b, v1.8b, #4
 	fadd	TMPF, TMPF, s2
 	fadd	SUMF, SUMF, TMPF
 .endm

 .macro KERNEL_F8
 	ld1	{v1.4s, v2.4s, v3.4s, v4.4s}, [X]
 	add	X, X, #64

 	PRFM	PLDL1KEEP, [X, #1024]

 	fadd	v1.4s, v1.4s, v2.4s
 	fadd	v3.4s, v3.4s, v4.4s
 	fadd	v0.4s, v0.4s, v1.4s
 	fadd	v0.4s, v0.4s, v3.4s
 .endm

 .macro KERNEL_F8_FINALIZE
 	ext	v1.16b, v0.16b, v0.16b, #8
 	fadd	v0.2s, v0.2s, v1.2s
 	faddp	SUMF, v0.2s
 .endm

 .macro INIT_S
 	lsl	INC_X, INC_X, #3
 .endm

 .macro KERNEL_S1
 	ld1	{v1.2s}, [X], INC_X
 	ext	v2.8b, v1.8b, v1.8b, #4
 	fadd	TMPF, TMPF, s2
 	fadd	SUMF, SUMF, TMPF

 .endm

 /*******************************************************************************
 * End of macro definitions
 *******************************************************************************/

 	PROLOGUE

 	fmov	SUMF, REG0
 	fmov	s1, SUMF

 	cmp	N, xzr
 	ble	.Lcsum_kernel_L999
 	cmp	INC_X, xzr
 	ble	.Lcsum_kernel_L999

 	cmp	INC_X, #1
 	bne	.Lcsum_kernel_S_BEGIN

 .Lcsum_kernel_F_BEGIN:

 	asr	I, N, #3
 	cmp	I, xzr
 	beq	.Lcsum_kernel_F1

 .Lcsum_kernel_F8:

 	KERNEL_F8

 	subs	I, I, #1
 	bne	.Lcsum_kernel_F8

 	KERNEL_F8_FINALIZE

 .Lcsum_kernel_F1:

 	ands	I, N, #7
 	ble	.Lcsum_kernel_L999

 .Lcsum_kernel_F10:

 	KERNEL_F1

 	subs    I, I, #1
        bne     .Lcsum_kernel_F10

 .Lcsum_kernel_L999:
 	ret

 .Lcsum_kernel_S_BEGIN:

 	INIT_S

 	asr	I, N, #2
 	cmp	I, xzr
 	ble	.Lcsum_kernel_S1

 .Lcsum_kernel_S4:

 	KERNEL_S1
 	KERNEL_S1
 	KERNEL_S1
 	KERNEL_S1

 	subs	I, I, #1
 	bne	.Lcsum_kernel_S4

 .Lcsum_kernel_S1:

 	ands	I, N, #3
 	ble	.Lcsum_kernel_L999

 .Lcsum_kernel_S10:

 	KERNEL_S1

 	subs    I, I, #1
        bne     .Lcsum_kernel_S10

 	ret

 	EPILOGUE
--- a/kernel/arm64/sum.S
+++ b/kernel/arm64/sum.S
@@ -0,0 +1,186 @@
 /*******************************************************************************
 Copyright (c) 2019, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *******************************************************************************/

 #define ASSEMBLER
 #include "common.h"

 #define	N	x0	/* vector length */
 #define	X	x1	/* X vector address */
 #define	INC_X	x2	/* X stride */
 #define I	x5	/* loop variable */

 /*******************************************************************************
 * Macro definitions
 *******************************************************************************/

 #if !defined(DOUBLE)
 #define REG0	wzr
 #define SUMF	s0
 #define TMPF	s1
 #define TMPVF	{v1.s}[0]
 #define SZ	4
 #else
 #define REG0	xzr
 #define SUMF	d0
 #define TMPF	d1
 #define TMPVF	{v1.d}[0]
 #define SZ	8
 #endif

 /******************************************************************************/

 .macro KERNEL_F1
 	ldr	TMPF, [X], #SZ
 	fadd	SUMF, SUMF, TMPF
 .endm

 .macro KERNEL_F8
 #if !defined(DOUBLE)
 	ld1	{v1.4s, v2.4s}, [X], #32	// Load [X3, X2, X1, X0]
 	fadd	v1.4s, v1.4s, v2.4s		// [X3+X1, X2+X0]
 	fadd	v0.4s, v0.4s, v1.4s		// [X3+X1, X2+X0]
 	PRFM	PLDL1KEEP, [X, #1024]
 #else // DOUBLE
 	ld1	{v2.2d, v3.2d, v4.2d, v5.2d}, [X]
 	add	X, X, #64

 	PRFM	PLDL1KEEP, [X, #1024]

 	fadd	v2.2d, v2.2d, v3.2d
 	fadd	v4.2d, v4.2d, v5.2d
 	fadd	v0.2d, v0.2d, v2.2d
 	fadd	v0.2d, v0.2d, v4.2d
 #endif
 .endm

 .macro KERNEL_F8_FINALIZE
 #if !defined(DOUBLE)
 	ext	v1.16b, v0.16b, v0.16b, #8
 	fadd	v0.2s, v0.2s, v1.2s
 	faddp	SUMF, v0.2s
 #else
 	faddp	SUMF, v0.2d
 #endif
 .endm

 .macro INIT_S
 #if !defined(DOUBLE)
 	lsl	INC_X, INC_X, #2
 #else
 	lsl	INC_X, INC_X, #3
 #endif
 .endm

 .macro KERNEL_S1
 	ld1	TMPVF, [X], INC_X
 	fadd	SUMF, SUMF, TMPF
 .endm

 /*******************************************************************************
 * End of macro definitions
 *******************************************************************************/

 	PROLOGUE

 	fmov	SUMF, REG0
 #if !defined(DOUBLE)
 	fmov	s1, SUMF
 #else
 	fmov	d1, SUMF
 #endif

 	cmp	N, xzr
 	ble	.Lsum_kernel_L999
 	cmp	INC_X, xzr
 	ble	.Lsum_kernel_L999

 	cmp	INC_X, #1
 	bne	.Lsum_kernel_S_BEGIN

 .Lsum_kernel_F_BEGIN:

 	asr	I, N, #3
 	cmp	I, xzr
 	beq	.Lsum_kernel_F1

 .Lsum_kernel_F8:

 	KERNEL_F8

 	subs	I, I, #1
 	bne	.Lsum_kernel_F8

 	KERNEL_F8_FINALIZE

 .Lsum_kernel_F1:

 	ands	I, N, #7
 	ble	.Lsum_kernel_L999

 .Lsum_kernel_F10:

 	KERNEL_F1

 	subs    I, I, #1
        bne     .Lsum_kernel_F10

 .Lsum_kernel_L999:
 	ret

 .Lsum_kernel_S_BEGIN:

 	INIT_S

 	asr	I, N, #2
 	cmp	I, xzr
 	ble	.Lsum_kernel_S1

 .Lsum_kernel_S4:

 	KERNEL_S1
 	KERNEL_S1
 	KERNEL_S1
 	KERNEL_S1

 	subs	I, I, #1
 	bne	.Lsum_kernel_S4

 .Lsum_kernel_S1:

 	ands	I, N, #3
 	ble	.Lsum_kernel_L999

 .Lsum_kernel_S10:

 	KERNEL_S1

 	subs    I, I, #1
        bne     .Lsum_kernel_S10

 	ret

 	EPILOGUE
--- a/kernel/arm64/zsum.S
+++ b/kernel/arm64/zsum.S
@@ -0,0 +1,158 @@
 /*******************************************************************************
 Copyright (c) 2015, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *******************************************************************************/

 #define ASSEMBLER
 #include "common.h"

 #define	N	x0	/* vector length */
 #define	X	x1	/* X vector address */
 #define	INC_X	x2	/* X stride */
 #define I	x5	/* loop variable */

 /*******************************************************************************
 * Macro definitions
 *******************************************************************************/

 #define REG0	xzr
 #define SUMF	d0
 #define TMPF	d1
 #define TMPVF	{v1.d}[0]
 #define SZ	8

 /******************************************************************************/

 .macro KERNEL_F1
 	ld1	{v1.2d}, [X], #16
 	faddp	TMPF, v1.2d
 	fadd	SUMF, SUMF, TMPF
 .endm

 .macro KERNEL_F4
 	ld1	{v1.2d, v2.2d, v3.2d, v4.2d}, [X], #64

 	fadd	v1.2d, v1.2d, v2.2d
 	fadd	v3.2d, v3.2d, v4.2d

 	fadd	v0.2d, v0.2d, v1.2d
 	fadd	v0.2d, v0.2d, v3.2d

 	PRFM	PLDL1KEEP, [X, #1024]
 .endm

 .macro KERNEL_F4_FINALIZE
 	faddp	SUMF, v0.2d
 .endm

 .macro INIT_S
 	lsl	INC_X, INC_X, #4
 .endm

 .macro KERNEL_S1
 	ld1	{v1.2d}, [X], INC_X
 	faddp	TMPF, v1.2d
 	fadd	SUMF, SUMF, TMPF
 .endm

 /*******************************************************************************
 * End of macro definitions
 *******************************************************************************/

 	PROLOGUE

 	fmov	SUMF, REG0

 	cmp	N, xzr
 	ble	.Lzsum_kernel_L999
 	cmp	INC_X, xzr
 	ble	.Lzsum_kernel_L999

 	cmp	INC_X, #1
 	bne	.Lzsum_kernel_S_BEGIN

 .Lzsum_kernel_F_BEGIN:

 	asr	I, N, #2
 	cmp	I, xzr
 	beq	.Lzsum_kernel_F1

 .Lzsum_kernel_F4:

 	KERNEL_F4

 	subs	I, I, #1
 	bne	.Lzsum_kernel_F4

 	KERNEL_F4_FINALIZE

 .Lzsum_kernel_F1:

 	ands	I, N, #3
 	ble	.Lzsum_kernel_L999

 .Lzsum_kernel_F10:

 	KERNEL_F1

 	subs    I, I, #1
        bne     .Lzsum_kernel_F10

 .Lzsum_kernel_L999:
 	ret

 .Lzsum_kernel_S_BEGIN:

 	INIT_S

 	asr	I, N, #2
 	cmp	I, xzr
 	ble	.Lzsum_kernel_S1

 .Lzsum_kernel_S4:

 	KERNEL_S1
 	KERNEL_S1
 	KERNEL_S1
 	KERNEL_S1

 	subs	I, I, #1
 	bne	.Lzsum_kernel_S4

 .Lzsum_kernel_S1:

 	ands	I, N, #3
 	ble	.Lzsum_kernel_L999

 .Lzsum_kernel_S10:

 	KERNEL_S1

 	subs    I, I, #1
        bne     .Lzsum_kernel_S10

 	ret

 	EPILOGUE
--- a/kernel/ia64/KERNEL
+++ b/kernel/ia64/KERNEL
@@ -60,6 +60,10 @@ CASUMKERNEL  = asum.S
 ZASUMKERNEL  = asum.S
 XASUMKERNEL  = asum.S

 CSUMKERNEL  = sum.S
 ZSUMKERNEL  = sum.S
 XSUMKERNEL  = sum.S

 CNRM2KERNEL  = nrm2.S
 ZNRM2KERNEL  = nrm2.S
 XNRM2KERNEL  = nrm2.S
--- a/kernel/ia64/sum.S
+++ b/kernel/ia64/sum.S
@@ -0,0 +1,358 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
 /* Copyright 2019, The OpenBLAS project                              */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
 /* without modification, are permitted provided that the following   */
 /* conditions are met:                                               */
 /*                                                                   */
 /*   1. Redistributions of source code must retain the above         */
 /*      copyright notice, this list of conditions and the following  */
 /*      disclaimer.                                                  */
 /*                                                                   */
 /*   2. Redistributions in binary form must reproduce the above      */
 /*      copyright notice, this list of conditions and the following  */
 /*      disclaimer in the documentation and/or other materials       */
 /*      provided with the distribution.                              */
 /*                                                                   */
 /*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
 /*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
 /*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
 /*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
 /*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
 /*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
 /*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
 /*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
 /*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
 /*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
 /*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
 /*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
 /*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
 /*    POSSIBILITY OF SUCH DAMAGE.                                    */
 /*                                                                   */
 /* The views and conclusions contained in the software and           */
 /* documentation are those of the authors and should not be          */
 /* interpreted as representing official policies, either expressed   */
 /* or implied, of The University of Texas at Austin.                 */
 /*********************************************************************/

 #define ASSEMBLER
 #include "common.h"

 #ifdef XDOUBLE
 #define PREFETCH_SIZE ( 8 * 16 +  4)
 #elif defined(DOUBLE)
 #define PREFETCH_SIZE (16 * 16 +  8)
 #else
 #define PREFETCH_SIZE (32 * 16 + 16)
 #endif

 #ifndef COMPLEX
 #define COMPADD	0
 #define STRIDE INCX
 #else
 #define COMPADD	1
 #define STRIDE SIZE
 #endif

 #define PRE1	r2

 #define I	r17
 #define J	r18
 #define INCX16	r21

 #define PR	r30
 #define ARLC	r31

 #define N	r32
 #define X	r33
 #define INCX	r34


 	PROLOGUE
 	.prologue
 	PROFCODE
 	{ .mfi
 	adds	PRE1 = PREFETCH_SIZE * SIZE, X
 	mov	f8   = f0
 	.save ar.lc, ARLC
 	mov	ARLC = ar.lc
 	}
 	;;
 	.body
 #ifdef F_INTERFACE
 	{ .mmi
 	LDINT	N    = [N]
 	LDINT	INCX = [INCX]
 	nop.i	0
 	}
 	;;
 #ifndef USE64BITINT
 	{ .mii
 	nop.m	0
 	sxt4	N = N
 	sxt4	INCX = INCX
 	}
 	;;
 #endif
 #endif
 	{ .mmi
 	cmp.lt	p0, p6 = r0, INCX
 	cmp.lt	p0, p7 = r0, N
 	shr	I =  N, (4 - COMPADD)
 	}
 	{ .mbb
 	and	J = ((1 << (4 - COMPADD)) - 1), N
 	(p6) 	br.ret.sptk.many b0
 	(p7) 	br.ret.sptk.many b0
 	}
 	;;
 	{ .mfi
 	adds	I = -1, I
 	mov	f10 = f0
 	mov	PR = pr
 	}
 	{ .mfi
 	cmp.eq	p9, p0  =   r0, J
 	mov	f9  = f0
 	tbit.z	p0, p12 = N, 3 - COMPADD
 	}
 	;;
 	{ .mmi
 	cmp.eq	p16, p0 = r0, r0
 	cmp.ne	p17, p0 = r0, r0
 	mov	ar.ec= 3
 	}
 	{ .mfi
 	cmp.ne	p18, p0 = r0, r0
 	mov	f11 = f0
 	shl	INCX = INCX, BASE_SHIFT + COMPADD
 	}
 	;;
 	{ .mmi
 #ifdef XDOUBLE
 	shladd	INCX16  = INCX, (3 - COMPADD), r0
 #else
 	shladd	INCX16  = INCX, (4 - COMPADD), r0
 #endif
 	cmp.ne	p19, p0 = r0, r0
 	mov	ar.lc = I
 	}
 	{ .mmb
 	cmp.gt	p8 ,p0  =   r0, I
 #ifdef COMPLEX
       adds	INCX = - SIZE, INCX
 #else
 	nop.m	0
 #endif
 	(p8) br.cond.dpnt  .L55
 	}
 	;;
 	.align 32

 .L52:
 	{ .mmf
 	(p16) lfetch.nt1 [PRE1], INCX16
 	(p16) LDFD	f32  = [X], STRIDE
 	}
 	{ .mfb
 	(p19) FADD	f8  = f8,  f71
 	}
 	;;
 	{ .mmf
 	(p16) LDFD	f35  = [X], INCX
 	}
 	{ .mfb
 	(p19) FADD	f9  = f9,  f74
 	}
 	;;
 	{ .mmf
 	(p16) LDFD	f38  = [X], STRIDE
 	}
 	{ .mfb
 	(p19) FADD	f10 = f10, f77
 	}
 	;;
 	{ .mmf
 	(p16) LDFD	f41  = [X], INCX
 	}
 	{ .mfb
 	(p19) FADD	f11 = f11, f80
 	}
 	;;
 	{ .mmf
 	(p16) LDFD	f44  = [X], STRIDE
 	}
 	{ .mfb
 	(p18) FADD	f8  = f8,  f34
 	}
 	;;
 	{ .mmf
 	(p16) LDFD	f47  = [X], INCX
 	}
 	{ .mfb
 	(p18) FADD	f9  = f9,  f37
 	}
 	;;
 	{ .mmf
 	(p16) LDFD	f50  = [X], STRIDE
 	}
 	{ .mfb
 	(p18) FADD	f10 = f10, f40
 	}
 	;;
 	{ .mmf
 	(p16) LDFD	f53  = [X], INCX
 	}
 	{ .mfb
 	(p18) FADD	f11 = f11, f43
 	}
 	;;
 	{ .mmf
 #ifdef XDOUBLE
 	(p16) lfetch.nt1 [PRE1], INCX16
 #endif
 	(p16) LDFD	f56  = [X], STRIDE
 	}
 	{ .mfb
 	(p18) FADD	f8  = f8,  f46
 	}
 	;;
 	{ .mmf
 	(p16) LDFD	f59  = [X], INCX
 	}
 	{ .mfb
 	(p18) FADD	f9  = f9,  f49
 	}
 	;;
 	{ .mmf
 	(p16) LDFD	f62  = [X], STRIDE
 	}
 	{ .mfb
 	(p18) FADD	f10 = f10, f52
 	}
 	;;
 	{ .mmf
 	(p16) LDFD	f65  = [X], INCX
 	}
 	{ .mfb
 	(p18) FADD	f11 = f11, f55
 	}
 	;;
 	{ .mmf
 	(p16) LDFD	f68  = [X], STRIDE
 	}
 	{ .mfb
 	(p18) FADD	f8  = f8,  f58
 	}
 	;;
 	{ .mmf
 	(p16) LDFD	f71  = [X], INCX
 	}
 	{ .mfb
 	(p18) FADD	f9  = f9,  f61
 	}
 	;;
 	{ .mmf
 	(p16) LDFD	f74  = [X], STRIDE
 	}
 	{ .mfb
 	(p18) FADD	f10 = f10, f64
 	}
 	;;
 	{ .mmf
 	(p16) LDFD	f77  = [X], INCX
 	}
 	{ .mfb
 	(p18) FADD	f11 = f11, f67
 	br.ctop.sptk.few .L52
 	}
 	;;
 	FADD	f8  = f8,  f71
 	FADD	f9  = f9,  f74
 	FADD	f10 = f10, f77
 	FADD	f11 = f11, f80
 	.align 32
 	;;
 .L55:
 	(p12) LDFD	f32  = [X], STRIDE
 	(p9) br.cond.dptk .L998
 	;;
 	(p12) LDFD	f33  = [X], INCX
 	;;
 	(p12) LDFD	f34  = [X], STRIDE
 	;;
 	(p12) LDFD	f35  = [X], INCX
 	tbit.z	p0, p13 = N, (2 - COMPADD)
 	;;
 	(p12) LDFD	f36  = [X], STRIDE
 	tbit.z	p0, p14 = N, (1 - COMPADD)
 	;;
 	(p12) LDFD	f37  = [X], INCX
 #ifndef COMPLEX
 	tbit.z	p0, p15 = N, 0
 #endif
 	;;
 	(p12) LDFD	f38  = [X], STRIDE
 	;;
 	(p12) LDFD	f39  = [X], INCX
 	;;
 	(p13) LDFD	f40  = [X], STRIDE
 	;;
 	(p13) LDFD	f41  = [X], INCX
 	;;
 	(p13) LDFD	f42  = [X], STRIDE
 	(p12) FADD	f8  = f8,  f32
 	;;
 	(p13) LDFD	f43  = [X], INCX
 	(p12) FADD	f9  = f9,  f33
 	;;
 	(p14) LDFD	f44  = [X], STRIDE
 	(p12) FADD	f10 = f10, f34
 	;;
 	(p14) LDFD	f45  = [X], INCX
 	(p12) FADD	f11 = f11, f35
 	;;
 #ifndef COMPLEX
 	(p15) LDFD	f46  = [X]
 #endif
 	(p12) FADD	f8  = f8,  f36
 	;;
 	(p12) FADD	f9  = f9,  f37
 	(p12) FADD	f10 = f10, f38
 	(p12) FADD	f11 = f11, f39
 	;;
 	(p13) FADD	f8  = f8,  f40
 	(p13) FADD	f9  = f9,  f41
 #ifndef COMPLEX
 #endif
 	(p13) FADD	f10 = f10, f42
 	;;
 	(p13) FADD	f11 = f11, f43
 	(p14) FADD	f8  = f8,  f44
 	(p14) FADD	f9  = f9,  f45
 #ifndef COMPLEX
 	(p15) FADD	f10 = f10, f46
 #endif
 	;;
 	.align 32

 .L998:
 	{ .mfi
 	FADD	f8  = f8,  f9
 	mov	ar.lc  = ARLC
 	}
 	{ .mmf
 	FADD	f10 = f10, f11
 	}
 	;;
 	{ .mii
 	mov	pr = PR, -65474
 	}
 	;;
 	{ .mfb
 	FADD	f8  = f8,  f10
 	br.ret.sptk.many b0
 	}
 	EPILOGUE
--- a/kernel/mips/KERNEL.P5600
+++ b/kernel/mips/KERNEL.P5600
@@ -30,6 +30,11 @@ IDMAXKERNEL  = ../mips/imax.c
 ISMINKERNEL  = ../mips/imin.c
 IDMINKERNEL  = ../mips/imin.c

 SSUMKERNEL  = ../mips/sum.c
 DSUMKERNEL  = ../mips/sum.c
 CSUMKERNEL  = ../mips/zsum.c
 ZSUMKERNEL  = ../mips/zsum.c

 ifdef HAVE_MSA
 SASUMKERNEL  = ../mips/sasum_msa.c
 DASUMKERNEL  = ../mips/dasum_msa.c
--- a/kernel/mips/sum.c
+++ b/kernel/mips/sum.c
@@ -0,0 +1,47 @@
 /***************************************************************************
 Copyright (c) 2016, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/

 #include "common.h"
 #include <math.h>


 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
 	BLASLONG i=0;
 	FLOAT sumf = 0.0;
 	if (n <= 0 || inc_x <= 0) return(sumf);

 	n *= inc_x;
 	while(i < n)
 	{
 		sumf += x[i];
 		i += inc_x;
 	}
 	return(sumf);
 }


--- a/kernel/mips/zsum.c
+++ b/kernel/mips/zsum.c
@@ -0,0 +1,52 @@
 /***************************************************************************
 Copyright (c) 2016, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/

 #include "common.h"
 #include <math.h>

 #define CSUM1(x,i)	x[i]+x[i+1]

 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
 	BLASLONG i=0;
 	FLOAT sumf = 0.0;
 	BLASLONG inc_x2;

 	if (n <= 0 || inc_x <= 0) return(sumf);

 	inc_x2 = 2 * inc_x;

 	n *= inc_x2;
 	while(i < n)
 	{
 		sumf += CSUM1(x,i);
 		i += inc_x2;
 	}
 	return(sumf);
 }


--- a/kernel/mips64/sum.S
+++ b/kernel/mips64/sum.S
@@ -0,0 +1,332 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
 /* without modification, are permitted provided that the following   */
 /* conditions are met:                                               */
 /*                                                                   */
 /*   1. Redistributions of source code must retain the above         */
 /*      copyright notice, this list of conditions and the following  */
 /*      disclaimer.                                                  */
 /*                                                                   */
 /*   2. Redistributions in binary form must reproduce the above      */
 /*      copyright notice, this list of conditions and the following  */
 /*      disclaimer in the documentation and/or other materials       */
 /*      provided with the distribution.                              */
 /*                                                                   */
 /*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
 /*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
 /*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
 /*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
 /*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
 /*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
 /*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
 /*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
 /*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
 /*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
 /*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
 /*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
 /*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
 /*    POSSIBILITY OF SUCH DAMAGE.                                    */
 /*                                                                   */
 /* The views and conclusions contained in the software and           */
 /* documentation are those of the authors and should not be          */
 /* interpreted as representing official policies, either expressed   */
 /* or implied, of The University of Texas at Austin.                 */
 /*********************************************************************/

 #define ASSEMBLER
 #include "common.h"

 #define N	$4
 #define	X	$5
 #define INCX	$6

 #define I	$2
 #define TEMP	$3

 #define a1	$f2
 #define a2	$f3
 #define a3	$f4
 #define a4	$f5
 #define a5	$f6
 #define a6	$f7
 #define a7	$f8
 #define a8	$f9

 #define t1	$f10
 #define t2	$f11
 #define t3	$f12
 #define t4	$f13

 #define s1	$f0
 #define s2	$f1

 	PROLOGUE

 #ifdef F_INTERFACE
 	LDINT	N,     0(N)
 	LDINT	INCX,  0(INCX)
 #endif

 	MTC	$0,  s1

 	MTC	$0,  s2
 	dsll	INCX, INCX, BASE_SHIFT

 	blez	N, .L999
 	li	TEMP, SIZE

 	bne	INCX, TEMP, .L20
 	dsra	I, N, 3

 	blez	I, .L15
 	NOP

 	LD	a1,  0 * SIZE(X)
 	LD	a2,  1 * SIZE(X)
 	LD	a3,  2 * SIZE(X)
 	LD	a4,  3 * SIZE(X)

 	LD	a5,  4 * SIZE(X)
 	MOV	t1, a1
 	LD	a6,  5 * SIZE(X)
 	MOV	t2, a2
 	LD	a7,  6 * SIZE(X)
 	MOV	t3, a3

 	MOV	t4, a4
 	daddiu	I, I, -1

 	blez	I, .L13
 	LD	a8,  7 * SIZE(X)
 	.align 3

 .L12:
 	ADD	s1, s1, t1
 	LD	a1,  8 * SIZE(X)

 	MOV	t1, a5
 	daddiu	I, I, -1

 	ADD	s2, s2, t2
 	LD	a2,  9 * SIZE(X)

 	MOV	t2, a6
 	NOP

 	ADD	s1, s1, t3
 	LD	a3, 10 * SIZE(X)

 	MOV	t3, a7
 	NOP

 	ADD	s2, s2, t4
 	LD	a4, 11 * SIZE(X)

 	MOV	t4, a8
 	daddiu	X, X, 8 * SIZE

 	ADD	s1, s1, t1
 	LD	a5,  4 * SIZE(X)

 	MOV	t1, a1
 	NOP

 	ADD	s2, s2, t2
 	LD	a6,  5 * SIZE(X)

 	MOV	t2, a2
 	NOP

 	ADD	s1, s1, t3
 	LD	a7,  6 * SIZE(X)

 	MOV	t3, a3
 	NOP

 	ADD	s2, s2, t4
 	LD	a8,  7 * SIZE(X)

 	bgtz	I, .L12
 	MOV	t4, a4
 	.align 3

 .L13:
 	ADD	s1, s1, t1
 	daddiu	X, X, 8 * SIZE

 	MOV	t1, a5
 	NOP

 	ADD	s2, s2, t2
 	MOV	t2, a6

 	ADD	s1, s1, t3
 	MOV	t3, a7

 	ADD	s2, s2, t4
 	MOV	t4, a8

 	ADD	s1, s1, t1
 	ADD	s2, s2, t2
 	ADD	s1, s1, t3
 	ADD	s2, s2, t4
 	.align 3

 .L15:
 	andi	I,  N, 7

 	blez	I, .L999
 	NOP
 	.align	3

 .L16:
 	LD	a1,  0 * SIZE(X)
 	daddiu	I, I, -1

 	MOV	t1, a1

 	ADD	s1, s1, t1

 	bgtz	I, .L16
 	daddiu	X, X, SIZE

 	j	.L999
 	NOP
 	.align 3

 .L20:
 	blez	I, .L25
 	NOP

 	LD	a1,  0 * SIZE(X)
 	daddu	X, X, INCX

 	LD	a2,  0 * SIZE(X)
 	daddu	X, X, INCX

 	LD	a3,  0 * SIZE(X)
 	daddu	X, X, INCX

 	LD	a4,  0 * SIZE(X)
 	daddu	X, X, INCX

 	LD	a5,  0 * SIZE(X)
 	daddu	X, X, INCX

 	LD	a6,  0 * SIZE(X)
 	daddu	X, X, INCX

 	MOV	t1, a1
 	LD	a7,  0 * SIZE(X)

 	MOV	t2, a2
 	daddu	X, X, INCX

 	MOV	t3, a3
 	LD	a8,  0 * SIZE(X)

 	MOV	t4, a4
 	daddiu	I, I, -1

 	blez	I, .L24
 	daddu	X, X, INCX
 	.align 3

 .L23:
 	ADD	s1, s1, t1
 	LD	a1,  0 * SIZE(X)

 	MOV	t1, a5
 	daddu	X, X, INCX

 	ADD	s2, s2, t2
 	LD	a2,  0 * SIZE(X)

 	MOV	t2, a6
 	daddu	X, X, INCX

 	ADD	s1, s1, t3
 	LD	a3,  0 * SIZE(X)

 	MOV	t3, a7
 	daddu	X, X, INCX

 	ADD	s2, s2, t4
 	LD	a4,  0 * SIZE(X)

 	MOV	t4, a8
 	daddu	X, X, INCX

 	ADD	s1, s1, t1
 	LD	a5,  0 * SIZE(X)

 	MOV	t1, a1
 	daddu	X, X, INCX

 	ADD	s2, s2, t2
 	LD	a6,  0 * SIZE(X)

 	MOV	t2, a2
 	daddu	X, X, INCX

 	ADD	s1, s1, t3
 	LD	a7,  0 * SIZE(X)

 	MOV	t3, a3
 	daddu	X, X, INCX

 	ADD	s2, s2, t4
 	LD	a8,  0 * SIZE(X)

 	MOV	t4, a4
 	daddiu	I, I, -1

 	bgtz	I, .L23
 	daddu	X, X, INCX
 	.align 3

 .L24:
 	ADD	s1, s1, t1
 	MOV	t1, a5

 	ADD	s2, s2, t2
 	MOV	t2, a6

 	ADD	s1, s1, t3
 	MOV	t3, a7

 	ADD	s2, s2, t4
 	MOV	t4, a8

 	ADD	s1, s1, t1
 	ADD	s2, s2, t2
 	ADD	s1, s1, t3
 	ADD	s2, s2, t4
 	.align 3

 .L25:
 	andi	I,  N, 7

 	blez	I, .L999
 	NOP
 	.align	3

 .L26:
 	LD	a1,  0 * SIZE(X)
 	daddiu	I, I, -1

 	MOV	t1, a1
 	daddu	X, X, INCX

 	bgtz	I, .L26
 	ADD	s1, s1, t1
 	.align 3

 .L999:
 	j	$31
 	ADD	s1, s1, s2

 	EPILOGUE
--- a/kernel/mips64/zsum.S
+++ b/kernel/mips64/zsum.S
@@ -0,0 +1,204 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
 /* without modification, are permitted provided that the following   */
 /* conditions are met:                                               */
 /*                                                                   */
 /*   1. Redistributions of source code must retain the above         */
 /*      copyright notice, this list of conditions and the following  */
 /*      disclaimer.                                                  */
 /*                                                                   */
 /*   2. Redistributions in binary form must reproduce the above      */
 /*      copyright notice, this list of conditions and the following  */
 /*      disclaimer in the documentation and/or other materials       */
 /*      provided with the distribution.                              */
 /*                                                                   */
 /*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
 /*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
 /*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
 /*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
 /*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
 /*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
 /*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
 /*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
 /*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
 /*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
 /*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
 /*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
 /*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
 /*    POSSIBILITY OF SUCH DAMAGE.                                    */
 /*                                                                   */
 /* The views and conclusions contained in the software and           */
 /* documentation are those of the authors and should not be          */
 /* interpreted as representing official policies, either expressed   */
 /* or implied, of The University of Texas at Austin.                 */
 /*********************************************************************/

 #define ASSEMBLER
 #include "common.h"

 #define N	$4
 #define	X	$5
 #define INCX	$6

 #define I	$2
 #define TEMP	$3

 #define a1	$f2
 #define a2	$f3
 #define a3	$f4
 #define a4	$f5
 #define a5	$f6
 #define a6	$f7
 #define a7	$f8
 #define a8	$f9

 #define t1	$f10
 #define t2	$f11
 #define t3	$f12
 #define t4	$f13

 #define s1	$f0
 #define s2	$f1

 	PROLOGUE

 #ifdef F_INTERFACE
 	LDINT	N,     0(N)
 	LDINT	INCX,  0(INCX)
 #endif

 	MTC	$0,  s1

 	MTC	$0,  s2
 	dsll	INCX, INCX, ZBASE_SHIFT

 	blez	N, .L999
 	dsra	I, N, 2

 	blez	I, .L25
 	NOP

 	LD	a1,  0 * SIZE(X)
 	LD	a2,  1 * SIZE(X)
 	daddu	X, X, INCX

 	LD	a3,  0 * SIZE(X)
 	LD	a4,  1 * SIZE(X)
 	daddu	X, X, INCX

 	LD	a5,  0 * SIZE(X)
 	LD	a6,  1 * SIZE(X)
 	daddu	X, X, INCX

 	MOV	t1, a1
 	MOV	t2, a2

 	LD	a7,  0 * SIZE(X)
 	LD	a8,  1 * SIZE(X)

 	MOV	t3, a3
 	MOV	t4, a4
 	daddiu	I, I, -1

 	blez	I, .L24
 	daddu	X, X, INCX
 	.align 3

 .L23:
 	ADD	s1, s1, t1
 	LD	a1,  0 * SIZE(X)

 	MOV	t1, a5
 	daddiu	I, I, -1

 	ADD	s2, s2, t2
 	LD	a2,  1 * SIZE(X)

 	MOV	t2, a6
 	daddu	X, X, INCX

 	ADD	s1, s1, t3
 	LD	a3,  0 * SIZE(X)

 	MOV	t3, a7
 	NOP

 	ADD	s2, s2, t4
 	LD	a4,  1 * SIZE(X)

 	MOV	t4, a8
 	daddu	X, X, INCX

 	ADD	s1, s1, t1
 	LD	a5,  0 * SIZE(X)

 	MOV	t1, a1
 	NOP

 	ADD	s2, s2, t2
 	LD	a6,  1 * SIZE(X)

 	MOV	t2, a2
 	daddu	X, X, INCX

 	ADD	s1, s1, t3
 	LD	a7,  0 * SIZE(X)

 	MOV	t3, a3
 	LD	a8,  1 * SIZE(X)

 	ADD	s2, s2, t4
 	daddu	X, X, INCX

 	bgtz	I, .L23
 	MOV	t4, a4
 	.align 3

 .L24:
 	ADD	s1, s1, t1
 	MOV	t1, a5

 	ADD	s2, s2, t2
 	MOV	t2, a6

 	ADD	s1, s1, t3
 	MOV	t3, a7

 	ADD	s2, s2, t4
 	MOV	t4, a8

 	ADD	s1, s1, t1
 	ADD	s2, s2, t2
 	ADD	s1, s1, t3
 	ADD	s2, s2, t4
 	.align 3

 .L25:
 	andi	I,  N, 3

 	blez	I, .L999
 	NOP
 	.align	3

 .L26:
 	LD	a1,  0 * SIZE(X)
 	LD	a2,  1 * SIZE(X)

 	MOV	t1, a1
 	daddiu	I, I, -1
 	MOV	t2, a2
 	daddu	X, X, INCX

 	ADD	s1, s1, t1
 	bgtz	I, .L26
 	ADD	s2, s2, t2
 	.align 3

 .L999:
 	j	$31
 	ADD	s1, s1, s2

 	EPILOGUE
--- a/kernel/power/sum.S
+++ b/kernel/power/sum.S
@@ -0,0 +1,446 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
 /* without modification, are permitted provided that the following   */
 /* conditions are met:                                               */
 /*                                                                   */
 /*   1. Redistributions of source code must retain the above         */
 /*      copyright notice, this list of conditions and the following  */
 /*      disclaimer.                                                  */
 /*                                                                   */
 /*   2. Redistributions in binary form must reproduce the above      */
 /*      copyright notice, this list of conditions and the following  */
 /*      disclaimer in the documentation and/or other materials       */
 /*      provided with the distribution.                              */
 /*                                                                   */
 /*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
 /*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
 /*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
 /*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
 /*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
 /*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
 /*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
 /*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
 /*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
 /*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
 /*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
 /*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
 /*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
 /*    POSSIBILITY OF SUCH DAMAGE.                                    */
 /*                                                                   */
 /* The views and conclusions contained in the software and           */
 /* documentation are those of the authors and should not be          */
 /* interpreted as representing official policies, either expressed   */
 /* or implied, of The University of Texas at Austin.                 */
 /*********************************************************************/

 #define ASSEMBLER
 #include "common.h"

 #define N	r3
 #define X	r4
 #define INCX	r5

 #define PREA	r8

 #define FZERO	f0

 #define STACKSIZE 160

 	PROLOGUE
 	PROFCODE

 	addi	SP, SP, -STACKSIZE
 	li	r0,   0

 	stfd	f14,    0(SP)
 	stfd	f15,    8(SP)
 	stfd	f16,   16(SP)
 	stfd	f17,   24(SP)

 	stfd	f18,   32(SP)
 	stfd	f19,   40(SP)
 	stfd	f20,   48(SP)
 	stfd	f21,   56(SP)

 	stfd	f22,   64(SP)
 	stfd	f23,   72(SP)
 	stfd	f24,   80(SP)
 	stfd	f25,   88(SP)

 	stfd	f26,   96(SP)
 	stfd	f27,  104(SP)
 	stfd	f28,  112(SP)
 	stfd	f29,  120(SP)

 	stfd	f30,  128(SP)
 	stfd	f31,  136(SP)

 	stw	r0,   144(SP)
 	lfs	FZERO,144(SP)

 #ifdef F_INTERFACE
 	LDINT	N,    0(N)
 	LDINT	INCX, 0(INCX)
 #endif

 	slwi	INCX, INCX, BASE_SHIFT

 	fmr	f1,  FZERO
 	fmr	f2,  FZERO
 	fmr	f3,  FZERO
 	fmr	f4,  FZERO
 	fmr	f5,  FZERO
 	fmr	f6,  FZERO
 	fmr	f7,  FZERO

 	li	PREA, L1_PREFETCHSIZE

 	cmpwi	cr0, N, 0
 	ble-	LL(999)

 	cmpwi	cr0, INCX, 0
 	ble-	LL(999)

 	cmpwi	cr0, INCX, SIZE
 	bne-	cr0, LL(100)

 	srawi.	r0, N, 4
 	mtspr	CTR, r0
 	beq-	cr0, LL(50)
 	.align 4

 	LFD	f8,    0 * SIZE(X)
 	LFD	f9,    1 * SIZE(X)
 	LFD	f10,   2 * SIZE(X)
 	LFD	f11,   3 * SIZE(X)
 	LFD	f12,   4 * SIZE(X)
 	LFD	f13,   5 * SIZE(X)
 	LFD	f14,   6 * SIZE(X)
 	LFD	f15,   7 * SIZE(X)

 	LFD	f24,   8 * SIZE(X)
 	LFD	f25,   9 * SIZE(X)
 	LFD	f26,  10 * SIZE(X)
 	LFD	f27,  11 * SIZE(X)
 	LFD	f28,  12 * SIZE(X)
 	LFD	f29,  13 * SIZE(X)
 	LFD	f30,  14 * SIZE(X)
 	LFD	f31,  15 * SIZE(X)

 	fmr	f16, f8
 	fmr	f17, f9
 	fmr	f18, f10
 	fmr	f19, f11

 	fmr	f20, f12
 	fmr	f21, f13
 	fmr	f22, f14
 	fmr	f23, f15
 	bdz	LL(20)
 	.align 4

 LL(10):
 	FADD	f0, f0, f16
 	fmr	f16, f24
 	FADD	f1, f1, f17
 	fmr	f17, f25

 	FADD	f2, f2, f18
 	fmr	f18, f26
 	FADD	f3, f3, f19
 	fmr	f19, f27

 	LFD	f8,   16 * SIZE(X)
 	LFD	f9,   17 * SIZE(X)
 	LFD	f10,  18 * SIZE(X)
 	LFD	f11,  19 * SIZE(X)

 	FADD	f4, f4, f20
 	fmr	f20, f28
 	FADD	f5, f5, f21
 	fmr	f21, f29

 	FADD	f6, f6, f22
 	fmr	f22, f30
 	FADD	f7, f7, f23
 	fmr	f23, f31

 	LFD	f12,  20 * SIZE(X)
 	LFD	f13,  21 * SIZE(X)
 	LFD	f14,  22 * SIZE(X)
 	LFD	f15,  23 * SIZE(X)

 	FADD	f0, f0, f16
 	fmr	f16, f8
 	FADD	f1, f1, f17
 	fmr	f17, f9

 	FADD	f2, f2, f18
 	fmr	f18, f10
 	FADD	f3, f3, f19
 	fmr	f19, f11

 	LFD	f24,  24 * SIZE(X)
 	LFD	f25,  25 * SIZE(X)
 	LFD	f26,  26 * SIZE(X)
 	LFD	f27,  27 * SIZE(X)

 	FADD	f4, f4, f20
 	fmr	f20, f12
 	FADD	f5, f5, f21
 	fmr	f21, f13

 	FADD	f6, f6, f22
 	fmr	f22, f14
 	FADD	f7, f7, f23
 	fmr	f23, f15

 	LFD	f28,  28 * SIZE(X)
 	LFD	f29,  29 * SIZE(X)
 	LFD	f30,  30 * SIZE(X)
 	LFD	f31,  31 * SIZE(X)

 #ifndef POWER6
 	L1_PREFETCH	X, PREA
 #endif
 	addi	X, X, 16 * SIZE
 #ifdef POWER6
 	L1_PREFETCH	X, PREA
 #endif

 	bdnz	LL(10)
 	.align 4

 LL(20):
 	FADD	f0, f0, f16
 	fmr	f16, f24
 	FADD	f1, f1, f17
 	fmr	f17, f25

 	FADD	f2, f2, f18
 	fmr	f18, f26
 	FADD	f3, f3, f19
 	fmr	f19, f27

 	FADD	f4, f4, f20
 	fmr	f20, f28
 	FADD	f5, f5, f21
 	fmr	f21, f29

 	FADD	f6, f6, f22
 	fmr	f22, f30
 	FADD	f7, f7, f23
 	fmr	f23, f31

 	FADD	f0, f0, f16
 	FADD	f1, f1, f17
 	FADD	f2, f2, f18
 	FADD	f3, f3, f19

 	FADD	f4, f4, f20
 	FADD	f5, f5, f21
 	FADD	f6, f6, f22
 	FADD	f7, f7, f23
 	addi	X, X, 16 * SIZE
 	.align 4

 LL(50):
 	andi.	r0,  N, 15
 	mtspr	CTR, r0
 	beq	LL(999)
 	.align 4

 LL(60):
 	LFD	f8,  0 * SIZE(X)
 	addi	X, X,  1 * SIZE

 	FADD	f0, f0,  f8

 	bdnz	LL(60)
 	b	LL(999)
 	.align 4

 LL(100):
 	sub	X, X, INCX

 	srawi.	r0, N, 4
 	mtspr	CTR,  r0
 	beq-	LL(150)

 	LFDUX	f8,    X, INCX
 	LFDUX	f9,    X, INCX
 	LFDUX	f10,   X, INCX
 	LFDUX	f11,   X, INCX
 	LFDUX	f12,   X, INCX
 	LFDUX	f13,   X, INCX
 	LFDUX	f14,   X, INCX
 	LFDUX	f15,   X, INCX

 	LFDUX	f24,   X, INCX
 	LFDUX	f25,   X, INCX
 	LFDUX	f26,   X, INCX
 	LFDUX	f27,   X, INCX
 	LFDUX	f28,   X, INCX
 	LFDUX	f29,   X, INCX
 	LFDUX	f30,   X, INCX
 	LFDUX	f31,   X, INCX

 	fmr	f16, f8
 	fmr	f17, f9
 	fmr	f18, f10
 	fmr	f19, f11

 	fmr	f20, f12
 	fmr	f21, f13
 	fmr	f22, f14
 	fmr	f23, f15
 	bdz	LL(120)
 	.align 4

 LL(110):
 	FADD	f0, f0, f16
 	fmr	f16, f24
 	FADD	f1, f1, f17
 	fmr	f17, f25

 	FADD	f2, f2, f18
 	fmr	f18, f26
 	FADD	f3, f3, f19
 	fmr	f19, f27

 	LFDUX	f8,    X, INCX
 	LFDUX	f9,    X, INCX
 	LFDUX	f10,   X, INCX
 	LFDUX	f11,   X, INCX

 	FADD	f4, f4, f20
 	fmr	f20, f28
 	FADD	f5, f5, f21
 	fmr	f21, f29

 	FADD	f6, f6, f22
 	fmr	f22, f30
 	FADD	f7, f7, f23
 	fmr	f23, f31

 	LFDUX	f12,   X, INCX
 	LFDUX	f13,   X, INCX
 	LFDUX	f14,   X, INCX
 	LFDUX	f15,   X, INCX

 	FADD	f0, f0, f16
 	fmr	f16, f8
 	FADD	f1, f1, f17
 	fmr	f17, f9

 	FADD	f2, f2, f18
 	fmr	f18, f10
 	FADD	f3, f3, f19
 	fmr	f19, f11

 	LFDUX	f24,   X, INCX
 	LFDUX	f25,   X, INCX
 	LFDUX	f26,   X, INCX
 	LFDUX	f27,   X, INCX

 	FADD	f4, f4, f20
 	fmr	f20, f12
 	FADD	f5, f5, f21
 	fmr	f21, f13

 	FADD	f6, f6, f22
 	fmr	f22, f14
 	FADD	f7, f7, f23
 	fmr	f23, f15

 	LFDUX	f28,   X, INCX
 	LFDUX	f29,   X, INCX
 	LFDUX	f30,   X, INCX
 	LFDUX	f31,   X, INCX
 	bdnz	LL(110)
 	.align 4

 LL(120):
 	FADD	f0, f0, f16
 	fmr	f16, f24
 	FADD	f1, f1, f17
 	fmr	f17, f25

 	FADD	f2, f2, f18
 	fmr	f18, f26
 	FADD	f3, f3, f19
 	fmr	f19, f27

 	FADD	f4, f4, f20
 	fmr	f20, f28
 	FADD	f5, f5, f21
 	fmr	f21, f29

 	FADD	f6, f6, f22
 	fmr	f22, f30
 	FADD	f7, f7, f23
 	fmr	f23, f31

 	FADD	f0, f0, f16
 	FADD	f1, f1, f17
 	FADD	f2, f2, f18
 	FADD	f3, f3, f19

 	FADD	f4, f4, f20
 	FADD	f5, f5, f21
 	FADD	f6, f6, f22
 	FADD	f7, f7, f23
 	.align 4

 LL(150):
 	andi.	r0,  N, 15
 	mtspr	CTR, r0
 	beq	LL(999)
 	.align 4

 LL(160):
 	LFDUX	f8,    X, INCX
 	FADD	f0,  f0, f8
 	bdnz	LL(160)
 	.align 4

 LL(999):
 	FADD	f0,  f0,  f1
 	FADD	f2,  f2,  f3
 	FADD	f4,  f4,  f5
 	FADD	f6,  f6,  f7

 	FADD	f0,  f0,  f2
 	FADD	f4,  f4,  f6
 	FADD	f1,  f0,  f4

 	lfd	f14,    0(SP)
 	lfd	f15,    8(SP)
 	lfd	f16,   16(SP)
 	lfd	f17,   24(SP)

 	lfd	f18,   32(SP)
 	lfd	f19,   40(SP)
 	lfd	f20,   48(SP)
 	lfd	f21,   56(SP)

 	lfd	f22,   64(SP)
 	lfd	f23,   72(SP)
 	lfd	f24,   80(SP)
 	lfd	f25,   88(SP)

 	lfd	f26,   96(SP)
 	lfd	f27,  104(SP)
 	lfd	f28,  112(SP)
 	lfd	f29,  120(SP)

 	lfd	f30,  128(SP)
 	lfd	f31,  136(SP)

 	addi	SP, SP, STACKSIZE
 	blr

 	EPILOGUE
--- a/kernel/power/zsum.S
+++ b/kernel/power/zsum.S
@@ -0,0 +1,452 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
 /* without modification, are permitted provided that the following   */
 /* conditions are met:                                               */
 /*                                                                   */
 /*   1. Redistributions of source code must retain the above         */
 /*      copyright notice, this list of conditions and the following  */
 /*      disclaimer.                                                  */
 /*                                                                   */
 /*   2. Redistributions in binary form must reproduce the above      */
 /*      copyright notice, this list of conditions and the following  */
 /*      disclaimer in the documentation and/or other materials       */
 /*      provided with the distribution.                              */
 /*                                                                   */
 /*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
 /*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
 /*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
 /*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
 /*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
 /*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
 /*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
 /*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
 /*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
 /*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
 /*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
 /*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
 /*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
 /*    POSSIBILITY OF SUCH DAMAGE.                                    */
 /*                                                                   */
 /* The views and conclusions contained in the software and           */
 /* documentation are those of the authors and should not be          */
 /* interpreted as representing official policies, either expressed   */
 /* or implied, of The University of Texas at Austin.                 */
 /*********************************************************************/

 #define ASSEMBLER
 #include "common.h"

 #define N	r3
 #define X	r4
 #define INCX	r5

 #define INCXM1	r9
 #define PREA	r8

 #define FZERO	f0

 #define STACKSIZE 160

 	PROLOGUE
 	PROFCODE

 	addi	SP, SP, -STACKSIZE
 	li	r0,   0

 	stfd	f14,    0(SP)
 	stfd	f15,    8(SP)
 	stfd	f16,   16(SP)
 	stfd	f17,   24(SP)

 	stfd	f18,   32(SP)
 	stfd	f19,   40(SP)
 	stfd	f20,   48(SP)
 	stfd	f21,   56(SP)

 	stfd	f22,   64(SP)
 	stfd	f23,   72(SP)
 	stfd	f24,   80(SP)
 	stfd	f25,   88(SP)

 	stfd	f26,   96(SP)
 	stfd	f27,  104(SP)
 	stfd	f28,  112(SP)
 	stfd	f29,  120(SP)

 	stfd	f30,  128(SP)
 	stfd	f31,  136(SP)

 	stw	r0,   144(SP)
 	lfs	FZERO,144(SP)

 #ifdef F_INTERFACE
 	LDINT	N,    0(N)
 	LDINT	INCX, 0(INCX)
 #endif

 	slwi	INCX, INCX, ZBASE_SHIFT
 	subi	INCXM1, INCX, SIZE

 	fmr	f1,  FZERO
 	fmr	f2,  FZERO
 	fmr	f3,  FZERO
 	fmr	f4,  FZERO
 	fmr	f5,  FZERO
 	fmr	f6,  FZERO
 	fmr	f7,  FZERO

 	li	PREA, L1_PREFETCHSIZE

 	cmpwi	cr0, N, 0
 	ble-	LL(999)

 	cmpwi	cr0, INCX, 0
 	ble-	LL(999)

 	cmpwi	cr0, INCX, 2 * SIZE
 	bne-	cr0, LL(100)

 	srawi.	r0, N, 3
 	mtspr	CTR, r0
 	beq-	cr0, LL(50)
 	.align 4

 	LFD	f8,    0 * SIZE(X)
 	LFD	f9,    1 * SIZE(X)
 	LFD	f10,   2 * SIZE(X)
 	LFD	f11,   3 * SIZE(X)
 	LFD	f12,   4 * SIZE(X)
 	LFD	f13,   5 * SIZE(X)
 	LFD	f14,   6 * SIZE(X)
 	LFD	f15,   7 * SIZE(X)

 	LFD	f24,   8 * SIZE(X)
 	LFD	f25,   9 * SIZE(X)
 	LFD	f26,  10 * SIZE(X)
 	LFD	f27,  11 * SIZE(X)
 	LFD	f28,  12 * SIZE(X)
 	LFD	f29,  13 * SIZE(X)
 	LFD	f30,  14 * SIZE(X)
 	LFD	f31,  15 * SIZE(X)

 	fmr	f16, f8
 	fmr	f17, f9
 	fmr	f18, f10
 	fmr	f19, f11

 	fmr	f20, f12
 	fmr	f21, f13
 	fmr	f22, f14
 	fmr	f23, f15
 	bdz	LL(20)
 	.align 4

 LL(10):
 	FADD	f0, f0, f16
 	fmr	f16, f24
 	FADD	f1, f1, f17
 	fmr	f17, f25

 	FADD	f2, f2, f18
 	fmr	f18, f26
 	FADD	f3, f3, f19
 	fmr	f19, f27

 	LFD	f8,   16 * SIZE(X)
 	LFD	f9,   17 * SIZE(X)
 	LFD	f10,  18 * SIZE(X)
 	LFD	f11,  19 * SIZE(X)

 	FADD	f4, f4, f20
 	fmr	f20, f28
 	FADD	f5, f5, f21
 	fmr	f21, f29

 	FADD	f6, f6, f22
 	fmr	f22, f30
 	FADD	f7, f7, f23
 	fmr	f23, f31

 	LFD	f12,  20 * SIZE(X)
 	LFD	f13,  21 * SIZE(X)
 	LFD	f14,  22 * SIZE(X)
 	LFD	f15,  23 * SIZE(X)

 	FADD	f0, f0, f16
 	fmr	f16, f8
 	FADD	f1, f1, f17
 	fmr	f17, f9

 	FADD	f2, f2, f18
 	fmr	f18, f10
 	FADD	f3, f3, f19
 	fmr	f19, f11

 	LFD	f24,  24 * SIZE(X)
 	LFD	f25,  25 * SIZE(X)
 	LFD	f26,  26 * SIZE(X)
 	LFD	f27,  27 * SIZE(X)

 	FADD	f4, f4, f20
 	fmr	f20, f12
 	FADD	f5, f5, f21
 	fmr	f21, f13

 	FADD	f6, f6, f22
 	fmr	f22, f14
 	FADD	f7, f7, f23
 	fmr	f23, f15

 	LFD	f28,  28 * SIZE(X)
 	LFD	f29,  29 * SIZE(X)
 	LFD	f30,  30 * SIZE(X)
 	LFD	f31,  31 * SIZE(X)

 #ifndef POWER6
 	L1_PREFETCH	X, PREA
 #endif
 	addi	X, X, 16 * SIZE
 #ifdef POWER6
 	L1_PREFETCH	X, PREA
 #endif

 	bdnz	LL(10)
 	.align 4

 LL(20):
 	FADD	f0, f0, f16
 	fmr	f16, f24
 	FADD	f1, f1, f17
 	fmr	f17, f25

 	FADD	f2, f2, f18
 	fmr	f18, f26
 	FADD	f3, f3, f19
 	fmr	f19, f27

 	FADD	f4, f4, f20
 	fmr	f20, f28
 	FADD	f5, f5, f21
 	fmr	f21, f29

 	FADD	f6, f6, f22
 	fmr	f22, f30
 	FADD	f7, f7, f23
 	fmr	f23, f31

 	FADD	f0, f0, f16
 	FADD	f1, f1, f17
 	FADD	f2, f2, f18
 	FADD	f3, f3, f19

 	FADD	f4, f4, f20
 	FADD	f5, f5, f21
 	FADD	f6, f6, f22
 	FADD	f7, f7, f23
 	addi	X, X, 16 * SIZE
 	.align 4

 LL(50):
 	andi.	r0,  N, 7
 	mtspr	CTR, r0
 	beq	LL(999)
 	.align 4

 LL(60):
 	LFD	f8,  0 * SIZE(X)
 	LFD	f9,  1 * SIZE(X)
 	addi	X, X,  2 * SIZE

 	FADD	f0, f0,  f8
 	FADD	f1, f1,  f9

 	bdnz	LL(60)
 	b	LL(999)
 	.align 4

 LL(100):
 	sub	X, X, INCXM1

 	srawi.	r0, N, 3
 	mtspr	CTR,  r0
 	beq-	LL(150)

 	LFDX	f8,    X, INCXM1
 	LFDUX	f9,    X, INCX
 	LFDX	f10,   X, INCXM1
 	LFDUX	f11,   X, INCX
 	LFDX	f12,   X, INCXM1
 	LFDUX	f13,   X, INCX
 	LFDX	f14,   X, INCXM1
 	LFDUX	f15,   X, INCX

 	LFDX	f24,   X, INCXM1
 	LFDUX	f25,   X, INCX
 	LFDX	f26,   X, INCXM1
 	LFDUX	f27,   X, INCX
 	LFDX	f28,   X, INCXM1
 	LFDUX	f29,   X, INCX
 	LFDX	f30,   X, INCXM1
 	LFDUX	f31,   X, INCX

 	fmr	f16, f8
 	fmr	f17, f9
 	fmr	f18, f10
 	fmr	f19, f11

 	fmr	f20, f12
 	fmr	f21, f13
 	fmr	f22, f14
 	fmr	f23, f15
 	bdz	LL(120)
 	.align 4

 LL(110):
 	FADD	f0, f0, f16
 	fmr	f16, f24
 	FADD	f1, f1, f17
 	fmr	f17, f25

 	FADD	f2, f2, f18
 	fmr	f18, f26
 	FADD	f3, f3, f19
 	fmr	f19, f27

 	LFDX	f8,    X, INCXM1
 	LFDUX	f9,    X, INCX
 	LFDX	f10,   X, INCXM1
 	LFDUX	f11,   X, INCX

 	FADD	f4, f4, f20
 	fmr	f20, f28
 	FADD	f5, f5, f21
 	fmr	f21, f29

 	FADD	f6, f6, f22
 	fmr	f22, f30
 	FADD	f7, f7, f23
 	fmr	f23, f31

 	LFDX	f12,   X, INCXM1
 	LFDUX	f13,   X, INCX
 	LFDX	f14,   X, INCXM1
 	LFDUX	f15,   X, INCX

 	FADD	f0, f0, f16
 	fmr	f16, f8
 	FADD	f1, f1, f17
 	fmr	f17, f9

 	FADD	f2, f2, f18
 	fmr	f18, f10
 	FADD	f3, f3, f19
 	fmr	f19, f11

 	LFDX	f24,   X, INCXM1
 	LFDUX	f25,   X, INCX
 	LFDX	f26,   X, INCXM1
 	LFDUX	f27,   X, INCX

 	FADD	f4, f4, f20
 	fmr	f20, f12
 	FADD	f5, f5, f21
 	fmr	f21, f13

 	FADD	f6, f6, f22
 	fmr	f22, f14
 	FADD	f7, f7, f23
 	fmr	f23, f15

 	LFDX	f28,   X, INCXM1
 	LFDUX	f29,   X, INCX
 	LFDX	f30,   X, INCXM1
 	LFDUX	f31,   X, INCX
 	bdnz	LL(110)
 	.align 4

 LL(120):
 	FADD	f0, f0, f16
 	fmr	f16, f24
 	FADD	f1, f1, f17
 	fmr	f17, f25

 	FADD	f2, f2, f18
 	fmr	f18, f26
 	FADD	f3, f3, f19
 	fmr	f19, f27

 	FADD	f4, f4, f20
 	fmr	f20, f28
 	FADD	f5, f5, f21
 	fmr	f21, f29

 	FADD	f6, f6, f22
 	fmr	f22, f30
 	FADD	f7, f7, f23
 	fmr	f23, f31

 	FADD	f0, f0, f16
 	FADD	f1, f1, f17
 	FADD	f2, f2, f18
 	FADD	f3, f3, f19

 	FADD	f4, f4, f20
 	FADD	f5, f5, f21
 	FADD	f6, f6, f22
 	FADD	f7, f7, f23
 	.align 4

 LL(150):
 	andi.	r0,  N, 7
 	mtspr	CTR, r0
 	beq	LL(999)
 	.align 4

 LL(160):
 	LFDX	f8,    X, INCXM1
 	LFDUX	f9,    X, INCX
 	FADD	f0,  f0, f8
 	FADD	f1,  f1, f9
 	bdnz	LL(160)
 	.align 4

 LL(999):
 	FADD	f0,  f0,  f1
 	FADD	f2,  f2,  f3
 	FADD	f4,  f4,  f5
 	FADD	f6,  f6,  f7

 	FADD	f0,  f0,  f2
 	FADD	f4,  f4,  f6
 	FADD	f1,  f0,  f4

 	lfd	f14,    0(SP)
 	lfd	f15,    8(SP)
 	lfd	f16,   16(SP)
 	lfd	f17,   24(SP)

 	lfd	f18,   32(SP)
 	lfd	f19,   40(SP)
 	lfd	f20,   48(SP)
 	lfd	f21,   56(SP)

 	lfd	f22,   64(SP)
 	lfd	f23,   72(SP)
 	lfd	f24,   80(SP)
 	lfd	f25,   88(SP)

 	lfd	f26,   96(SP)
 	lfd	f27,  104(SP)
 	lfd	f28,  112(SP)
 	lfd	f29,  120(SP)

 	lfd	f30,  128(SP)
 	lfd	f31,  136(SP)

 	addi	SP, SP, STACKSIZE
 	blr

 	EPILOGUE
--- a/kernel/setparam-ref.c
+++ b/kernel/setparam-ref.c
@@ -70,7 +70,7 @@ gotoblas_t TABLE_NAME = {

  samax_kTS,  samin_kTS,  smax_kTS,  smin_kTS,
  isamax_kTS, isamin_kTS, ismax_kTS, ismin_kTS,
  snrm2_kTS,  sasum_kTS,  scopy_kTS, sdot_kTS,
  snrm2_kTS,  sasum_kTS, ssum_kTS, scopy_kTS, sdot_kTS,
  dsdot_kTS,
  srot_kTS,   saxpy_kTS,  sscal_kTS, sswap_kTS,
  sgemv_nTS,  sgemv_tTS, sger_kTS,
@@ -126,7 +126,7 @@ gotoblas_t TABLE_NAME = {

  damax_kTS,  damin_kTS,  dmax_kTS,  dmin_kTS,
  idamax_kTS, idamin_kTS, idmax_kTS, idmin_kTS,
  dnrm2_kTS,  dasum_kTS,  dcopy_kTS, ddot_kTS,
  dnrm2_kTS,  dasum_kTS,  dsum_kTS, dcopy_kTS, ddot_kTS,
  drot_kTS,   daxpy_kTS,  dscal_kTS, dswap_kTS,
  dgemv_nTS,  dgemv_tTS,  dger_kTS,
  dsymv_LTS,  dsymv_UTS,
@@ -178,7 +178,7 @@ gotoblas_t TABLE_NAME = {

  qamax_kTS,  qamin_kTS,  qmax_kTS,  qmin_kTS,
  iqamax_kTS, iqamin_kTS, iqmax_kTS, iqmin_kTS,
  qnrm2_kTS,  qasum_kTS,  qcopy_kTS, qdot_kTS,
  qnrm2_kTS,  qasum_kTS,  qsum_kTS, qcopy_kTS, qdot_kTS,
  qrot_kTS,   qaxpy_kTS,  qscal_kTS, qswap_kTS,
  qgemv_nTS,  qgemv_tTS,  qger_kTS,
  qsymv_LTS,  qsymv_UTS,
@@ -234,7 +234,7 @@ gotoblas_t TABLE_NAME = {
 #endif

  camax_kTS, camin_kTS, icamax_kTS, icamin_kTS,
  cnrm2_kTS, casum_kTS, ccopy_kTS,
  cnrm2_kTS, casum_kTS, csum_kTS, ccopy_kTS,
  cdotu_kTS, cdotc_kTS, csrot_kTS,
  caxpy_kTS, caxpyc_kTS, cscal_kTS, cswap_kTS,

@@ -369,7 +369,7 @@ gotoblas_t TABLE_NAME = {
 #endif

  zamax_kTS, zamin_kTS, izamax_kTS, izamin_kTS,
  znrm2_kTS, zasum_kTS, zcopy_kTS,
  znrm2_kTS, zasum_kTS, zsum_kTS, zcopy_kTS,
  zdotu_kTS, zdotc_kTS, zdrot_kTS,
  zaxpy_kTS, zaxpyc_kTS, zscal_kTS, zswap_kTS,

@@ -500,7 +500,7 @@ gotoblas_t TABLE_NAME = {
  XGEMM_DEFAULT_UNROLL_M, XGEMM_DEFAULT_UNROLL_N, MAX(XGEMM_DEFAULT_UNROLL_M, XGEMM_DEFAULT_UNROLL_N),

  xamax_kTS, xamin_kTS, ixamax_kTS, ixamin_kTS,
  xnrm2_kTS, xasum_kTS, xcopy_kTS,
  xnrm2_kTS, xasum_kTS, xsum_kTS, xcopy_kTS,
  xdotu_kTS, xdotc_kTS, xqrot_kTS,
  xaxpy_kTS, xaxpyc_kTS, xscal_kTS, xswap_kTS,

--- a/kernel/sparc/sum.S
+++ b/kernel/sparc/sum.S
@@ -0,0 +1,325 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
 /* without modification, are permitted provided that the following   */
 /* conditions are met:                                               */
 /*                                                                   */
 /*   1. Redistributions of source code must retain the above         */
 /*      copyright notice, this list of conditions and the following  */
 /*      disclaimer.                                                  */
 /*                                                                   */
 /*   2. Redistributions in binary form must reproduce the above      */
 /*      copyright notice, this list of conditions and the following  */
 /*      disclaimer in the documentation and/or other materials       */
 /*      provided with the distribution.                              */
 /*                                                                   */
 /*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
 /*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
 /*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
 /*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
 /*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
 /*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
 /*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
 /*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
 /*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
 /*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
 /*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
 /*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
 /*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
 /*    POSSIBILITY OF SUCH DAMAGE.                                    */
 /*                                                                   */
 /* The views and conclusions contained in the software and           */
 /* documentation are those of the authors and should not be          */
 /* interpreted as representing official policies, either expressed   */
 /* or implied, of The University of Texas at Austin.                 */
 /*********************************************************************/

 #define ASSEMBLER
 #include "common.h"

 #define N	%i0
 #define X	%i1
 #define INCX	%i2
 #define I	%i3

 #ifdef DOUBLE
 #define c1	%f0
 #define c2	%f2
 #define t1	%f8
 #define t2	%f10
 #define t3	%f12
 #define t4	%f14

 #define a1	%f16
 #define a2	%f18
 #define a3	%f20
 #define a4	%f22
 #define a5	%f24
 #define a6	%f26
 #define a7	%f28
 #define a8	%f30
 #else
 #define c1	%f0
 #define c2	%f1
 #define t1	%f4
 #define t2	%f5
 #define t3	%f6
 #define t4	%f7

 #define a1	%f8
 #define a2	%f9
 #define a3	%f10
 #define a4	%f11
 #define a5	%f12
 #define a6	%f13
 #define a7	%f14
 #define a8	%f15
 #endif

 	PROLOGUE
 	SAVESP

 	FCLR(0)

 	sll	INCX, BASE_SHIFT, INCX

 	FMOV	c1, c2
 	FMOV	c1, t1
 	FMOV	c1, t2
 	FMOV	c1, t3
 	FMOV	c1, t4

 	cmp	INCX, 0
 	ble	.LL19
 	cmp	INCX, SIZE
 	bne	.LL50

 	sra	N, 3, I
 	cmp	I, 0
 	ble,pn	%icc, .LL15
 	nop

 	LDF	[X +  0 * SIZE], a1
 	add	I, -1, I
 	LDF	[X +  1 * SIZE], a2
 	cmp	I, 0
 	LDF	[X +  2 * SIZE], a3
 	LDF	[X +  3 * SIZE], a4
 	LDF	[X +  4 * SIZE], a5
 	LDF	[X +  5 * SIZE], a6
 	LDF	[X +  6 * SIZE], a7
 	LDF	[X +  7 * SIZE], a8

 	ble,pt	%icc, .LL12
 	add	X, 8 * SIZE, X

 #define PREFETCHSIZE 128

 .LL11:
 	FADD	c1, t1, c1
 	prefetch [X  + PREFETCHSIZE * SIZE], 0
 	FMOV	a1, t1
 	LDF	[X +  0 * SIZE], a1

 	FADD	c2, t2, c2
 	add	I, -1, I
 	FMOV	a2, t2
 	LDF	[X +  1 * SIZE], a2

 	FADD	c1, t3, c1
 	cmp	I, 0
 	FMOV	a3, t3
 	LDF	[X +  2 * SIZE], a3

 	FADD	c2, t4, c2
 	nop
 	FMOV	a4, t4
 	LDF	[X +  3 * SIZE], a4

 	FADD	c1, t1, c1
 	nop
 	FMOV	a5, t1
 	LDF	[X +  4 * SIZE], a5

 	FADD	c2, t2, c2
 	nop
 	FMOV	a6, t2
 	LDF	[X +  5 * SIZE], a6

 	FADD	c1, t3, c1
 	FMOV	a7, t3
 	LDF	[X +  6 * SIZE], a7
 	add	X, 8 * SIZE, X

 	FADD	c2, t4, c2
 	FMOV	a8, t4
 	bg,pt	%icc, .LL11
 	LDF	[X -  1 * SIZE], a8

 .LL12:
 	FADD	c1, t1, c1
 	FMOV	a1, t1
 	FADD	c2, t2, c2
 	FMOV	a2, t2

 	FADD	c1, t3, c1
 	FMOV	a3, t3
 	FADD	c2, t4, c2
 	FMOV	a4, t4

 	FADD	c1, t1, c1
 	FMOV	a5, t1
 	FADD	c2, t2, c2
 	FMOV	a6, t2

 	FADD	c1, t3, c1
 	FMOV	a7, t3
 	FADD	c2, t4, c2
 	FMOV	a8, t4

 .LL15:
 	and	N, 7, I
 	cmp	I,  0
 	ble,a,pn %icc, .LL19
 	nop

 .LL16:
 	LDF	[X +  0 * SIZE], a1
 	add	I, -1, I
 	cmp	I, 0
 	FADD	c1, t1, c1
 	FMOV	a1, t1
 	bg,pt	%icc, .LL16
 	add	X, 1 * SIZE, X

 .LL19:
 	FADD	c1, t1, c1
 	FADD	c2, t2, c2
 	FADD	c1, t3, c1
 	FADD	c2, t4, c2

 	FADD	c1, c2, c1
 	return	%i7 + 8
 	clr	%g0

 .LL50:
 	sra	N, 3, I
 	cmp	I, 0
 	ble,pn	%icc, .LL55
 	nop

 	LDF	[X +  0 * SIZE], a1
 	add	X, INCX, X
 	LDF	[X +  0 * SIZE], a2
 	add	X, INCX, X
 	LDF	[X +  0 * SIZE], a3
 	add	X, INCX, X
 	LDF	[X +  0 * SIZE], a4
 	add	X, INCX, X
 	LDF	[X +  0 * SIZE], a5
 	add	X, INCX, X
 	LDF	[X +  0 * SIZE], a6
 	add	X, INCX, X
 	add	I, -1, I
 	LDF	[X +  0 * SIZE], a7
 	cmp	I, 0
 	add	X, INCX, X
 	LDF	[X +  0 * SIZE], a8

 	ble,pt	%icc, .LL52
 	add	X, INCX, X

 .LL51:
 	FADD	c1, t1, c1
 	add	I, -1, I
 	FMOV	a1, t1
 	LDF	[X +  0 * SIZE], a1
 	add	X, INCX, X

 	FADD	c2, t2, c2
 	cmp	I, 0
 	FMOV	a2, t2
 	LDF	[X +  0 * SIZE], a2
 	add	X, INCX, X

 	FADD	c1, t3, c1
 	FMOV	a3, t3
 	LDF	[X +  0 * SIZE], a3
 	add	X, INCX, X

 	FADD	c2, t4, c2
 	FMOV	a4, t4
 	LDF	[X +  0 * SIZE], a4
 	add	X, INCX, X

 	FADD	c1, t1, c1
 	FMOV	a5, t1
 	LDF	[X +  0 * SIZE], a5
 	add	X, INCX, X

 	FADD	c2, t2, c2
 	FMOV	a6, t2
 	LDF	[X +  0 * SIZE], a6
 	add	X, INCX, X

 	FADD	c1, t3, c1
 	FMOV	a7, t3
 	LDF	[X +  0 * SIZE], a7
 	add	X, INCX, X

 	FADD	c2, t4, c2
 	FMOV	a8, t4
 	LDF	[X +  0 * SIZE], a8

 	bg,pt	%icc, .LL51
 	add	X, INCX, X

 .LL52:
 	FADD	c1, t1, c1
 	FMOV	a1, t1
 	FADD	c2, t2, c2
 	FMOV	a2, t2

 	FADD	c1, t3, c1
 	FMOV	a3, t3
 	FADD	c2, t4, c2
 	FMOV	a4, t4

 	FADD	c1, t1, c1
 	FMOV	a5, t1
 	FADD	c2, t2, c2
 	FMOV	a6, t2

 	FADD	c1, t3, c1
 	FMOV	a7, t3
 	FADD	c2, t4, c2
 	FMOV	a8, t4

 .LL55:
 	and	N, 7, I
 	cmp	I,  0
 	ble,a,pn %icc, .LL59
 	nop

 .LL56:
 	LDF	[X +  0 * SIZE], a1
 	FADD	c1, t1, c1
 	add	I, -1, I
 	FMOV	a1, t1
 	cmp	I, 0
 	bg,pt	%icc, .LL56
 	add	X, INCX, X

 .LL59:
 	FADD	c1, t1, c1
 	FADD	c2, t2, c2
 	FADD	c1, t3, c1
 	FADD	c2, t4, c2

 	FADD	c1, c2, c1
 	return	%i7 + 8
 	clr	%o0

 	EPILOGUE
--- a/kernel/sparc/zsum.S
+++ b/kernel/sparc/zsum.S
@@ -0,0 +1,327 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
 /* without modification, are permitted provided that the following   */
 /* conditions are met:                                               */
 /*                                                                   */
 /*   1. Redistributions of source code must retain the above         */
 /*      copyright notice, this list of conditions and the following  */
 /*      disclaimer.                                                  */
 /*                                                                   */
 /*   2. Redistributions in binary form must reproduce the above      */
 /*      copyright notice, this list of conditions and the following  */
 /*      disclaimer in the documentation and/or other materials       */
 /*      provided with the distribution.                              */
 /*                                                                   */
 /*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
 /*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
 /*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
 /*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
 /*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
 /*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
 /*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
 /*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
 /*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
 /*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
 /*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
 /*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
 /*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
 /*    POSSIBILITY OF SUCH DAMAGE.                                    */
 /*                                                                   */
 /* The views and conclusions contained in the software and           */
 /* documentation are those of the authors and should not be          */
 /* interpreted as representing official policies, either expressed   */
 /* or implied, of The University of Texas at Austin.                 */
 /*********************************************************************/

 #define ASSEMBLER
 #include "common.h"

 #define N	%i0
 #define X	%i1
 #define INCX	%i2
 #define I	%i3

 #ifdef DOUBLE
 #define c1	%f0
 #define c2	%f2
 #define t1	%f8
 #define t2	%f10
 #define t3	%f12
 #define t4	%f14

 #define a1	%f16
 #define a2	%f18
 #define a3	%f20
 #define a4	%f22
 #define a5	%f24
 #define a6	%f26
 #define a7	%f28
 #define a8	%f30
 #else
 #define c1	%f0
 #define c2	%f1
 #define t1	%f4
 #define t2	%f5
 #define t3	%f6
 #define t4	%f7

 #define a1	%f8
 #define a2	%f9
 #define a3	%f10
 #define a4	%f11
 #define a5	%f12
 #define a6	%f13
 #define a7	%f14
 #define a8	%f15
 #endif

 	PROLOGUE
 	SAVESP

 	FCLR(0)

 	sll	INCX, ZBASE_SHIFT, INCX

 	FMOV	c1, c2
 	FMOV	c1, t1
 	FMOV	c1, t2
 	FMOV	c1, t3
 	FMOV	c1, t4

 	cmp	INCX, 0
 	ble	.LL19
 	nop

 	cmp	INCX, 2 * SIZE
 	bne	.LL50
 	nop

 	sra	N, 2, I
 	cmp	I, 0
 	ble,pn	%icc, .LL15
 	nop

 	LDF	[X +  0 * SIZE], a1
 	add	I, -1, I
 	LDF	[X +  1 * SIZE], a2
 	cmp	I, 0
 	LDF	[X +  2 * SIZE], a3
 	LDF	[X +  3 * SIZE], a4
 	LDF	[X +  4 * SIZE], a5
 	LDF	[X +  5 * SIZE], a6
 	LDF	[X +  6 * SIZE], a7
 	LDF	[X +  7 * SIZE], a8

 	ble,pt	%icc, .LL12
 	add	X, 8 * SIZE, X

 #define PREFETCHSIZE 32

 .LL11:
 	FADD	c1, t1, c1
 	prefetch [X  + PREFETCHSIZE * SIZE], 0
 	FMOV	a1, t1
 	LDF	[X +  0 * SIZE], a1

 	FADD	c2, t2, c2
 	add	I, -1, I
 	FMOV	a2, t2
 	LDF	[X +  1 * SIZE], a2

 	FADD	c1, t3, c1
 	cmp	I, 0
 	FMOV	a3, t3
 	LDF	[X +  2 * SIZE], a3

 	FADD	c2, t4, c2
 	nop
 	FMOV	a4, t4
 	LDF	[X +  3 * SIZE], a4

 	FADD	c1, t1, c1
 	nop
 	FMOV	a5, t1
 	LDF	[X +  4 * SIZE], a5

 	FADD	c2, t2, c2
 	nop
 	FMOV	a6, t2
 	LDF	[X +  5 * SIZE], a6

 	FADD	c1, t3, c1
 	FMOV	a7, t3
 	LDF	[X +  6 * SIZE], a7
 	add	X, 8 * SIZE, X

 	FADD	c2, t4, c2
 	FMOV	a8, t4
 	bg,pt	%icc, .LL11
 	LDF	[X -  1 * SIZE], a8

 .LL12:
 	FADD	c1, t1, c1
 	FMOV	a1, t1
 	FADD	c2, t2, c2
 	FMOV	a2, t2

 	FADD	c1, t3, c1
 	FMOV	a3, t3
 	FADD	c2, t4, c2
 	FMOV	a4, t4

 	FADD	c1, t1, c1
 	FMOV	a5, t1
 	FADD	c2, t2, c2
 	FMOV	a6, t2

 	FADD	c1, t3, c1
 	FMOV	a7, t3
 	FADD	c2, t4, c2
 	FMOV	a8, t4

 .LL15:
 	and	N, 3, I
 	cmp	I,  0
 	ble,a,pn %icc, .LL19
 	nop

 .LL16:
 	LDF	[X +  0 * SIZE], a1
 	LDF	[X +  1 * SIZE], a2
 	add	I, -1, I
 	cmp	I, 0
 	FADD	c1, t1, c1
 	FADD	c2, t2, c2
 	FMOV	a1, t1
 	FMOV	a2, t2
 	bg,pt	%icc, .LL16
 	add	X, 2 * SIZE, X

 .LL19:
 	FADD	c1, t1, c1
 	FADD	c2, t2, c2
 	FADD	c1, t3, c1
 	FADD	c2, t4, c2

 	FADD	c1, c2, c1
 	return	%i7 + 8
 	clr	%g0

 .LL50:
 	sra	N, 2, I
 	cmp	I, 0
 	ble,pn	%icc, .LL55
 	nop

 	LDF	[X +  0 * SIZE], a1
 	LDF	[X +  1 * SIZE], a2
 	add	X, INCX, X
 	LDF	[X +  0 * SIZE], a3
 	LDF	[X +  1 * SIZE], a4
 	add	X, INCX, X
 	LDF	[X +  0 * SIZE], a5
 	LDF	[X +  1 * SIZE], a6
 	add	X, INCX, X
 	add	I, -1, I
 	LDF	[X +  0 * SIZE], a7
 	cmp	I, 0
 	LDF	[X +  1 * SIZE], a8

 	ble,pt	%icc, .LL52
 	add	X, INCX, X

 .LL51:
 	FADD	c1, t1, c1
 	add	I, -1, I
 	FMOV	a1, t1
 	LDF	[X +  0 * SIZE], a1

 	FADD	c2, t2, c2
 	cmp	I, 0
 	FMOV	a2, t2
 	LDF	[X +  1 * SIZE], a2
 	add	X, INCX, X

 	FADD	c1, t3, c1
 	FMOV	a3, t3
 	LDF	[X +  0 * SIZE], a3

 	FADD	c2, t4, c2
 	FMOV	a4, t4
 	LDF	[X +  1 * SIZE], a4
 	add	X, INCX, X

 	FADD	c1, t1, c1
 	FMOV	a5, t1
 	LDF	[X +  0 * SIZE], a5

 	FADD	c2, t2, c2
 	FMOV	a6, t2
 	LDF	[X +  1 * SIZE], a6
 	add	X, INCX, X

 	FADD	c1, t3, c1
 	FMOV	a7, t3
 	LDF	[X +  0 * SIZE], a7

 	FADD	c2, t4, c2
 	FMOV	a8, t4
 	LDF	[X +  1 * SIZE], a8

 	bg,pt	%icc, .LL51
 	add	X, INCX, X

 .LL52:
 	FADD	c1, t1, c1
 	FMOV	a1, t1
 	FADD	c2, t2, c2
 	FMOV	a2, t2

 	FADD	c1, t3, c1
 	FMOV	a3, t3
 	FADD	c2, t4, c2
 	FMOV	a4, t4

 	FADD	c1, t1, c1
 	FMOV	a5, t1
 	FADD	c2, t2, c2
 	FMOV	a6, t2

 	FADD	c1, t3, c1
 	FMOV	a7, t3
 	FADD	c2, t4, c2
 	FMOV	a8, t4

 .LL55:
 	and	N, 3, I
 	cmp	I,  0
 	ble,a,pn %icc, .LL59
 	nop

 .LL56:
 	LDF	[X +  0 * SIZE], a1
 	LDF	[X +  1 * SIZE], a2
 	FADD	c1, t1, c1
 	FADD	c2, t2, c2
 	add	I, -1, I
 	FMOV	a1, t1
 	FMOV	a2, t2
 	cmp	I, 0
 	bg,pt	%icc, .LL56
 	add	X, INCX, X

 .LL59:
 	FADD	c1, t1, c1
 	FADD	c2, t2, c2
 	FADD	c1, t3, c1
 	FADD	c2, t4, c2

 	FADD	c1, c2, c1

 	return	%i7 + 8
 	clr	%o0

 	EPILOGUE
--- a/kernel/x86/KERNEL.generic
+++ b/kernel/x86/KERNEL.generic
@@ -94,6 +94,11 @@ DASUMKERNEL  = ../arm/asum.c
 CASUMKERNEL  = ../arm/zasum.c
 ZASUMKERNEL  = ../arm/zasum.c

 SSUMKERNEL  = ../arm/sum.c
 DSUMKERNEL  = ../arm/sum.c
 CSUMKERNEL  = ../arm/zsum.c
 ZSUMKERNEL  = ../arm/zsum.c

 SAXPYKERNEL  = ../arm/axpy.c
 DAXPYKERNEL  = ../arm/axpy.c
 CAXPYKERNEL  = ../arm/zaxpy.c
--- a/kernel/x86/sum.S
+++ b/kernel/x86/sum.S
@@ -0,0 +1,207 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
 /* without modification, are permitted provided that the following   */
 /* conditions are met:                                               */
 /*                                                                   */
 /*   1. Redistributions of source code must retain the above         */
 /*      copyright notice, this list of conditions and the following  */
 /*      disclaimer.                                                  */
 /*                                                                   */
 /*   2. Redistributions in binary form must reproduce the above      */
 /*      copyright notice, this list of conditions and the following  */
 /*      disclaimer in the documentation and/or other materials       */
 /*      provided with the distribution.                              */
 /*                                                                   */
 /*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
 /*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
 /*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
 /*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
 /*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
 /*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
 /*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
 /*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
 /*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
 /*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
 /*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
 /*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
 /*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
 /*    POSSIBILITY OF SUCH DAMAGE.                                    */
 /*                                                                   */
 /* The views and conclusions contained in the software and           */
 /* documentation are those of the authors and should not be          */
 /* interpreted as representing official policies, either expressed   */
 /* or implied, of The University of Texas at Austin.                 */
 /*********************************************************************/

 #define ASSEMBLER
 #include "common.h"

 #define STACK	 8
 #define ARGS	 0

 #define STACK_M		 4 + STACK + ARGS(%esp)
 #define STACK_X		 8 + STACK + ARGS(%esp)
 #define STACK_INCX	12 + STACK + ARGS(%esp)

 #define M	%edx
 #define X	%ecx
 #define INCX	%esi

 #define I	%eax

 #include "l1param.h"

 	PROLOGUE

 	pushl	%esi
 	pushl	%ebx

 	PROFCODE

 #if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95)
 	EMMS
 #endif

 	movl	STACK_M,    M
 	movl	STACK_X,    X
 	movl	STACK_INCX, INCX

 #ifdef F_INTERFACE
 	movl	(M),    M
 	movl	(INCX), INCX
 #endif

 	fldz
 	testl	M, M
 	jle	.L999
 	testl	INCX, INCX
 	jle	.L999

 	sall	$BASE_SHIFT, INCX
 	fldz
 	fldz
 	fldz
 	cmpl	$SIZE, INCX
 	jne	.L40

 	movl	M,  I
 	sarl	$3, I
 	jle	.L20
 	ALIGN_4

 .L10:
 #ifdef PREFETCH
 	PREFETCH (PREFETCHSIZE +  0) - PREOFFSET(X)
 #endif

 	FLD	0 * SIZE(X)
 	FLD	1 * SIZE(X)
 	FLD	2 * SIZE(X)
 	FLD	3 * SIZE(X)

 	faddp	%st, %st(7)
 	faddp	%st, %st(5)
 	faddp	%st, %st(3)
 	faddp	%st, %st(1)

 	FLD	4 * SIZE(X)
 	FLD	5 * SIZE(X)
 	FLD	6 * SIZE(X)
 	FLD	7 * SIZE(X)

 	addl	$8 * SIZE, X

 	faddp	%st, %st(7)
 	faddp	%st, %st(5)
 	faddp	%st, %st(3)
 	faddp	%st, %st(1)

 	decl	I
 	jg	.L10
 	ALIGN_4

 .L20:
 	movl	M,  I
 	andl	$7, I
 	jle	.L998
 	ALIGN_4


 .L21:
 	FLD	(X)
 	faddp	%st,%st(1)
 	addl	$1 * SIZE, X
 	decl	I
 	jg	.L21
 	jmp	.L998
 	ALIGN_4

 .L40:
 	movl	M,  I
 	sarl	$3, I
 	jle	.L60
 	ALIGN_4

 .L50:
 	FLD	(X)
 	addl	INCX, X
 	FLD	(X)
 	addl	INCX, X
 	FLD	(X)
 	addl	INCX, X
 	FLD	(X)
 	addl	INCX, X

 	faddp	%st, %st(7)
 	faddp	%st, %st(5)
 	faddp	%st, %st(3)
 	faddp	%st, %st(1)

 	FLD	(X)
 	addl	INCX, X
 	FLD	(X)
 	addl	INCX, X
 	FLD	(X)
 	addl	INCX, X
 	FLD	(X)
 	addl	INCX, X

 	faddp	%st, %st(7)
 	faddp	%st, %st(5)
 	faddp	%st, %st(3)
 	faddp	%st, %st(1)

 	decl	I
 	jg	.L50
 	ALIGN_4

 .L60:
 	movl	M,  I
 	andl	$7, I
 	jle	.L998
 	ALIGN_4


 .L61:
 	FLD	(X)
 	addl	INCX, X
 	faddp	%st,%st(1)
 	decl	I
 	jg	.L61
 	ALIGN_4

 .L998:
 	faddp	%st,%st(2)
 	faddp	%st,%st(1)
 	faddp	%st,%st(1)
 	ALIGN_4

 .L999:
 	popl	%ebx
 	popl	%esi
 	ret

 	EPILOGUE
--- a/kernel/x86/zsum.S
+++ b/kernel/x86/zsum.S
@@ -0,0 +1,208 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
 /* without modification, are permitted provided that the following   */
 /* conditions are met:                                               */
 /*                                                                   */
 /*   1. Redistributions of source code must retain the above         */
 /*      copyright notice, this list of conditions and the following  */
 /*      disclaimer.                                                  */
 /*                                                                   */
 /*   2. Redistributions in binary form must reproduce the above      */
 /*      copyright notice, this list of conditions and the following  */
 /*      disclaimer in the documentation and/or other materials       */
 /*      provided with the distribution.                              */
 /*                                                                   */
 /*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
 /*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
 /*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
 /*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
 /*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
 /*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
 /*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
 /*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
 /*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
 /*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
 /*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
 /*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
 /*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
 /*    POSSIBILITY OF SUCH DAMAGE.                                    */
 /*                                                                   */
 /* The views and conclusions contained in the software and           */
 /* documentation are those of the authors and should not be          */
 /* interpreted as representing official policies, either expressed   */
 /* or implied, of The University of Texas at Austin.                 */
 /*********************************************************************/

 #define ASSEMBLER
 #include "common.h"

 #define STACK	 8
 #define ARGS	 0

 #define STACK_M		 4 + STACK + ARGS(%esp)
 #define STACK_X		 8 + STACK + ARGS(%esp)
 #define STACK_INCX	12 + STACK + ARGS(%esp)

 #define M	%edx
 #define X	%ecx
 #define INCX	%esi

 #define I	%eax

 #include "l1param.h"

 	PROLOGUE

 	pushl	%esi
 	pushl	%ebx

 	PROFCODE

 #if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95)
 	EMMS
 #endif

 	movl	STACK_M,    M
 	movl	STACK_X,    X
 	movl	STACK_INCX, INCX

 #ifdef F_INTERFACE
 	movl	(M),    M
 	movl	(INCX), INCX
 #endif

 	fldz
 	testl	M,  M
 	jle	.L999
 	testl	INCX, INCX
 	jle	.L999

 	sall	$ZBASE_SHIFT, INCX

 	fldz
 	fldz
 	fldz
 	cmpl	$SIZE * 2, INCX
 	jne	.L40

 	movl	M,  I
 	sarl	$2, I
 	jle	.L20
 	ALIGN_4

 .L10:
 #ifdef PREFETCH
 	PREFETCH (PREFETCHSIZE +  0) - PREOFFSET(X)
 #endif

 	FLD	0 * SIZE(X)
 	FLD	1 * SIZE(X)
 	FLD	2 * SIZE(X)
 	FLD	3 * SIZE(X)

 	faddp	%st, %st(7)
 	faddp	%st, %st(5)
 	faddp	%st, %st(3)
 	faddp	%st, %st(1)

 	FLD	4 * SIZE(X)
 	FLD	5 * SIZE(X)
 	FLD	6 * SIZE(X)
 	FLD	7 * SIZE(X)

 	addl	$8 * SIZE, X

 	faddp	%st, %st(7)
 	faddp	%st, %st(5)
 	faddp	%st, %st(3)
 	faddp	%st, %st(1)

 	decl	I
 	jg	.L10
 	ALIGN_4

 .L20:
 	movl	M,  I
 	andl	$3, I
 	jle	.L998
 	ALIGN_4


 .L21:
 	FLD	0 * SIZE(X)
 	FLD	1 * SIZE(X)
 	faddp	%st,%st(3)
 	faddp	%st,%st(1)
 	addl	$2 * SIZE, X
 	decl	I
 	jg	.L21
 	jmp	.L998
 	ALIGN_4

 .L40:
 	movl	M,  I
 	sarl	$2, I
 	jle	.L60
 	ALIGN_4

 .L50:
 	FLD	0 * SIZE(X)
 	FLD	1 * SIZE(X)
 	addl	INCX, X
 	FLD	0 * SIZE(X)
 	FLD	1 * SIZE(X)
 	addl	INCX, X

 	faddp	%st, %st(7)
 	faddp	%st, %st(5)
 	faddp	%st, %st(3)
 	faddp	%st, %st(1)

 	FLD	0 * SIZE(X)
 	FLD	1 * SIZE(X)
 	addl	INCX, X
 	FLD	0 * SIZE(X)
 	FLD	1 * SIZE(X)
 	addl	INCX, X

 	faddp	%st, %st(7)
 	faddp	%st, %st(5)
 	faddp	%st, %st(3)
 	faddp	%st, %st(1)

 	decl	I
 	jg	.L50
 	ALIGN_4

 .L60:
 	movl	M,  I
 	andl	$3, I
 	jle	.L998
 	ALIGN_4


 .L61:
 	FLD	0 * SIZE(X)
 	FLD	1 * SIZE(X)
 	addl	INCX, X
 	faddp	%st,%st(3)
 	faddp	%st,%st(1)
 	decl	I
 	jg	.L61
 	ALIGN_4

 .L998:
 	faddp	%st,%st(2)
 	faddp	%st,%st(1)
 	faddp	%st,%st(1)
 	ALIGN_4

 .L999:
 	popl	%ebx
 	popl	%esi
 	ret

 	EPILOGUE
--- a/kernel/x86_64/KERNEL.generic
+++ b/kernel/x86_64/KERNEL.generic
@@ -94,6 +94,11 @@ DASUMKERNEL  = ../arm/asum.c
 CASUMKERNEL  = ../arm/zasum.c
 ZASUMKERNEL  = ../arm/zasum.c

 SSUMKERNEL  = ../arm/sum.c
 DSUMKERNEL  = ../arm/sum.c
 CSUMKERNEL  = ../arm/zsum.c
 ZSUMKERNEL  = ../arm/zsum.c

 SAXPYKERNEL  = ../arm/axpy.c
 DAXPYKERNEL  = ../arm/axpy.c
 CAXPYKERNEL  = ../arm/zaxpy.c
--- a/kernel/x86_64/sum.S
+++ b/kernel/x86_64/sum.S
@@ -0,0 +1,179 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
 /* without modification, are permitted provided that the following   */
 /* conditions are met:                                               */
 /*                                                                   */
 /*   1. Redistributions of source code must retain the above         */
 /*      copyright notice, this list of conditions and the following  */
 /*      disclaimer.                                                  */
 /*                                                                   */
 /*   2. Redistributions in binary form must reproduce the above      */
 /*      copyright notice, this list of conditions and the following  */
 /*      disclaimer in the documentation and/or other materials       */
 /*      provided with the distribution.                              */
 /*                                                                   */
 /*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
 /*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
 /*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
 /*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
 /*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
 /*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
 /*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
 /*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
 /*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
 /*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
 /*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
 /*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
 /*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
 /*    POSSIBILITY OF SUCH DAMAGE.                                    */
 /*                                                                   */
 /* The views and conclusions contained in the software and           */
 /* documentation are those of the authors and should not be          */
 /* interpreted as representing official policies, either expressed   */
 /* or implied, of The University of Texas at Austin.                 */
 /*********************************************************************/

 #define ASSEMBLER
 #include "common.h"

 #define M	ARG1
 #define X	ARG2
 #define INCX	ARG3

 #define I	%rax

 #include "l1param.h"

 	PROLOGUE
 	PROFCODE

 	fldz
 	testq	M, M
 	jle	.L999
 	testq	INCX, INCX
 	jle	.L999

 	salq	$BASE_SHIFT, INCX

 	fldz
 	fldz
 	fldz
 	cmpq	$SIZE, INCX
 	jne	.L40

 	movq	M, I
 	sarq	$3,   I
 	jle	.L20
 	ALIGN_4

 .L10:
 #ifdef PREFETCH
 	PREFETCH (PREFETCHSIZE +  0) - PREOFFSET(X)
 #endif

 	FLD	0 * SIZE(X)
 	FLD	1 * SIZE(X)
 	FLD	2 * SIZE(X)
 	FLD	3 * SIZE(X)

 	faddp	%st, %st(7)
 	faddp	%st, %st(5)
 	faddp	%st, %st(3)
 	faddp	%st, %st(1)

 	FLD	4 * SIZE(X)
 	FLD	5 * SIZE(X)
 	FLD	6 * SIZE(X)
 	FLD	7 * SIZE(X)

 	addq	$8 * SIZE, X

 	faddp	%st, %st(7)
 	faddp	%st, %st(5)
 	faddp	%st, %st(3)
 	faddp	%st, %st(1)

 	decq	I
 	jg	.L10
 	ALIGN_4

 .L20:
 	andq	$7,  M
 	jle	.L998
 	ALIGN_4

 .L21:
 	FLD	(X)
 	faddp	%st,%st(1)
 	addq	$1 * SIZE, X
 	decq	M
 	jg	.L21
 	jmp	.L998
 	ALIGN_4

 .L40:
 	movq	M, I
 	sarq	$3,   I
 	jle	.L60
 	ALIGN_4

 .L50:
 	FLD	(X)
 	addq	INCX, X
 	FLD	(X)
 	addq	INCX, X
 	FLD	(X)
 	addq	INCX, X
 	FLD	(X)
 	addq	INCX, X

 	faddp	%st, %st(7)
 	faddp	%st, %st(5)
 	faddp	%st, %st(3)
 	faddp	%st, %st(1)

 	FLD	(X)
 	addq	INCX, X
 	FLD	(X)
 	addq	INCX, X
 	FLD	(X)
 	addq	INCX, X
 	FLD	(X)
 	addq	INCX, X

 	faddp	%st, %st(7)
 	faddp	%st, %st(5)
 	faddp	%st, %st(3)
 	faddp	%st, %st(1)

 	decq	I
 	jg	.L50
 	ALIGN_4

 .L60:
 	andq	$7,  M
 	jle	.L998
 	ALIGN_4


 .L61:
 	FLD	(X)
 	addq	INCX, X
 	faddp	%st,%st(1)
 	decq	M
 	jg	.L61
 	ALIGN_4

 .L998:
 	faddp	%st,%st(2)
 	faddp	%st,%st(1)
 	faddp	%st,%st(1)
 	ALIGN_4

 .L999:
 	ret

 	EPILOGUE
--- a/kernel/x86_64/zsum.S
+++ b/kernel/x86_64/zsum.S
@@ -0,0 +1,180 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
 /* without modification, are permitted provided that the following   */
 /* conditions are met:                                               */
 /*                                                                   */
 /*   1. Redistributions of source code must retain the above         */
 /*      copyright notice, this list of conditions and the following  */
 /*      disclaimer.                                                  */
 /*                                                                   */
 /*   2. Redistributions in binary form must reproduce the above      */
 /*      copyright notice, this list of conditions and the following  */
 /*      disclaimer in the documentation and/or other materials       */
 /*      provided with the distribution.                              */
 /*                                                                   */
 /*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
 /*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
 /*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
 /*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
 /*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
 /*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
 /*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
 /*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
 /*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
 /*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
 /*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
 /*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
 /*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
 /*    POSSIBILITY OF SUCH DAMAGE.                                    */
 /*                                                                   */
 /* The views and conclusions contained in the software and           */
 /* documentation are those of the authors and should not be          */
 /* interpreted as representing official policies, either expressed   */
 /* or implied, of The University of Texas at Austin.                 */
 /*********************************************************************/

 #define ASSEMBLER
 #include "common.h"

 #define M	ARG1
 #define X	ARG2
 #define INCX	ARG3

 #define I	%rax

 #include "l1param.h"

 	PROLOGUE
 	PROFCODE

 	fldz
 	testq	M, M
 	jle	.L999
 	testq	INCX, INCX
 	jle	.L999

 	salq	$ZBASE_SHIFT, INCX

 	fldz
 	fldz
 	fldz
 	cmpq	$SIZE * 2, INCX
 	jne	.L40

 	movq	M, I
 	sarq	$2,   I
 	jle	.L20
 	ALIGN_4

 .L10:
 #ifdef PREFETCH
 	PREFETCH (PREFETCHSIZE +  0) - PREOFFSET(X)
 #endif

 	FLD	0 * SIZE(X)
 	FLD	1 * SIZE(X)
 	FLD	2 * SIZE(X)
 	FLD	3 * SIZE(X)

 	faddp	%st, %st(7)
 	faddp	%st, %st(5)
 	faddp	%st, %st(3)
 	faddp	%st, %st(1)

 	FLD	4 * SIZE(X)
 	FLD	5 * SIZE(X)
 	FLD	6 * SIZE(X)
 	FLD	7 * SIZE(X)

 	addq	$8 * SIZE, X

 	faddp	%st, %st(7)
 	faddp	%st, %st(5)
 	faddp	%st, %st(3)
 	faddp	%st, %st(1)

 	decq	I
 	jg	.L10
 	ALIGN_4

 .L20:
 	andq	$3,  M
 	jle	.L998
 	ALIGN_4


 .L21:
 	FLD	0 * SIZE(X)
 	FLD	1 * SIZE(X)
 	faddp	%st,%st(3)
 	faddp	%st,%st(1)
 	addq	$2 * SIZE, X
 	decq	M
 	jg	.L21
 	jmp	.L998
 	ALIGN_4

 .L40:
 	movq	M, I
 	sarq	$2,   I
 	jle	.L60
 	ALIGN_4

 .L50:
 	FLD	0 * SIZE(X)
 	FLD	1 * SIZE(X)
 	addq	INCX, X
 	FLD	0 * SIZE(X)
 	FLD	1 * SIZE(X)
 	addq	INCX, X

 	faddp	%st, %st(7)
 	faddp	%st, %st(5)
 	faddp	%st, %st(3)
 	faddp	%st, %st(1)

 	FLD	0 * SIZE(X)
 	FLD	1 * SIZE(X)
 	addq	INCX, X
 	FLD	0 * SIZE(X)
 	FLD	1 * SIZE(X)
 	addq	INCX, X

 	faddp	%st, %st(7)
 	faddp	%st, %st(5)
 	faddp	%st, %st(3)
 	faddp	%st, %st(1)

 	decq	I
 	jg	.L50
 	ALIGN_4

 .L60:
 	andq	$3,  M
 	jle	.L998
 	ALIGN_4


 .L61:
 	FLD	0 * SIZE(X)
 	FLD	1 * SIZE(X)
 	addq	INCX, X
 	faddp	%st,%st(3)
 	faddp	%st,%st(1)
 	decq	M
 	jg	.L61
 	ALIGN_4

 .L998:
 	faddp	%st,%st(2)
 	faddp	%st,%st(1)
 	faddp	%st,%st(1)
 	ALIGN_4

 .L999:
 	ret

 	EPILOGUE
--- a/kernel/zarch/KERNEL.Z13
+++ b/kernel/zarch/KERNEL.Z13
@@ -35,6 +35,11 @@ DASUMKERNEL  = dasum.c
 CASUMKERNEL  = ../arm/zasum.c
 ZASUMKERNEL  = zasum.c

 SSUMKERNEL  = ../arm/asum.c
 DSUMKERNEL  = dasum.c
 CSUMKERNEL  = ../arm/zasum.c
 ZSUMKERNEL  = zasum.c

 SAXPYKERNEL  = ../arm/axpy.c
 DAXPYKERNEL  = daxpy.c
 CAXPYKERNEL  = ../arm/zaxpy.c
--- a/kernel/zarch/KERNEL.Z14
+++ b/kernel/zarch/KERNEL.Z14
@@ -35,6 +35,11 @@ DASUMKERNEL  = dasum.c
 CASUMKERNEL  = casum.c
 ZASUMKERNEL  = zasum.c

 SSUMKERNEL  = ssum.c
 DSUMKERNEL  = dsum.c
 CSUMKERNEL  = csum.c
 ZSUMKERNEL  = zsum.c

 SAXPYKERNEL  = saxpy.c
 DAXPYKERNEL  = daxpy.c
 CAXPYKERNEL  = caxpy.c
--- a/kernel/zarch/KERNEL.ZARCH_GENERIC
+++ b/kernel/zarch/KERNEL.ZARCH_GENERIC
@@ -35,6 +35,11 @@ DASUMKERNEL  = ../arm/asum.c
 CASUMKERNEL  = ../arm/zasum.c
 ZASUMKERNEL  = ../arm/zasum.c

 SSUMKERNEL  = ../arm/sum.c
 DSUMKERNEL  = ../arm/sum.c
 CSUMKERNEL  = ../arm/zsum.c
 ZSUMKERNEL  = ../arm/zsum.c

 SAXPYKERNEL  = ../arm/axpy.c
 DAXPYKERNEL  = ../arm/axpy.c
 CAXPYKERNEL  = ../arm/zaxpy.c
--- a/kernel/zarch/csum.c
+++ b/kernel/zarch/csum.c
@@ -0,0 +1,137 @@
 /***************************************************************************
 Copyright (c) 2013-2019, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/

 #include "common.h"
 #include <math.h>

 static FLOAT csum_kernel_32(BLASLONG n, FLOAT *x) {
  FLOAT sum;

  __asm__("vzero   %%v24\n\t"
    "vzero   %%v25\n\t"
    "vzero   %%v26\n\t"
    "vzero   %%v27\n\t"
    "vzero   %%v28\n\t"
    "vzero   %%v29\n\t"
    "vzero   %%v30\n\t"
    "vzero   %%v31\n\t"
    "srlg  %[n],%[n],5\n\t"
    "xgr %%r1,%%r1\n\t"
    "0:\n\t"
    "pfd  1, 1024(%%r1,%[x])\n\t"
    "vl  %%v16, 0(%%r1,%[x])\n\t"
    "vl  %%v17, 16(%%r1,%[x])\n\t"
    "vl  %%v18, 32(%%r1,%[x])\n\t"
    "vl  %%v19, 48(%%r1,%[x])\n\t"
    "vl  %%v20, 64(%%r1,%[x])\n\t"
    "vl  %%v21, 80(%%r1,%[x])\n\t"
    "vl  %%v22, 96(%%r1,%[x])\n\t"
    "vl  %%v23, 112(%%r1,%[x])\n\t"
    "vfasb   %%v24,%%v24,%%v16\n\t"
    "vfasb   %%v25,%%v25,%%v17\n\t"
    "vfasb   %%v26,%%v26,%%v18\n\t"
    "vfasb   %%v27,%%v27,%%v19\n\t"
    "vfasb   %%v28,%%v28,%%v20\n\t"
    "vfasb   %%v29,%%v29,%%v21\n\t"
    "vfasb   %%v30,%%v30,%%v22\n\t"
    "vfasb   %%v31,%%v31,%%v23\n\t"
    "vl  %%v16, 128(%%r1,%[x])\n\t"
    "vl  %%v17, 144(%%r1,%[x])\n\t"
    "vl  %%v18, 160(%%r1,%[x])\n\t"
    "vl  %%v19, 176(%%r1,%[x])\n\t"
    "vl  %%v20, 192(%%r1,%[x])\n\t"
    "vl  %%v21, 208(%%r1,%[x])\n\t"
    "vl  %%v22, 224(%%r1,%[x])\n\t"
    "vl  %%v23, 240(%%r1,%[x])\n\t"
    "vfasb   %%v24,%%v24,%%v16\n\t"
    "vfasb   %%v25,%%v25,%%v17\n\t"
    "vfasb   %%v26,%%v26,%%v18\n\t"
    "vfasb   %%v27,%%v27,%%v19\n\t"
    "vfasb   %%v28,%%v28,%%v20\n\t"
    "vfasb   %%v29,%%v29,%%v21\n\t"
    "vfasb   %%v30,%%v30,%%v22\n\t"
    "vfasb   %%v31,%%v31,%%v23\n\t"
    "agfi  %%r1,256\n\t"
    "brctg %[n],0b\n\t"
    "vfasb   %%v24,%%v24,%%v25\n\t"
    "vfasb   %%v24,%%v24,%%v26\n\t"
    "vfasb   %%v24,%%v24,%%v27\n\t"
    "vfasb   %%v24,%%v24,%%v28\n\t"
    "vfasb   %%v24,%%v24,%%v29\n\t"
    "vfasb   %%v24,%%v24,%%v30\n\t"
    "vfasb   %%v24,%%v24,%%v31\n\t"
    "veslg   %%v25,%%v24,32\n\t"
    "vfasb   %%v24,%%v24,%%v25\n\t"
    "vrepf   %%v25,%%v24,2\n\t"
    "vfasb   %%v24,%%v24,%%v25\n\t"
    "vstef   %%v24,%[asum],0"
    : [sum] "=Q"(sum),[n] "+&r"(n)
    : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
    : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
       "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");

  return sum;
 }

 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
  BLASLONG i = 0;
  BLASLONG ip = 0;
  FLOAT sumf = 0.0;
  BLASLONG n1;
  BLASLONG inc_x2;

  if (n <= 0 || inc_x <= 0)
    return (sumf);

  if (inc_x == 1) {

    n1 = n & -32;
    if (n1 > 0) {

      sumf = csum_kernel_32(n1, x);
      i = n1;
      ip = 2 * n1;
    }

    while (i < n) {
      sumf += x[ip] + x[ip + 1];
      i++;
      ip += 2;
    }

  } else {
    inc_x2 = 2 * inc_x;

    while (i < n) {
      sumf += x[ip] + x[ip + 1];
      ip += inc_x2;
      i++;
    }

  }
  return (sumf);
 }
--- a/kernel/zarch/dsum.c
+++ b/kernel/zarch/dsum.c
@@ -0,0 +1,148 @@
 /***************************************************************************
 Copyright (c) 2013-2019, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/

 #include "common.h"
 #include <math.h>

 static FLOAT dsum_kernel_32(BLASLONG n, FLOAT *x) {
  FLOAT sum;

  __asm__("vzero   %%v24\n\t"
    "vzero   %%v25\n\t"
    "vzero   %%v26\n\t"
    "vzero   %%v27\n\t"
    "vzero   %%v28\n\t"
    "vzero   %%v29\n\t"
    "vzero   %%v30\n\t"
    "vzero   %%v31\n\t"
    "srlg  %[n],%[n],5\n\t"
    "xgr %%r1,%%r1\n\t"
    "0:\n\t"
    "pfd  1, 1024(%%r1,%[x])\n\t"
    "vl  %%v16, 0(%%r1,%[x])\n\t"
    "vl  %%v17, 16(%%r1,%[x])\n\t"
    "vl  %%v18, 32(%%r1,%[x])\n\t"
    "vl  %%v19, 48(%%r1,%[x])\n\t"
    "vl  %%v20, 64(%%r1,%[x])\n\t"
    "vl  %%v21, 80(%%r1,%[x])\n\t"
    "vl  %%v22, 96(%%r1,%[x])\n\t"
    "vl  %%v23, 112(%%r1,%[x])\n\t"
    "vfadb   %%v24,%%v24,%%v16\n\t"
    "vfadb   %%v25,%%v25,%%v17\n\t"
    "vfadb   %%v26,%%v26,%%v18\n\t"
    "vfadb   %%v27,%%v27,%%v19\n\t"
    "vfadb   %%v28,%%v28,%%v20\n\t"
    "vfadb   %%v29,%%v29,%%v21\n\t"
    "vfadb   %%v30,%%v30,%%v22\n\t"
    "vfadb   %%v31,%%v31,%%v23\n\t"
    "vl  %%v16, 128(%%r1,%[x])\n\t"
    "vl  %%v17, 144(%%r1,%[x])\n\t"
    "vl  %%v18, 160(%%r1,%[x])\n\t"
    "vl  %%v19, 176(%%r1,%[x])\n\t"
    "vl  %%v20, 192(%%r1,%[x])\n\t"
    "vl  %%v21, 208(%%r1,%[x])\n\t"
    "vl  %%v22, 224(%%r1,%[x])\n\t"
    "vl  %%v23, 240(%%r1,%[x])\n\t"
    "vfadb   %%v24,%%v24,%%v16\n\t"
    "vfadb   %%v25,%%v25,%%v17\n\t"
    "vfadb   %%v26,%%v26,%%v18\n\t"
    "vfadb   %%v27,%%v27,%%v19\n\t"
    "vfadb   %%v28,%%v28,%%v20\n\t"
    "vfadb   %%v29,%%v29,%%v21\n\t"
    "vfadb   %%v30,%%v30,%%v22\n\t"
    "vfadb   %%v31,%%v31,%%v23\n\t"
    "agfi  %%r1,256\n\t"
    "brctg %[n],0b\n\t"
    "vfadb   %%v24,%%v24,%%v25\n\t"
    "vfadb   %%v24,%%v24,%%v26\n\t"
    "vfadb   %%v24,%%v24,%%v27\n\t"
    "vfadb   %%v24,%%v24,%%v28\n\t"
    "vfadb   %%v24,%%v24,%%v29\n\t"
    "vfadb   %%v24,%%v24,%%v30\n\t"
    "vfadb   %%v24,%%v24,%%v31\n\t"
    "vrepg   %%v25,%%v24,1\n\t"
    "vfadb   %%v24,%%v24,%%v25\n\t"
    "vsteg   %%v24,%[asum],0"
    : [sum] "=Q"(sum),[n] "+&r"(n)
    : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
    : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
       "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");

  return sum;
 }

 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
  BLASLONG i = 0;
  BLASLONG j = 0;
  FLOAT sumf = 0.0;
  BLASLONG n1;

  if (n <= 0 || inc_x <= 0)
    return sumf;

  if (inc_x == 1) {

    n1 = n & -32;

    if (n1 > 0) {

      sumf = dsum_kernel_32(n1, x);
      i = n1;
    }

    while (i < n) {
      sumf += x[i];
      i++;
    }

  } else {
    BLASLONG n1 = n & -4;
    register FLOAT sum1, sum2;
    sum1 = 0.0;
    sum2 = 0.0;
    while (j < n1) {

      sum1 += x[i];
      sum2 += x[i + inc_x];
      sum1 += x[i + 2 * inc_x];
      sum2 += x[i + 3 * inc_x];

      i += inc_x * 4;
      j += 4;

    }
    sumf = sum1 + sum2;
    while (j < n) {

      sumf += x[i];
      i += inc_x;
      j++;
    }

  }
  return sumf;
 }
--- a/kernel/zarch/ssum.c
+++ b/kernel/zarch/ssum.c
@@ -0,0 +1,151 @@
 /***************************************************************************
 Copyright (c) 2013-2019, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/

 #include "common.h"
 #include <math.h>


 static FLOAT ssum_kernel_64(BLASLONG n, FLOAT *x) {
  FLOAT sum;

  __asm__("vzero   %%v24\n\t"
    "vzero   %%v25\n\t"
    "vzero   %%v26\n\t"
    "vzero   %%v27\n\t"
    "vzero   %%v28\n\t"
    "vzero   %%v29\n\t"
    "vzero   %%v30\n\t"
    "vzero   %%v31\n\t"
    "srlg  %[n],%[n],6\n\t"
    "xgr %%r1,%%r1\n\t"
    "0:\n\t"
    "pfd  1, 1024(%%r1,%[x])\n\t"
    "vl  %%v16, 0(%%r1,%[x])\n\t"
    "vl  %%v17, 16(%%r1,%[x])\n\t"
    "vl  %%v18, 32(%%r1,%[x])\n\t"
    "vl  %%v19, 48(%%r1,%[x])\n\t"
    "vl  %%v20, 64(%%r1,%[x])\n\t"
    "vl  %%v21, 80(%%r1,%[x])\n\t"
    "vl  %%v22, 96(%%r1,%[x])\n\t"
    "vl  %%v23, 112(%%r1,%[x])\n\t"
    "vfasb   %%v24,%%v24,%%v16\n\t"
    "vfasb   %%v25,%%v25,%%v17\n\t"
    "vfasb   %%v26,%%v26,%%v18\n\t"
    "vfasb   %%v27,%%v27,%%v19\n\t"
    "vfasb   %%v28,%%v28,%%v20\n\t"
    "vfasb   %%v29,%%v29,%%v21\n\t"
    "vfasb   %%v30,%%v30,%%v22\n\t"
    "vfasb   %%v31,%%v31,%%v23\n\t"
    "vl  %%v16, 128(%%r1,%[x])\n\t"
    "vl  %%v17, 144(%%r1,%[x])\n\t"
    "vl  %%v18, 160(%%r1,%[x])\n\t"
    "vl  %%v19, 176(%%r1,%[x])\n\t"
    "vl  %%v20, 192(%%r1,%[x])\n\t"
    "vl  %%v21, 208(%%r1,%[x])\n\t"
    "vl  %%v22, 224(%%r1,%[x])\n\t"
    "vl  %%v23, 240(%%r1,%[x])\n\t"
    "vfasb   %%v24,%%v24,%%v16\n\t"
    "vfasb   %%v25,%%v25,%%v17\n\t"
    "vfasb   %%v26,%%v26,%%v18\n\t"
    "vfasb   %%v27,%%v27,%%v19\n\t"
    "vfasb   %%v28,%%v28,%%v20\n\t"
    "vfasb   %%v29,%%v29,%%v21\n\t"
    "vfasb   %%v30,%%v30,%%v22\n\t"
    "vfasb   %%v31,%%v31,%%v23\n\t"
    "agfi  %%r1,256\n\t"
    "brctg %[n],0b\n\t"
    "vfasb   %%v24,%%v24,%%v25\n\t"
    "vfasb   %%v24,%%v24,%%v26\n\t"
    "vfasb   %%v24,%%v24,%%v27\n\t"
    "vfasb   %%v24,%%v24,%%v28\n\t"
    "vfasb   %%v24,%%v24,%%v29\n\t"
    "vfasb   %%v24,%%v24,%%v30\n\t"
    "vfasb   %%v24,%%v24,%%v31\n\t"
    "veslg   %%v25,%%v24,32\n\t"
    "vfasb   %%v24,%%v24,%%v25\n\t"
    "vrepf   %%v25,%%v24,2\n\t"
    "vfasb   %%v24,%%v24,%%v25\n\t"
    "vstef   %%v24,%[asum],0"
    : [sum] "=Q"(sum),[n] "+&r"(n)
    : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
    : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
       "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");

  return sum;
 }

 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
  BLASLONG i = 0;
  BLASLONG j = 0;
  FLOAT sumf = 0.0;
  BLASLONG n1;

  if (n <= 0 || inc_x <= 0)
    return sumf;

  if (inc_x == 1) {

    n1 = n & -64;

    if (n1 > 0) {

      sumf = ssum_kernel_64(n1, x);
      i = n1;
    }

    while (i < n) {
      sumf += x[i];
      i++;
    }

  } else {
    BLASLONG n1 = n & -4;
    register FLOAT sum1, sum2;
    sum1 = 0.0;
    sum2 = 0.0;
    while (j < n1) {

      sum1 += x[i];
      sum2 += x[i + inc_x];
      sum1 += x[i + 2 * inc_x];
      sum2 += x[i + 3 * inc_x];

      i += inc_x * 4;
      j += 4;

    }
    sumf = sum1 + sum2;
    while (j < n) {

      sumf += x[i];
      i += inc_x;
      j++;
    }

  }
  return sumf;
 }
--- a/kernel/zarch/zsum.c
+++ b/kernel/zarch/zsum.c
@@ -0,0 +1,136 @@
 /***************************************************************************
 Copyright (c) 2013-2019, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/

 #include "common.h"
 #include <math.h>


 static FLOAT zsum_kernel_16(BLASLONG n, FLOAT *x) {
  FLOAT sum;

  __asm__("vzero   %%v24\n\t"
    "vzero   %%v25\n\t"
    "vzero   %%v26\n\t"
    "vzero   %%v27\n\t"
    "vzero   %%v28\n\t"
    "vzero   %%v29\n\t"
    "vzero   %%v30\n\t"
    "vzero   %%v31\n\t"
    "srlg  %[n],%[n],4\n\t"
    "xgr %%r1,%%r1\n\t"
    "0:\n\t"
    "pfd  1, 1024(%%r1,%[x])\n\t"
    "vl  %%v16, 0(%%r1,%[x])\n\t"
    "vl  %%v17, 16(%%r1,%[x])\n\t"
    "vl  %%v18, 32(%%r1,%[x])\n\t"
    "vl  %%v19, 48(%%r1,%[x])\n\t"
    "vl  %%v20, 64(%%r1,%[x])\n\t"
    "vl  %%v21, 80(%%r1,%[x])\n\t"
    "vl  %%v22, 96(%%r1,%[x])\n\t"
    "vl  %%v23, 112(%%r1,%[x])\n\t"
    "vfadb   %%v24,%%v24,%%v16\n\t"
    "vfadb   %%v25,%%v25,%%v17\n\t"
    "vfadb   %%v26,%%v26,%%v18\n\t"
    "vfadb   %%v27,%%v27,%%v19\n\t"
    "vfadb   %%v28,%%v28,%%v20\n\t"
    "vfadb   %%v29,%%v29,%%v21\n\t"
    "vfadb   %%v30,%%v30,%%v22\n\t"
    "vfadb   %%v31,%%v31,%%v23\n\t"
    "vl  %%v16, 128(%%r1,%[x])\n\t"
    "vl  %%v17, 144(%%r1,%[x])\n\t"
    "vl  %%v18, 160(%%r1,%[x])\n\t"
    "vl  %%v19, 176(%%r1,%[x])\n\t"
    "vl  %%v20, 192(%%r1,%[x])\n\t"
    "vl  %%v21, 208(%%r1,%[x])\n\t"
    "vl  %%v22, 224(%%r1,%[x])\n\t"
    "vl  %%v23, 240(%%r1,%[x])\n\t"
    "vfadb   %%v24,%%v24,%%v16\n\t"
    "vfadb   %%v25,%%v25,%%v17\n\t"
    "vfadb   %%v26,%%v26,%%v18\n\t"
    "vfadb   %%v27,%%v27,%%v19\n\t"
    "vfadb   %%v28,%%v28,%%v20\n\t"
    "vfadb   %%v29,%%v29,%%v21\n\t"
    "vfadb   %%v30,%%v30,%%v22\n\t"
    "vfadb   %%v31,%%v31,%%v23\n\t"
    "agfi  %%r1,256\n\t"
    "brctg %[n],0b\n\t"
    "vfadb   %%v24,%%v24,%%v25\n\t"
    "vfadb   %%v24,%%v24,%%v26\n\t"
    "vfadb   %%v24,%%v24,%%v27\n\t"
    "vfadb   %%v24,%%v24,%%v28\n\t"
    "vfadb   %%v24,%%v24,%%v29\n\t"
    "vfadb   %%v24,%%v24,%%v30\n\t"
    "vfadb   %%v24,%%v24,%%v31\n\t"
    "vrepg   %%v25,%%v24,1\n\t"
    "vfadb   %%v24,%%v24,%%v25\n\t"
    "vsteg   %%v24,%[asum],0"
    : [sum] "=Q"(sum),[n] "+&r"(n)
    : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
    : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
       "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");

  return sum;
 }

 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
  BLASLONG i = 0;
  BLASLONG ip = 0;
  FLOAT sumf = 0.0;
  BLASLONG n1;
  BLASLONG inc_x2;

  if (n <= 0 || inc_x <= 0)
    return (sumf);

  if (inc_x == 1) {

    n1 = n & -16;
    if (n1 > 0) {

      sumf = zsum_kernel_16(n1, x);
      i = n1;
      ip = 2 * n1;
    }

    while (i < n) {
      sumf += x[ip] + x[ip + 1];
      i++;
      ip += 2;
    }

  } else {
    inc_x2 = 2 * inc_x;

    while (i < n) {
      sumf += x[ip] + x[ip + 1];
      ip += inc_x2;
      i++;
    }

  }
  return (sumf);
 }