| @@ -121,5 +121,11 @@ In chronological order: | |||
| * [2014-10-10] trmm and sgemm kernels (optimized for APM's X-Gene 1). | |||
| ARMv8 support. | |||
| * Dan Kortschak | |||
| * [2015-01-07] Added test for drotmg bug #484. | |||
| * Ton van den Heuvel <https://github.com/ton> | |||
| * [2015-03-18] Fix race condition during shutdown causing a crash in gotoblas_set_affinity(). | |||
| * [Your name or handle] <[email or website]> | |||
| * [Date] [Brief summary of your changes] | |||
| @@ -1,4 +1,24 @@ | |||
| OpenBLAS ChangeLog | |||
| ==================================================================== | |||
| Version 0.2.14 | |||
| 24-Mar-2015 | |||
| common: | |||
| * Improve OpenBLASConfig.cmake. (#474, #475. Thanks, xantares.) | |||
| * Improve ger and gemv for small matrices by stack allocation. | |||
| e.g. make -DMAX_STACK_ALLOC=2048 (#482. Thanks, Jerome Robert.) | |||
| * Introduce openblas_get_num_threads and openblas_get_num_procs. | |||
| (#497. Thanks, Erik Schnetter.) | |||
| * Add ATLAS-style ?geadd function. (#509. Thanks, Martin Köhler.) | |||
| * Fix c/zsyr bug with negative incx. (#492.) | |||
| * Fix race condition during shutdown causing a crash in | |||
| gotoblas_set_affinity(). (#508. Thanks, Ton van den Heuvel.) | |||
| x86/x86-64: | |||
| * Support AMD Streamroller. | |||
| ARM: | |||
| * Add Cortex-A9 and Cortex-A15 targets. | |||
| ==================================================================== | |||
| Version 0.2.13 | |||
| 3-Dec-2014 | |||
| @@ -9,10 +9,10 @@ | |||
| If you want to allocate 64 large pages, | |||
| $shell> echo 0 > /pros/sys/vm/nr_hugepages # need to be reset | |||
| $shell> echo 65 > /pros/sys/vm/nr_hugepages # add 1 extra page | |||
| $shell> echo 3355443200 > /pros/sys/kernel/shmmax # just large number | |||
| $shell> echo 3355443200 > /pros/sys/kernel/shmall | |||
| $shell> echo 0 > /proc/sys/vm/nr_hugepages # need to be reset | |||
| $shell> echo 65 > /proc/sys/vm/nr_hugepages # add 1 extra page | |||
| $shell> echo 3355443200 > /proc/sys/kernel/shmmax # just large number | |||
| $shell> echo 3355443200 > /proc/sys/kernel/shmall | |||
| Also may add a few lines into /etc/security/limits.conf file. | |||
| @@ -1,3 +1,8 @@ | |||
| # ifeq logical or | |||
| ifeq ($(CORE), $(filter $(CORE),CORTEXA9 CORTEXA15)) | |||
| CCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard -march=armv7-a | |||
| FCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard -march=armv7-a | |||
| endif | |||
| ifeq ($(CORE), ARMV7) | |||
| CCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard -march=armv7-a | |||
| @@ -9,7 +9,7 @@ OPENBLAS_INCLUDE_DIR := $(PREFIX)/include | |||
| OPENBLAS_LIBRARY_DIR := $(PREFIX)/lib | |||
| OPENBLAS_BINARY_DIR := $(PREFIX)/bin | |||
| OPENBLAS_BUILD_DIR := $(CURDIR) | |||
| OPENBLAS_CMAKE_DIR := $(PREFIX)/cmake | |||
| OPENBLAS_CMAKE_DIR := $(OPENBLAS_LIBRARY_DIR)/cmake/openblas | |||
| OPENBLAS_CMAKE_CONFIG := OpenBLASConfig.cmake | |||
| .PHONY : install | |||
| @@ -46,11 +46,11 @@ ifndef NO_CBLAS | |||
| endif | |||
| ifndef NO_LAPACKE | |||
| @echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) | |||
| @-install -pDm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h | |||
| @-install -pDm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_config.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h | |||
| @-install -pDm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_mangling_with_flags.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h | |||
| @-install -pDm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_utils.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h | |||
| @echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) | |||
| @-install -pm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h | |||
| @-install -pm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_config.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h | |||
| @-install -pm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_mangling_with_flags.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h | |||
| @-install -pm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_utils.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h | |||
| endif | |||
| #for install static library | |||
| @@ -95,7 +95,8 @@ endif | |||
| endif | |||
| #Generating OpenBLASConfig.cmake | |||
| @echo Generating $(OPENBLAS_CMAKE_CONFIG) in $(DESTDIR)$(OPENBLAS_CMAKE_DIR) | |||
| @echo "SET(OpenBLAS_INCLUDE_DIRS ${OPENBLAS_INCLUDE_DIR})" > $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG) | |||
| @echo "SET(OpenBLAS_VERSION \"${VERSION}\")" > $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG) | |||
| @echo "SET(OpenBLAS_INCLUDE_DIRS ${OPENBLAS_INCLUDE_DIR})" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG) | |||
| ifndef NO_SHARED | |||
| #ifeq logical or | |||
| ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD)) | |||
| @@ -3,7 +3,7 @@ | |||
| # | |||
| # This library's version | |||
| VERSION = 0.2.13 | |||
| VERSION = 0.2.14 | |||
| # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a | |||
| # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library | |||
| @@ -159,6 +159,19 @@ COMMON_PROF = -pg | |||
| # Build Debug version | |||
| # DEBUG = 1 | |||
| # Improve GEMV and GER for small matrices by stack allocation. | |||
| # For details, https://github.com/xianyi/OpenBLAS/pull/482 | |||
| # | |||
| # MAX_STACK_ALLOC=2048 | |||
| # Add a prefix or suffix to all exported symbol names in the shared library. | |||
| # Avoid conflicts with other BLAS libraries, especially when using | |||
| # 64 bit integer interfaces in OpenBLAS. | |||
| # For details, https://github.com/xianyi/OpenBLAS/pull/459 | |||
| # | |||
| # SYMBOLPREFIX= | |||
| # SYMBOLSUFFIX= | |||
| # | |||
| # End of user configuration | |||
| # | |||
| @@ -61,6 +61,9 @@ endif | |||
| ifeq ($(TARGET), PILEDRIVER) | |||
| GETARCH_FLAGS := -DFORCE_BARCELONA | |||
| endif | |||
| ifeq ($(TARGET), STEAMROLLER) | |||
| GETARCH_FLAGS := -DFORCE_BARCELONA | |||
| endif | |||
| endif | |||
| @@ -85,6 +88,9 @@ endif | |||
| ifeq ($(TARGET_CORE), PILEDRIVER) | |||
| GETARCH_FLAGS := -DFORCE_BARCELONA | |||
| endif | |||
| ifeq ($(TARGET_CORE), STEAMROLLER) | |||
| GETARCH_FLAGS := -DFORCE_BARCELONA | |||
| endif | |||
| endif | |||
| @@ -305,6 +311,10 @@ ifdef SANITY_CHECK | |||
| CCOMMON_OPT += -DSANITY_CHECK -DREFNAME=$(*F)f$(BU) | |||
| endif | |||
| ifdef MAX_STACK_ALLOC | |||
| CCOMMON_OPT += -DMAX_STACK_ALLOC=$(MAX_STACK_ALLOC) | |||
| endif | |||
| # | |||
| # Architecture dependent settings | |||
| # | |||
| @@ -354,6 +364,12 @@ endif | |||
| ifeq ($(USE_OPENMP), 1) | |||
| #check | |||
| ifeq ($(USE_THREAD), 0) | |||
| $(error OpenBLAS: Cannot set both USE_OPENMP=1 and USE_THREAD=0. The USE_THREAD=0 is only for building single thread version.) | |||
| endif | |||
| # ifeq logical or. GCC or LSB | |||
| ifeq ($(C_COMPILER), $(filter $(C_COMPILER),GCC LSB)) | |||
| CCOMMON_OPT += -fopenmp | |||
| @@ -392,7 +408,7 @@ endif | |||
| ifeq ($(ARCH), x86_64) | |||
| DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO | |||
| ifneq ($(NO_AVX), 1) | |||
| DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER | |||
| DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER | |||
| endif | |||
| ifneq ($(NO_AVX2), 1) | |||
| DYNAMIC_CORE += HASWELL | |||
| @@ -60,6 +60,7 @@ Please read GotoBLAS_01Readme.txt | |||
| - **AMD Bobcat**: Used GotoBLAS2 Barcelona codes. | |||
| - **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thank Werner Saar) | |||
| - **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations. | |||
| - **AMD STEAMROLLER**: Uses Bulldozer codes with some optimizations. | |||
| #### MIPS64: | |||
| - **ICT Loongson 3A**: Optimized Level-3 BLAS and the part of Level-1,2. | |||
| @@ -32,6 +32,7 @@ ISTANBUL | |||
| BOBCAT | |||
| BULLDOZER | |||
| PILEDRIVER | |||
| STEAMROLLER | |||
| c)VIA CPU: | |||
| SSE_GENERIC | |||
| @@ -62,6 +63,11 @@ SPARC | |||
| SPARCV7 | |||
| 6.ARM CPU: | |||
| CORTEXA15 | |||
| CORTEXA9 | |||
| ARMV7 | |||
| ARMV6 | |||
| ARMV5 | |||
| 7.ARM 64-bit CPU: | |||
| ARMV8 | |||
| @@ -6,8 +6,13 @@ include $(TOPDIR)/Makefile.system | |||
| #LIBACML = -fopenmp $(ACML)/libacml_mp.a -lgfortran -lm | |||
| # ACML custom | |||
| ACML=/opt/pb/acml-5-3-1-gfortran-64bit/gfortran64_fma4_mp/lib | |||
| LIBACML = -fopenmp $(ACML)/libacml_mp.a -lgfortran -lm | |||
| #ACML=/opt/pb/acml-5-3-1-gfortran-64bit/gfortran64_fma4_mp/lib | |||
| #LIBACML = -fopenmp $(ACML)/libacml_mp.a -lgfortran -lm | |||
| # ACML 6.1 custom | |||
| ACML=/home/werner/project/acml6.1/gfortran64_mp/lib | |||
| LIBACML = -fopenmp $(ACML)/libacml_mp.so -lgfortran -lm | |||
| # Atlas Ubuntu | |||
| #ATLAS=/usr/lib/atlas-base | |||
| @@ -114,7 +114,7 @@ static void *huge_malloc(BLASLONG size){ | |||
| #endif | |||
| int MAIN__(int argc, char *argv[]){ | |||
| int main(int argc, char *argv[]){ | |||
| FLOAT *x, *y; | |||
| FLOAT alpha[2] = { 2.0, 2.0 }; | |||
| @@ -198,4 +198,4 @@ int MAIN__(int argc, char *argv[]){ | |||
| return 0; | |||
| } | |||
| void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| @@ -117,7 +117,7 @@ static __inline double getmflops(int ratio, int m, double secs){ | |||
| } | |||
| int MAIN__(int argc, char *argv[]){ | |||
| int main(int argc, char *argv[]){ | |||
| #ifndef COMPLEX | |||
| char *trans[] = {"T", "N"}; | |||
| @@ -273,4 +273,4 @@ int MAIN__(int argc, char *argv[]){ | |||
| return 0; | |||
| } | |||
| void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| @@ -108,7 +108,7 @@ static void *huge_malloc(BLASLONG size){ | |||
| #endif | |||
| int MAIN__(int argc, char *argv[]){ | |||
| int main(int argc, char *argv[]){ | |||
| FLOAT *x, *y; | |||
| FLOAT result; | |||
| @@ -192,4 +192,4 @@ int MAIN__(int argc, char *argv[]){ | |||
| return 0; | |||
| } | |||
| void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| @@ -139,7 +139,7 @@ static void *huge_malloc(BLASLONG size){ | |||
| #endif | |||
| int MAIN__(int argc, char *argv[]){ | |||
| int main(int argc, char *argv[]){ | |||
| FLOAT *a,*vl,*vr,*wi,*wr,*work,*rwork; | |||
| FLOAT wkopt[4]; | |||
| @@ -257,4 +257,4 @@ int MAIN__(int argc, char *argv[]){ | |||
| return 0; | |||
| } | |||
| void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| @@ -118,14 +118,15 @@ static void *huge_malloc(BLASLONG size){ | |||
| #endif | |||
| int MAIN__(int argc, char *argv[]){ | |||
| int main(int argc, char *argv[]){ | |||
| FLOAT *a, *b, *c; | |||
| FLOAT alpha[] = {1.0, 1.0}; | |||
| FLOAT beta [] = {1.0, 1.0}; | |||
| char trans='N'; | |||
| blasint m, i, j; | |||
| blasint m, n, i, j; | |||
| int loops = 1; | |||
| int has_param_n=0; | |||
| int l; | |||
| char *p; | |||
| @@ -162,6 +163,11 @@ int MAIN__(int argc, char *argv[]){ | |||
| if ( p != NULL ) | |||
| loops = atoi(p); | |||
| if ((p = getenv("OPENBLAS_PARAM_N"))) { | |||
| n = atoi(p); | |||
| has_param_n=1; | |||
| } | |||
| #ifdef linux | |||
| srandom(getpid()); | |||
| @@ -174,7 +180,14 @@ int MAIN__(int argc, char *argv[]){ | |||
| timeg=0; | |||
| fprintf(stderr, " %6d : ", (int)m); | |||
| if ( has_param_n == 1 && n <= m ) | |||
| n=n; | |||
| else | |||
| n=m; | |||
| fprintf(stderr, " %6dx%d : ", (int)m, (int)n); | |||
| for (l=0; l<loops; l++) | |||
| { | |||
| @@ -189,7 +202,7 @@ int MAIN__(int argc, char *argv[]){ | |||
| gettimeofday( &start, (struct timezone *)0); | |||
| GEMM (&trans, &trans, &m, &m, &m, alpha, a, &m, b, &m, beta, c, &m ); | |||
| GEMM (&trans, &trans, &m, &n, &m, alpha, a, &m, b, &m, beta, c, &m ); | |||
| gettimeofday( &stop, (struct timezone *)0); | |||
| @@ -202,11 +215,11 @@ int MAIN__(int argc, char *argv[]){ | |||
| timeg /= loops; | |||
| fprintf(stderr, | |||
| " %10.2f MFlops\n", | |||
| COMPSIZE * COMPSIZE * 2. * (double)m * (double)m * (double)m / timeg * 1.e-6); | |||
| COMPSIZE * COMPSIZE * 2. * (double)m * (double)m * (double)n / timeg * 1.e-6); | |||
| } | |||
| return 0; | |||
| } | |||
| void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| @@ -118,7 +118,7 @@ static void *huge_malloc(BLASLONG size){ | |||
| #endif | |||
| int MAIN__(int argc, char *argv[]){ | |||
| int main(int argc, char *argv[]){ | |||
| FLOAT *a, *b, *c; | |||
| FLOAT alpha[] = {1.0, 1.0}; | |||
| @@ -209,4 +209,4 @@ int MAIN__(int argc, char *argv[]){ | |||
| return 0; | |||
| } | |||
| void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| @@ -118,7 +118,7 @@ static void *huge_malloc(BLASLONG size){ | |||
| #endif | |||
| int MAIN__(int argc, char *argv[]){ | |||
| int main(int argc, char *argv[]){ | |||
| FLOAT *a, *x, *y; | |||
| FLOAT alpha[] = {1.0, 1.0}; | |||
| @@ -266,4 +266,4 @@ int MAIN__(int argc, char *argv[]){ | |||
| return 0; | |||
| } | |||
| void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| @@ -108,7 +108,7 @@ static void *huge_malloc(BLASLONG size){ | |||
| #endif | |||
| int MAIN__(int argc, char *argv[]){ | |||
| int main(int argc, char *argv[]){ | |||
| FLOAT *a, *x, *y; | |||
| FLOAT alpha[] = {1.0, 1.0}; | |||
| @@ -214,5 +214,5 @@ int MAIN__(int argc, char *argv[]){ | |||
| return 0; | |||
| } | |||
| void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| @@ -137,7 +137,7 @@ static void *huge_malloc(BLASLONG size){ | |||
| #endif | |||
| int MAIN__(int argc, char *argv[]){ | |||
| int main(int argc, char *argv[]){ | |||
| FLOAT *a,*work; | |||
| FLOAT wkopt[4]; | |||
| @@ -231,4 +231,4 @@ int MAIN__(int argc, char *argv[]){ | |||
| return 0; | |||
| } | |||
| void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| @@ -107,7 +107,7 @@ static void *huge_malloc(BLASLONG size){ | |||
| #endif | |||
| int MAIN__(int argc, char *argv[]){ | |||
| int main(int argc, char *argv[]){ | |||
| FLOAT *a, *b, *c; | |||
| FLOAT alpha[] = {1.0, 1.0}; | |||
| @@ -189,4 +189,4 @@ int MAIN__(int argc, char *argv[]){ | |||
| return 0; | |||
| } | |||
| void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| @@ -108,7 +108,7 @@ static void *huge_malloc(BLASLONG size){ | |||
| #endif | |||
| int MAIN__(int argc, char *argv[]){ | |||
| int main(int argc, char *argv[]){ | |||
| FLOAT *a, *x, *y; | |||
| FLOAT alpha[] = {1.0, 1.0}; | |||
| @@ -205,4 +205,4 @@ int MAIN__(int argc, char *argv[]){ | |||
| return 0; | |||
| } | |||
| void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| @@ -106,7 +106,7 @@ static void *huge_malloc(BLASLONG size){ | |||
| #endif | |||
| int MAIN__(int argc, char *argv[]){ | |||
| int main(int argc, char *argv[]){ | |||
| FLOAT *a, *b, *c; | |||
| FLOAT alpha[] = {1.0, 1.0}; | |||
| @@ -188,4 +188,4 @@ int MAIN__(int argc, char *argv[]){ | |||
| return 0; | |||
| } | |||
| void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| @@ -108,7 +108,7 @@ static void *huge_malloc(BLASLONG size){ | |||
| #endif | |||
| int MAIN__(int argc, char *argv[]){ | |||
| int main(int argc, char *argv[]){ | |||
| FLOAT *a, *c; | |||
| FLOAT alpha[] = {1.0, 1.0}; | |||
| @@ -186,4 +186,4 @@ int MAIN__(int argc, char *argv[]){ | |||
| return 0; | |||
| } | |||
| void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| @@ -137,7 +137,7 @@ static void *huge_malloc(BLASLONG size){ | |||
| #endif | |||
| int MAIN__(int argc, char *argv[]){ | |||
| int main(int argc, char *argv[]){ | |||
| FLOAT *a, *b; | |||
| blasint *ipiv; | |||
| @@ -270,4 +270,4 @@ int MAIN__(int argc, char *argv[]){ | |||
| return 0; | |||
| } | |||
| void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| @@ -114,7 +114,7 @@ int gettimeofday(struct timeval *tv, void *tz){ | |||
| #endif | |||
| int MAIN__(int argc, char *argv[]){ | |||
| int main(int argc, char *argv[]){ | |||
| #ifndef COMPLEX | |||
| char *trans[] = {"T", "N"}; | |||
| @@ -278,5 +278,5 @@ int MAIN__(int argc, char *argv[]){ | |||
| return 0; | |||
| } | |||
| void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| @@ -118,7 +118,7 @@ static void *huge_malloc(BLASLONG size){ | |||
| #endif | |||
| int MAIN__(int argc, char *argv[]){ | |||
| int main(int argc, char *argv[]){ | |||
| FLOAT *a, *b, *c; | |||
| FLOAT alpha[] = {1.0, 1.0}; | |||
| @@ -200,4 +200,4 @@ int MAIN__(int argc, char *argv[]){ | |||
| return 0; | |||
| } | |||
| void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| @@ -118,7 +118,7 @@ static void *huge_malloc(BLASLONG size){ | |||
| #endif | |||
| int MAIN__(int argc, char *argv[]){ | |||
| int main(int argc, char *argv[]){ | |||
| FLOAT *a, *x, *y; | |||
| FLOAT alpha[] = {1.0, 1.0}; | |||
| @@ -215,4 +215,4 @@ int MAIN__(int argc, char *argv[]){ | |||
| return 0; | |||
| } | |||
| void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| @@ -118,7 +118,7 @@ static void *huge_malloc(BLASLONG size){ | |||
| #endif | |||
| int MAIN__(int argc, char *argv[]){ | |||
| int main(int argc, char *argv[]){ | |||
| FLOAT *a, *b, *c; | |||
| FLOAT alpha[] = {1.0, 1.0}; | |||
| @@ -200,4 +200,4 @@ int MAIN__(int argc, char *argv[]){ | |||
| return 0; | |||
| } | |||
| void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| @@ -118,7 +118,7 @@ static void *huge_malloc(BLASLONG size){ | |||
| #endif | |||
| int MAIN__(int argc, char *argv[]){ | |||
| int main(int argc, char *argv[]){ | |||
| FLOAT *a, *c; | |||
| FLOAT alpha[] = {1.0, 1.0}; | |||
| @@ -196,4 +196,4 @@ int MAIN__(int argc, char *argv[]){ | |||
| return 0; | |||
| } | |||
| void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| @@ -118,7 +118,7 @@ static void *huge_malloc(BLASLONG size){ | |||
| #endif | |||
| int MAIN__(int argc, char *argv[]){ | |||
| int main(int argc, char *argv[]){ | |||
| FLOAT *a, *b; | |||
| FLOAT alpha[] = {1.0, 1.0}; | |||
| @@ -199,4 +199,4 @@ int MAIN__(int argc, char *argv[]){ | |||
| return 0; | |||
| } | |||
| void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| @@ -118,7 +118,7 @@ static void *huge_malloc(BLASLONG size){ | |||
| #endif | |||
| int MAIN__(int argc, char *argv[]){ | |||
| int main(int argc, char *argv[]){ | |||
| FLOAT *a, *b; | |||
| FLOAT alpha[] = {1.0, 1.0}; | |||
| @@ -199,4 +199,4 @@ int MAIN__(int argc, char *argv[]){ | |||
| return 0; | |||
| } | |||
| void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| @@ -81,6 +81,10 @@ if (($architecture eq "mips32") || ($architecture eq "mips64")) { | |||
| $defined = 1; | |||
| } | |||
| if (($architecture eq "arm") || ($architecture eq "arm64")) { | |||
| $defined = 1; | |||
| } | |||
| if ($architecture eq "alpha") { | |||
| $defined = 1; | |||
| $binary = 64; | |||
| @@ -13,6 +13,12 @@ extern "C" { | |||
| void openblas_set_num_threads(int num_threads); | |||
| void goto_set_num_threads(int num_threads); | |||
| /*Get the number of threads on runtime.*/ | |||
| int openblas_get_num_threads(void); | |||
| /*Get the number of physical processors (cores).*/ | |||
| int openblas_get_num_procs(void); | |||
| /*Get the build configure on runtime.*/ | |||
| char* openblas_get_config(void); | |||
| @@ -341,6 +347,16 @@ void cblas_cimatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum | |||
| void cblas_zimatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double* calpha, double* a, | |||
| OPENBLAS_CONST blasint clda, OPENBLAS_CONST blasint cldb); | |||
| void cblas_sgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float calpha, float *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST float cbeta, | |||
| float *c, OPENBLAS_CONST blasint cldc); | |||
| void cblas_dgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double calpha, double *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST double cbeta, | |||
| double *c, OPENBLAS_CONST blasint cldc); | |||
| void cblas_cgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float *calpha, float *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST float *cbeta, | |||
| float *c, OPENBLAS_CONST blasint cldc); | |||
| void cblas_zgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double *calpha, double *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST double *cbeta, | |||
| double *c, OPENBLAS_CONST blasint cldc); | |||
| #ifdef __cplusplus | |||
| } | |||
| #endif /* __cplusplus */ | |||
| @@ -13,6 +13,12 @@ extern "C" { | |||
| void openblas_set_num_threads(int num_threads); | |||
| void goto_set_num_threads(int num_threads); | |||
| /*Get the number of threads on runtime.*/ | |||
| int openblas_get_num_threads(void); | |||
| /*Get the number of physical processors (cores).*/ | |||
| int openblas_get_num_procs(void); | |||
| /*Get the build configure on runtime.*/ | |||
| char* openblas_get_config(void); | |||
| @@ -327,6 +333,16 @@ void cblas_cimatcopy( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, bl | |||
| blasint clda, blasint cldb); | |||
| void cblas_zimatcopy( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, blasint ccols, double* calpha, double* a, | |||
| blasint clda, blasint cldb); | |||
| void cblas_sgeadd( enum CBLAS_ORDER CORDER, blasint crows, blasint ccols, float calpha, float *a, blasint clda, float cbeta, | |||
| float *c, blasint cldc); | |||
| void cblas_dgeadd( enum CBLAS_ORDER CORDER, blasint crows, blasint ccols, double calpha, double *a, blasint clda, double cbeta, | |||
| double *c, blasint cldc); | |||
| void cblas_cgeadd( enum CBLAS_ORDER CORDER, blasint crows, blasint ccols, float *calpha, float *a, blasint clda, float *cbeta, | |||
| float *c, blasint cldc); | |||
| void cblas_zgeadd( enum CBLAS_ORDER CORDER, blasint crows, blasint ccols, double *calpha, double *a, blasint clda, double *cbeta, | |||
| double *c, blasint cldc); | |||
| #ifdef __cplusplus | |||
| } | |||
| #endif /* __cplusplus */ | |||
| @@ -327,6 +327,14 @@ typedef int blasint; | |||
| #endif | |||
| #endif | |||
| /* | |||
| #ifdef STEAMROLLER | |||
| #ifndef YIELDING | |||
| #define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n"); | |||
| #endif | |||
| #endif | |||
| */ | |||
| #ifndef YIELDING | |||
| #define YIELDING sched_yield() | |||
| #endif | |||
| @@ -220,6 +220,7 @@ | |||
| #define COMATCOPY_K_CTC comatcopy_k_ctc | |||
| #define COMATCOPY_K_RTC comatcopy_k_rtc | |||
| #define CGEADD_K cgeadd_k | |||
| #else | |||
| @@ -402,6 +403,7 @@ | |||
| #define COMATCOPY_K_RNC gotoblas -> comatcopy_k_rnc | |||
| #define COMATCOPY_K_CTC gotoblas -> comatcopy_k_ctc | |||
| #define COMATCOPY_K_RTC gotoblas -> comatcopy_k_rtc | |||
| #define CGEADD_K gotoblas -> cgeadd_k | |||
| #endif | |||
| @@ -149,6 +149,7 @@ | |||
| #define DOMATCOPY_K_RN domatcopy_k_rn | |||
| #define DOMATCOPY_K_CT domatcopy_k_ct | |||
| #define DOMATCOPY_K_RT domatcopy_k_rt | |||
| #define DGEADD_K dgeadd_k | |||
| #else | |||
| @@ -267,6 +268,8 @@ | |||
| #define DOMATCOPY_K_CT gotoblas -> domatcopy_k_ct | |||
| #define DOMATCOPY_K_RT gotoblas -> domatcopy_k_rt | |||
| #define DGEADD_K gotoblas -> dgeadd_k | |||
| #endif | |||
| #define DGEMM_NN dgemm_nn | |||
| @@ -754,6 +754,12 @@ void BLASFUNC(dimatcopy) (char *, char *, blasint *, blasint *, double *, do | |||
| void BLASFUNC(cimatcopy) (char *, char *, blasint *, blasint *, float *, float *, blasint *, blasint *); | |||
| void BLASFUNC(zimatcopy) (char *, char *, blasint *, blasint *, double *, double *, blasint *, blasint *); | |||
| void BLASFUNC(sgeadd) (blasint *, blasint *, float *, float *, blasint *, float *, float *, blasint*); | |||
| void BLASFUNC(dgeadd) (blasint *, blasint *, double *, double *, blasint *, double *, double *, blasint*); | |||
| void BLASFUNC(cgeadd) (blasint *, blasint *, float *, float *, blasint *, float *, float *, blasint*); | |||
| void BLASFUNC(zgeadd) (blasint *, blasint *, double *, double *, blasint *, double *, double *, blasint*); | |||
| #ifdef __cplusplus | |||
| } | |||
| @@ -1762,6 +1762,11 @@ int zomatcopy_k_rnc(BLASLONG, BLASLONG, double, double, double *, BLASLONG, dou | |||
| int zomatcopy_k_ctc(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG); | |||
| int zomatcopy_k_rtc(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG); | |||
| int sgeadd_k(BLASLONG, BLASLONG, float, float*, BLASLONG, float, float *, BLASLONG); | |||
| int dgeadd_k(BLASLONG, BLASLONG, double, double*, BLASLONG, double, double *, BLASLONG); | |||
| int cgeadd_k(BLASLONG, BLASLONG, float, float, float*, BLASLONG, float, float, float *, BLASLONG); | |||
| int zgeadd_k(BLASLONG, BLASLONG, double,double, double*, BLASLONG, double, double, double *, BLASLONG); | |||
| #ifdef __CUDACC__ | |||
| } | |||
| @@ -634,7 +634,7 @@ | |||
| #define OMATCOPY_K_RN DOMATCOPY_K_RN | |||
| #define OMATCOPY_K_CT DOMATCOPY_K_CT | |||
| #define OMATCOPY_K_RT DOMATCOPY_K_RT | |||
| #define GEADD_K DGEADD_K | |||
| #else | |||
| #define AMAX_K SAMAX_K | |||
| @@ -932,6 +932,7 @@ | |||
| #define OMATCOPY_K_CT SOMATCOPY_K_CT | |||
| #define OMATCOPY_K_RT SOMATCOPY_K_RT | |||
| #define GEADD_K SGEADD_K | |||
| #endif | |||
| #else | |||
| #ifdef XDOUBLE | |||
| @@ -1746,6 +1747,7 @@ | |||
| #define OMATCOPY_K_RNC ZOMATCOPY_K_RNC | |||
| #define OMATCOPY_K_CTC ZOMATCOPY_K_CTC | |||
| #define OMATCOPY_K_RTC ZOMATCOPY_K_RTC | |||
| #define GEADD_K ZGEADD_K | |||
| #else | |||
| @@ -2159,6 +2161,8 @@ | |||
| #define OMATCOPY_K_CTC COMATCOPY_K_CTC | |||
| #define OMATCOPY_K_RTC COMATCOPY_K_RTC | |||
| #define GEADD_K CGEADD_K | |||
| #endif | |||
| #endif | |||
| @@ -855,6 +855,10 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); | |||
| int (*zomatcopy_k_rnc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); | |||
| int (*zomatcopy_k_rtc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); | |||
| int (*sgeadd_k) (BLASLONG, BLASLONG, float, float *, BLASLONG, float, float *, BLASLONG); | |||
| int (*dgeadd_k) (BLASLONG, BLASLONG, double, double *, BLASLONG, double, double *, BLASLONG); | |||
| int (*cgeadd_k) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float, float, float *, BLASLONG); | |||
| int (*zgeadd_k) (BLASLONG, BLASLONG, float, double, double *, BLASLONG, double, double, double *, BLASLONG); | |||
| } gotoblas_t; | |||
| @@ -153,6 +153,7 @@ | |||
| #define SOMATCOPY_K_CT somatcopy_k_ct | |||
| #define SOMATCOPY_K_RT somatcopy_k_rt | |||
| #define SGEADD_K sgeadd_k | |||
| #else | |||
| @@ -274,6 +275,7 @@ | |||
| #define SOMATCOPY_K_CT gotoblas -> somatcopy_k_ct | |||
| #define SOMATCOPY_K_RT gotoblas -> somatcopy_k_rt | |||
| #define SGEADD_K gotoblas -> sgeadd_k | |||
| #endif | |||
| @@ -171,7 +171,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ | |||
| #define MMXSTORE movd | |||
| #endif | |||
| #if defined(PILEDRIVER) || defined(BULLDOZER) | |||
| #if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER) | |||
| //Enable some optimazation for barcelona. | |||
| #define BARCELONA_OPTIMIZATION | |||
| #endif | |||
| @@ -226,7 +226,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ | |||
| #ifdef ASSEMBLER | |||
| #if defined(PILEDRIVER) || defined(BULLDOZER) | |||
| #if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER) | |||
| //Enable some optimazation for barcelona. | |||
| #define BARCELONA_OPTIMIZATION | |||
| #endif | |||
| @@ -220,6 +220,7 @@ | |||
| #define ZOMATCOPY_K_CTC zomatcopy_k_ctc | |||
| #define ZOMATCOPY_K_RTC zomatcopy_k_rtc | |||
| #define ZGEADD_K zgeadd_k | |||
| #else | |||
| @@ -403,6 +404,8 @@ | |||
| #define ZOMATCOPY_K_CTC gotoblas -> zomatcopy_k_ctc | |||
| #define ZOMATCOPY_K_RTC gotoblas -> zomatcopy_k_rtc | |||
| #define ZGEADD_K gotoblas -> zgeadd_k | |||
| #endif | |||
| #define ZGEMM_NN zgemm_nn | |||
| @@ -104,10 +104,11 @@ | |||
| #define CORE_ATOM 18 | |||
| #define CORE_NANO 19 | |||
| #define CORE_SANDYBRIDGE 20 | |||
| #define CORE_BOBCAT 21 | |||
| #define CORE_BULLDOZER 22 | |||
| #define CORE_BOBCAT 21 | |||
| #define CORE_BULLDOZER 22 | |||
| #define CORE_PILEDRIVER 23 | |||
| #define CORE_HASWELL 24 | |||
| #define CORE_HASWELL 24 | |||
| #define CORE_STEAMROLLER 25 | |||
| #define HAVE_SSE (1 << 0) | |||
| #define HAVE_SSE2 (1 << 1) | |||
| @@ -200,6 +201,7 @@ typedef struct { | |||
| #define CPUTYPE_BOBCAT 45 | |||
| #define CPUTYPE_BULLDOZER 46 | |||
| #define CPUTYPE_PILEDRIVER 47 | |||
| #define CPUTYPE_HASWELL 48 | |||
| #define CPUTYPE_HASWELL 48 | |||
| #define CPUTYPE_STEAMROLLER 49 | |||
| #endif | |||
| @@ -30,16 +30,27 @@ | |||
| #define CPU_UNKNOWN 0 | |||
| #define CPU_ARMV6 1 | |||
| #define CPU_ARMV7 2 | |||
| #define CPU_CORTEXA15 3 | |||
| #define CPU_CORTEXA9 3 | |||
| #define CPU_CORTEXA15 4 | |||
| static char *cpuname[] = { | |||
| "UNKOWN", | |||
| "ARMV6", | |||
| "ARMV7", | |||
| "CORTEXA9", | |||
| "CORTEXA15" | |||
| }; | |||
| static char *cpuname_lower[] = { | |||
| "unknown", | |||
| "armv6", | |||
| "armv7", | |||
| "cortexa9", | |||
| "cortexa15" | |||
| }; | |||
| int get_feature(char *search) | |||
| { | |||
| @@ -85,6 +96,29 @@ int detect(void) | |||
| char buffer[512], *p; | |||
| p = (char *) NULL ; | |||
| infile = fopen("/proc/cpuinfo", "r"); | |||
| while (fgets(buffer, sizeof(buffer), infile)) | |||
| { | |||
| if (!strncmp("CPU part", buffer, 8)) | |||
| { | |||
| p = strchr(buffer, ':') + 2; | |||
| break; | |||
| } | |||
| } | |||
| fclose(infile); | |||
| if(p != NULL) { | |||
| if (strstr(p, "0xc09")) { | |||
| return CPU_CORTEXA9; | |||
| } | |||
| if (strstr(p, "0xc0f")) { | |||
| return CPU_CORTEXA15; | |||
| } | |||
| } | |||
| p = (char *) NULL ; | |||
| infile = fopen("/proc/cpuinfo", "r"); | |||
| while (fgets(buffer, sizeof(buffer), infile)) | |||
| @@ -142,21 +176,7 @@ void get_architecture(void) | |||
| void get_subarchitecture(void) | |||
| { | |||
| int d = detect(); | |||
| switch (d) | |||
| { | |||
| case CPU_ARMV7: | |||
| printf("ARMV7"); | |||
| break; | |||
| case CPU_ARMV6: | |||
| printf("ARMV6"); | |||
| break; | |||
| default: | |||
| printf("UNKNOWN"); | |||
| break; | |||
| } | |||
| printf("%s", cpuname[d]); | |||
| } | |||
| void get_subdirname(void) | |||
| @@ -170,6 +190,36 @@ void get_cpuconfig(void) | |||
| int d = detect(); | |||
| switch (d) | |||
| { | |||
| case CPU_CORTEXA9: | |||
| printf("#define CORTEXA9\n"); | |||
| printf("#define HAVE_VFP\n"); | |||
| printf("#define HAVE_VFPV3\n"); | |||
| if ( get_feature("neon")) printf("#define HAVE_NEON\n"); | |||
| if ( get_feature("vfpv4")) printf("#define HAVE_VFPV4\n"); | |||
| printf("#define L1_DATA_SIZE 32768\n"); | |||
| printf("#define L1_DATA_LINESIZE 32\n"); | |||
| printf("#define L2_SIZE 1048576\n"); | |||
| printf("#define L2_LINESIZE 32\n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 128\n"); | |||
| printf("#define DTB_SIZE 4096\n"); | |||
| printf("#define L2_ASSOCIATIVE 4\n"); | |||
| break; | |||
| case CPU_CORTEXA15: | |||
| printf("#define CORTEXA15\n"); | |||
| printf("#define HAVE_VFP\n"); | |||
| printf("#define HAVE_VFPV3\n"); | |||
| if ( get_feature("neon")) printf("#define HAVE_NEON\n"); | |||
| if ( get_feature("vfpv4")) printf("#define HAVE_VFPV4\n"); | |||
| printf("#define L1_DATA_SIZE 32768\n"); | |||
| printf("#define L1_DATA_LINESIZE 32\n"); | |||
| printf("#define L2_SIZE 1048576\n"); | |||
| printf("#define L2_LINESIZE 32\n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 128\n"); | |||
| printf("#define DTB_SIZE 4096\n"); | |||
| printf("#define L2_ASSOCIATIVE 4\n"); | |||
| break; | |||
| case CPU_ARMV7: | |||
| printf("#define ARMV7\n"); | |||
| @@ -206,18 +256,7 @@ void get_libname(void) | |||
| { | |||
| int d = detect(); | |||
| switch (d) | |||
| { | |||
| case CPU_ARMV7: | |||
| printf("armv7\n"); | |||
| break; | |||
| case CPU_ARMV6: | |||
| printf("armv6\n"); | |||
| break; | |||
| } | |||
| printf("%s", cpuname_lower[d]); | |||
| } | |||
| @@ -1162,6 +1162,12 @@ int get_cpuname(void){ | |||
| return CPUTYPE_PILEDRIVER; | |||
| else | |||
| return CPUTYPE_BARCELONA; //OS don't support AVX. | |||
| case 0: | |||
| if(support_avx()) | |||
| return CPUTYPE_STEAMROLLER; | |||
| else | |||
| return CPUTYPE_BARCELONA; //OS don't support AVX. | |||
| } | |||
| break; | |||
| case 5: | |||
| @@ -1290,6 +1296,7 @@ static char *cpuname[] = { | |||
| "BULLDOZER", | |||
| "PILEDRIVER", | |||
| "HASWELL", | |||
| "STEAMROLLER", | |||
| }; | |||
| static char *lowercpuname[] = { | |||
| @@ -1341,6 +1348,7 @@ static char *lowercpuname[] = { | |||
| "bulldozer", | |||
| "piledriver", | |||
| "haswell", | |||
| "steamroller", | |||
| }; | |||
| static char *corename[] = { | |||
| @@ -1369,6 +1377,7 @@ static char *corename[] = { | |||
| "BULLDOZER", | |||
| "PILEDRIVER", | |||
| "HASWELL", | |||
| "STEAMROLLER", | |||
| }; | |||
| static char *corename_lower[] = { | |||
| @@ -1397,6 +1406,7 @@ static char *corename_lower[] = { | |||
| "bulldozer", | |||
| "piledriver", | |||
| "haswell", | |||
| "steamroller", | |||
| }; | |||
| @@ -1562,7 +1572,15 @@ int get_coretype(void){ | |||
| return CORE_PILEDRIVER; | |||
| else | |||
| return CORE_BARCELONA; //OS don't support AVX. | |||
| case 0: | |||
| if(support_avx()) | |||
| return CORE_STEAMROLLER; | |||
| else | |||
| return CORE_BARCELONA; //OS don't support AVX. | |||
| } | |||
| }else return CORE_BARCELONA; | |||
| } | |||
| } | |||
| @@ -1,7 +1,7 @@ | |||
| TOPDIR = ../.. | |||
| include ../../Makefile.system | |||
| COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) openblas_get_config.$(SUFFIX) openblas_get_parallel.$(SUFFIX) openblas_error_handle.$(SUFFIX) | |||
| COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) openblas_get_num_threads.$(SUFFIX) openblas_get_num_procs.$(SUFFIX) openblas_get_config.$(SUFFIX) openblas_get_parallel.$(SUFFIX) openblas_error_handle.$(SUFFIX) | |||
| #COMMONOBJS += slamch.$(SUFFIX) slamc3.$(SUFFIX) dlamch.$(SUFFIX) dlamc3.$(SUFFIX) | |||
| @@ -103,6 +103,12 @@ blas_server.$(SUFFIX) : $(BLAS_SERVER) ../../common.h ../../common_thread.h ../. | |||
| openblas_set_num_threads.$(SUFFIX) : openblas_set_num_threads.c | |||
| $(CC) $(CFLAGS) -c $< -o $(@F) | |||
| openblas_get_num_threads.$(SUFFIX) : openblas_get_num_threads.c | |||
| $(CC) $(CFLAGS) -c $< -o $(@F) | |||
| openblas_get_num_procs.$(SUFFIX) : openblas_get_num_procs.c | |||
| $(CC) $(CFLAGS) -c $< -o $(@F) | |||
| openblas_get_config.$(SUFFIX) : openblas_get_config.c | |||
| $(CC) $(CFLAGS) -c $< -o $(@F) | |||
| @@ -66,6 +66,7 @@ extern gotoblas_t gotoblas_BOBCAT; | |||
| extern gotoblas_t gotoblas_SANDYBRIDGE; | |||
| extern gotoblas_t gotoblas_BULLDOZER; | |||
| extern gotoblas_t gotoblas_PILEDRIVER; | |||
| extern gotoblas_t gotoblas_STEAMROLLER; | |||
| #ifdef NO_AVX2 | |||
| #define gotoblas_HASWELL gotoblas_SANDYBRIDGE | |||
| #else | |||
| @@ -77,6 +78,7 @@ extern gotoblas_t gotoblas_HASWELL; | |||
| #define gotoblas_HASWELL gotoblas_NEHALEM | |||
| #define gotoblas_BULLDOZER gotoblas_BARCELONA | |||
| #define gotoblas_PILEDRIVER gotoblas_BARCELONA | |||
| #define gotoblas_STEAMROLLER gotoblas_BARCELONA | |||
| #endif | |||
| @@ -275,7 +277,17 @@ static gotoblas_t *get_coretype(void){ | |||
| openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); | |||
| return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. | |||
| } | |||
| }else if(model == 0){ | |||
| //AMD STEAMROLLER | |||
| if(support_avx()) | |||
| return &gotoblas_STEAMROLLER; | |||
| else{ | |||
| openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); | |||
| return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. | |||
| } | |||
| } | |||
| } else { | |||
| return &gotoblas_BARCELONA; | |||
| } | |||
| @@ -315,6 +327,7 @@ static char *corename[] = { | |||
| "Bulldozer", | |||
| "Piledriver", | |||
| "Haswell", | |||
| "Steamroller", | |||
| }; | |||
| char *gotoblas_corename(void) { | |||
| @@ -339,6 +352,7 @@ char *gotoblas_corename(void) { | |||
| if (gotoblas == &gotoblas_BULLDOZER) return corename[18]; | |||
| if (gotoblas == &gotoblas_PILEDRIVER) return corename[19]; | |||
| if (gotoblas == &gotoblas_HASWELL) return corename[20]; | |||
| if (gotoblas == &gotoblas_STEAMROLLER) return corename[21]; | |||
| return corename[0]; | |||
| } | |||
| @@ -349,9 +363,9 @@ static gotoblas_t *force_coretype(char *coretype){ | |||
| int i ; | |||
| int found = -1; | |||
| char message[128]; | |||
| char mname[20]; | |||
| //char mname[20]; | |||
| for ( i=1 ; i <= 20; i++) | |||
| for ( i=1 ; i <= 21; i++) | |||
| { | |||
| if (!strncasecmp(coretype,corename[i],20)) | |||
| { | |||
| @@ -361,8 +375,8 @@ static gotoblas_t *force_coretype(char *coretype){ | |||
| } | |||
| if (found < 0) | |||
| { | |||
| strncpy(mname,coretype,20); | |||
| sprintf(message, "Core not found: %s\n",mname); | |||
| //strncpy(mname,coretype,20); | |||
| snprintf(message, 128, "Core not found: %s\n",coretype); | |||
| openblas_warning(1, message); | |||
| return(NULL); | |||
| } | |||
| @@ -370,6 +384,7 @@ static gotoblas_t *force_coretype(char *coretype){ | |||
| switch (found) | |||
| { | |||
| case 21: return (&gotoblas_STEAMROLLER); | |||
| case 20: return (&gotoblas_HASWELL); | |||
| case 19: return (&gotoblas_PILEDRIVER); | |||
| case 18: return (&gotoblas_BULLDOZER); | |||
| @@ -241,6 +241,7 @@ void set_stack_limit(int limitMB){ | |||
| */ | |||
| #endif | |||
| /* | |||
| OpenBLAS uses the numbers of CPU cores in multithreading. | |||
| It can be set by openblas_set_num_threads(int num_threads); | |||
| @@ -323,6 +324,23 @@ int blas_get_cpu_number(void){ | |||
| } | |||
| #endif | |||
| int openblas_get_num_procs(void) { | |||
| #ifndef SMP | |||
| return 1; | |||
| #else | |||
| return get_num_procs(); | |||
| #endif | |||
| } | |||
| int openblas_get_num_threads(void) { | |||
| #ifndef SMP | |||
| return 1; | |||
| #else | |||
| return blas_get_cpu_number(); | |||
| #endif | |||
| } | |||
| struct release_t { | |||
| void *address; | |||
| void (*func)(struct release_t *); | |||
| @@ -1335,6 +1353,8 @@ void DESTRUCTOR gotoblas_quit(void) { | |||
| if (gotoblas_initialized == 0) return; | |||
| blas_shutdown(); | |||
| #ifdef PROFILE | |||
| moncontrol (0); | |||
| #endif | |||
| @@ -1356,8 +1376,6 @@ void DESTRUCTOR gotoblas_quit(void) { | |||
| #ifdef PROFILE | |||
| moncontrol (1); | |||
| #endif | |||
| blas_shutdown(); | |||
| } | |||
| #if (defined(C_PGI) || (!defined(C_SUN) && defined(F_INTERFACE_SUN))) && (defined(ARCH_X86) || defined(ARCH_X86_64)) | |||
| @@ -0,0 +1,40 @@ | |||
| /***************************************************************************** | |||
| Copyright (c) 2011-2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written | |||
| permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| **********************************************************************************/ | |||
| #include "common.h" | |||
| extern int openblas_get_num_procs(void); | |||
| int openblas_get_num_procs_(void) { | |||
| return openblas_get_num_procs(); | |||
| } | |||
| @@ -0,0 +1,40 @@ | |||
| /***************************************************************************** | |||
| Copyright (c) 2011-2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written | |||
| permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| **********************************************************************************/ | |||
| #include "common.h" | |||
| extern int openblas_get_num_threads(void); | |||
| int openblas_get_num_threads_(void) { | |||
| return openblas_get_num_threads(); | |||
| } | |||
| @@ -166,7 +166,7 @@ int get_L2_size(void){ | |||
| #if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) || \ | |||
| defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \ | |||
| defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) || \ | |||
| defined(PILEDRIVER) || defined(HASWELL) | |||
| defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) | |||
| cpuid(0x80000006, &eax, &ebx, &ecx, &edx); | |||
| @@ -251,7 +251,7 @@ void blas_set_parameter(void){ | |||
| env_var_t p; | |||
| int factor; | |||
| #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) | |||
| #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER) | |||
| int size = 16; | |||
| #else | |||
| int size = get_L2_size(); | |||
| @@ -100,7 +100,12 @@ else | |||
| $(OBJCONV) @objconv.def ../$(LIBNAME) ../$(LIBNAME).renamed | |||
| $(LIBDYNNAME) : ../$(LIBNAME).renamed osx.def | |||
| endif | |||
| ifeq ($(NOFORTRAN), 2) | |||
| #only build cblas without Fortran | |||
| $(CC) $(CFLAGS) -all_load -headerpad_max_install_names -install_name $(CURDIR)/../$(LIBDYNNAME) -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) | |||
| else | |||
| $(FC) $(FFLAGS) -all_load -headerpad_max_install_names -install_name $(CURDIR)/../$(LIBDYNNAME) -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) | |||
| endif | |||
| dllinit.$(SUFFIX) : dllinit.c | |||
| $(CC) $(CFLAGS) -c -o $(@F) -s $< | |||
| @@ -23,7 +23,8 @@ | |||
| zhpr,zrotg,zscal,zswap,zsymm,zsyr2k,zsyrk,ztbmv, | |||
| ztbsv,ztpmv,ztpsv,ztrmm,ztrmv,ztrsm,ztrsv, zsymv, | |||
| xerbla, | |||
| saxpby,daxpby,caxpby,zaxpby | |||
| saxpby,daxpby,caxpby,zaxpby, | |||
| sgeadd,dgeadd,cgeadd,zgeadd, | |||
| ); | |||
| @cblasobjs = ( | |||
| @@ -55,6 +56,7 @@ | |||
| cblas_saxpby,cblas_daxpby,cblas_caxpby,cblas_zaxpby, | |||
| cblas_somatcopy, cblas_domatcopy, cblas_comatcopy, cblas_zomatcopy, | |||
| cblas_simatcopy, cblas_dimatcopy, cblas_cimatcopy, cblas_zimatcopy, | |||
| cblas_sgeadd, cblas_dgeadd,cblas_cgeadd, cblas_zgeadd | |||
| ); | |||
| @exblasobjs = ( | |||
| @@ -81,7 +83,10 @@ | |||
| #both underscore and no underscore | |||
| @misc_common_objs = ( | |||
| openblas_set_num_threads, openblas_get_parallel, | |||
| openblas_get_parallel, | |||
| openblas_get_num_procs, | |||
| openblas_set_num_threads, | |||
| openblas_get_num_threads, | |||
| ); | |||
| @misc_no_underscore_objs = ( | |||
| @@ -432,6 +432,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define CORENAME "PILEDRIVER" | |||
| #endif | |||
| #if defined (FORCE_STEAMROLLER) | |||
| #define FORCE | |||
| #define FORCE_INTEL | |||
| #define ARCHITECTURE "X86" | |||
| #define SUBARCHITECTURE "STEAMROLLER" | |||
| #define ARCHCONFIG "-DSTEAMROLLER " \ | |||
| "-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=64 " \ | |||
| "-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL3_SIZE=12582912 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||
| "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 " \ | |||
| "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU -DHAVE_CFLUSH " \ | |||
| "-DHAVE_AVX -DHAVE_FMA4 -DHAVE_FMA3" | |||
| #define LIBNAME "steamroller" | |||
| #define CORENAME "STEAMROLLER" | |||
| #endif | |||
| #ifdef FORCE_SSE_GENERIC | |||
| #define FORCE | |||
| #define FORCE_INTEL | |||
| @@ -710,6 +727,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #else | |||
| #endif | |||
| #ifdef FORCE_CORTEXA9 | |||
| #define FORCE | |||
| #define ARCHITECTURE "ARM" | |||
| #define SUBARCHITECTURE "CORTEXA9" | |||
| #define SUBDIRNAME "arm" | |||
| #define ARCHCONFIG "-DCORTEXA9 " \ | |||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \ | |||
| "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " \ | |||
| "-DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON" | |||
| #define LIBNAME "cortexa9" | |||
| #define CORENAME "CORTEXA9" | |||
| #else | |||
| #endif | |||
| #ifdef FORCE_CORTEXA15 | |||
| #define FORCE | |||
| #define ARCHITECTURE "ARM" | |||
| #define SUBARCHITECTURE "CORTEXA15" | |||
| #define SUBDIRNAME "arm" | |||
| #define ARCHCONFIG "-DCORTEXA15 " \ | |||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \ | |||
| "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " \ | |||
| "-DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON" | |||
| #define LIBNAME "cortexa15" | |||
| #define CORENAME "CORTEXA15" | |||
| #else | |||
| #endif | |||
| #ifdef FORCE_ARMV6 | |||
| #define FORCE | |||
| #define ARCHITECTURE "ARM" | |||
| @@ -43,7 +43,8 @@ SBLAS2OBJS = \ | |||
| SBLAS3OBJS = \ | |||
| sgemm.$(SUFFIX) ssymm.$(SUFFIX) strmm.$(SUFFIX) \ | |||
| strsm.$(SUFFIX) ssyrk.$(SUFFIX) ssyr2k.$(SUFFIX) \ | |||
| somatcopy.$(SUFFIX) simatcopy.$(SUFFIX) | |||
| somatcopy.$(SUFFIX) simatcopy.$(SUFFIX)\ | |||
| sgeadd.$(SUFFIX) | |||
| DBLAS1OBJS = \ | |||
| @@ -68,7 +69,8 @@ DBLAS2OBJS = \ | |||
| DBLAS3OBJS = \ | |||
| dgemm.$(SUFFIX) dsymm.$(SUFFIX) dtrmm.$(SUFFIX) \ | |||
| dtrsm.$(SUFFIX) dsyrk.$(SUFFIX) dsyr2k.$(SUFFIX) \ | |||
| domatcopy.$(SUFFIX) dimatcopy.$(SUFFIX) | |||
| domatcopy.$(SUFFIX) dimatcopy.$(SUFFIX)\ | |||
| dgeadd.$(SUFFIX) | |||
| CBLAS1OBJS = \ | |||
| caxpy.$(SUFFIX) caxpyc.$(SUFFIX) cswap.$(SUFFIX) \ | |||
| @@ -96,7 +98,8 @@ CBLAS3OBJS = \ | |||
| cgemm.$(SUFFIX) csymm.$(SUFFIX) ctrmm.$(SUFFIX) \ | |||
| ctrsm.$(SUFFIX) csyrk.$(SUFFIX) csyr2k.$(SUFFIX) \ | |||
| chemm.$(SUFFIX) cherk.$(SUFFIX) cher2k.$(SUFFIX) \ | |||
| comatcopy.$(SUFFIX) cimatcopy.$(SUFFIX) | |||
| comatcopy.$(SUFFIX) cimatcopy.$(SUFFIX)\ | |||
| cgeadd.$(SUFFIX) | |||
| ZBLAS1OBJS = \ | |||
| zaxpy.$(SUFFIX) zaxpyc.$(SUFFIX) zswap.$(SUFFIX) \ | |||
| @@ -124,7 +127,8 @@ ZBLAS3OBJS = \ | |||
| zgemm.$(SUFFIX) zsymm.$(SUFFIX) ztrmm.$(SUFFIX) \ | |||
| ztrsm.$(SUFFIX) zsyrk.$(SUFFIX) zsyr2k.$(SUFFIX) \ | |||
| zhemm.$(SUFFIX) zherk.$(SUFFIX) zher2k.$(SUFFIX) \ | |||
| zomatcopy.$(SUFFIX) zimatcopy.$(SUFFIX) | |||
| zomatcopy.$(SUFFIX) zimatcopy.$(SUFFIX)\ | |||
| zgeadd.$(SUFFIX) | |||
| ifeq ($(SUPPORT_GEMM3M), 1) | |||
| @@ -269,7 +273,8 @@ CSBLAS2OBJS = \ | |||
| CSBLAS3OBJS = \ | |||
| cblas_sgemm.$(SUFFIX) cblas_ssymm.$(SUFFIX) cblas_strmm.$(SUFFIX) cblas_strsm.$(SUFFIX) \ | |||
| cblas_ssyrk.$(SUFFIX) cblas_ssyr2k.$(SUFFIX) cblas_somatcopy.$(SUFFIX) cblas_simatcopy.$(SUFFIX) | |||
| cblas_ssyrk.$(SUFFIX) cblas_ssyr2k.$(SUFFIX) cblas_somatcopy.$(SUFFIX) cblas_simatcopy.$(SUFFIX)\ | |||
| cblas_sgeadd.$(SUFFIX) | |||
| CDBLAS1OBJS = \ | |||
| cblas_idamax.$(SUFFIX) cblas_dasum.$(SUFFIX) cblas_daxpy.$(SUFFIX) \ | |||
| @@ -285,7 +290,8 @@ CDBLAS2OBJS = \ | |||
| CDBLAS3OBJS += \ | |||
| cblas_dgemm.$(SUFFIX) cblas_dsymm.$(SUFFIX) cblas_dtrmm.$(SUFFIX) cblas_dtrsm.$(SUFFIX) \ | |||
| cblas_dsyrk.$(SUFFIX) cblas_dsyr2k.$(SUFFIX) cblas_domatcopy.$(SUFFIX) cblas_dimatcopy.$(SUFFIX) | |||
| cblas_dsyrk.$(SUFFIX) cblas_dsyr2k.$(SUFFIX) cblas_domatcopy.$(SUFFIX) cblas_dimatcopy.$(SUFFIX) \ | |||
| cblas_dgeadd.$(SUFFIX) | |||
| CCBLAS1OBJS = \ | |||
| cblas_icamax.$(SUFFIX) cblas_scasum.$(SUFFIX) cblas_caxpy.$(SUFFIX) \ | |||
| @@ -308,7 +314,9 @@ CCBLAS3OBJS = \ | |||
| cblas_cgemm.$(SUFFIX) cblas_csymm.$(SUFFIX) cblas_ctrmm.$(SUFFIX) cblas_ctrsm.$(SUFFIX) \ | |||
| cblas_csyrk.$(SUFFIX) cblas_csyr2k.$(SUFFIX) \ | |||
| cblas_chemm.$(SUFFIX) cblas_cherk.$(SUFFIX) cblas_cher2k.$(SUFFIX) \ | |||
| cblas_comatcopy.$(SUFFIX) cblas_cimatcopy.$(SUFFIX) | |||
| cblas_comatcopy.$(SUFFIX) cblas_cimatcopy.$(SUFFIX)\ | |||
| cblas_cgeadd.$(SUFFIX) | |||
| CZBLAS1OBJS = \ | |||
| @@ -332,7 +340,9 @@ CZBLAS3OBJS = \ | |||
| cblas_zgemm.$(SUFFIX) cblas_zsymm.$(SUFFIX) cblas_ztrmm.$(SUFFIX) cblas_ztrsm.$(SUFFIX) \ | |||
| cblas_zsyrk.$(SUFFIX) cblas_zsyr2k.$(SUFFIX) \ | |||
| cblas_zhemm.$(SUFFIX) cblas_zherk.$(SUFFIX) cblas_zher2k.$(SUFFIX)\ | |||
| cblas_zomatcopy.$(SUFFIX) cblas_zimatcopy.$(SUFFIX) | |||
| cblas_zomatcopy.$(SUFFIX) cblas_zimatcopy.$(SUFFIX) \ | |||
| cblas_zgeadd.$(SUFFIX) | |||
| ifeq ($(SUPPORT_GEMM3M), 1) | |||
| @@ -2103,4 +2113,27 @@ zimatcopy.$(SUFFIX) zimatcopy.$(PSUFFIX) : zimatcopy.c | |||
| cblas_zimatcopy.$(SUFFIX) cblas_zimatcopy.$(PSUFFIX) : zimatcopy.c | |||
| $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) | |||
| sgeadd.$(SUFFIX) sgeadd.$(PSUFFIX) : geadd.c | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| dgeadd.$(SUFFIX) dgeadd.$(PSUFFIX) : geadd.c | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| cgeadd.$(SUFFIX) cgeadd.$(PSUFFIX) : zgeadd.c | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| zgeadd.$(SUFFIX) zgeadd.$(PSUFFIX) : zgeadd.c | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| cblas_sgeadd.$(SUFFIX) cblas_sgeadd.$(PSUFFIX) : geadd.c | |||
| $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) | |||
| cblas_dgeadd.$(SUFFIX) cblas_dgeadd.$(PSUFFIX) : geadd.c | |||
| $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) | |||
| cblas_cgeadd.$(SUFFIX) cblas_cgeadd.$(PSUFFIX) : zgeadd.c | |||
| $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) | |||
| cblas_zgeadd.$(SUFFIX) cblas_zgeadd.$(PSUFFIX) : zgeadd.c | |||
| $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) | |||
| @@ -0,0 +1,148 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| #ifdef FUNCTION_PROFILE | |||
| #include "functable.h" | |||
| #endif | |||
| #if defined(DOUBLE) | |||
| #define ERROR_NAME "DGEADD " | |||
| #else | |||
| #define ERROR_NAME "SGEADD " | |||
| #endif | |||
| #ifndef CBLAS | |||
| void NAME(blasint *M, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA, | |||
| FLOAT *BETA, FLOAT *c, blasint *LDC) | |||
| { | |||
| blasint m = *M; | |||
| blasint n = *N; | |||
| blasint lda = *LDA; | |||
| blasint ldc = *LDC; | |||
| FLOAT alpha = *ALPHA; | |||
| FLOAT beta = *BETA; | |||
| blasint info; | |||
| PRINT_DEBUG_NAME; | |||
| info = 0; | |||
| if (lda < MAX(1, m)) info = 6; | |||
| if (ldc < MAX(1, m)) info = 8; | |||
| if (n < 0) info = 2; | |||
| if (m < 0) info = 1; | |||
| if (info != 0){ | |||
| BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); | |||
| return; | |||
| } | |||
| #else | |||
| void CNAME( enum CBLAS_ORDER order, blasint m, blasint n, FLOAT alpha, FLOAT *a, blasint lda, FLOAT beta, | |||
| FLOAT *c, blasint ldc) | |||
| { | |||
| /* | |||
| void CNAME(enum CBLAS_ORDER order, | |||
| blasint m, blasint n, | |||
| FLOAT alpha, | |||
| FLOAT *a, blasint lda, | |||
| FLOAT beta, | |||
| FLOAT *c, blasint ldc){ */ | |||
| blasint info, t; | |||
| PRINT_DEBUG_CNAME; | |||
| info = 0; | |||
| if (order == CblasColMajor) { | |||
| info = -1; | |||
| if (ldc < MAX(1, m)) info = 8; | |||
| if (lda < MAX(1, m)) info = 5; | |||
| if (n < 0) info = 2; | |||
| if (m < 0) info = 1; | |||
| } | |||
| if (order == CblasRowMajor) { | |||
| info = -1; | |||
| t = n; | |||
| n = m; | |||
| m = t; | |||
| if (ldc < MAX(1, m)) info = 8; | |||
| if (lda < MAX(1, m)) info = 5; | |||
| if (n < 0) info = 2; | |||
| if (m < 0) info = 1; | |||
| } | |||
| if (info >= 0) { | |||
| BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); | |||
| return; | |||
| } | |||
| #endif | |||
| if ((m==0) || (n==0)) return; | |||
| IDEBUG_START; | |||
| FUNCTION_PROFILE_START(); | |||
| GEADD_K(m,n,alpha, a, lda, beta, c, ldc); | |||
| FUNCTION_PROFILE_END(1, 2* m * n , 2 * m * n); | |||
| IDEBUG_END; | |||
| return; | |||
| } | |||
| @@ -208,7 +208,20 @@ void CNAME(enum CBLAS_ORDER order, | |||
| if (incx < 0) x -= (lenx - 1) * incx; | |||
| if (incy < 0) y -= (leny - 1) * incy; | |||
| #ifdef MAX_STACK_ALLOC | |||
| // make it volatile because some gemv implementation (ex: dgemv_n.S) | |||
| // do not restore all register | |||
| volatile int stack_alloc_size = m + n; | |||
| if(stack_alloc_size < 128) | |||
| //dgemv_n.S require a 128 bytes buffer | |||
| stack_alloc_size = 128; | |||
| if(stack_alloc_size > MAX_STACK_ALLOC / sizeof(FLOAT)) | |||
| stack_alloc_size = 0; | |||
| FLOAT stack_buffer[stack_alloc_size]; | |||
| buffer = stack_alloc_size ? stack_buffer : (FLOAT *)blas_memory_alloc(1); | |||
| #else | |||
| buffer = (FLOAT *)blas_memory_alloc(1); | |||
| #endif | |||
| #ifdef SMP | |||
| @@ -237,7 +250,10 @@ void CNAME(enum CBLAS_ORDER order, | |||
| } | |||
| #endif | |||
| blas_memory_free(buffer); | |||
| #ifdef MAX_STACK_ALLOC | |||
| if(!stack_alloc_size) | |||
| #endif | |||
| blas_memory_free(buffer); | |||
| FUNCTION_PROFILE_END(1, m * n + m + n, 2 * m * n); | |||
| @@ -171,7 +171,15 @@ void CNAME(enum CBLAS_ORDER order, | |||
| if (incy < 0) y -= (n - 1) * incy; | |||
| if (incx < 0) x -= (m - 1) * incx; | |||
| #ifdef MAX_STACK_ALLOC | |||
| volatile int stack_alloc_size = m; | |||
| if(stack_alloc_size > MAX_STACK_ALLOC / sizeof(FLOAT)) | |||
| stack_alloc_size = 0; | |||
| FLOAT stack_buffer[stack_alloc_size]; | |||
| buffer = stack_alloc_size ? stack_buffer : (FLOAT *)blas_memory_alloc(1); | |||
| #else | |||
| buffer = (FLOAT *)blas_memory_alloc(1); | |||
| #endif | |||
| #ifdef SMPTEST | |||
| nthreads = num_cpu_avail(2); | |||
| @@ -190,7 +198,10 @@ void CNAME(enum CBLAS_ORDER order, | |||
| } | |||
| #endif | |||
| blas_memory_free(buffer); | |||
| #ifdef MAX_STACK_ALLOC | |||
| if(!stack_alloc_size) | |||
| #endif | |||
| blas_memory_free(buffer); | |||
| FUNCTION_PROFILE_END(1, m * n + m + n, 2 * m * n); | |||
| @@ -0,0 +1,146 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| #ifdef FUNCTION_PROFILE | |||
| #include "functable.h" | |||
| #endif | |||
| #if defined(DOUBLE) | |||
| #define ERROR_NAME "ZGEADD " | |||
| #else | |||
| #define ERROR_NAME "CGEADD " | |||
| #endif | |||
| #ifndef CBLAS | |||
| void NAME(blasint *M, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA, | |||
| FLOAT *BETA, FLOAT *c, blasint *LDC) | |||
| { | |||
| blasint m = *M; | |||
| blasint n = *N; | |||
| blasint lda = *LDA; | |||
| blasint ldc = *LDC; | |||
| blasint info; | |||
| PRINT_DEBUG_NAME; | |||
| info = 0; | |||
| if (lda < MAX(1, m)) info = 6; | |||
| if (ldc < MAX(1, m)) info = 8; | |||
| if (n < 0) info = 2; | |||
| if (m < 0) info = 1; | |||
| if (info != 0){ | |||
| BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); | |||
| return; | |||
| } | |||
| #else | |||
| void CNAME( enum CBLAS_ORDER order, blasint m, blasint n, FLOAT *ALPHA, FLOAT *a, blasint lda, FLOAT *BETA, | |||
| FLOAT *c, blasint ldc) | |||
| { | |||
| /* | |||
| void CNAME(enum CBLAS_ORDER order, | |||
| blasint m, blasint n, | |||
| FLOAT alpha, | |||
| FLOAT *a, blasint lda, | |||
| FLOAT beta, | |||
| FLOAT *c, blasint ldc){ */ | |||
| blasint info, t; | |||
| PRINT_DEBUG_CNAME; | |||
| info = 0; | |||
| if (order == CblasColMajor) { | |||
| info = -1; | |||
| if (ldc < MAX(1, m)) info = 8; | |||
| if (lda < MAX(1, m)) info = 5; | |||
| if (n < 0) info = 2; | |||
| if (m < 0) info = 1; | |||
| } | |||
| if (order == CblasRowMajor) { | |||
| info = -1; | |||
| t = n; | |||
| n = m; | |||
| m = t; | |||
| if (ldc < MAX(1, m)) info = 8; | |||
| if (lda < MAX(1, m)) info = 5; | |||
| if (n < 0) info = 2; | |||
| if (m < 0) info = 1; | |||
| } | |||
| if (info >= 0) { | |||
| BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); | |||
| return; | |||
| } | |||
| #endif | |||
| if ((m==0) || (n==0)) return; | |||
| IDEBUG_START; | |||
| FUNCTION_PROFILE_START(); | |||
| GEADD_K(m,n,ALPHA[0],ALPHA[1], a, lda, BETA[0], BETA[1], c, ldc); | |||
| FUNCTION_PROFILE_END(1, 2* m * n , 2 * m * n); | |||
| IDEBUG_END; | |||
| return; | |||
| } | |||
| @@ -173,7 +173,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, int n, FLOAT alpha, FLO | |||
| FUNCTION_PROFILE_START(); | |||
| if (incx < 0 ) x -= (n - 1) * incx; | |||
| if (incx < 0 ) x -= (n - 1) * incx * 2; | |||
| buffer = (FLOAT *)blas_memory_alloc(1); | |||
| @@ -329,23 +329,27 @@ endif | |||
| ###### BLAS extensions ##### | |||
| SBLASOBJS += \ | |||
| somatcopy_k_cn$(TSUFFIX).$(SUFFIX) somatcopy_k_rn$(TSUFFIX).$(SUFFIX) \ | |||
| somatcopy_k_ct$(TSUFFIX).$(SUFFIX) somatcopy_k_rt$(TSUFFIX).$(SUFFIX) | |||
| somatcopy_k_ct$(TSUFFIX).$(SUFFIX) somatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ | |||
| sgeadd_k$(TSUFFIX).$(SUFFIX) | |||
| DBLASOBJS += \ | |||
| domatcopy_k_cn$(TSUFFIX).$(SUFFIX) domatcopy_k_rn$(TSUFFIX).$(SUFFIX) \ | |||
| domatcopy_k_ct$(TSUFFIX).$(SUFFIX) domatcopy_k_rt$(TSUFFIX).$(SUFFIX) | |||
| domatcopy_k_ct$(TSUFFIX).$(SUFFIX) domatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ | |||
| dgeadd_k$(TSUFFIX).$(SUFFIX) | |||
| CBLASOBJS += \ | |||
| comatcopy_k_cn$(TSUFFIX).$(SUFFIX) comatcopy_k_rn$(TSUFFIX).$(SUFFIX) \ | |||
| comatcopy_k_ct$(TSUFFIX).$(SUFFIX) comatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ | |||
| comatcopy_k_cnc$(TSUFFIX).$(SUFFIX) comatcopy_k_rnc$(TSUFFIX).$(SUFFIX) \ | |||
| comatcopy_k_ctc$(TSUFFIX).$(SUFFIX) comatcopy_k_rtc$(TSUFFIX).$(SUFFIX) | |||
| comatcopy_k_ctc$(TSUFFIX).$(SUFFIX) comatcopy_k_rtc$(TSUFFIX).$(SUFFIX) \ | |||
| cgeadd_k$(TSUFFIX).$(SUFFIX) | |||
| ZBLASOBJS += \ | |||
| zomatcopy_k_cn$(TSUFFIX).$(SUFFIX) zomatcopy_k_rn$(TSUFFIX).$(SUFFIX) \ | |||
| zomatcopy_k_ct$(TSUFFIX).$(SUFFIX) zomatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ | |||
| zomatcopy_k_cnc$(TSUFFIX).$(SUFFIX) zomatcopy_k_rnc$(TSUFFIX).$(SUFFIX) \ | |||
| zomatcopy_k_ctc$(TSUFFIX).$(SUFFIX) zomatcopy_k_rtc$(TSUFFIX).$(SUFFIX) | |||
| zomatcopy_k_ctc$(TSUFFIX).$(SUFFIX) zomatcopy_k_rtc$(TSUFFIX).$(SUFFIX) \ | |||
| zgeadd_k$(TSUFFIX).$(SUFFIX) | |||
| SGEMMINCOPYOBJ_P = $(SGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) | |||
| @@ -3440,3 +3444,31 @@ $(KDIR)zomatcopy_k_rtc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZOMATCOPY_RTC) | |||
| $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DROWM -DCONJ $< -o $@ | |||
| ifndef SGEADD_K | |||
| SGEADD_K = ../generic/geadd.c | |||
| endif | |||
| $(KDIR)sgeadd_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEADD_K) | |||
| $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -UROWM $< -o $@ | |||
| ifndef DGEADD_K | |||
| DGEADD_K = ../generic/geadd.c | |||
| endif | |||
| $(KDIR)dgeadd_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEADD_K) | |||
| $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -UROWM $< -o $@ | |||
| ifndef CGEADD_K | |||
| CGEADD_K = ../generic/zgeadd.c | |||
| endif | |||
| $(KDIR)cgeadd_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEADD_K) | |||
| $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -UROWM $< -o $@ | |||
| ifndef ZGEADD_K | |||
| ZGEADD_K = ../generic/zgeadd.c | |||
| endif | |||
| $(KDIR)zgeadd_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEADD_K) | |||
| $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM $< -o $@ | |||
| @@ -0,0 +1 @@ | |||
| include $(KERNELDIR)/KERNEL.ARMV7 | |||
| @@ -0,0 +1 @@ | |||
| include $(KERNELDIR)/KERNEL.ARMV7 | |||
| @@ -0,0 +1,64 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT beta, FLOAT *b, BLASLONG ldb) | |||
| { | |||
| BLASLONG i; | |||
| FLOAT *aptr,*bptr; | |||
| if ( rows <= 0 ) return(0); | |||
| if ( cols <= 0 ) return(0); | |||
| aptr = a; | |||
| bptr = b; | |||
| if ( alpha == 0.0 ) | |||
| { | |||
| for ( i=0; i<cols ; i++ ) | |||
| { | |||
| SCAL_K(rows, 0,0, beta, bptr, 1, NULL, 0,NULL,0); | |||
| bptr+=ldb; | |||
| } | |||
| return(0); | |||
| } | |||
| for (i = 0; i < cols; i++) { | |||
| AXPBY_K(rows, alpha, aptr, 1, beta, bptr, 1); | |||
| aptr += lda; | |||
| bptr += ldb; | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -0,0 +1,65 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alphar, FLOAT alphai, FLOAT *a, BLASLONG lda, FLOAT betar, FLOAT betai , FLOAT *b, BLASLONG ldb) | |||
| { | |||
| BLASLONG i; | |||
| FLOAT *aptr,*bptr; | |||
| if ( rows <= 0 ) return(0); | |||
| if ( cols <= 0 ) return(0); | |||
| aptr = a; | |||
| bptr = b; | |||
| lda *= 2; | |||
| ldb *= 2; | |||
| if ( alphar == 0.0 && alphai == 0.0 ) | |||
| { | |||
| for ( i=0; i<cols ; i++ ) | |||
| { | |||
| SCAL_K(rows, 0,0, betar, betai, bptr, 1, NULL, 0,NULL,0); | |||
| bptr+=ldb; | |||
| } | |||
| return(0); | |||
| } | |||
| for (i = 0; i < cols; i++) { | |||
| AXPBY_K(rows, alphar, alphai, aptr, 1, betar, betai, bptr, 1); | |||
| aptr += lda; | |||
| bptr += ldb; | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -548,8 +548,9 @@ gotoblas_t TABLE_NAME = { | |||
| comatcopy_k_cnTS, comatcopy_k_ctTS, comatcopy_k_rnTS, comatcopy_k_rtTS, | |||
| comatcopy_k_cncTS, comatcopy_k_ctcTS, comatcopy_k_rncTS, comatcopy_k_rtcTS, | |||
| zomatcopy_k_cnTS, zomatcopy_k_ctTS, zomatcopy_k_rnTS, zomatcopy_k_rtTS, | |||
| zomatcopy_k_cncTS, zomatcopy_k_ctcTS, zomatcopy_k_rncTS, zomatcopy_k_rtcTS | |||
| zomatcopy_k_cncTS, zomatcopy_k_ctcTS, zomatcopy_k_rncTS, zomatcopy_k_rtcTS, | |||
| sgeadd_kTS, dgeadd_kTS, cgeadd_kTS, zgeadd_kTS | |||
| }; | |||
| @@ -941,6 +942,23 @@ static void init_parameter(void) { | |||
| #endif | |||
| #endif | |||
| #ifdef STEAMROLLER | |||
| #ifdef DEBUG | |||
| fprintf(stderr, "Steamroller\n"); | |||
| #endif | |||
| TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; | |||
| TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; | |||
| TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; | |||
| TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; | |||
| #ifdef EXPRECISION | |||
| TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; | |||
| TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; | |||
| #endif | |||
| #endif | |||
| #ifdef NANO | |||
| #ifdef DEBUG | |||
| @@ -34,17 +34,17 @@ CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMKERNEL = zgemm_kernel_4x4_sandy.S | |||
| ZGEMMINCOPY = | |||
| ZGEMMITCOPY = | |||
| ZGEMMKERNEL = zgemm_kernel_1x4_nehalem.S | |||
| ZGEMMINCOPY = zgemm_ncopy_1.S | |||
| ZGEMMITCOPY = zgemm_tcopy_1.S | |||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c | |||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c | |||
| ZGEMMINCOPYOBJ = | |||
| ZGEMMITCOPYOBJ = | |||
| ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| #STRSMKERNEL_LN = trsm_kernel_LN_4x8_nehalem.S | |||
| #STRSMKERNEL_LT = trsm_kernel_LT_4x8_nehalem.S | |||
| #STRSMKERNEL_RN = trsm_kernel_LT_4x8_nehalem.S | |||
| @@ -0,0 +1,88 @@ | |||
| DAXPYKERNEL = daxpy.c | |||
| CAXPYKERNEL = caxpy.c | |||
| ZAXPYKERNEL = zaxpy.c | |||
| SDOTKERNEL = sdot.c | |||
| DDOTKERNEL = ddot.c | |||
| DSYMV_U_KERNEL = dsymv_U.c | |||
| DSYMV_L_KERNEL = dsymv_L.c | |||
| SSYMV_U_KERNEL = ssymv_U.c | |||
| SSYMV_L_KERNEL = ssymv_L.c | |||
| SGEMVNKERNEL = sgemv_n_4.c | |||
| SGEMVTKERNEL = sgemv_t_4.c | |||
| DGEMVNKERNEL = dgemv_n_4.c | |||
| DGEMVTKERNEL = dgemv_t_4.c | |||
| ZGEMVNKERNEL = zgemv_n_dup.S | |||
| ZGEMVTKERNEL = zgemv_t_4.c | |||
| DCOPYKERNEL = dcopy_bulldozer.S | |||
| SGEMMKERNEL = sgemm_kernel_16x2_piledriver.S | |||
| SGEMMINCOPY = ../generic/gemm_ncopy_16.c | |||
| SGEMMITCOPY = ../generic/gemm_tcopy_16.c | |||
| SGEMMONCOPY = gemm_ncopy_2_bulldozer.S | |||
| SGEMMOTCOPY = gemm_tcopy_2_bulldozer.S | |||
| SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMKERNEL = dgemm_kernel_8x2_piledriver.S | |||
| DGEMMINCOPY = ../generic/gemm_ncopy_8.c | |||
| DGEMMITCOPY = ../generic/gemm_tcopy_8.c | |||
| DGEMMONCOPY = gemm_ncopy_2_bulldozer.S | |||
| DGEMMOTCOPY = gemm_tcopy_2_bulldozer.S | |||
| DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMKERNEL = cgemm_kernel_4x2_piledriver.S | |||
| CGEMMINCOPY = ../generic/zgemm_ncopy_4.c | |||
| CGEMMITCOPY = ../generic/zgemm_tcopy_4.c | |||
| CGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||
| CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMKERNEL = zgemm_kernel_2x2_piledriver.S | |||
| ZGEMMINCOPY = | |||
| ZGEMMITCOPY = | |||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||
| ZGEMMINCOPYOBJ = | |||
| ZGEMMITCOPYOBJ = | |||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S | |||
| ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S | |||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| DTRSMKERNEL_LT = dtrsm_kernel_LT_8x2_bulldozer.S | |||
| DTRSMKERNEL_RN = dtrsm_kernel_RN_8x2_bulldozer.S | |||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| @@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if defined(BULLDOZER) | |||
| #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) | |||
| #include "caxpy_microk_bulldozer-2.c" | |||
| #endif | |||
| @@ -40,7 +40,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| "vbroadcastss 4(%4), %%xmm1 \n\t" // imag part of alpha | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "prefetcht0 768(%2,%0,4) \n\t" | |||
| "vmovups (%2,%0,4), %%xmm5 \n\t" // 2 complex values from x | |||
| @@ -113,7 +113,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| "addq $16, %0 \n\t" | |||
| "subq $8 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| : | |||
| : | |||
| @@ -49,10 +49,10 @@ static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) | |||
| "vbroadcastss 28(%2), %%ymm7 \n\t" // imag part x3 | |||
| "cmpq $0 , %1 \n\t" | |||
| "je .L01END%= \n\t" | |||
| "je 2f \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "prefetcht0 320(%4,%0,4) \n\t" | |||
| "vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0 | |||
| "vmovups 32(%4,%0,4), %%ymm9 \n\t" // 4 complex values form a0 | |||
| @@ -115,12 +115,12 @@ static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) | |||
| "addq $16, %0 \n\t" | |||
| "subq $8 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| ".L01END%=: \n\t" | |||
| "2: \n\t" | |||
| "cmpq $4, %8 \n\t" | |||
| "jne .L02END%= \n\t" | |||
| "jne 3f \n\t" | |||
| "vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0 | |||
| "vmovups (%5,%0,4), %%ymm10 \n\t" // 4 complex values form a1 | |||
| @@ -155,7 +155,7 @@ static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) | |||
| "vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y | |||
| ".L02END%=: \n\t" | |||
| "3: \n\t" | |||
| "vzeroupper \n\t" | |||
| : | |||
| @@ -200,10 +200,10 @@ static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) | |||
| "vbroadcastss 12(%2), %%ymm3 \n\t" // imag part x1 | |||
| "cmpq $0 , %1 \n\t" | |||
| "je .L01END%= \n\t" | |||
| "je 2f \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "prefetcht0 320(%4,%0,4) \n\t" | |||
| "vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0 | |||
| "vmovups 32(%4,%0,4), %%ymm9 \n\t" // 4 complex values form a0 | |||
| @@ -248,12 +248,12 @@ static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) | |||
| "addq $16, %0 \n\t" | |||
| "subq $8 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| ".L01END%=: \n\t" | |||
| "2: \n\t" | |||
| "cmpq $4, %6 \n\t" | |||
| "jne .L02END%= \n\t" | |||
| "jne 3f \n\t" | |||
| "vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0 | |||
| "vmovups (%5,%0,4), %%ymm10 \n\t" // 4 complex values form a1 | |||
| @@ -279,7 +279,7 @@ static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) | |||
| "vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y | |||
| ".L02END%=: \n\t" | |||
| "3: \n\t" | |||
| "vzeroupper \n\t" | |||
| : | |||
| @@ -320,10 +320,10 @@ static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) | |||
| "vbroadcastss 4(%2), %%ymm1 \n\t" // imag part x0 | |||
| "cmpq $0 , %1 \n\t" | |||
| "je .L01END%= \n\t" | |||
| "je 2f \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "prefetcht0 320(%4,%0,4) \n\t" | |||
| "vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0 | |||
| "vmovups 32(%4,%0,4), %%ymm9 \n\t" // 4 complex values form a0 | |||
| @@ -359,12 +359,12 @@ static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) | |||
| "vmovups %%ymm12,-64(%3,%0,4) \n\t" // 4 complex values to y | |||
| "vmovups %%ymm13,-32(%3,%0,4) \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| ".L01END%=: \n\t" | |||
| "2: \n\t" | |||
| "cmpq $4, %5 \n\t" | |||
| "jne .L02END%= \n\t" | |||
| "jne 3f \n\t" | |||
| "vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0 | |||
| @@ -386,7 +386,7 @@ static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) | |||
| "vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y | |||
| ".L02END%=: \n\t" | |||
| "3: \n\t" | |||
| "vzeroupper \n\t" | |||
| : | |||
| @@ -452,10 +452,10 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT a | |||
| "vbroadcastss (%5), %%ymm1 \n\t" // alpha_i | |||
| "cmpq $0 , %1 \n\t" | |||
| "je .L01END%= \n\t" | |||
| "je 2f \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "vmovups (%2,%0,4), %%ymm8 \n\t" // 4 complex values from src | |||
| "vmovups 32(%2,%0,4), %%ymm9 \n\t" | |||
| @@ -489,12 +489,12 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT a | |||
| "vmovups %%ymm12,-64(%3,%0,4) \n\t" // 4 complex values to y | |||
| "vmovups %%ymm13,-32(%3,%0,4) \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| ".L01END%=: \n\t" | |||
| "2: \n\t" | |||
| "cmpq $4, %6 \n\t" | |||
| "jne .L02END%= \n\t" | |||
| "jne 3f \n\t" | |||
| "vmovups (%2,%0,4), %%ymm8 \n\t" // 4 complex values src | |||
| @@ -516,7 +516,7 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT a | |||
| "vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y | |||
| ".L02END%=: \n\t" | |||
| "3: \n\t" | |||
| "vzeroupper \n\t" | |||
| : | |||
| @@ -47,7 +47,7 @@ static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT | |||
| "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" | |||
| "testq $0x04, %1 \n\t" | |||
| "jz .L08LABEL%= \n\t" | |||
| "jz 2f \n\t" | |||
| "vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0 | |||
| "vmovups (%5,%0,4), %%ymm5 \n\t" // 4 complex values from a1 | |||
| @@ -72,12 +72,12 @@ static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT | |||
| "addq $8 , %0 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| ".L08LABEL%=: \n\t" | |||
| "2: \n\t" | |||
| "cmpq $0, %1 \n\t" | |||
| "je .L08END%= \n\t" | |||
| "je 3f \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "prefetcht0 192(%4,%0,4) \n\t" | |||
| "vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0 | |||
| "prefetcht0 192(%5,%0,4) \n\t" | |||
| @@ -125,9 +125,9 @@ static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT | |||
| "addq $16 , %0 \n\t" | |||
| "subq $8 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| ".L08END%=: \n\t" | |||
| "3: \n\t" | |||
| "vbroadcastss (%8) , %%xmm0 \n\t" // value from alpha | |||
| "vbroadcastss 4(%8) , %%xmm1 \n\t" // value from alpha | |||
| @@ -269,7 +269,7 @@ static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT | |||
| "vxorps %%ymm11, %%ymm11, %%ymm11 \n\t" // temp | |||
| "testq $0x04, %1 \n\t" | |||
| "jz .L08LABEL%= \n\t" | |||
| "jz 2f \n\t" | |||
| "vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0 | |||
| "vmovups (%5,%0,4), %%ymm5 \n\t" // 4 complex values from a1 | |||
| @@ -288,12 +288,12 @@ static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT | |||
| "addq $8 , %0 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| ".L08LABEL%=: \n\t" | |||
| "2: \n\t" | |||
| "cmpq $0, %1 \n\t" | |||
| "je .L08END%= \n\t" | |||
| "je 3f \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "prefetcht0 192(%4,%0,4) \n\t" | |||
| "vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0 | |||
| "prefetcht0 192(%5,%0,4) \n\t" | |||
| @@ -325,9 +325,9 @@ static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT | |||
| "addq $16 , %0 \n\t" | |||
| "subq $8 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| ".L08END%=: \n\t" | |||
| "3: \n\t" | |||
| "vbroadcastss (%6) , %%xmm0 \n\t" // value from alpha | |||
| "vbroadcastss 4(%6) , %%xmm1 \n\t" // value from alpha | |||
| @@ -426,7 +426,7 @@ static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT * | |||
| "vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // temp | |||
| "testq $0x04, %1 \n\t" | |||
| "jz .L08LABEL%= \n\t" | |||
| "jz 2f \n\t" | |||
| "vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0 | |||
| @@ -442,12 +442,12 @@ static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT * | |||
| "addq $8 , %0 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| ".L08LABEL%=: \n\t" | |||
| "2: \n\t" | |||
| "cmpq $0, %1 \n\t" | |||
| "je .L08END%= \n\t" | |||
| "je 3f \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "prefetcht0 192(%4,%0,4) \n\t" | |||
| "vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0 | |||
| @@ -472,9 +472,9 @@ static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT * | |||
| "addq $16 , %0 \n\t" | |||
| "subq $8 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| ".L08END%=: \n\t" | |||
| "3: \n\t" | |||
| "vbroadcastss (%5) , %%xmm0 \n\t" // value from alpha | |||
| "vbroadcastss 4(%5) , %%xmm1 \n\t" // value from alpha | |||
| @@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(NEHALEM) | |||
| #include "daxpy_microk_nehalem-2.c" | |||
| #elif defined(BULLDOZER) | |||
| #elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) | |||
| #include "daxpy_microk_bulldozer-2.c" | |||
| #endif | |||
| @@ -39,7 +39,7 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| "vmovddup (%4), %%xmm0 \n\t" // alpha | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "prefetcht0 768(%3,%0,8) \n\t" | |||
| "vmovups (%2,%0,8), %%xmm12 \n\t" // 2 * x | |||
| @@ -61,7 +61,7 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| "addq $8 , %0 \n\t" | |||
| "subq $8 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| : | |||
| : | |||
| @@ -40,7 +40,7 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| "shufpd $0, %%xmm0, %%xmm0 \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| // "prefetcht0 192(%2,%0,8) \n\t" | |||
| // "prefetcht0 192(%3,%0,8) \n\t" | |||
| @@ -70,7 +70,7 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| "addq $8 , %0 \n\t" | |||
| "subq $8 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| : | |||
| : | |||
| @@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if defined(BULLDOZER) || defined(PILEDRIVER) | |||
| #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) | |||
| #include "ddot_microk_bulldozer-2.c" | |||
| #elif defined(NEHALEM) | |||
| #include "ddot_microk_nehalem-2.c" | |||
| @@ -42,7 +42,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) | |||
| "vxorpd %%xmm7, %%xmm7, %%xmm7 \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "vmovups (%2,%0,8), %%xmm12 \n\t" // 2 * x | |||
| "vmovups 16(%2,%0,8), %%xmm13 \n\t" // 2 * x | |||
| "vmovups 32(%2,%0,8), %%xmm14 \n\t" // 2 * x | |||
| @@ -55,7 +55,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) | |||
| "addq $8 , %0 \n\t" | |||
| "subq $8 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| "vaddpd %%xmm4, %%xmm5, %%xmm4 \n\t" | |||
| "vaddpd %%xmm6, %%xmm7, %%xmm6 \n\t" | |||
| @@ -42,7 +42,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) | |||
| "xorpd %%xmm7, %%xmm7 \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "movups (%2,%0,8), %%xmm12 \n\t" // 2 * x | |||
| "movups (%3,%0,8), %%xmm8 \n\t" // 2 * y | |||
| @@ -65,7 +65,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) | |||
| "addq $8 , %0 \n\t" | |||
| "subq $8 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| "addpd %%xmm5, %%xmm4 \n\t" | |||
| "addpd %%xmm7, %%xmm6 \n\t" | |||
| @@ -1092,18 +1092,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro INIT4x1 | |||
| vxorpd %xmm4 , %xmm4 , %xmm4 | |||
| vxorpd %xmm5 , %xmm5 , %xmm5 | |||
| vxorpd %ymm4 , %ymm4 , %ymm4 | |||
| vxorpd %ymm5 , %ymm5 , %ymm5 | |||
| vxorpd %ymm6 , %ymm6 , %ymm6 | |||
| vxorpd %ymm7 , %ymm7 , %ymm7 | |||
| .endm | |||
| .macro KERNEL4x1 | |||
| vbroadcastsd -12 * SIZE(BO), %ymm0 | |||
| vbroadcastsd -11 * SIZE(BO), %ymm1 | |||
| vbroadcastsd -10 * SIZE(BO), %ymm2 | |||
| vbroadcastsd -9 * SIZE(BO), %ymm3 | |||
| vfmadd231pd -16 * SIZE(AO) ,%ymm0 , %ymm4 | |||
| vfmadd231pd -12 * SIZE(AO) ,%ymm1 , %ymm5 | |||
| vbroadcastsd -8 * SIZE(BO), %ymm0 | |||
| vbroadcastsd -7 * SIZE(BO), %ymm1 | |||
| vfmadd231pd -8 * SIZE(AO) ,%ymm2 , %ymm6 | |||
| vfmadd231pd -4 * SIZE(AO) ,%ymm3 , %ymm7 | |||
| vbroadcastsd -6 * SIZE(BO), %ymm2 | |||
| vbroadcastsd -5 * SIZE(BO), %ymm3 | |||
| vfmadd231pd 0 * SIZE(AO) ,%ymm0 , %ymm4 | |||
| vfmadd231pd 4 * SIZE(AO) ,%ymm1 , %ymm5 | |||
| vfmadd231pd 8 * SIZE(AO) ,%ymm2 , %ymm6 | |||
| vfmadd231pd 12 * SIZE(AO) ,%ymm3 , %ymm7 | |||
| addq $ 8 *SIZE, BO | |||
| addq $ 32*SIZE, AO | |||
| .endm | |||
| .macro KERNEL4x1_SUB | |||
| vmovddup -12 * SIZE(BO), %xmm2 | |||
| vmovups -16 * SIZE(AO), %xmm0 | |||
| vmovups -14 * SIZE(AO), %xmm1 | |||
| vfmadd231pd %xmm0 ,%xmm2 , %xmm4 | |||
| vfmadd231pd %xmm1 ,%xmm2 , %xmm5 | |||
| vbroadcastsd -12 * SIZE(BO), %ymm2 | |||
| vmovups -16 * SIZE(AO), %ymm0 | |||
| vfmadd231pd %ymm0 ,%ymm2 , %ymm4 | |||
| addq $ 1*SIZE, BO | |||
| addq $ 4*SIZE, AO | |||
| @@ -1112,21 +1142,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro SAVE4x1 | |||
| vmovddup ALPHA, %xmm0 | |||
| vbroadcastsd ALPHA, %ymm0 | |||
| vmulpd %xmm0 , %xmm4 , %xmm4 | |||
| vmulpd %xmm0 , %xmm5 , %xmm5 | |||
| vaddpd %ymm4,%ymm5, %ymm4 | |||
| vaddpd %ymm6,%ymm7, %ymm6 | |||
| vaddpd %ymm4,%ymm6, %ymm4 | |||
| vmulpd %ymm0 , %ymm4 , %ymm4 | |||
| #if !defined(TRMMKERNEL) | |||
| vaddpd (CO1) , %xmm4, %xmm4 | |||
| vaddpd 2 * SIZE(CO1) , %xmm5, %xmm5 | |||
| vaddpd (CO1) , %ymm4, %ymm4 | |||
| #endif | |||
| vmovups %xmm4 , (CO1) | |||
| vmovups %xmm5 , 2 * SIZE(CO1) | |||
| vmovups %ymm4 , (CO1) | |||
| addq $ 4*SIZE, CO1 | |||
| .endm | |||
| @@ -2112,15 +2143,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .L1_12: | |||
| KERNEL4x1_SUB | |||
| KERNEL4x1_SUB | |||
| KERNEL4x1_SUB | |||
| KERNEL4x1_SUB | |||
| KERNEL4x1_SUB | |||
| KERNEL4x1_SUB | |||
| KERNEL4x1_SUB | |||
| KERNEL4x1_SUB | |||
| KERNEL4x1 | |||
| dec %rax | |||
| jne .L1_12 | |||
| @@ -3180,15 +3203,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .L1_12: | |||
| KERNEL4x1_SUB | |||
| KERNEL4x1_SUB | |||
| KERNEL4x1_SUB | |||
| KERNEL4x1_SUB | |||
| KERNEL4x1_SUB | |||
| KERNEL4x1_SUB | |||
| KERNEL4x1_SUB | |||
| KERNEL4x1_SUB | |||
| KERNEL4x1 | |||
| dec %rax | |||
| jne .L1_12 | |||
| @@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(NEHALEM) | |||
| #include "dgemv_n_microk_nehalem-4.c" | |||
| #elif defined(HASWELL) | |||
| #elif defined(HASWELL) || defined(STEAMROLLER) | |||
| #include "dgemv_n_microk_haswell-4.c" | |||
| #endif | |||
| @@ -125,7 +125,7 @@ static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT | |||
| "shufpd $0, %%xmm13, %%xmm13 \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "movups (%3,%0,8), %%xmm4 \n\t" // 2 * y | |||
| "movups 16(%3,%0,8), %%xmm5 \n\t" // 2 * y | |||
| @@ -148,7 +148,7 @@ static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT | |||
| "addq $4 , %0 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| : | |||
| : | |||
| @@ -187,7 +187,7 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a | |||
| "shufpd $0, %%xmm12, %%xmm12 \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "movups (%4,%0,8), %%xmm8 \n\t" // 2 * a | |||
| "movups 16(%4,%0,8), %%xmm9 \n\t" // 2 * a | |||
| "movups (%3,%0,8), %%xmm4 \n\t" // 2 * y | |||
| @@ -203,7 +203,7 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a | |||
| "addq $4 , %0 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| : | |||
| : | |||
| @@ -50,7 +50,7 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO | |||
| "vbroadcastsd (%9), %%ymm6 \n\t" // alpha | |||
| "testq $0x04, %1 \n\t" | |||
| "jz .L8LABEL%= \n\t" | |||
| "jz 2f \n\t" | |||
| "vmovupd (%3,%0,8), %%ymm7 \n\t" // 4 * y | |||
| "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" | |||
| @@ -77,14 +77,14 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO | |||
| "addq $4 , %0 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| ".L8LABEL%=: \n\t" | |||
| "2: \n\t" | |||
| "cmpq $0, %1 \n\t" | |||
| "je .L16END%= \n\t" | |||
| "je 3f \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" | |||
| "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" | |||
| @@ -118,9 +118,9 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO | |||
| "subq $8 , %1 \n\t" | |||
| "vmovupd %%ymm9,-32(%3,%0,8) \n\t" // 4 * y | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| ".L16END%=: \n\t" | |||
| "3: \n\t" | |||
| "vzeroupper \n\t" | |||
| : | |||
| @@ -168,7 +168,7 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT | |||
| "vbroadcastsd (%8), %%ymm6 \n\t" // alpha | |||
| "testq $0x04, %1 \n\t" | |||
| "jz .L8LABEL%= \n\t" | |||
| "jz 2f \n\t" | |||
| "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" | |||
| "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" | |||
| @@ -188,14 +188,14 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT | |||
| "addq $4 , %0 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| ".L8LABEL%=: \n\t" | |||
| "2: \n\t" | |||
| "cmpq $0, %1 \n\t" | |||
| "je .L8END%= \n\t" | |||
| "je 3f \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" | |||
| "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" | |||
| "vmovupd (%3,%0,8), %%ymm8 \n\t" // 4 * y | |||
| @@ -218,9 +218,9 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT | |||
| "addq $8 , %0 \n\t" | |||
| "subq $8 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| ".L8END%=: \n\t" | |||
| "3: \n\t" | |||
| "vzeroupper \n\t" | |||
| : | |||
| @@ -60,7 +60,7 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "xorpd %%xmm4 , %%xmm4 \n\t" | |||
| "xorpd %%xmm5 , %%xmm5 \n\t" | |||
| "movups (%3,%0,8), %%xmm7 \n\t" // 2 * y | |||
| @@ -142,7 +142,7 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO | |||
| "addq $4 , %0 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| : | |||
| : | |||
| @@ -194,7 +194,7 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT | |||
| "shufpd $0, %%xmm6 , %%xmm6 \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "xorpd %%xmm4 , %%xmm4 \n\t" | |||
| "xorpd %%xmm5 , %%xmm5 \n\t" | |||
| "movups (%3,%0,8), %%xmm7 \n\t" // 2 * y | |||
| @@ -239,7 +239,7 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT | |||
| "addq $4 , %0 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| : | |||
| : | |||
| @@ -0,0 +1,247 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define HAVE_KERNEL_4x8 1 | |||
| static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline)); | |||
| static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) | |||
| { | |||
| BLASLONG register i = 0; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "vzeroupper \n\t" | |||
| "vbroadcastsd (%2), %%ymm12 \n\t" // x0 | |||
| "vbroadcastsd 8(%2), %%ymm13 \n\t" // x1 | |||
| "vbroadcastsd 16(%2), %%ymm14 \n\t" // x2 | |||
| "vbroadcastsd 24(%2), %%ymm15 \n\t" // x3 | |||
| "vbroadcastsd 32(%2), %%ymm0 \n\t" // x4 | |||
| "vbroadcastsd 40(%2), %%ymm1 \n\t" // x5 | |||
| "vbroadcastsd 48(%2), %%ymm2 \n\t" // x6 | |||
| "vbroadcastsd 56(%2), %%ymm3 \n\t" // x7 | |||
| "vbroadcastsd (%9), %%ymm6 \n\t" // alpha | |||
| "testq $0x04, %1 \n\t" | |||
| "jz 2f \n\t" | |||
| "vmovupd (%3,%0,8), %%ymm7 \n\t" // 4 * y | |||
| "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" | |||
| "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" | |||
| "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" | |||
| "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm5 \n\t" | |||
| "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" | |||
| "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm5 \n\t" | |||
| "vfmadd231pd (%4,%8,8), %%ymm0 , %%ymm4 \n\t" | |||
| "vfmadd231pd (%5,%8,8), %%ymm1 , %%ymm5 \n\t" | |||
| "vfmadd231pd (%6,%8,8), %%ymm2 , %%ymm4 \n\t" | |||
| "vfmadd231pd (%7,%8,8), %%ymm3 , %%ymm5 \n\t" | |||
| "vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t" | |||
| "vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t" | |||
| "vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t" | |||
| "vmovupd %%ymm5, (%3,%0,8) \n\t" // 4 * y | |||
| "addq $4 , %8 \n\t" | |||
| "addq $4 , %0 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| "2: \n\t" | |||
| "cmpq $0, %1 \n\t" | |||
| "je 3f \n\t" | |||
| ".align 16 \n\t" | |||
| "1: \n\t" | |||
| "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" | |||
| "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" | |||
| "vmovupd (%3,%0,8), %%ymm8 \n\t" // 4 * y | |||
| "vmovupd 32(%3,%0,8), %%ymm9 \n\t" // 4 * y | |||
| "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" | |||
| "vfmadd231pd 32(%4,%0,8), %%ymm12, %%ymm5 \n\t" | |||
| "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm4 \n\t" | |||
| "vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t" | |||
| "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" | |||
| "vfmadd231pd 32(%6,%0,8), %%ymm14, %%ymm5 \n\t" | |||
| "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm4 \n\t" | |||
| "vfmadd231pd 32(%7,%0,8), %%ymm15, %%ymm5 \n\t" | |||
| "vfmadd231pd (%4,%8,8), %%ymm0 , %%ymm4 \n\t" | |||
| "addq $8 , %0 \n\t" | |||
| "vfmadd231pd 32(%4,%8,8), %%ymm0 , %%ymm5 \n\t" | |||
| "vfmadd231pd (%5,%8,8), %%ymm1 , %%ymm4 \n\t" | |||
| "vfmadd231pd 32(%5,%8,8), %%ymm1 , %%ymm5 \n\t" | |||
| "vfmadd231pd (%6,%8,8), %%ymm2 , %%ymm4 \n\t" | |||
| "vfmadd231pd 32(%6,%8,8), %%ymm2 , %%ymm5 \n\t" | |||
| "vfmadd231pd (%7,%8,8), %%ymm3 , %%ymm4 \n\t" | |||
| "vfmadd231pd 32(%7,%8,8), %%ymm3 , %%ymm5 \n\t" | |||
| "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t" | |||
| "vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t" | |||
| "addq $8 , %8 \n\t" | |||
| "vmovupd %%ymm8,-64(%3,%0,8) \n\t" // 4 * y | |||
| "subq $8 , %1 \n\t" | |||
| "vmovupd %%ymm9,-32(%3,%0,8) \n\t" // 4 * y | |||
| "jnz 1b \n\t" | |||
| "3: \n\t" | |||
| "vzeroupper \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (ap[0]), // 4 | |||
| "r" (ap[1]), // 5 | |||
| "r" (ap[2]), // 6 | |||
| "r" (ap[3]), // 7 | |||
| "r" (lda4), // 8 | |||
| "r" (alpha) // 9 | |||
| : "cc", | |||
| "%xmm0", "%xmm1", | |||
| "%xmm2", "%xmm3", | |||
| "%xmm4", "%xmm5", | |||
| "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| } | |||
| #define HAVE_KERNEL_4x4 1 | |||
| static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); | |||
| static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| { | |||
| BLASLONG register i = 0; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "vzeroupper \n\t" | |||
| "vbroadcastsd (%2), %%ymm12 \n\t" // x0 | |||
| "vbroadcastsd 8(%2), %%ymm13 \n\t" // x1 | |||
| "vbroadcastsd 16(%2), %%ymm14 \n\t" // x2 | |||
| "vbroadcastsd 24(%2), %%ymm15 \n\t" // x3 | |||
| "vbroadcastsd (%8), %%ymm6 \n\t" // alpha | |||
| "testq $0x04, %1 \n\t" | |||
| "jz 2f \n\t" | |||
| "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" | |||
| "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" | |||
| "vmovupd (%3,%0,8), %%ymm7 \n\t" // 4 * y | |||
| "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" | |||
| "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm5 \n\t" | |||
| "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" | |||
| "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm5 \n\t" | |||
| "vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t" | |||
| "vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t" | |||
| "vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t" | |||
| "vmovupd %%ymm5, (%3,%0,8) \n\t" // 4 * y | |||
| "addq $4 , %0 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| "2: \n\t" | |||
| "cmpq $0, %1 \n\t" | |||
| "je 3f \n\t" | |||
| ".align 16 \n\t" | |||
| "1: \n\t" | |||
| "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" | |||
| "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" | |||
| "vmovupd (%3,%0,8), %%ymm8 \n\t" // 4 * y | |||
| "vmovupd 32(%3,%0,8), %%ymm9 \n\t" // 4 * y | |||
| "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" | |||
| "vfmadd231pd 32(%4,%0,8), %%ymm12, %%ymm5 \n\t" | |||
| "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm4 \n\t" | |||
| "vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t" | |||
| "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" | |||
| "vfmadd231pd 32(%6,%0,8), %%ymm14, %%ymm5 \n\t" | |||
| "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm4 \n\t" | |||
| "vfmadd231pd 32(%7,%0,8), %%ymm15, %%ymm5 \n\t" | |||
| "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t" | |||
| "vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t" | |||
| "vmovupd %%ymm8, (%3,%0,8) \n\t" // 4 * y | |||
| "vmovupd %%ymm9, 32(%3,%0,8) \n\t" // 4 * y | |||
| "addq $8 , %0 \n\t" | |||
| "subq $8 , %1 \n\t" | |||
| "jnz 1b \n\t" | |||
| "3: \n\t" | |||
| "vzeroupper \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (ap[0]), // 4 | |||
| "r" (ap[1]), // 5 | |||
| "r" (ap[2]), // 6 | |||
| "r" (ap[3]), // 7 | |||
| "r" (alpha) // 8 | |||
| : "cc", | |||
| "%xmm4", "%xmm5", | |||
| "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| } | |||
| @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if defined(HASWELL) | |||
| #if defined(HASWELL) || defined(STEAMROLLER) | |||
| #include "dgemv_t_microk_haswell-4.c" | |||
| #endif | |||
| @@ -78,7 +78,7 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT | |||
| "xorpd %%xmm11 , %%xmm11 \n\t" | |||
| "testq $2 , %1 \n\t" | |||
| "jz .L01LABEL%= \n\t" | |||
| "jz 2f \n\t" | |||
| "movups (%5,%0,8) , %%xmm14 \n\t" // x | |||
| "movups (%3,%0,8) , %%xmm12 \n\t" // ap0 | |||
| @@ -90,13 +90,13 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT | |||
| "subq $2 , %1 \n\t" | |||
| "addpd %%xmm13 , %%xmm11 \n\t" | |||
| ".L01LABEL%=: \n\t" | |||
| "2: \n\t" | |||
| "cmpq $0, %1 \n\t" | |||
| "je .L01END%= \n\t" | |||
| "je 3f \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "movups (%5,%0,8) , %%xmm14 \n\t" // x | |||
| "movups (%3,%0,8) , %%xmm12 \n\t" // ap0 | |||
| @@ -116,9 +116,9 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT | |||
| "addq $4 , %0 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| ".L01END%=: \n\t" | |||
| "3: \n\t" | |||
| "haddpd %%xmm10, %%xmm10 \n\t" | |||
| "haddpd %%xmm11, %%xmm11 \n\t" | |||
| @@ -157,7 +157,7 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) | |||
| "xorpd %%xmm10 , %%xmm10 \n\t" | |||
| "testq $2 , %1 \n\t" | |||
| "jz .L01LABEL%= \n\t" | |||
| "jz 2f \n\t" | |||
| "movups (%3,%0,8) , %%xmm12 \n\t" | |||
| "movups (%4,%0,8) , %%xmm11 \n\t" | |||
| @@ -166,13 +166,13 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) | |||
| "addpd %%xmm12 , %%xmm10 \n\t" | |||
| "subq $2 , %1 \n\t" | |||
| ".L01LABEL%=: \n\t" | |||
| "2: \n\t" | |||
| "cmpq $0, %1 \n\t" | |||
| "je .L01END%= \n\t" | |||
| "je 3f \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "movups (%3,%0,8) , %%xmm12 \n\t" | |||
| "movups 16(%3,%0,8) , %%xmm14 \n\t" | |||
| @@ -185,9 +185,9 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) | |||
| "subq $4 , %1 \n\t" | |||
| "addpd %%xmm14 , %%xmm9 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| ".L01END%=: \n\t" | |||
| "3: \n\t" | |||
| "addpd %%xmm9 , %%xmm10 \n\t" | |||
| "haddpd %%xmm10, %%xmm10 \n\t" | |||
| @@ -246,7 +246,7 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d | |||
| "shufpd $0 , %%xmm10 , %%xmm10 \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "movups (%3,%0,8) , %%xmm12 \n\t" | |||
| "movups (%4,%0,8) , %%xmm11 \n\t" | |||
| @@ -256,7 +256,7 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d | |||
| "subq $2 , %1 \n\t" | |||
| "movups %%xmm11, -16(%4,%0,8) \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| : | |||
| : | |||
| @@ -42,7 +42,7 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) | |||
| "vxorpd %%ymm7 , %%ymm7, %%ymm7 \n\t" | |||
| "testq $0x04, %1 \n\t" | |||
| "jz .L08LABEL%= \n\t" | |||
| "jz 2f \n\t" | |||
| "vmovups (%2,%0,8), %%ymm12 \n\t" // 4 * x | |||
| @@ -54,13 +54,13 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) | |||
| "addq $4 , %0 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| ".L08LABEL%=: \n\t" | |||
| "2: \n\t" | |||
| "cmpq $0, %1 \n\t" | |||
| "je .L16END%= \n\t" | |||
| "je 3f \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| // "prefetcht0 384(%2,%0,8) \n\t" | |||
| "vmovups (%2,%0,8), %%ymm12 \n\t" // 4 * x | |||
| "vmovups 32(%2,%0,8), %%ymm13 \n\t" // 4 * x | |||
| @@ -80,9 +80,9 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) | |||
| "subq $8 , %1 \n\t" | |||
| "vfmadd231pd -32(%7,%0,8), %%ymm13, %%ymm7 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| ".L16END%=: \n\t" | |||
| "3: \n\t" | |||
| "vextractf128 $1 , %%ymm4, %%xmm12 \n\t" | |||
| "vextractf128 $1 , %%ymm5, %%xmm13 \n\t" | |||
| @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if defined(BULLDOZER) | |||
| #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) | |||
| #include "dsymv_L_microk_bulldozer-2.c" | |||
| #elif defined(NEHALEM) | |||
| #include "dsymv_L_microk_nehalem-2.c" | |||
| @@ -44,7 +44,7 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL | |||
| "vmovddup 24(%8), %%xmm7 \n\t" // temp1[1] | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "vmovups (%4,%0,8), %%xmm12 \n\t" // 2 * a | |||
| "vmovups (%2,%0,8), %%xmm8 \n\t" // 2 * x | |||
| @@ -90,7 +90,7 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL | |||
| "vmovups %%xmm11 , -16(%3,%0,8) \n\t" | |||
| "cmpq %0 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| "vmovsd (%9), %%xmm4 \n\t" | |||
| "vmovsd 8(%9), %%xmm5 \n\t" | |||
| @@ -48,7 +48,7 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL | |||
| "shufpd $0, %%xmm7, %%xmm7 \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "movups (%4,%0,8), %%xmm12 \n\t" // 2 * a | |||
| "movups (%2,%0,8), %%xmm8 \n\t" // 2 * x | |||
| "movups %%xmm12 , %%xmm11 \n\t" | |||
| @@ -85,7 +85,7 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL | |||
| "movups %%xmm9,-16(%3,%0,8) \n\t" // 2 * y | |||
| "cmpq %0 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| "movsd (%9), %%xmm4 \n\t" // temp1[0] | |||
| "movsd 8(%9), %%xmm5 \n\t" // temp1[1] | |||
| @@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if defined(BULLDOZER) | |||
| #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) | |||
| #include "dsymv_U_microk_bulldozer-2.c" | |||
| #elif defined(NEHALEM) | |||
| #include "dsymv_U_microk_nehalem-2.c" | |||
| @@ -47,7 +47,7 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT | |||
| "xorq %0,%0 \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "vmovups (%4,%0,8), %%xmm12 \n\t" // 2 * a | |||
| "vmovups (%2,%0,8), %%xmm8 \n\t" // 2 * x | |||
| @@ -93,7 +93,7 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT | |||
| "vmovups %%xmm9 , -32(%3,%0,8) \n\t" | |||
| "vmovups %%xmm11 , -16(%3,%0,8) \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| "vhaddpd %%xmm0, %%xmm0, %%xmm0 \n\t" | |||
| "vhaddpd %%xmm1, %%xmm1, %%xmm1 \n\t" | |||
| @@ -51,7 +51,7 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT | |||
| "xorq %0,%0 \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "movups (%4,%0,8), %%xmm12 \n\t" // 2 * a | |||
| "movups (%2,%0,8), %%xmm8 \n\t" // 2 * x | |||
| "movups %%xmm12 , %%xmm11 \n\t" | |||
| @@ -88,7 +88,7 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT | |||
| "movups %%xmm9,-16(%3,%0,8) \n\t" // 2 * y | |||
| "subq $2 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| "haddpd %%xmm0, %%xmm0 \n\t" | |||
| "haddpd %%xmm1, %%xmm1 \n\t" | |||
| @@ -40,7 +40,7 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| "shufps $0, %%xmm0, %%xmm0 \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| // "prefetcht0 192(%2,%0,4) \n\t" | |||
| // "prefetcht0 192(%3,%0,4) \n\t" | |||
| @@ -70,7 +70,7 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| "addq $16, %0 \n\t" | |||
| "subq $16, %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| : | |||
| : | |||
| @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if defined(BULLDOZER) || defined(PILEDRIVER) | |||
| #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) | |||
| #include "sdot_microk_bulldozer-2.c" | |||
| #elif defined(NEHALEM) | |||
| #include "sdot_microk_nehalem-2.c" | |||
| @@ -42,7 +42,7 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) | |||
| "vxorps %%xmm7, %%xmm7, %%xmm7 \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x | |||
| "vmovups 16(%2,%0,4), %%xmm13 \n\t" // 4 * x | |||
| "vmovups 32(%2,%0,4), %%xmm14 \n\t" // 4 * x | |||
| @@ -55,7 +55,7 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) | |||
| "addq $16, %0 \n\t" | |||
| "subq $16, %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| "vaddps %%xmm4, %%xmm5, %%xmm4 \n\t" | |||
| "vaddps %%xmm6, %%xmm7, %%xmm6 \n\t" | |||
| @@ -42,7 +42,7 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) | |||
| "xorps %%xmm7, %%xmm7 \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "movups (%2,%0,4), %%xmm12 \n\t" // 4 * x | |||
| "movups (%3,%0,4), %%xmm8 \n\t" // 4 * x | |||
| "movups 16(%2,%0,4), %%xmm13 \n\t" // 4 * x | |||
| @@ -64,7 +64,7 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) | |||
| "addq $16, %0 \n\t" | |||
| "subq $16, %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| "addps %%xmm5, %%xmm4 \n\t" | |||
| "addps %%xmm7, %%xmm6 \n\t" | |||
| @@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if defined(BULLDOZER) || defined(PILEDRIVER) | |||
| #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) | |||
| #include "sgemv_n_microk_bulldozer-4.c" | |||
| #elif defined(NEHALEM) | |||
| #include "sgemv_n_microk_nehalem-4.c" | |||
| @@ -39,8 +39,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "sgemv_n_microk_haswell-4.c" | |||
| #endif | |||
| #if defined(STEAMROLLER) | |||
| #define NBMAX 2048 | |||
| #else | |||
| #define NBMAX 4096 | |||
| #endif | |||
| #ifndef HAVE_KERNEL_4x8 | |||
| @@ -129,7 +132,7 @@ static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT | |||
| "shufps $0, %%xmm13, %%xmm13 \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "movups (%3,%0,4), %%xmm4 \n\t" // 4 * y | |||
| "movups (%4,%0,4), %%xmm8 \n\t" | |||
| @@ -143,7 +146,7 @@ static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT | |||
| "movups %%xmm4 , -16(%3,%0,4) \n\t" // 4 * y | |||
| "subq $4 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| : | |||
| : | |||
| @@ -166,7 +169,7 @@ static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT | |||
| #endif | |||
| #ifndef HAVE_KERNEL_4x2 | |||
| #ifndef HAVE_KERNEL_4x1 | |||
| static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); | |||
| @@ -184,10 +187,10 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a | |||
| "shufps $0, %%xmm12, %%xmm12 \n\t" | |||
| "cmpq $0, %1 \n\t" | |||
| "je .L16END%= \n\t" | |||
| "je 2f \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "movups (%3,%0,4), %%xmm4 \n\t" // 4 * y | |||
| "movups 16(%3,%0,4), %%xmm5 \n\t" // 4 * y | |||
| "movups (%4,%0,4), %%xmm8 \n\t" // 4 * a | |||
| @@ -203,12 +206,12 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a | |||
| "subq $8 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| ".L16END%=: \n\t" | |||
| "2: \n\t" | |||
| "testq $0x04, %5 \n\t" | |||
| "jz .L08LABEL%= \n\t" | |||
| "jz 3f \n\t" | |||
| "movups (%3,%0,4), %%xmm4 \n\t" // 4 * y | |||
| "movups (%4,%0,4), %%xmm8 \n\t" // 4 * a | |||
| @@ -218,7 +221,7 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a | |||
| "addq $4 , %0 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| ".L08LABEL%=: \n\t" | |||
| "3: \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| @@ -262,7 +265,7 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) | |||
| ( | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "movups (%2,%0,4) , %%xmm12 \n\t" | |||
| "movups (%3,%0,4) , %%xmm11 \n\t" | |||
| @@ -271,7 +274,7 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) | |||
| "movups %%xmm11, -16(%3,%0,4) \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| : | |||
| : | |||
| @@ -49,7 +49,7 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO | |||
| "vbroadcastss (%9), %%xmm8 \n\t" // alpha | |||
| "testq $0x04, %1 \n\t" | |||
| "jz .L08LABEL%= \n\t" | |||
| "jz 2f \n\t" | |||
| "vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t" | |||
| "vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t" | |||
| @@ -71,10 +71,10 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO | |||
| "subq $4 , %1 \n\t" | |||
| "vmovups %%xmm6, -16(%3,%0,4) \n\t" // 4 * y | |||
| ".L08LABEL%=: \n\t" | |||
| "2: \n\t" | |||
| "testq $0x08, %1 \n\t" | |||
| "jz .L16LABEL%= \n\t" | |||
| "jz 3f \n\t" | |||
| "vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t" | |||
| "vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t" | |||
| @@ -107,13 +107,13 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO | |||
| "subq $8 , %1 \n\t" | |||
| ".L16LABEL%=: \n\t" | |||
| "3: \n\t" | |||
| "cmpq $0, %1 \n\t" | |||
| "je .L16END%= \n\t" | |||
| "je 4f \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t" | |||
| "vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t" | |||
| @@ -178,9 +178,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO | |||
| "vmovups %%xmm7,-16(%3,%0,4) \n\t" // 4 * y | |||
| "subq $16, %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| ".L16END%=: \n\t" | |||
| "4: \n\t" | |||
| : | |||
| : | |||
| @@ -227,7 +227,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT | |||
| "vbroadcastss (%8), %%xmm8 \n\t" // alpha | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t" | |||
| "vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t" | |||
| @@ -243,7 +243,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT | |||
| "addq $4 , %0 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| : | |||
| : | |||