| @@ -1,4 +1,22 @@ | |||
| OpenBLAS ChangeLog | |||
| ==================================================================== | |||
| Version 0.2.6 | |||
| 2-Mar-2013 | |||
| common: | |||
| * Improved OpenMP performance slightly. (d744c9) | |||
| * Improved cblas.h compatibility with Intel MKL.(#185) | |||
| * Fixed the overflowing bug in single thread cholesky factorization. | |||
| * Fixed the overflowing buffer bug of multithreading hbmv and sbmv.(#174) | |||
| x86/x86-64: | |||
| * Added AMD Bulldozer x86-64 S/DGEMM AVX kernels. (Thank Werner Saar) | |||
| We will tune the performance in future. | |||
| * Auto-detect Intel Xeon E7540. | |||
| * Fixed the overflowing buffer bug of gemv. (#173) | |||
| * Fixed the bug of s/cdot about invalid reading NAN on x86_64. (#189) | |||
| MIPS64: | |||
| ==================================================================== | |||
| Version 0.2.5 | |||
| 26-Nov-2012 | |||
| @@ -314,7 +314,7 @@ clean :: | |||
| #endif | |||
| @$(MAKE) -C reference clean | |||
| @rm -f *.$(LIBSUFFIX) *.so *~ *.exe getarch getarch_2nd *.dll *.lib *.$(SUFFIX) *.dwf $(LIBPREFIX).$(LIBSUFFIX) $(LIBPREFIX)_p.$(LIBSUFFIX) $(LIBPREFIX).so.$(MAJOR_VERSION) *.lnk myconfig.h | |||
| @rm -f Makefile.conf config.h Makefile_kernel.conf config_kernel.h st* *.dylib | |||
| @rm -f Makefile.conf config.h cblas_noconst.h Makefile_kernel.conf config_kernel.h st* *.dylib | |||
| @if test -d $(NETLIB_LAPACK_DIR); then \ | |||
| echo deleting $(NETLIB_LAPACK_DIR); \ | |||
| rm -rf $(NETLIB_LAPACK_DIR) ;\ | |||
| @@ -1,3 +1,5 @@ | |||
| # This is triggered by Makefile.system and runs before any of the code is built. | |||
| export BINARY | |||
| export USE_OPENMP | |||
| @@ -15,7 +17,7 @@ ifdef CPUIDEMU | |||
| EXFLAGS = -DCPUIDEMU -DVENDOR=99 | |||
| endif | |||
| all: getarch_2nd | |||
| all: getarch_2nd cblas_noconst.h | |||
| ./getarch_2nd 0 >> $(TARGET_MAKE) | |||
| ./getarch_2nd 1 >> $(TARGET_CONF) | |||
| @@ -36,4 +38,7 @@ else | |||
| $(HOSTCC) -I. $(CFLAGS) -DBUILD_KERNEL -o $(@F) getarch_2nd.c | |||
| endif | |||
| cblas_noconst.h : cblas.h | |||
| perl -ane ' s/\bconst\b\s*//g; print; ' < cblas.h > cblas_noconst.h | |||
| dummy: | |||
| @@ -3,7 +3,7 @@ | |||
| # | |||
| # This library's version | |||
| VERSION = 0.2.5 | |||
| VERSION = 0.2.6 | |||
| # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a | |||
| # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library | |||
| @@ -70,7 +70,7 @@ ifndef GOTOBLAS_MAKEFILE | |||
| export GOTOBLAS_MAKEFILE = 1 | |||
| # Generating Makefile.conf and config.h | |||
| DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.getarch CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" CFLAGS="$(GETARCH_FLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) all) | |||
| DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" CFLAGS="$(GETARCH_FLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) all) | |||
| ifndef TARGET_CORE | |||
| include $(TOPDIR)/Makefile.conf | |||
| @@ -277,14 +277,14 @@ ifeq ($(ARCH), x86) | |||
| DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \ | |||
| CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO | |||
| ifneq ($(NO_AVX), 1) | |||
| DYNAMIC_CORE += SANDYBRIDGE | |||
| DYNAMIC_CORE += SANDYBRIDGE BULLDOZER | |||
| endif | |||
| endif | |||
| ifeq ($(ARCH), x86_64) | |||
| DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO | |||
| ifneq ($(NO_AVX), 1) | |||
| DYNAMIC_CORE += SANDYBRIDGE | |||
| DYNAMIC_CORE += SANDYBRIDGE BULLDOZER | |||
| endif | |||
| endif | |||
| @@ -44,7 +44,7 @@ Please read GotoBLAS_01Readme.txt | |||
| - **Intel Xeon 56xx (Westmere)**: Used GotoBLAS2 Nehalem codes. | |||
| - **Intel Sandy Bridge**: Optimized Level-3 BLAS with AVX on x86-64. | |||
| - **AMD Bobcat**: Used GotoBLAS2 Barcelona codes. | |||
| - **AMD Bulldozer**: Used GotoBLAS2 Barcelona codes. | |||
| - **AMD Bulldozer**: x86-64 S/DGEMM AVX kernels. (Thank Werner Saar) | |||
| #### MIPS64: | |||
| - **ICT Loongson 3A**: Optimized Level-3 BLAS and the part of Level-1,2. | |||
| @@ -29,6 +29,7 @@ BARCELONA | |||
| SHANGHAI | |||
| ISTANBUL | |||
| BOBCAT | |||
| BULLDOZER | |||
| c)VIA CPU: | |||
| SSE_GENERIC | |||
| @@ -1,291 +1,293 @@ | |||
| #ifndef CBLAS_H | |||
| #define CBLAS_H | |||
| #include <stddef.h> | |||
| #include "common.h" | |||
| #ifdef __cplusplus | |||
| extern "C" { | |||
| /* Assume C declarations for C++ */ | |||
| #endif /* __cplusplus */ | |||
| #include <stddef.h> | |||
| #include "common.h" | |||
| /*Set the number of threads on runtime.*/ | |||
| void openblas_set_num_threads(int num_threads); | |||
| void goto_set_num_threads(int num_threads); | |||
| /*Get the build configure on runtime.*/ | |||
| char* openblas_get_config(void); | |||
| #define CBLAS_INDEX size_t | |||
| enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102}; | |||
| enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113, CblasConjNoTrans=114}; | |||
| enum CBLAS_UPLO {CblasUpper=121, CblasLower=122}; | |||
| enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132}; | |||
| enum CBLAS_SIDE {CblasLeft=141, CblasRight=142}; | |||
| float cblas_sdsdot(blasint n, float, float *x, blasint incx, float *y, blasint incy); | |||
| double cblas_dsdot (blasint n, float *x, blasint incx, float *y, blasint incy); | |||
| float cblas_sdot(blasint n, float *x, blasint incx, float *y, blasint incy); | |||
| double cblas_ddot(blasint n, double *x, blasint incx, double *y, blasint incy); | |||
| openblas_complex_float cblas_cdotu(blasint n, float *x, blasint incx, float *y, blasint incy); | |||
| openblas_complex_float cblas_cdotc(blasint n, float *x, blasint incx, float *y, blasint incy); | |||
| openblas_complex_double cblas_zdotu(blasint n, double *x, blasint incx, double *y, blasint incy); | |||
| openblas_complex_double cblas_zdotc(blasint n, double *x, blasint incx, double *y, blasint incy); | |||
| void cblas_cdotu_sub(blasint n, float *x, blasint incx, float *y, blasint incy, openblas_complex_float *ret); | |||
| void cblas_cdotc_sub(blasint n, float *x, blasint incx, float *y, blasint incy, openblas_complex_float *ret); | |||
| void cblas_zdotu_sub(blasint n, double *x, blasint incx, double *y, blasint incy, openblas_complex_double *ret); | |||
| void cblas_zdotc_sub(blasint n, double *x, blasint incx, double *y, blasint incy, openblas_complex_double *ret); | |||
| float cblas_sasum (blasint n, float *x, blasint incx); | |||
| double cblas_dasum (blasint n, double *x, blasint incx); | |||
| float cblas_scasum(blasint n, float *x, blasint incx); | |||
| double cblas_dzasum(blasint n, double *x, blasint incx); | |||
| float cblas_snrm2 (blasint N, float *X, blasint incX); | |||
| double cblas_dnrm2 (blasint N, double *X, blasint incX); | |||
| float cblas_scnrm2(blasint N, float *X, blasint incX); | |||
| double cblas_dznrm2(blasint N, double *X, blasint incX); | |||
| CBLAS_INDEX cblas_isamax(blasint n, float *x, blasint incx); | |||
| CBLAS_INDEX cblas_idamax(blasint n, double *x, blasint incx); | |||
| CBLAS_INDEX cblas_icamax(blasint n, float *x, blasint incx); | |||
| CBLAS_INDEX cblas_izamax(blasint n, double *x, blasint incx); | |||
| void cblas_saxpy(blasint n, float, float *x, blasint incx, float *y, blasint incy); | |||
| void cblas_daxpy(blasint n, double, double *x, blasint incx, double *y, blasint incy); | |||
| void cblas_caxpy(blasint n, float *, float *x, blasint incx, float *y, blasint incy); | |||
| void cblas_zaxpy(blasint n, double *, double *x, blasint incx, double *y, blasint incy); | |||
| void cblas_scopy(blasint n, float *x, blasint incx, float *y, blasint incy); | |||
| void cblas_dcopy(blasint n, double *x, blasint incx, double *y, blasint incy); | |||
| void cblas_ccopy(blasint n, float *x, blasint incx, float *y, blasint incy); | |||
| void cblas_zcopy(blasint n, double *x, blasint incx, double *y, blasint incy); | |||
| void cblas_sswap(blasint n, float *x, blasint incx, float *y, blasint incy); | |||
| void cblas_dswap(blasint n, double *x, blasint incx, double *y, blasint incy); | |||
| void cblas_cswap(blasint n, float *x, blasint incx, float *y, blasint incy); | |||
| void cblas_zswap(blasint n, double *x, blasint incx, double *y, blasint incy); | |||
| void cblas_srot(blasint N, float *X, blasint incX, float *Y, blasint incY, float c, float s); | |||
| void cblas_drot(blasint N, double *X, blasint incX, double *Y, blasint incY, double c, double s); | |||
| typedef enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102} CBLAS_ORDER; | |||
| typedef enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113, CblasConjNoTrans=114} CBLAS_TRANSPOSE; | |||
| typedef enum CBLAS_UPLO {CblasUpper=121, CblasLower=122} CBLAS_UPLO; | |||
| typedef enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132} CBLAS_DIAG; | |||
| typedef enum CBLAS_SIDE {CblasLeft=141, CblasRight=142} CBLAS_SIDE; | |||
| float cblas_sdsdot(const blasint n, const float alpha, const float *x, const blasint incx, const float *y, const blasint incy); | |||
| double cblas_dsdot (const blasint n, const float *x, const blasint incx, const float *y, const blasint incy); | |||
| float cblas_sdot(const blasint n, const float *x, const blasint incx, const float *y, const blasint incy); | |||
| double cblas_ddot(const blasint n, const double *x, const blasint incx, const double *y, const blasint incy); | |||
| openblas_complex_float cblas_cdotu(const blasint n, const float *x, const blasint incx, const float *y, const blasint incy); | |||
| openblas_complex_float cblas_cdotc(const blasint n, const float *x, const blasint incx, const float *y, const blasint incy); | |||
| openblas_complex_double cblas_zdotu(const blasint n, const double *x, const blasint incx, const double *y, const blasint incy); | |||
| openblas_complex_double cblas_zdotc(const blasint n, const double *x, const blasint incx, const double *y, const blasint incy); | |||
| void cblas_cdotu_sub(const blasint n, const float *x, const blasint incx, const float *y, const blasint incy, openblas_complex_float *ret); | |||
| void cblas_cdotc_sub(const blasint n, const float *x, const blasint incx, const float *y, const blasint incy, openblas_complex_float *ret); | |||
| void cblas_zdotu_sub(const blasint n, const double *x, const blasint incx, const double *y, const blasint incy, openblas_complex_double *ret); | |||
| void cblas_zdotc_sub(const blasint n, const double *x, const blasint incx, const double *y, const blasint incy, openblas_complex_double *ret); | |||
| float cblas_sasum (const blasint n, const float *x, const blasint incx); | |||
| double cblas_dasum (const blasint n, const double *x, const blasint incx); | |||
| float cblas_scasum(const blasint n, const float *x, const blasint incx); | |||
| double cblas_dzasum(const blasint n, const double *x, const blasint incx); | |||
| float cblas_snrm2 (const blasint N, const float *X, const blasint incX); | |||
| double cblas_dnrm2 (const blasint N, const double *X, const blasint incX); | |||
| float cblas_scnrm2(const blasint N, const float *X, const blasint incX); | |||
| double cblas_dznrm2(const blasint N, const double *X, const blasint incX); | |||
| CBLAS_INDEX cblas_isamax(const blasint n, const float *x, const blasint incx); | |||
| CBLAS_INDEX cblas_idamax(const blasint n, const double *x, const blasint incx); | |||
| CBLAS_INDEX cblas_icamax(const blasint n, const float *x, const blasint incx); | |||
| CBLAS_INDEX cblas_izamax(const blasint n, const double *x, const blasint incx); | |||
| void cblas_saxpy(const blasint n, const float alpha, const float *x, const blasint incx, float *y, const blasint incy); | |||
| void cblas_daxpy(const blasint n, const double alpha, const double *x, const blasint incx, double *y, const blasint incy); | |||
| void cblas_caxpy(const blasint n, const float *alpha, const float *x, const blasint incx, float *y, const blasint incy); | |||
| void cblas_zaxpy(const blasint n, const double *alpha, const double *x, const blasint incx, double *y, const blasint incy); | |||
| void cblas_scopy(const blasint n, const float *x, const blasint incx, float *y, const blasint incy); | |||
| void cblas_dcopy(const blasint n, const double *x, const blasint incx, double *y, const blasint incy); | |||
| void cblas_ccopy(const blasint n, const float *x, const blasint incx, float *y, const blasint incy); | |||
| void cblas_zcopy(const blasint n, const double *x, const blasint incx, double *y, const blasint incy); | |||
| void cblas_sswap(const blasint n, float *x, const blasint incx, float *y, const blasint incy); | |||
| void cblas_dswap(const blasint n, double *x, const blasint incx, double *y, const blasint incy); | |||
| void cblas_cswap(const blasint n, float *x, const blasint incx, float *y, const blasint incy); | |||
| void cblas_zswap(const blasint n, double *x, const blasint incx, double *y, const blasint incy); | |||
| void cblas_srot(const blasint N, float *X, const blasint incX, float *Y, const blasint incY, const float c, const float s); | |||
| void cblas_drot(const blasint N, double *X, const blasint incX, double *Y, const blasint incY, const double c, const double s); | |||
| void cblas_srotg(float *a, float *b, float *c, float *s); | |||
| void cblas_drotg(double *a, double *b, double *c, double *s); | |||
| void cblas_srotm(blasint N, float *X, blasint incX, float *Y, blasint incY, float *P); | |||
| void cblas_drotm(blasint N, double *X, blasint incX, double *Y, blasint incY, double *P); | |||
| void cblas_srotmg(float *d1, float *d2, float *b1, float b2, float *P); | |||
| void cblas_drotmg(double *d1, double *d2, double *b1, double b2, double *P); | |||
| void cblas_sscal(blasint N, float alpha, float *X, blasint incX); | |||
| void cblas_dscal(blasint N, double alpha, double *X, blasint incX); | |||
| void cblas_cscal(blasint N, float *alpha, float *X, blasint incX); | |||
| void cblas_zscal(blasint N, double *alpha, double *X, blasint incX); | |||
| void cblas_csscal(blasint N, float alpha, float *X, blasint incX); | |||
| void cblas_zdscal(blasint N, double alpha, double *X, blasint incX); | |||
| void cblas_sgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n, | |||
| float alpha, float *a, blasint lda, float *x, blasint incx, float beta, float *y, blasint incy); | |||
| void cblas_dgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n, | |||
| double alpha, double *a, blasint lda, double *x, blasint incx, double beta, double *y, blasint incy); | |||
| void cblas_cgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n, | |||
| float *alpha, float *a, blasint lda, float *x, blasint incx, float *beta, float *y, blasint incy); | |||
| void cblas_zgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n, | |||
| double *alpha, double *a, blasint lda, double *x, blasint incx, double *beta, double *y, blasint incy); | |||
| void cblas_sger (enum CBLAS_ORDER order, blasint M, blasint N, float alpha, float *X, blasint incX, float *Y, blasint incY, float *A, blasint lda); | |||
| void cblas_dger (enum CBLAS_ORDER order, blasint M, blasint N, double alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda); | |||
| void cblas_cgeru(enum CBLAS_ORDER order, blasint M, blasint N, float *alpha, float *X, blasint incX, float *Y, blasint incY, float *A, blasint lda); | |||
| void cblas_cgerc(enum CBLAS_ORDER order, blasint M, blasint N, float *alpha, float *X, blasint incX, float *Y, blasint incY, float *A, blasint lda); | |||
| void cblas_zgeru(enum CBLAS_ORDER order, blasint M, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda); | |||
| void cblas_zgerc(enum CBLAS_ORDER order, blasint M, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda); | |||
| void cblas_strsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX); | |||
| void cblas_dtrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX); | |||
| void cblas_ctrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX); | |||
| void cblas_ztrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX); | |||
| void cblas_strmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX); | |||
| void cblas_dtrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX); | |||
| void cblas_ctrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX); | |||
| void cblas_ztrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX); | |||
| void cblas_ssyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A, blasint lda); | |||
| void cblas_dsyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *A, blasint lda); | |||
| void cblas_cher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A, blasint lda); | |||
| void cblas_zher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *A, blasint lda); | |||
| void cblas_ssyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,blasint N, float alpha, float *X, | |||
| blasint incX, float *Y, blasint incY, float *A, blasint lda); | |||
| void cblas_dsyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, | |||
| blasint incX, double *Y, blasint incY, double *A, blasint lda); | |||
| void cblas_cher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *X, blasint incX, | |||
| float *Y, blasint incY, float *A, blasint lda); | |||
| void cblas_zher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *X, blasint incX, | |||
| double *Y, blasint incY, double *A, blasint lda); | |||
| void cblas_sgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N, | |||
| blasint KL, blasint KU, float alpha, float *A, blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY); | |||
| void cblas_dgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N, | |||
| blasint KL, blasint KU, double alpha, double *A, blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY); | |||
| void cblas_cgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N, | |||
| blasint KL, blasint KU, float *alpha, float *A, blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY); | |||
| void cblas_zgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N, | |||
| blasint KL, blasint KU, double *alpha, double *A, blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY); | |||
| void cblas_ssbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, float alpha, float *A, | |||
| blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY); | |||
| void cblas_dsbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, double alpha, double *A, | |||
| blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY); | |||
| void cblas_stbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, | |||
| blasint N, blasint K, float *A, blasint lda, float *X, blasint incX); | |||
| void cblas_dtbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, | |||
| blasint N, blasint K, double *A, blasint lda, double *X, blasint incX); | |||
| void cblas_ctbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, | |||
| blasint N, blasint K, float *A, blasint lda, float *X, blasint incX); | |||
| void cblas_ztbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, | |||
| blasint N, blasint K, double *A, blasint lda, double *X, blasint incX); | |||
| void cblas_stbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, | |||
| blasint N, blasint K, float *A, blasint lda, float *X, blasint incX); | |||
| void cblas_dtbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, | |||
| blasint N, blasint K, double *A, blasint lda, double *X, blasint incX); | |||
| void cblas_ctbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, | |||
| blasint N, blasint K, float *A, blasint lda, float *X, blasint incX); | |||
| void cblas_ztbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, | |||
| blasint N, blasint K, double *A, blasint lda, double *X, blasint incX); | |||
| void cblas_stpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, | |||
| blasint N, float *Ap, float *X, blasint incX); | |||
| void cblas_dtpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, | |||
| blasint N, double *Ap, double *X, blasint incX); | |||
| void cblas_ctpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, | |||
| blasint N, float *Ap, float *X, blasint incX); | |||
| void cblas_ztpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, | |||
| blasint N, double *Ap, double *X, blasint incX); | |||
| void cblas_stpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, | |||
| blasint N, float *Ap, float *X, blasint incX); | |||
| void cblas_dtpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, | |||
| blasint N, double *Ap, double *X, blasint incX); | |||
| void cblas_ctpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, | |||
| blasint N, float *Ap, float *X, blasint incX); | |||
| void cblas_ztpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, | |||
| blasint N, double *Ap, double *X, blasint incX); | |||
| void cblas_ssymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *A, | |||
| blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY); | |||
| void cblas_dsymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *A, | |||
| blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY); | |||
| void cblas_chemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *A, | |||
| blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY); | |||
| void cblas_zhemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *A, | |||
| blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY); | |||
| void cblas_sspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *Ap, | |||
| float *X, blasint incX, float beta, float *Y, blasint incY); | |||
| void cblas_dspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *Ap, | |||
| double *X, blasint incX, double beta, double *Y, blasint incY); | |||
| void cblas_sspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *Ap); | |||
| void cblas_dspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *Ap); | |||
| void cblas_chpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A); | |||
| void cblas_zhpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X,blasint incX, double *A); | |||
| void cblas_sspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *Y, blasint incY, float *A); | |||
| void cblas_dspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *Y, blasint incY, double *A); | |||
| void cblas_chpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *X, blasint incX, float *Y, blasint incY, float *Ap); | |||
| void cblas_zhpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *Ap); | |||
| void cblas_chbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, | |||
| float *alpha, float *A, blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY); | |||
| void cblas_zhbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, | |||
| double *alpha, double *A, blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY); | |||
| void cblas_chpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, | |||
| float *alpha, float *Ap, float *X, blasint incX, float *beta, float *Y, blasint incY); | |||
| void cblas_zhpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, | |||
| double *alpha, double *Ap, double *X, blasint incX, double *beta, double *Y, blasint incY); | |||
| void cblas_sgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, | |||
| float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc); | |||
| void cblas_dgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, | |||
| double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); | |||
| void cblas_cgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, | |||
| float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); | |||
| void cblas_zgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, | |||
| double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); | |||
| void cblas_ssymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, | |||
| float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc); | |||
| void cblas_dsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, | |||
| double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); | |||
| void cblas_csymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, | |||
| float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); | |||
| void cblas_zsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, | |||
| double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); | |||
| void cblas_ssyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, | |||
| blasint N, blasint K, float alpha, float *A, blasint lda, float beta, float *C, blasint ldc); | |||
| void cblas_dsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, | |||
| blasint N, blasint K, double alpha, double *A, blasint lda, double beta, double *C, blasint ldc); | |||
| void cblas_csyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, | |||
| blasint N, blasint K, float *alpha, float *A, blasint lda, float *beta, float *C, blasint ldc); | |||
| void cblas_zsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, | |||
| blasint N, blasint K, double *alpha, double *A, blasint lda, double *beta, double *C, blasint ldc); | |||
| void cblas_ssyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, | |||
| blasint N, blasint K, float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc); | |||
| void cblas_dsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, | |||
| blasint N, blasint K, double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); | |||
| void cblas_csyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, | |||
| blasint N, blasint K, float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); | |||
| void cblas_zsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, | |||
| blasint N, blasint K, double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); | |||
| void cblas_strmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, | |||
| enum CBLAS_DIAG Diag, blasint M, blasint N, float alpha, float *A, blasint lda, float *B, blasint ldb); | |||
| void cblas_dtrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, | |||
| enum CBLAS_DIAG Diag, blasint M, blasint N, double alpha, double *A, blasint lda, double *B, blasint ldb); | |||
| void cblas_ctrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, | |||
| enum CBLAS_DIAG Diag, blasint M, blasint N, float *alpha, float *A, blasint lda, float *B, blasint ldb); | |||
| void cblas_ztrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, | |||
| enum CBLAS_DIAG Diag, blasint M, blasint N, double *alpha, double *A, blasint lda, double *B, blasint ldb); | |||
| void cblas_strsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, | |||
| enum CBLAS_DIAG Diag, blasint M, blasint N, float alpha, float *A, blasint lda, float *B, blasint ldb); | |||
| void cblas_dtrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, | |||
| enum CBLAS_DIAG Diag, blasint M, blasint N, double alpha, double *A, blasint lda, double *B, blasint ldb); | |||
| void cblas_ctrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, | |||
| enum CBLAS_DIAG Diag, blasint M, blasint N, float *alpha, float *A, blasint lda, float *B, blasint ldb); | |||
| void cblas_ztrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, | |||
| enum CBLAS_DIAG Diag, blasint M, blasint N, double *alpha, double *A, blasint lda, double *B, blasint ldb); | |||
| void cblas_chemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, | |||
| float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); | |||
| void cblas_zhemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, | |||
| double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); | |||
| void cblas_cherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K, | |||
| float alpha, float *A, blasint lda, float beta, float *C, blasint ldc); | |||
| void cblas_zherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K, | |||
| double alpha, double *A, blasint lda, double beta, double *C, blasint ldc); | |||
| void cblas_cher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K, | |||
| float *alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc); | |||
| void cblas_zher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K, | |||
| double *alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); | |||
| void cblas_srotm(const blasint N, float *X, const blasint incX, float *Y, const blasint incY, const float *P); | |||
| void cblas_drotm(const blasint N, double *X, const blasint incX, double *Y, const blasint incY, const double *P); | |||
| void cblas_srotmg(float *d1, float *d2, float *b1, const float b2, float *P); | |||
| void cblas_drotmg(double *d1, double *d2, double *b1, const double b2, double *P); | |||
| void cblas_sscal(const blasint N, const float alpha, float *X, const blasint incX); | |||
| void cblas_dscal(const blasint N, const double alpha, double *X, const blasint incX); | |||
| void cblas_cscal(const blasint N, const float *alpha, float *X, const blasint incX); | |||
| void cblas_zscal(const blasint N, const double *alpha, double *X, const blasint incX); | |||
| void cblas_csscal(const blasint N, const float alpha, float *X, const blasint incX); | |||
| void cblas_zdscal(const blasint N, const double alpha, double *X, const blasint incX); | |||
| void cblas_sgemv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE trans, const blasint m, const blasint n, | |||
| const float alpha, const float *a, const blasint lda, const float *x, const blasint incx, const float beta, float *y, const blasint incy); | |||
| void cblas_dgemv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE trans, const blasint m, const blasint n, | |||
| const double alpha, const double *a, const blasint lda, const double *x, const blasint incx, const double beta, double *y, const blasint incy); | |||
| void cblas_cgemv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE trans, const blasint m, const blasint n, | |||
| const float *alpha, const float *a, const blasint lda, const float *x, const blasint incx, const float *beta, float *y, const blasint incy); | |||
| void cblas_zgemv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE trans, const blasint m, const blasint n, | |||
| const double *alpha, const double *a, const blasint lda, const double *x, const blasint incx, const double *beta, double *y, const blasint incy); | |||
| void cblas_sger (const enum CBLAS_ORDER order, const blasint M, const blasint N, const float alpha, const float *X, const blasint incX, const float *Y, const blasint incY, float *A, const blasint lda); | |||
| void cblas_dger (const enum CBLAS_ORDER order, const blasint M, const blasint N, const double alpha, const double *X, const blasint incX, const double *Y, const blasint incY, double *A, const blasint lda); | |||
| void cblas_cgeru(const enum CBLAS_ORDER order, const blasint M, const blasint N, const float *alpha, const float *X, const blasint incX, const float *Y, const blasint incY, float *A, const blasint lda); | |||
| void cblas_cgerc(const enum CBLAS_ORDER order, const blasint M, const blasint N, const float *alpha, const float *X, const blasint incX, const float *Y, const blasint incY, float *A, const blasint lda); | |||
| void cblas_zgeru(const enum CBLAS_ORDER order, const blasint M, const blasint N, const double *alpha, const double *X, const blasint incX, const double *Y, const blasint incY, double *A, const blasint lda); | |||
| void cblas_zgerc(const enum CBLAS_ORDER order, const blasint M, const blasint N, const double *alpha, const double *X, const blasint incX, const double *Y, const blasint incY, double *A, const blasint lda); | |||
| void cblas_strsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const float *A, const blasint lda, float *X, const blasint incX); | |||
| void cblas_dtrsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const double *A, const blasint lda, double *X, const blasint incX); | |||
| void cblas_ctrsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const float *A, const blasint lda, float *X, const blasint incX); | |||
| void cblas_ztrsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const double *A, const blasint lda, double *X, const blasint incX); | |||
| void cblas_strmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const float *A, const blasint lda, float *X, const blasint incX); | |||
| void cblas_dtrmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const double *A, const blasint lda, double *X, const blasint incX); | |||
| void cblas_ctrmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const float *A, const blasint lda, float *X, const blasint incX); | |||
| void cblas_ztrmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const double *A, const blasint lda, double *X, const blasint incX); | |||
| void cblas_ssyr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *X, const blasint incX, float *A, const blasint lda); | |||
| void cblas_dsyr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X, const blasint incX, double *A, const blasint lda); | |||
| void cblas_cher(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *X, const blasint incX, float *A, const blasint lda); | |||
| void cblas_zher(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X, const blasint incX, double *A, const blasint lda); | |||
| void cblas_ssyr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,const blasint N, const float alpha, const float *X, | |||
| const blasint incX, const float *Y, const blasint incY, float *A, const blasint lda); | |||
| void cblas_dsyr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X, | |||
| const blasint incX, const double *Y, const blasint incY, double *A, const blasint lda); | |||
| void cblas_cher2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float *alpha, const float *X, const blasint incX, | |||
| const float *Y, const blasint incY, float *A, const blasint lda); | |||
| void cblas_zher2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double *alpha, const double *X, const blasint incX, | |||
| const double *Y, const blasint incY, double *A, const blasint lda); | |||
| void cblas_sgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const blasint M, const blasint N, | |||
| const blasint KL, const blasint KU, const float alpha, const float *A, const blasint lda, const float *X, const blasint incX, const float beta, float *Y, const blasint incY); | |||
| void cblas_dgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const blasint M, const blasint N, | |||
| const blasint KL, const blasint KU, const double alpha, const double *A, const blasint lda, const double *X, const blasint incX, const double beta, double *Y, const blasint incY); | |||
| void cblas_cgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const blasint M, const blasint N, | |||
| const blasint KL, const blasint KU, const float *alpha, const float *A, const blasint lda, const float *X, const blasint incX, const float *beta, float *Y, const blasint incY); | |||
| void cblas_zgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const blasint M, const blasint N, | |||
| const blasint KL, const blasint KU, const double *alpha, const double *A, const blasint lda, const double *X, const blasint incX, const double *beta, double *Y, const blasint incY); | |||
| void cblas_ssbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const blasint K, const float alpha, const float *A, | |||
| const blasint lda, const float *X, const blasint incX, const float beta, float *Y, const blasint incY); | |||
| void cblas_dsbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const blasint K, const double alpha, const double *A, | |||
| const blasint lda, const double *X, const blasint incX, const double beta, double *Y, const blasint incY); | |||
| void cblas_stbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, | |||
| const blasint N, const blasint K, const float *A, const blasint lda, float *X, const blasint incX); | |||
| void cblas_dtbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, | |||
| const blasint N, const blasint K, const double *A, const blasint lda, double *X, const blasint incX); | |||
| void cblas_ctbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, | |||
| const blasint N, const blasint K, const float *A, const blasint lda, float *X, const blasint incX); | |||
| void cblas_ztbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, | |||
| const blasint N, const blasint K, const double *A, const blasint lda, double *X, const blasint incX); | |||
| void cblas_stbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, | |||
| const blasint N, const blasint K, const float *A, const blasint lda, float *X, const blasint incX); | |||
| void cblas_dtbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, | |||
| const blasint N, const blasint K, const double *A, const blasint lda, double *X, const blasint incX); | |||
| void cblas_ctbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, | |||
| const blasint N, const blasint K, const float *A, const blasint lda, float *X, const blasint incX); | |||
| void cblas_ztbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, | |||
| const blasint N, const blasint K, const double *A, const blasint lda, double *X, const blasint incX); | |||
| void cblas_stpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, | |||
| const blasint N, const float *Ap, float *X, const blasint incX); | |||
| void cblas_dtpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, | |||
| const blasint N, const double *Ap, double *X, const blasint incX); | |||
| void cblas_ctpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, | |||
| const blasint N, const float *Ap, float *X, const blasint incX); | |||
| void cblas_ztpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, | |||
| const blasint N, const double *Ap, double *X, const blasint incX); | |||
| void cblas_stpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, | |||
| const blasint N, const float *Ap, float *X, const blasint incX); | |||
| void cblas_dtpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, | |||
| const blasint N, const double *Ap, double *X, const blasint incX); | |||
| void cblas_ctpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, | |||
| const blasint N, const float *Ap, float *X, const blasint incX); | |||
| void cblas_ztpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, | |||
| const blasint N, const double *Ap, double *X, const blasint incX); | |||
| void cblas_ssymv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *A, | |||
| const blasint lda, const float *X, const blasint incX, const float beta, float *Y, const blasint incY); | |||
| void cblas_dsymv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *A, | |||
| const blasint lda, const double *X, const blasint incX, const double beta, double *Y, const blasint incY); | |||
| void cblas_chemv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float *alpha, const float *A, | |||
| const blasint lda, const float *X, const blasint incX, const float *beta, float *Y, const blasint incY); | |||
| void cblas_zhemv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double *alpha, const double *A, | |||
| const blasint lda, const double *X, const blasint incX, const double *beta, double *Y, const blasint incY); | |||
| void cblas_sspmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *Ap, | |||
| const float *X, const blasint incX, const float beta, float *Y, const blasint incY); | |||
| void cblas_dspmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *Ap, | |||
| const double *X, const blasint incX, const double beta, double *Y, const blasint incY); | |||
| void cblas_sspr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *X, const blasint incX, float *Ap); | |||
| void cblas_dspr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X, const blasint incX, double *Ap); | |||
| void cblas_chpr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *X, const blasint incX, float *A); | |||
| void cblas_zhpr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X,const blasint incX, double *A); | |||
| void cblas_sspr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *X, const blasint incX, const float *Y, const blasint incY, float *A); | |||
| void cblas_dspr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X, const blasint incX, const double *Y, const blasint incY, double *A); | |||
| void cblas_chpr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float *alpha, const float *X, const blasint incX, const float *Y, const blasint incY, float *Ap); | |||
| void cblas_zhpr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double *alpha, const double *X, const blasint incX, const double *Y, const blasint incY, double *Ap); | |||
| void cblas_chbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const blasint K, | |||
| const float *alpha, const float *A, const blasint lda, const float *X, const blasint incX, const float *beta, float *Y, const blasint incY); | |||
| void cblas_zhbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const blasint K, | |||
| const double *alpha, const double *A, const blasint lda, const double *X, const blasint incX, const double *beta, double *Y, const blasint incY); | |||
| void cblas_chpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, | |||
| const float *alpha, const float *Ap, const float *X, const blasint incX, const float *beta, float *Y, const blasint incY); | |||
| void cblas_zhpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, | |||
| const double *alpha, const double *Ap, const double *X, const blasint incX, const double *beta, double *Y, const blasint incY); | |||
| void cblas_sgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const blasint M, const blasint N, const blasint K, | |||
| const float alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float beta, float *C, const blasint ldc); | |||
| void cblas_dgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const blasint M, const blasint N, const blasint K, | |||
| const double alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double beta, double *C, const blasint ldc); | |||
| void cblas_cgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const blasint M, const blasint N, const blasint K, | |||
| const float *alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float *beta, float *C, const blasint ldc); | |||
| void cblas_zgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const blasint M, const blasint N, const blasint K, | |||
| const double *alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double *beta, double *C, const blasint ldc); | |||
| void cblas_ssymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N, | |||
| const float alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float beta, float *C, const blasint ldc); | |||
| void cblas_dsymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N, | |||
| const double alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double beta, double *C, const blasint ldc); | |||
| void cblas_csymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N, | |||
| const float *alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float *beta, float *C, const blasint ldc); | |||
| void cblas_zsymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N, | |||
| const double *alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double *beta, double *C, const blasint ldc); | |||
| void cblas_ssyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, | |||
| const blasint N, const blasint K, const float alpha, const float *A, const blasint lda, const float beta, float *C, const blasint ldc); | |||
| void cblas_dsyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, | |||
| const blasint N, const blasint K, const double alpha, const double *A, const blasint lda, const double beta, double *C, const blasint ldc); | |||
| void cblas_csyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, | |||
| const blasint N, const blasint K, const float *alpha, const float *A, const blasint lda, const float *beta, float *C, const blasint ldc); | |||
| void cblas_zsyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, | |||
| const blasint N, const blasint K, const double *alpha, const double *A, const blasint lda, const double *beta, double *C, const blasint ldc); | |||
| void cblas_ssyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, | |||
| const blasint N, const blasint K, const float alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float beta, float *C, const blasint ldc); | |||
| void cblas_dsyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, | |||
| const blasint N, const blasint K, const double alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double beta, double *C, const blasint ldc); | |||
| void cblas_csyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, | |||
| const blasint N, const blasint K, const float *alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float *beta, float *C, const blasint ldc); | |||
| void cblas_zsyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, | |||
| const blasint N, const blasint K, const double *alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double *beta, double *C, const blasint ldc); | |||
| void cblas_strmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, | |||
| const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const float alpha, const float *A, const blasint lda, float *B, const blasint ldb); | |||
| void cblas_dtrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, | |||
| const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const double alpha, const double *A, const blasint lda, double *B, const blasint ldb); | |||
| void cblas_ctrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, | |||
| const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const float *alpha, const float *A, const blasint lda, float *B, const blasint ldb); | |||
| void cblas_ztrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, | |||
| const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const double *alpha, const double *A, const blasint lda, double *B, const blasint ldb); | |||
| void cblas_strsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, | |||
| const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const float alpha, const float *A, const blasint lda, float *B, const blasint ldb); | |||
| void cblas_dtrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, | |||
| const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const double alpha, const double *A, const blasint lda, double *B, const blasint ldb); | |||
| void cblas_ctrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, | |||
| const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const float *alpha, const float *A, const blasint lda, float *B, const blasint ldb); | |||
| void cblas_ztrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, | |||
| const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const double *alpha, const double *A, const blasint lda, double *B, const blasint ldb); | |||
| void cblas_chemm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N, | |||
| const float *alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float *beta, float *C, const blasint ldc); | |||
| void cblas_zhemm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N, | |||
| const double *alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double *beta, double *C, const blasint ldc); | |||
| void cblas_cherk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const blasint N, const blasint K, | |||
| const float alpha, const float *A, const blasint lda, const float beta, float *C, const blasint ldc); | |||
| void cblas_zherk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const blasint N, const blasint K, | |||
| const double alpha, const double *A, const blasint lda, const double beta, double *C, const blasint ldc); | |||
| void cblas_cher2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const blasint N, const blasint K, | |||
| const float *alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float beta, float *C, const blasint ldc); | |||
| void cblas_zher2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const blasint N, const blasint K, | |||
| const double *alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double beta, double *C, const blasint ldc); | |||
| void cblas_xerbla(blasint p, char *rout, char *form, ...); | |||
| #ifdef __cplusplus | |||
| } | |||
| #endif /* __cplusplus */ | |||
| #endif | |||
| @@ -390,7 +390,8 @@ typedef int blasint; | |||
| /* C99 supports complex floating numbers natively, which GCC also offers as an | |||
| extension since version 3.0. If neither are available, use a compatible | |||
| structure as fallback (see Clause 6.2.5.13 of the C99 standard). */ | |||
| #if defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || __GNUC__ >= 3 | |||
| #if (defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || \ | |||
| (__GNUC__ >= 3 && !defined(__cplusplus))) | |||
| #define OPENBLAS_COMPLEX_C99 | |||
| typedef float _Complex openblas_complex_float; | |||
| typedef double _Complex openblas_complex_double; | |||
| @@ -557,7 +558,8 @@ typedef struct { | |||
| #include "common_level3.h" | |||
| #include "common_lapack.h" | |||
| #ifdef CBLAS | |||
| #include "cblas.h" | |||
| /* This header file is generated from "cblas.h" (see Makefile.prebuild). */ | |||
| #include "cblas_noconst.h" | |||
| #endif | |||
| #ifndef ASSEMBLER | |||
| @@ -125,7 +125,8 @@ | |||
| #define HAVE_MISALIGNSSE (1 << 15) | |||
| #define HAVE_128BITFPU (1 << 16) | |||
| #define HAVE_FASTMOVU (1 << 17) | |||
| #define HAVE_AVX (1 << 18) | |||
| #define HAVE_AVX (1 << 18) | |||
| #define HAVE_FMA4 (1 << 19) | |||
| #define CACHE_INFO_L1_I 1 | |||
| #define CACHE_INFO_L1_D 2 | |||
| @@ -43,6 +43,8 @@ | |||
| #ifdef NO_AVX | |||
| #define CPUTYPE_SANDYBRIDGE CPUTYPE_NEHALEM | |||
| #define CORE_SANDYBRIDGE CORE_NEHALEM | |||
| #define CPUTYPE_BULLDOZER CPUTYPE_BARCELONA | |||
| #define CORE_BULLDOZER CORE_BARCELONA | |||
| #endif | |||
| #ifndef CPUIDEMU | |||
| @@ -116,8 +118,9 @@ static inline int have_excpuid(void){ | |||
| #ifndef NO_AVX | |||
| static inline void xgetbv(int op, int * eax, int * edx){ | |||
| //Use binary code for xgetbv | |||
| __asm__ __volatile__ | |||
| ("xgetbv": "=a" (*eax), "=d" (*edx) : "c" (op) : "cc"); | |||
| (".byte 0x0f, 0x01, 0xd0": "=a" (*eax), "=d" (*edx) : "c" (op) : "cc"); | |||
| } | |||
| #endif | |||
| @@ -228,6 +231,9 @@ int get_cputype(int gettype){ | |||
| cpuid(0x80000001, &eax, &ebx, &ecx, &edx); | |||
| if ((ecx & (1 << 6)) != 0) feature |= HAVE_SSE4A; | |||
| if ((ecx & (1 << 7)) != 0) feature |= HAVE_MISALIGNSSE; | |||
| #ifndef NO_AVX | |||
| if ((ecx & (1 << 16)) != 0) feature |= HAVE_FMA4; | |||
| #endif | |||
| if ((edx & (1 << 30)) != 0) feature |= HAVE_3DNOWEX; | |||
| if ((edx & (1 << 31)) != 0) feature |= HAVE_3DNOW; | |||
| } | |||
| @@ -1030,6 +1036,8 @@ int get_cpuname(void){ | |||
| return CPUTYPE_SANDYBRIDGE; | |||
| else | |||
| return CPUTYPE_NEHALEM; | |||
| case 14: | |||
| // Xeon E7540 | |||
| case 15: | |||
| //Xeon Processor E7 (Westmere-EX) | |||
| return CPUTYPE_NEHALEM; | |||
| @@ -1075,8 +1083,12 @@ int get_cpuname(void){ | |||
| return CPUTYPE_OPTERON; | |||
| case 1: | |||
| case 10: | |||
| case 6: //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series | |||
| return CPUTYPE_BARCELONA; | |||
| case 6: //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series | |||
| if(support_avx()) | |||
| return CPUTYPE_BULLDOZER; | |||
| else | |||
| return CPUTYPE_BARCELONA; //OS don't support AVX. | |||
| case 5: | |||
| return CPUTYPE_BOBCAT; | |||
| } | |||
| @@ -1398,6 +1410,8 @@ int get_coretype(void){ | |||
| return CORE_SANDYBRIDGE; | |||
| else | |||
| return CORE_NEHALEM; //OS doesn't support AVX | |||
| case 14: | |||
| //Xeon E7540 | |||
| case 15: | |||
| //Xeon Processor E7 (Westmere-EX) | |||
| return CORE_NEHALEM; | |||
| @@ -1427,8 +1441,13 @@ int get_coretype(void){ | |||
| if (family == 0xf){ | |||
| if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON; | |||
| else if (exfamily == 5) return CORE_BOBCAT; | |||
| else if (exfamily == 6) return CORE_BARCELONA; //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series | |||
| else return CORE_BARCELONA; | |||
| else if (exfamily == 6) { | |||
| //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series | |||
| if(support_avx()) | |||
| return CORE_BULLDOZER; | |||
| else | |||
| return CORE_BARCELONA; //OS don't support AVX. Use old kernels. | |||
| }else return CORE_BARCELONA; | |||
| } | |||
| } | |||
| @@ -1494,6 +1513,9 @@ void get_cpuconfig(void){ | |||
| printf("#define DTB_SIZE %d\n", info.size * 1024); | |||
| printf("#define DTB_ASSOCIATIVE %d\n", info.associative); | |||
| printf("#define DTB_DEFAULT_ENTRIES %d\n", info.linesize); | |||
| } else { | |||
| //fall back for some virtual machines. | |||
| printf("#define DTB_DEFAULT_ENTRIES 32\n"); | |||
| } | |||
| features = get_cputype(GET_FEATURE); | |||
| @@ -1511,6 +1533,7 @@ void get_cpuconfig(void){ | |||
| if (features & HAVE_AVX ) printf("#define HAVE_AVX\n"); | |||
| if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n"); | |||
| if (features & HAVE_3DNOW) printf("#define HAVE_3DNOW\n"); | |||
| if (features & HAVE_FMA4 ) printf("#define HAVE_FMA4\n"); | |||
| if (features & HAVE_CFLUSH) printf("#define HAVE_CFLUSH\n"); | |||
| if (features & HAVE_HIT) printf("#define HAVE_HIT 1\n"); | |||
| if (features & HAVE_MISALIGNSSE) printf("#define HAVE_MISALIGNSSE\n"); | |||
| @@ -1577,5 +1600,6 @@ void get_sse(void){ | |||
| if (features & HAVE_AVX ) printf("HAVE_AVX=1\n"); | |||
| if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n"); | |||
| if (features & HAVE_3DNOW) printf("HAVE_3DNOW=1\n"); | |||
| if (features & HAVE_FMA4 ) printf("HAVE_FMA4=1\n"); | |||
| } | |||
| @@ -65,7 +65,6 @@ static int sbmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F | |||
| a = (FLOAT *)args -> a; | |||
| x = (FLOAT *)args -> b; | |||
| y = (FLOAT *)args -> c; | |||
| lda = args -> lda; | |||
| incx = args -> ldb; | |||
| @@ -76,6 +75,10 @@ static int sbmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F | |||
| n_from = 0; | |||
| n_to = n; | |||
| //Use y as each thread's n* COMPSIZE elements in sb buffer | |||
| y = buffer; | |||
| buffer += ((COMPSIZE * n + 1023) & ~1023); | |||
| if (range_m) { | |||
| n_from = *(range_m + 0); | |||
| n_to = *(range_m + 1); | |||
| @@ -83,7 +86,6 @@ static int sbmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F | |||
| a += n_from * lda * COMPSIZE; | |||
| } | |||
| if (range_n) y += *range_n * COMPSIZE; | |||
| if (incx != 1) { | |||
| COPY_K(n, x, incx, buffer, 1); | |||
| @@ -331,7 +333,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x | |||
| if (num_cpu) { | |||
| queue[0].sa = NULL; | |||
| queue[0].sb = buffer + num_cpu * (((n + 255) & ~255) + 16) * COMPSIZE; | |||
| queue[0].sb = buffer; | |||
| queue[num_cpu - 1].next = NULL; | |||
| exec_blas(num_cpu, queue); | |||
| @@ -344,7 +346,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x | |||
| #else | |||
| ONE, ZERO, | |||
| #endif | |||
| buffer + range_n[i] * COMPSIZE, 1, buffer, 1, NULL, 0); | |||
| (FLOAT*)(queue[i].sb), 1, buffer, 1, NULL, 0); | |||
| } | |||
| AXPYU_K(n, 0, 0, | |||
| @@ -1,7 +1,7 @@ | |||
| TOPDIR = ../.. | |||
| include ../../Makefile.system | |||
| COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) | |||
| COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) openblas_get_config.$(SUFFIX) | |||
| COMMONOBJS += slamch.$(SUFFIX) slamc3.$(SUFFIX) dlamch.$(SUFFIX) dlamc3.$(SUFFIX) | |||
| @@ -103,6 +103,9 @@ blas_server.$(SUFFIX) : $(BLAS_SERVER) ../../common.h ../../common_thread.h ../. | |||
| openblas_set_num_threads.$(SUFFIX) : openblas_set_num_threads.c | |||
| $(CC) $(CFLAGS) -c $< -o $(@F) | |||
| openblas_get_config.$(SUFFIX) : openblas_get_config.c | |||
| $(CC) $(CFLAGS) -c $< -o $(@F) | |||
| blasL1thread.$(SUFFIX) : blas_l1_thread.c ../../common.h ../../common_thread.h | |||
| $(CC) $(CFLAGS) -c $< -o $(@F) | |||
| @@ -385,6 +385,7 @@ static int blas_thread_server(void *arg){ | |||
| + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); | |||
| } | |||
| } | |||
| queue->sb=sb; | |||
| } | |||
| #ifdef MONITOR | |||
| @@ -49,8 +49,12 @@ | |||
| int blas_server_avail = 0; | |||
| static void * blas_thread_buffer[MAX_CPU_NUMBER]; | |||
| void goto_set_num_threads(int num_threads) { | |||
| int i=0; | |||
| if (num_threads < 1) num_threads = blas_num_threads; | |||
| if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER; | |||
| @@ -62,7 +66,19 @@ void goto_set_num_threads(int num_threads) { | |||
| blas_cpu_number = num_threads; | |||
| omp_set_num_threads(blas_cpu_number); | |||
| //adjust buffer for each thread | |||
| for(i=0; i<blas_cpu_number; i++){ | |||
| if(blas_thread_buffer[i]==NULL){ | |||
| blas_thread_buffer[i]=blas_memory_alloc(2); | |||
| } | |||
| } | |||
| for(; i<MAX_CPU_NUMBER; i++){ | |||
| if(blas_thread_buffer[i]!=NULL){ | |||
| blas_memory_free(blas_thread_buffer[i]); | |||
| blas_thread_buffer[i]=NULL; | |||
| } | |||
| } | |||
| #if defined(ARCH_MIPS64) | |||
| //set parameters for different number of threads. | |||
| blas_set_parameter(); | |||
| @@ -76,17 +92,33 @@ void openblas_set_num_threads(int num_threads) { | |||
| int blas_thread_init(void){ | |||
| int i=0; | |||
| blas_get_cpu_number(); | |||
| blas_server_avail = 1; | |||
| for(i=0; i<blas_num_threads; i++){ | |||
| blas_thread_buffer[i]=blas_memory_alloc(2); | |||
| } | |||
| for(; i<MAX_CPU_NUMBER; i++){ | |||
| blas_thread_buffer[i]=NULL; | |||
| } | |||
| return 0; | |||
| } | |||
| int BLASFUNC(blas_thread_shutdown)(void){ | |||
| int i=0; | |||
| blas_server_avail = 0; | |||
| for(i=0; i<MAX_CPU_NUMBER; i++){ | |||
| if(blas_thread_buffer[i]!=NULL){ | |||
| blas_memory_free(blas_thread_buffer[i]); | |||
| blas_thread_buffer[i]=NULL; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -177,7 +209,8 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | |||
| static void exec_threads(blas_queue_t *queue){ | |||
| void *buffer, *sa, *sb; | |||
| int pos=0, release_flag=0; | |||
| buffer = NULL; | |||
| sa = queue -> sa; | |||
| sb = queue -> sb; | |||
| @@ -189,7 +222,14 @@ static void exec_threads(blas_queue_t *queue){ | |||
| if ((sa == NULL) && (sb == NULL) && ((queue -> mode & BLAS_PTHREAD) == 0)) { | |||
| buffer = blas_memory_alloc(2); | |||
| pos = omp_get_thread_num(); | |||
| buffer = blas_thread_buffer[pos]; | |||
| //fallback | |||
| if(buffer==NULL) { | |||
| buffer = blas_memory_alloc(2); | |||
| release_flag=1; | |||
| } | |||
| if (sa == NULL) sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A); | |||
| @@ -224,6 +264,7 @@ static void exec_threads(blas_queue_t *queue){ | |||
| + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); | |||
| } | |||
| } | |||
| queue->sb=sb; | |||
| } | |||
| } | |||
| @@ -241,7 +282,7 @@ static void exec_threads(blas_queue_t *queue){ | |||
| } | |||
| if (buffer != NULL) blas_memory_free(buffer); | |||
| if (release_flag) blas_memory_free(buffer); | |||
| } | |||
| @@ -253,6 +253,7 @@ static DWORD WINAPI blas_thread_server(void *arg){ | |||
| + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); | |||
| } | |||
| } | |||
| queue->sb=sb; | |||
| } | |||
| #ifdef MONITOR | |||
| @@ -495,4 +496,4 @@ void goto_set_num_threads(int num_threads) | |||
| void openblas_set_num_threads(int num) | |||
| { | |||
| goto_set_num_threads(num); | |||
| } | |||
| } | |||
| @@ -63,9 +63,11 @@ extern gotoblas_t gotoblas_BARCELONA; | |||
| extern gotoblas_t gotoblas_BOBCAT; | |||
| #ifndef NO_AVX | |||
| extern gotoblas_t gotoblas_SANDYBRIDGE; | |||
| extern gotoblas_t gotoblas_BULLDOZER; | |||
| #else | |||
| //Use NEHALEM kernels for sandy bridge | |||
| #define gotoblas_SANDYBRIDGE gotoblas_NEHALEM | |||
| #define gotoblas_BULLDOZER gotoblas_BARCELONA | |||
| #endif | |||
| @@ -78,8 +80,9 @@ extern gotoblas_t gotoblas_SANDYBRIDGE; | |||
| #ifndef NO_AVX | |||
| static inline void xgetbv(int op, int * eax, int * edx){ | |||
| //Use binary code for xgetbv | |||
| __asm__ __volatile__ | |||
| ("xgetbv": "=a" (*eax), "=d" (*edx) : "c" (op) : "cc"); | |||
| (".byte 0x0f, 0x01, 0xd0": "=a" (*eax), "=d" (*edx) : "c" (op) : "cc"); | |||
| } | |||
| #endif | |||
| @@ -163,7 +166,8 @@ static gotoblas_t *get_coretype(void){ | |||
| //Intel Xeon Processor 5600 (Westmere-EP) | |||
| //Xeon Processor E7 (Westmere-EX) | |||
| if (model == 12 || model == 15) return &gotoblas_NEHALEM; | |||
| //Xeon E7540 | |||
| if (model == 12 || model == 14 || model == 15) return &gotoblas_NEHALEM; | |||
| //Intel Core i5-2000 /i7-2000 (Sandy Bridge) | |||
| //Intel Core i7-3000 / Xeon E5 | |||
| @@ -171,7 +175,7 @@ static gotoblas_t *get_coretype(void){ | |||
| if(support_avx()) | |||
| return &gotoblas_SANDYBRIDGE; | |||
| else{ | |||
| fprintf(stderr, "OpenBLAS : Your OS doesn't support AVX. Use Nehalem kernels.\n"); | |||
| fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n"); | |||
| return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | |||
| } | |||
| } | |||
| @@ -182,7 +186,7 @@ static gotoblas_t *get_coretype(void){ | |||
| if(support_avx()) | |||
| return &gotoblas_SANDYBRIDGE; | |||
| else{ | |||
| fprintf(stderr, "OpenBLAS : Your OS doesn't support AVX. Use Nehalem kernels.\n"); | |||
| fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n"); | |||
| return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | |||
| } | |||
| } | |||
| @@ -202,6 +206,14 @@ static gotoblas_t *get_coretype(void){ | |||
| else return &gotoblas_OPTERON; | |||
| } else if (exfamily == 5) { | |||
| return &gotoblas_BOBCAT; | |||
| } else if (exfamily == 6) { | |||
| //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series | |||
| if(support_avx()) | |||
| return &gotoblas_BULLDOZER; | |||
| else{ | |||
| fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Barcelona kernels as a fallback, which may give poorer performance.\n"); | |||
| return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. | |||
| } | |||
| } else { | |||
| return &gotoblas_BARCELONA; | |||
| } | |||
| @@ -238,6 +250,7 @@ static char *corename[] = { | |||
| "Nano", | |||
| "Sandybridge", | |||
| "Bobcat", | |||
| "Bulldozer", | |||
| }; | |||
| char *gotoblas_corename(void) { | |||
| @@ -259,6 +272,7 @@ char *gotoblas_corename(void) { | |||
| if (gotoblas == &gotoblas_NANO) return corename[15]; | |||
| if (gotoblas == &gotoblas_SANDYBRIDGE) return corename[16]; | |||
| if (gotoblas == &gotoblas_BOBCAT) return corename[17]; | |||
| if (gotoblas == &gotoblas_BULLDOZER) return corename[18]; | |||
| return corename[0]; | |||
| } | |||
| @@ -0,0 +1,59 @@ | |||
| /***************************************************************************** | |||
| Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the ISCAS nor the names of its contributors may | |||
| be used to endorse or promote products derived from this software | |||
| without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| **********************************************************************************/ | |||
| #include "common.h" | |||
| static char* openblas_config_str="" | |||
| #ifdef USE64BITINT | |||
| "USE64BITINT " | |||
| #endif | |||
| #ifdef NO_CBLAS | |||
| "NO_CBLAS " | |||
| #endif | |||
| #ifdef NO_LAPACK | |||
| "NO_LAPACK " | |||
| #endif | |||
| #ifdef NO_LAPACKE | |||
| "NO_LAPACKE " | |||
| #endif | |||
| #ifdef DYNAMIC_ARCH | |||
| "DYNAMIC_ARCH " | |||
| #endif | |||
| #ifdef NO_AFFINITY | |||
| "NO_AFFINITY " | |||
| #endif | |||
| ; | |||
| char* CNAME() { | |||
| return openblas_config_str; | |||
| } | |||
| @@ -163,7 +163,7 @@ int get_L2_size(void){ | |||
| int eax, ebx, ecx, edx; | |||
| #if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || \ | |||
| #if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) || \ | |||
| defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \ | |||
| defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) | |||
| @@ -22,6 +22,11 @@ ifeq ($(OSNAME), WINNT) | |||
| ifeq ($(F_COMPILER), GFORTRAN) | |||
| EXTRALIB += -lgfortran | |||
| endif | |||
| ifeq ($(USE_OPENMP), 1) | |||
| ifeq ($(C_COMPILER), GCC) | |||
| EXTRALIB += -lgomp | |||
| endif | |||
| endif | |||
| endif | |||
| ifeq ($(OSNAME), CYGWIN_NT) | |||
| @@ -74,6 +74,7 @@ | |||
| @misc_no_underscore_objs = ( | |||
| openblas_set_num_threads, goto_set_num_threads, | |||
| openblas_get_config, | |||
| ); | |||
| @misc_underscore_objs = ( | |||
| @@ -350,7 +350,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define CORENAME "OPTERON" | |||
| #endif | |||
| #if defined(FORCE_BARCELONA) || defined(FORCE_SHANGHAI) || defined(FORCE_ISTANBUL) || defined (FORCE_BULLDOZER) | |||
| #if defined(FORCE_BARCELONA) || defined(FORCE_SHANGHAI) || defined(FORCE_ISTANBUL) | |||
| #define FORCE | |||
| #define FORCE_INTEL | |||
| #define ARCHITECTURE "X86" | |||
| @@ -380,6 +380,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define CORENAME "BOBCAT" | |||
| #endif | |||
| #if defined (FORCE_BULLDOZER) | |||
| #define FORCE | |||
| #define FORCE_INTEL | |||
| #define ARCHITECTURE "X86" | |||
| #define SUBARCHITECTURE "BULLDOZER" | |||
| #define ARCHCONFIG "-DBULLDOZER " \ | |||
| "-DL1_DATA_SIZE=49152 -DL1_DATA_LINESIZE=64 " \ | |||
| "-DL2_SIZE=1024000 -DL2_LINESIZE=64 -DL3_SIZE=16777216 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=32 -DDTB_SIZE=4096 " \ | |||
| "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 " \ | |||
| "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU " \ | |||
| "-DHAVE_AVX -DHAVE_FMA4" | |||
| #define LIBNAME "bulldozer" | |||
| #define CORENAME "BULLDOZER" | |||
| #endif | |||
| #ifdef FORCE_SSE_GENERIC | |||
| #define FORCE | |||
| #define FORCE_INTEL | |||
| @@ -34,7 +34,7 @@ int main(int argc, char **argv) { | |||
| #ifdef USE64BITINT | |||
| printf("#define USE64BITINT\n"); | |||
| #endif | |||
| printf("#define GEMM_MULTITHREAD_THRESHOLD\t%ld\n", GEMM_MULTITHREAD_THRESHOLD); | |||
| printf("#define GEMM_MULTITHREAD_THRESHOLD\t%ld\n", (long int)GEMM_MULTITHREAD_THRESHOLD); | |||
| } | |||
| return 0; | |||
| @@ -810,6 +810,22 @@ static void init_parameter(void) { | |||
| #endif | |||
| #endif | |||
| #ifdef BULLDOZER | |||
| #ifdef DEBUG | |||
| fprintf(stderr, "Bulldozer\n"); | |||
| #endif | |||
| TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; | |||
| TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; | |||
| TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; | |||
| TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; | |||
| #ifdef EXPRECISION | |||
| TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; | |||
| TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; | |||
| #endif | |||
| #endif | |||
| #ifdef NANO | |||
| #ifdef DEBUG | |||
| @@ -0,0 +1,59 @@ | |||
| SGEMMKERNEL = gemm_kernel_4x4_barcelona.S | |||
| SGEMMINCOPY = | |||
| SGEMMITCOPY = | |||
| SGEMMONCOPY = ../generic/gemm_ncopy_4.c | |||
| SGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||
| SGEMMINCOPYOBJ = | |||
| SGEMMITCOPYOBJ = | |||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMKERNEL = gemm_kernel_2x4_barcelona.S | |||
| DGEMMINCOPY = ../generic/gemm_ncopy_2.c | |||
| DGEMMITCOPY = ../generic/gemm_tcopy_2.c | |||
| DGEMMONCOPY = ../generic/gemm_ncopy_4.c | |||
| DGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||
| DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMKERNEL = zgemm_kernel_2x2_barcelona.S | |||
| CGEMMINCOPY = | |||
| CGEMMITCOPY = | |||
| CGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||
| CGEMMINCOPYOBJ = | |||
| CGEMMITCOPYOBJ = | |||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMKERNEL = zgemm_kernel_1x2_barcelona.S | |||
| ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c | |||
| ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c | |||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||
| ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| STRSMKERNEL_LN = trsm_kernel_LN_4x4_sse.S | |||
| STRSMKERNEL_LT = trsm_kernel_LT_4x4_sse.S | |||
| STRSMKERNEL_RN = trsm_kernel_LT_4x4_sse.S | |||
| STRSMKERNEL_RT = trsm_kernel_RT_4x4_sse.S | |||
| DTRSMKERNEL_LN = trsm_kernel_LN_2x4_sse2.S | |||
| DTRSMKERNEL_LT = trsm_kernel_LT_2x4_sse2.S | |||
| DTRSMKERNEL_RN = trsm_kernel_LT_2x4_sse2.S | |||
| DTRSMKERNEL_RT = trsm_kernel_RT_2x4_sse2.S | |||
| CTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse.S | |||
| CTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse.S | |||
| CTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse.S | |||
| CTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse.S | |||
| ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x2_sse2.S | |||
| ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x2_sse2.S | |||
| ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x2_sse2.S | |||
| ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x2_sse2.S | |||
| CGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S | |||
| ZGEMM3MKERNEL = zgemm3m_kernel_2x4_barcelona.S | |||
| @@ -596,7 +596,7 @@ | |||
| .L22: | |||
| mulps %xmm0, %xmm2 | |||
| addps %xmm2, %xmm4 | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| movsd 4 * SIZE(BB), %xmm2 | |||
| @@ -842,7 +842,7 @@ | |||
| .L32: | |||
| mulss %xmm0, %xmm2 | |||
| addss %xmm2, %xmm4 | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| movss 4 * SIZE(BB), %xmm2 | |||
| @@ -1168,7 +1168,7 @@ | |||
| .L52: | |||
| mulps %xmm0, %xmm2 | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| mulps 4 * SIZE(BB), %xmm0 | |||
| @@ -1198,7 +1198,7 @@ | |||
| addps %xmm0, %xmm5 | |||
| movaps 32 * SIZE(AA), %xmm0 | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | |||
| #endif | |||
| mulps %xmm1, %xmm2 | |||
| @@ -1347,7 +1347,7 @@ | |||
| ALIGN_4 | |||
| .L62: | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| @@ -1531,7 +1531,7 @@ | |||
| .L72: | |||
| mulss %xmm0, %xmm2 | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| mulss 4 * SIZE(BB), %xmm0 | |||
| @@ -1778,7 +1778,7 @@ | |||
| .L92: | |||
| mulps %xmm0, %xmm2 | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| movaps 4 * SIZE(AA), %xmm0 | |||
| @@ -1793,7 +1793,7 @@ | |||
| mulps 12 * SIZE(BB), %xmm0 | |||
| addps %xmm0, %xmm7 | |||
| movaps 32 * SIZE(AA), %xmm0 | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | |||
| #endif | |||
| mulps %xmm1, %xmm3 | |||
| @@ -1924,7 +1924,7 @@ | |||
| .L102: | |||
| mulps %xmm0, %xmm2 | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| movsd 2 * SIZE(AA), %xmm0 | |||
| @@ -2069,7 +2069,7 @@ | |||
| .L112: | |||
| mulss %xmm0, %xmm2 | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| movss 1 * SIZE(AA), %xmm0 | |||
| @@ -89,17 +89,22 @@ | |||
| #endif | |||
| #define STACKSIZE 16 | |||
| #define M 4 + STACKSIZE(%esp) | |||
| #define N 8 + STACKSIZE(%esp) | |||
| #define ALPHA 16 + STACKSIZE(%esp) | |||
| #define A 20 + STACKSIZE(%esp) | |||
| #define STACK_LDA 24 + STACKSIZE(%esp) | |||
| #define STACK_X 28 + STACKSIZE(%esp) | |||
| #define STACK_INCX 32 + STACKSIZE(%esp) | |||
| #define Y 36 + STACKSIZE(%esp) | |||
| #define STACK_INCY 40 + STACKSIZE(%esp) | |||
| #define BUFFER 44 + STACKSIZE(%esp) | |||
| #define ARGS 16 | |||
| #define M 4 + STACKSIZE+ARGS(%esp) | |||
| #define N 8 + STACKSIZE+ARGS(%esp) | |||
| #define ALPHA 16 + STACKSIZE+ARGS(%esp) | |||
| #define A 20 + STACKSIZE+ARGS(%esp) | |||
| #define STACK_LDA 24 + STACKSIZE+ARGS(%esp) | |||
| #define STACK_X 28 + STACKSIZE+ARGS(%esp) | |||
| #define STACK_INCX 32 + STACKSIZE+ARGS(%esp) | |||
| #define Y 36 + STACKSIZE+ARGS(%esp) | |||
| #define STACK_INCY 40 + STACKSIZE+ARGS(%esp) | |||
| #define BUFFER 44 + STACKSIZE+ARGS(%esp) | |||
| #define MMM 0+ARGS(%esp) | |||
| #define YY 4+ARGS(%esp) | |||
| #define AA 8+ARGS(%esp) | |||
| #define LDAX 12+ARGS(%esp) | |||
| #define I %eax | |||
| #define J %ebx | |||
| @@ -114,6 +119,7 @@ | |||
| PROLOGUE | |||
| subl $ARGS,%esp | |||
| pushl %ebp | |||
| pushl %edi | |||
| pushl %esi | |||
| @@ -121,7 +127,34 @@ | |||
| PROFCODE | |||
| movl Y,J | |||
| movl J,YY # backup Y | |||
| movl A,J | |||
| movl J,AA # backup A | |||
| movl M,J | |||
| movl J,MMM # backup MM | |||
| .L0t: | |||
| xorl J,J | |||
| addl $1,J | |||
| sall $21,J | |||
| subl J,MMM | |||
| movl J,M | |||
| jge .L00t | |||
| ALIGN_4 | |||
| movl MMM,%eax | |||
| addl J,%eax | |||
| jle .L999x | |||
| movl %eax,M | |||
| .L00t: | |||
| movl AA,%eax | |||
| movl %eax,A | |||
| movl YY,J | |||
| movl J,Y | |||
| movl STACK_LDA, LDA | |||
| movl STACK_X, X | |||
| movl STACK_INCX, INCX | |||
| @@ -651,12 +684,22 @@ | |||
| addss 0 * SIZE(X), %xmm0 | |||
| movss %xmm0, (Y1) | |||
| ALIGN_3 | |||
| .L999: | |||
| movl M,J | |||
| leal (,J,SIZE),%eax | |||
| addl %eax,AA | |||
| movl YY,J | |||
| addl %eax,J | |||
| movl J,YY | |||
| jmp .L0t | |||
| ALIGN_4 | |||
| .L999x: | |||
| popl %ebx | |||
| popl %esi | |||
| popl %edi | |||
| popl %ebp | |||
| addl $ARGS,%esp | |||
| ret | |||
| EPILOGUE | |||
| @@ -76,17 +76,22 @@ | |||
| #endif | |||
| #define STACKSIZE 16 | |||
| #define M 4 + STACKSIZE(%esp) | |||
| #define N 8 + STACKSIZE(%esp) | |||
| #define ALPHA 16 + STACKSIZE(%esp) | |||
| #define A 24 + STACKSIZE(%esp) | |||
| #define STACK_LDA 28 + STACKSIZE(%esp) | |||
| #define STACK_X 32 + STACKSIZE(%esp) | |||
| #define STACK_INCX 36 + STACKSIZE(%esp) | |||
| #define Y 40 + STACKSIZE(%esp) | |||
| #define STACK_INCY 44 + STACKSIZE(%esp) | |||
| #define BUFFER 48 + STACKSIZE(%esp) | |||
| #define ARGS 16 | |||
| #define M 4 + STACKSIZE+ARGS(%esp) | |||
| #define N 8 + STACKSIZE+ARGS(%esp) | |||
| #define ALPHA 16 + STACKSIZE+ARGS(%esp) | |||
| #define A 24 + STACKSIZE+ARGS(%esp) | |||
| #define STACK_LDA 28 + STACKSIZE+ARGS(%esp) | |||
| #define STACK_X 32 + STACKSIZE+ARGS(%esp) | |||
| #define STACK_INCX 36 + STACKSIZE+ARGS(%esp) | |||
| #define Y 40 + STACKSIZE+ARGS(%esp) | |||
| #define STACK_INCY 44 + STACKSIZE+ARGS(%esp) | |||
| #define BUFFER 48 + STACKSIZE+ARGS(%esp) | |||
| #define MMM 0+ARGS(%esp) | |||
| #define YY 4+ARGS(%esp) | |||
| #define AA 8+ARGS(%esp) | |||
| #define I %eax | |||
| #define J %ebx | |||
| @@ -101,6 +106,8 @@ | |||
| PROLOGUE | |||
| subl $ARGS,%esp | |||
| pushl %ebp | |||
| pushl %edi | |||
| pushl %esi | |||
| @@ -108,6 +115,33 @@ | |||
| PROFCODE | |||
| movl Y,J | |||
| movl J,YY # backup Y | |||
| movl A,J | |||
| movl J,AA # backup A | |||
| movl M,J | |||
| movl J,MMM # backup MM | |||
| .L0t: | |||
| xorl J,J | |||
| addl $1,J | |||
| sall $20,J | |||
| subl J,MMM | |||
| movl J,M | |||
| jge .L00t | |||
| ALIGN_4 | |||
| movl MMM,%eax | |||
| addl J,%eax | |||
| jle .L999x | |||
| movl %eax,M | |||
| .L00t: | |||
| movl AA,%eax | |||
| movl %eax,A | |||
| movl YY,J | |||
| movl J,Y | |||
| movl STACK_LDA, LDA | |||
| movl STACK_X, X | |||
| movl STACK_INCX, INCX | |||
| @@ -677,10 +711,22 @@ | |||
| ALIGN_3 | |||
| .L999: | |||
| movl M,J | |||
| leal (,J,SIZE),%eax | |||
| addl %eax,AA | |||
| movl YY,J | |||
| addl %eax,J | |||
| movl J,YY | |||
| jmp .L0t | |||
| ALIGN_4 | |||
| .L999x: | |||
| popl %ebx | |||
| popl %esi | |||
| popl %edi | |||
| popl %ebp | |||
| addl $ARGS,%esp | |||
| ret | |||
| EPILOGUE | |||
| @@ -89,17 +89,24 @@ | |||
| #endif | |||
| #define STACKSIZE 16 | |||
| #define M 4 + STACKSIZE(%esp) | |||
| #define N 8 + STACKSIZE(%esp) | |||
| #define ALPHA 16 + STACKSIZE(%esp) | |||
| #define A 20 + STACKSIZE(%esp) | |||
| #define STACK_LDA 24 + STACKSIZE(%esp) | |||
| #define STACK_X 28 + STACKSIZE(%esp) | |||
| #define STACK_INCX 32 + STACKSIZE(%esp) | |||
| #define Y 36 + STACKSIZE(%esp) | |||
| #define STACK_INCY 40 + STACKSIZE(%esp) | |||
| #define BUFFER 44 + STACKSIZE(%esp) | |||
| #define ARGS 20 | |||
| #define M 4 + STACKSIZE+ARGS(%esp) | |||
| #define N 8 + STACKSIZE+ARGS(%esp) | |||
| #define ALPHA 16 + STACKSIZE+ARGS(%esp) | |||
| #define A 20 + STACKSIZE+ARGS(%esp) | |||
| #define STACK_LDA 24 + STACKSIZE+ARGS(%esp) | |||
| #define STACK_X 28 + STACKSIZE+ARGS(%esp) | |||
| #define STACK_INCX 32 + STACKSIZE+ARGS(%esp) | |||
| #define Y 36 + STACKSIZE+ARGS(%esp) | |||
| #define STACK_INCY 40 + STACKSIZE+ARGS(%esp) | |||
| #define BUFFER 44 + STACKSIZE+ARGS(%esp) | |||
| #define MMM 0+STACKSIZE(%esp) | |||
| #define NN 4+STACKSIZE(%esp) | |||
| #define AA 8+STACKSIZE(%esp) | |||
| #define LDAX 12+STACKSIZE(%esp) | |||
| #define XX 16+STACKSIZE(%esp) | |||
| #define I %eax | |||
| #define J %ebx | |||
| @@ -114,6 +121,7 @@ | |||
| PROLOGUE | |||
| subl $ARGS,%esp | |||
| pushl %ebp | |||
| pushl %edi | |||
| pushl %esi | |||
| @@ -122,7 +130,42 @@ | |||
| PROFCODE | |||
| movl STACK_LDA, LDA | |||
| movl LDA,LDAX # backup LDA | |||
| movl STACK_X, X | |||
| movl X,XX | |||
| movl N,J | |||
| movl J,NN # backup N | |||
| movl A,J | |||
| movl J,AA # backup A | |||
| movl M,J | |||
| movl J,MMM # mov M to MMM | |||
| .L0t: | |||
| xorl J,J | |||
| addl $1,J | |||
| sall $22,J # J=2^24*sizeof(float)=buffer size(16MB) | |||
| subl $8, J # Don't use last 8 float in the buffer. | |||
| # Now, split M by block J | |||
| subl J,MMM # MMM=MMM-J | |||
| movl J,M | |||
| jge .L00t | |||
| ALIGN_4 | |||
| movl MMM,%eax | |||
| addl J,%eax | |||
| jle .L999x | |||
| movl %eax,M | |||
| .L00t: | |||
| movl AA,%eax | |||
| movl %eax,A # mov AA to A | |||
| movl NN,%eax | |||
| movl %eax,N # reset N | |||
| movl LDAX, LDA # reset LDA | |||
| movl XX,X | |||
| movl STACK_INCX, INCX | |||
| movl STACK_INCY, INCY | |||
| @@ -198,6 +241,20 @@ | |||
| jg .L06 | |||
| ALIGN_4 | |||
| //Padding zero to prevent loading the dirty number from buffer. | |||
| movl M, I | |||
| movl $8, J | |||
| andl $7, I | |||
| xorps %xmm0, %xmm0 | |||
| subl I, J | |||
| ALIGN_2 | |||
| .L07: | |||
| movss %xmm0, 0 * SIZE(Y1) | |||
| addl $SIZE, Y1 | |||
| decl J | |||
| jg .L07 | |||
| ALIGN_4 | |||
| .L10: | |||
| movl Y, Y1 | |||
| @@ -628,10 +685,22 @@ | |||
| ALIGN_4 | |||
| .L999: | |||
| movl M,J | |||
| leal (,J,SIZE),%eax | |||
| addl %eax,AA | |||
| movl XX,J | |||
| addl %eax,J | |||
| movl J,XX | |||
| jmp .L0t | |||
| ALIGN_4 | |||
| .L999x: | |||
| popl %ebx | |||
| popl %esi | |||
| popl %edi | |||
| popl %ebp | |||
| addl $ARGS,%esp | |||
| ret | |||
| EPILOGUE | |||
| @@ -76,18 +76,24 @@ | |||
| #endif | |||
| #define STACKSIZE 16 | |||
| #define ARGS 16 | |||
| #define M 4 + STACKSIZE+ARGS(%esp) | |||
| #define N 8 + STACKSIZE+ARGS(%esp) | |||
| #define ALPHA 16 + STACKSIZE+ARGS(%esp) | |||
| #define A 24 + STACKSIZE+ARGS(%esp) | |||
| #define STACK_LDA 28 + STACKSIZE+ARGS(%esp) | |||
| #define STACK_X 32 + STACKSIZE+ARGS(%esp) | |||
| #define STACK_INCX 36 + STACKSIZE+ARGS(%esp) | |||
| #define Y 40 + STACKSIZE+ARGS(%esp) | |||
| #define STACK_INCY 44 + STACKSIZE+ARGS(%esp) | |||
| #define BUFFER 48 + STACKSIZE+ARGS(%esp) | |||
| #define MMM 0+STACKSIZE(%esp) | |||
| #define AA 4+STACKSIZE(%esp) | |||
| #define LDAX 8+STACKSIZE(%esp) | |||
| #define NN 12+STACKSIZE(%esp) | |||
| #define M 4 + STACKSIZE(%esp) | |||
| #define N 8 + STACKSIZE(%esp) | |||
| #define ALPHA 16 + STACKSIZE(%esp) | |||
| #define A 24 + STACKSIZE(%esp) | |||
| #define STACK_LDA 28 + STACKSIZE(%esp) | |||
| #define STACK_X 32 + STACKSIZE(%esp) | |||
| #define STACK_INCX 36 + STACKSIZE(%esp) | |||
| #define Y 40 + STACKSIZE(%esp) | |||
| #define STACK_INCY 44 + STACKSIZE(%esp) | |||
| #define BUFFER 48 + STACKSIZE(%esp) | |||
| #define I %eax | |||
| #define J %ebx | |||
| @@ -101,6 +107,8 @@ | |||
| PROLOGUE | |||
| subl $ARGS,%esp | |||
| pushl %ebp | |||
| pushl %edi | |||
| pushl %esi | |||
| @@ -108,7 +116,40 @@ | |||
| PROFCODE | |||
| movl STACK_LDA, LDA | |||
| movl LDA,LDAX # backup LDA | |||
| movl N,J | |||
| movl J,NN # backup N | |||
| movl A,J | |||
| movl J,AA # backup A | |||
| movl M,J | |||
| movl J,MMM # mov M to MMM | |||
| .L0t: | |||
| xorl J,J | |||
| addl $1,J | |||
| sall $21,J # J=2^21*sizeof(double)=buffer size(16MB) | |||
| subl $4, J # Don't use last 4 double in the buffer. | |||
| # Now, split M by block J | |||
| subl J,MMM # MMM=MMM-J | |||
| movl J,M | |||
| jge .L00t | |||
| ALIGN_4 | |||
| movl MMM,%eax | |||
| addl J,%eax | |||
| jle .L999x | |||
| movl %eax,M | |||
| .L00t: | |||
| movl AA,%eax | |||
| movl %eax,A # mov AA to A | |||
| movl NN,%eax | |||
| movl %eax,N # reset N | |||
| movl LDAX, LDA # reset LDA | |||
| movl STACK_X, X | |||
| movl STACK_INCX, INCX | |||
| movl STACK_INCY, INCY | |||
| @@ -117,6 +158,7 @@ | |||
| leal (,INCY, SIZE), INCY | |||
| leal (,LDA, SIZE), LDA | |||
| subl $-16 * SIZE, A | |||
| cmpl $0, N | |||
| @@ -560,10 +602,19 @@ | |||
| ALIGN_4 | |||
| .L999: | |||
| movl M,J | |||
| leal (,J,SIZE),%eax | |||
| addl %eax,AA | |||
| jmp .L0t | |||
| ALIGN_4 | |||
| .L999x: | |||
| popl %ebx | |||
| popl %esi | |||
| popl %edi | |||
| popl %ebp | |||
| addl $ARGS,%esp | |||
| ret | |||
| EPILOGUE | |||
| @@ -269,7 +269,7 @@ | |||
| sarl $5, I | |||
| jle .L113 | |||
| #if defined(BARCELONA) | |||
| #if defined(BARCELONA) || defined(BULLDOZER) | |||
| movaps %xmm0, %xmm1 | |||
| mulps -32 * SIZE(X), %xmm1 | |||
| @@ -253,7 +253,7 @@ | |||
| sarl $4, I | |||
| jle .L113 | |||
| #if defined(BARCELONA) | |||
| #if defined(BARCELONA) || defined(BULLDOZER) | |||
| movaps %xmm0, %xmm1 | |||
| mulpd -16 * SIZE(X), %xmm1 | |||
| @@ -69,7 +69,7 @@ | |||
| #define STACK_ALIGN 4096 | |||
| #define STACK_OFFSET 1024 | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| #define PREFETCH prefetch | |||
| #define PREFETCHSIZE (8 * 10 + 4) | |||
| #endif | |||
| @@ -439,7 +439,7 @@ | |||
| .L22: | |||
| mulsd %xmm0, %xmm2 | |||
| addsd %xmm2, %xmm4 | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| movlpd 2 * SIZE(BB), %xmm2 | |||
| @@ -488,7 +488,7 @@ | |||
| movlpd 40 * SIZE(BB), %xmm3 | |||
| addsd %xmm0, %xmm7 | |||
| movlpd 8 * SIZE(AA), %xmm0 | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) | |||
| #endif | |||
| mulsd %xmm1, %xmm2 | |||
| @@ -1697,7 +1697,7 @@ | |||
| .L42: | |||
| mulpd %xmm0, %xmm2 | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| mulpd 2 * SIZE(BB), %xmm0 | |||
| @@ -1727,7 +1727,7 @@ | |||
| addpd %xmm0, %xmm7 | |||
| movapd 16 * SIZE(AA), %xmm0 | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) | |||
| #endif | |||
| mulpd %xmm1, %xmm2 | |||
| @@ -64,7 +64,7 @@ | |||
| #define BORIG 60(%esp) | |||
| #define BUFFER 128(%esp) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| #define PREFETCH prefetch | |||
| #define PREFETCHW prefetchw | |||
| #define PREFETCHSIZE (16 * 10 + 8) | |||
| @@ -437,7 +437,7 @@ | |||
| .L32: | |||
| mulss %xmm0, %xmm2 | |||
| addss %xmm2, %xmm4 | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| movss 4 * SIZE(BB), %xmm2 | |||
| @@ -833,7 +833,7 @@ | |||
| .L22: | |||
| mulps %xmm0, %xmm2 | |||
| addps %xmm2, %xmm4 | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| movaps 4 * SIZE(BB), %xmm2 | |||
| @@ -1848,7 +1848,7 @@ | |||
| .L72: | |||
| mulss %xmm0, %xmm2 | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| mulss 4 * SIZE(BB), %xmm0 | |||
| @@ -2109,7 +2109,7 @@ | |||
| ALIGN_4 | |||
| .L62: | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| @@ -2429,7 +2429,7 @@ | |||
| .L52: | |||
| mulps %xmm0, %xmm2 | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| mulps 4 * SIZE(BB), %xmm0 | |||
| @@ -2459,7 +2459,7 @@ | |||
| addps %xmm0, %xmm5 | |||
| movaps 32 * SIZE(AA), %xmm0 | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | |||
| #endif | |||
| mulps %xmm1, %xmm2 | |||
| @@ -2952,7 +2952,7 @@ | |||
| .L112: | |||
| mulss %xmm0, %xmm2 | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| movss 1 * SIZE(AA), %xmm0 | |||
| @@ -3148,7 +3148,7 @@ | |||
| .L102: | |||
| mulps %xmm0, %xmm2 | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| movsd 2 * SIZE(AA), %xmm0 | |||
| @@ -3389,7 +3389,7 @@ | |||
| .L92: | |||
| mulps %xmm0, %xmm2 | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| movaps 4 * SIZE(AA), %xmm0 | |||
| @@ -3404,7 +3404,7 @@ | |||
| mulps 12 * SIZE(BB), %xmm0 | |||
| addps %xmm0, %xmm7 | |||
| movaps 32 * SIZE(AA), %xmm0 | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | |||
| #endif | |||
| mulps %xmm1, %xmm3 | |||
| @@ -69,7 +69,7 @@ | |||
| #define STACK_ALIGN 4096 | |||
| #define STACK_OFFSET 1024 | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| #define PREFETCH prefetch | |||
| #define PREFETCHSIZE (8 * 10 + 4) | |||
| #endif | |||
| @@ -910,7 +910,7 @@ | |||
| .L22: | |||
| mulsd %xmm0, %xmm2 | |||
| addsd %xmm2, %xmm4 | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| movlpd 2 * SIZE(BB), %xmm2 | |||
| @@ -959,7 +959,7 @@ | |||
| movlpd 40 * SIZE(BB), %xmm3 | |||
| addsd %xmm0, %xmm7 | |||
| movlpd 8 * SIZE(AA), %xmm0 | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) | |||
| #endif | |||
| mulsd %xmm1, %xmm2 | |||
| @@ -1439,7 +1439,7 @@ | |||
| .L42: | |||
| mulpd %xmm0, %xmm2 | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| mulpd 2 * SIZE(BB), %xmm0 | |||
| @@ -1469,7 +1469,7 @@ | |||
| addpd %xmm0, %xmm7 | |||
| movapd 16 * SIZE(AA), %xmm0 | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) | |||
| #endif | |||
| mulpd %xmm1, %xmm2 | |||
| @@ -64,7 +64,7 @@ | |||
| #define BORIG 60(%esp) | |||
| #define BUFFER 128(%esp) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| #define PREFETCH prefetch | |||
| #define PREFETCHW prefetchw | |||
| #define PREFETCHSIZE (16 * 10 + 8) | |||
| @@ -872,7 +872,7 @@ | |||
| .L22: | |||
| mulps %xmm0, %xmm2 | |||
| addps %xmm2, %xmm4 | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| movaps 4 * SIZE(BB), %xmm2 | |||
| @@ -1316,7 +1316,7 @@ | |||
| .L32: | |||
| mulss %xmm0, %xmm2 | |||
| addss %xmm2, %xmm4 | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| movss 4 * SIZE(BB), %xmm2 | |||
| @@ -1855,7 +1855,7 @@ | |||
| .L52: | |||
| mulps %xmm0, %xmm2 | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| mulps 4 * SIZE(BB), %xmm0 | |||
| @@ -1885,7 +1885,7 @@ | |||
| addps %xmm0, %xmm5 | |||
| movaps 32 * SIZE(AA), %xmm0 | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | |||
| #endif | |||
| mulps %xmm1, %xmm2 | |||
| @@ -2249,7 +2249,7 @@ | |||
| ALIGN_4 | |||
| .L62: | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| @@ -2562,7 +2562,7 @@ | |||
| .L72: | |||
| mulss %xmm0, %xmm2 | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| mulss 4 * SIZE(BB), %xmm0 | |||
| @@ -2957,7 +2957,7 @@ | |||
| .L92: | |||
| mulps %xmm0, %xmm2 | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| movaps 4 * SIZE(AA), %xmm0 | |||
| @@ -2972,7 +2972,7 @@ | |||
| mulps 12 * SIZE(BB), %xmm0 | |||
| addps %xmm0, %xmm7 | |||
| movaps 32 * SIZE(AA), %xmm0 | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | |||
| #endif | |||
| mulps %xmm1, %xmm3 | |||
| @@ -3280,7 +3280,7 @@ | |||
| .L102: | |||
| mulps %xmm0, %xmm2 | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| movsd 2 * SIZE(AA), %xmm0 | |||
| @@ -3515,7 +3515,7 @@ | |||
| .L112: | |||
| mulss %xmm0, %xmm2 | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| movss 1 * SIZE(AA), %xmm0 | |||
| @@ -69,7 +69,7 @@ | |||
| #define STACK_ALIGN 4096 | |||
| #define STACK_OFFSET 1024 | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| #define PREFETCH prefetch | |||
| #define PREFETCHSIZE (8 * 10 + 4) | |||
| #endif | |||
| @@ -1036,7 +1036,7 @@ | |||
| .L42: | |||
| mulpd %xmm0, %xmm2 | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| mulpd 2 * SIZE(BB), %xmm0 | |||
| @@ -1066,7 +1066,7 @@ | |||
| addpd %xmm0, %xmm7 | |||
| movapd 16 * SIZE(AA), %xmm0 | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) | |||
| #endif | |||
| mulpd %xmm1, %xmm2 | |||
| @@ -2224,7 +2224,7 @@ | |||
| .L22: | |||
| mulsd %xmm0, %xmm2 | |||
| addsd %xmm2, %xmm4 | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| movlpd 2 * SIZE(BB), %xmm2 | |||
| @@ -2273,7 +2273,7 @@ | |||
| movlpd 40 * SIZE(BB), %xmm3 | |||
| addsd %xmm0, %xmm7 | |||
| movlpd 8 * SIZE(AA), %xmm0 | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) | |||
| #endif | |||
| mulsd %xmm1, %xmm2 | |||
| @@ -64,7 +64,7 @@ | |||
| #define BORIG 60(%esp) | |||
| #define BUFFER 128(%esp) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| #define PREFETCH prefetch | |||
| #define PREFETCHW prefetchw | |||
| #define PREFETCHSIZE (16 * 10 + 8) | |||
| @@ -439,7 +439,7 @@ | |||
| .L92: | |||
| mulps %xmm0, %xmm2 | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| movaps 4 * SIZE(AA), %xmm0 | |||
| @@ -454,7 +454,7 @@ | |||
| mulps 12 * SIZE(BB), %xmm0 | |||
| addps %xmm0, %xmm7 | |||
| movaps 32 * SIZE(AA), %xmm0 | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | |||
| #endif | |||
| mulps %xmm1, %xmm3 | |||
| @@ -758,7 +758,7 @@ | |||
| .L102: | |||
| mulps %xmm0, %xmm2 | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| movsd 2 * SIZE(AA), %xmm0 | |||
| @@ -993,7 +993,7 @@ | |||
| .L112: | |||
| mulss %xmm0, %xmm2 | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| movss 1 * SIZE(AA), %xmm0 | |||
| @@ -1324,7 +1324,7 @@ | |||
| .L52: | |||
| mulps %xmm0, %xmm2 | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| mulps 4 * SIZE(BB), %xmm0 | |||
| @@ -1354,7 +1354,7 @@ | |||
| addps %xmm0, %xmm5 | |||
| movaps 32 * SIZE(AA), %xmm0 | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | |||
| #endif | |||
| mulps %xmm1, %xmm2 | |||
| @@ -1718,7 +1718,7 @@ | |||
| ALIGN_4 | |||
| .L62: | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| @@ -2031,7 +2031,7 @@ | |||
| .L72: | |||
| mulss %xmm0, %xmm2 | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| mulss 4 * SIZE(BB), %xmm0 | |||
| @@ -2859,7 +2859,7 @@ | |||
| .L22: | |||
| mulps %xmm0, %xmm2 | |||
| addps %xmm2, %xmm4 | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| movaps 4 * SIZE(BB), %xmm2 | |||
| @@ -3303,7 +3303,7 @@ | |||
| .L32: | |||
| mulss %xmm0, %xmm2 | |||
| addss %xmm2, %xmm4 | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| movss 4 * SIZE(BB), %xmm2 | |||
| @@ -74,7 +74,7 @@ | |||
| #define BB %ecx | |||
| #define LDC %ebp | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | |||
| #define movsd movlps | |||
| #endif | |||
| @@ -625,7 +625,7 @@ | |||
| .L22: | |||
| mulps %xmm0, %xmm2 | |||
| addps %xmm2, %xmm4 | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| movsd 4 * SIZE(BB), %xmm2 | |||
| @@ -870,7 +870,7 @@ | |||
| .L32: | |||
| mulss %xmm0, %xmm2 | |||
| addss %xmm2, %xmm4 | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| movss 4 * SIZE(BB), %xmm2 | |||
| @@ -1173,7 +1173,7 @@ | |||
| .L52: | |||
| mulps %xmm0, %xmm2 | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| mulps 4 * SIZE(BB), %xmm0 | |||
| @@ -1203,7 +1203,7 @@ | |||
| addps %xmm0, %xmm5 | |||
| movaps 32 * SIZE(AA), %xmm0 | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | |||
| #endif | |||
| mulps %xmm1, %xmm2 | |||
| @@ -1359,7 +1359,7 @@ | |||
| ALIGN_4 | |||
| .L62: | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| @@ -1536,7 +1536,7 @@ | |||
| .L72: | |||
| mulss %xmm0, %xmm2 | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| mulss 4 * SIZE(BB), %xmm0 | |||
| @@ -1794,7 +1794,7 @@ | |||
| .L92: | |||
| mulps %xmm0, %xmm2 | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| movaps 4 * SIZE(AA), %xmm0 | |||
| @@ -1809,7 +1809,7 @@ | |||
| mulps 12 * SIZE(BB), %xmm0 | |||
| addps %xmm0, %xmm7 | |||
| movaps 32 * SIZE(AA), %xmm0 | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | |||
| #endif | |||
| mulps %xmm1, %xmm3 | |||
| @@ -1936,7 +1936,7 @@ | |||
| .L102: | |||
| mulps %xmm0, %xmm2 | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| movsd 2 * SIZE(AA), %xmm0 | |||
| @@ -2069,7 +2069,7 @@ | |||
| .L112: | |||
| mulss %xmm0, %xmm2 | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | |||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | |||
| #endif | |||
| movss 1 * SIZE(AA), %xmm0 | |||
| @@ -71,7 +71,7 @@ | |||
| #define movsd movlps | |||
| #endif | |||
| #ifdef BARCELONA | |||
| #if defined(BARCELONA) || defined(BULLDOZER) | |||
| #define PREFETCH prefetchnta | |||
| #define PREFETCHW prefetchw | |||
| #define PREFETCHSIZE (16 * 5) | |||
| @@ -58,7 +58,7 @@ | |||
| #define movsd movlps | |||
| #endif | |||
| #ifdef BARCELONA | |||
| #if defined(BARCELONA) || defined(BULLDOZER) | |||
| #define PREFETCH prefetchnta | |||
| #define PREFETCHW prefetchw | |||
| #define PREFETCHSIZE (8 * 5) | |||
| @@ -71,7 +71,7 @@ | |||
| #define movsd movlps | |||
| #endif | |||
| #ifdef BARCELONA | |||
| #if defined(BARCELONA) || defined(BULLDOZER) | |||
| #define PREFETCH prefetchnta | |||
| #define PREFETCHW prefetchw | |||
| #define PREFETCHSIZE (16 * 5) | |||
| @@ -58,7 +58,7 @@ | |||
| #define movsd movlps | |||
| #endif | |||
| #ifdef BARCELONA | |||
| #if defined(BARCELONA) || defined(BULLDOZER) | |||
| #define PREFETCH prefetchnta | |||
| #define PREFETCHW prefetchw | |||
| #define PREFETCHSIZE (8 * 5) | |||
| @@ -75,7 +75,7 @@ | |||
| #define STACK_ALIGN 4096 | |||
| #define STACK_OFFSET 1024 | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| #define PREFETCHSIZE (16 * 10 + 8) | |||
| #define WPREFETCHSIZE 112 | |||
| #define PREFETCH prefetch | |||
| @@ -533,7 +533,7 @@ | |||
| addps %xmm0, %xmm7 | |||
| movsd 16 * SIZE(AA), %xmm0 | |||
| mulps %xmm1, %xmm2 | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) | |||
| #endif | |||
| addps %xmm2, %xmm4 | |||
| @@ -75,7 +75,7 @@ | |||
| #define STACK_ALIGN 4096 | |||
| #define STACK_OFFSET 1024 | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| #define PREFETCHSIZE (16 * 10 + 8) | |||
| #define WPREFETCHSIZE 112 | |||
| #define PREFETCH prefetch | |||
| @@ -994,7 +994,7 @@ | |||
| addps %xmm0, %xmm7 | |||
| movsd 16 * SIZE(AA), %xmm0 | |||
| mulps %xmm1, %xmm2 | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) | |||
| #endif | |||
| addps %xmm2, %xmm4 | |||
| @@ -75,7 +75,7 @@ | |||
| #define STACK_ALIGN 4096 | |||
| #define STACK_OFFSET 1024 | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| #define PREFETCHSIZE (16 * 10 + 8) | |||
| #define WPREFETCHSIZE 112 | |||
| #define PREFETCH prefetch | |||
| @@ -1820,7 +1820,7 @@ | |||
| addps %xmm0, %xmm7 | |||
| movsd 16 * SIZE(AA), %xmm0 | |||
| mulps %xmm1, %xmm2 | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) | |||
| prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) | |||
| #endif | |||
| addps %xmm2, %xmm4 | |||
| @@ -0,0 +1,62 @@ | |||
| ZGEMVNKERNEL = zgemv_n_dup.S | |||
| ZGEMVTKERNEL = zgemv_t_dup.S | |||
| SGEMMKERNEL = gemm_kernel_8x4_barcelona.S | |||
| SGEMMINCOPY = ../generic/gemm_ncopy_8.c | |||
| SGEMMITCOPY = ../generic/gemm_tcopy_8.c | |||
| SGEMMONCOPY = gemm_ncopy_4_opteron.S | |||
| SGEMMOTCOPY = gemm_tcopy_4_opteron.S | |||
| SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMKERNEL = dgemm_kernel_4x4_bulldozer.S | |||
| DGEMMINCOPY = | |||
| DGEMMITCOPY = | |||
| DGEMMONCOPY = gemm_ncopy_4_opteron.S | |||
| DGEMMOTCOPY = gemm_tcopy_4_opteron.S | |||
| DGEMMINCOPYOBJ = | |||
| DGEMMITCOPYOBJ = | |||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMKERNEL = zgemm_kernel_4x2_barcelona.S | |||
| CGEMMINCOPY = ../generic/zgemm_ncopy_4.c | |||
| CGEMMITCOPY = ../generic/zgemm_tcopy_4.c | |||
| CGEMMONCOPY = zgemm_ncopy_2.S | |||
| CGEMMOTCOPY = zgemm_tcopy_2.S | |||
| CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMKERNEL = zgemm_kernel_2x2_barcelona.S | |||
| ZGEMMINCOPY = | |||
| ZGEMMITCOPY = | |||
| ZGEMMONCOPY = zgemm_ncopy_2.S | |||
| ZGEMMOTCOPY = zgemm_tcopy_2.S | |||
| ZGEMMINCOPYOBJ = | |||
| ZGEMMITCOPYOBJ = | |||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| STRSMKERNEL_LN = trsm_kernel_LN_8x4_sse.S | |||
| STRSMKERNEL_LT = trsm_kernel_LT_8x4_sse.S | |||
| STRSMKERNEL_RN = trsm_kernel_LT_8x4_sse.S | |||
| STRSMKERNEL_RT = trsm_kernel_RT_8x4_sse.S | |||
| DTRSMKERNEL_LN = trsm_kernel_LN_4x4_barcelona.S | |||
| DTRSMKERNEL_LT = trsm_kernel_LT_4x4_barcelona.S | |||
| DTRSMKERNEL_RN = trsm_kernel_LT_4x4_barcelona.S | |||
| DTRSMKERNEL_RT = trsm_kernel_RT_4x4_barcelona.S | |||
| CTRSMKERNEL_LN = ztrsm_kernel_LN_4x2_sse.S | |||
| CTRSMKERNEL_LT = ztrsm_kernel_LT_4x2_sse.S | |||
| CTRSMKERNEL_RN = ztrsm_kernel_LT_4x2_sse.S | |||
| CTRSMKERNEL_RT = ztrsm_kernel_RT_4x2_sse.S | |||
| ZTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse2.S | |||
| ZTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse2.S | |||
| ZTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse2.S | |||
| ZTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse2.S | |||
| CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S | |||
| ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S | |||
| @@ -530,7 +530,7 @@ | |||
| #endif | |||
| movsd -32 * SIZE(Y), %xmm8 | |||
| pshufd $0x39, %xmm4, %xmm5 | |||
| pshufd $0x29, %xmm4, %xmm5 | |||
| mulps %xmm8, %xmm5 | |||
| addps %xmm5, %xmm3 | |||
| @@ -750,7 +750,8 @@ | |||
| xorps %xmm5, %xmm5 | |||
| movhlps %xmm4, %xmm5 | |||
| mulps -32 * SIZE(Y), %xmm5 | |||
| movlps -32 * SIZE(Y), %xmm4 | |||
| mulps %xmm4, %xmm5 | |||
| addps %xmm5, %xmm0 | |||
| addq $2 * SIZE, X | |||
| @@ -992,7 +993,7 @@ | |||
| movsd -32 * SIZE(Y), %xmm8 | |||
| movss %xmm5, %xmm4 | |||
| shufps $0x93, %xmm5, %xmm4 | |||
| shufps $0x93, %xmm4, %xmm4 | |||
| mulps %xmm8, %xmm4 | |||
| addps %xmm4, %xmm3 | |||
| @@ -930,7 +930,7 @@ | |||
| .L22: | |||
| mulps %xmm8, %xmm9 | |||
| addps %xmm9, %xmm0 | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) | |||
| PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) | |||
| #endif | |||
| movaps 4 * SIZE(BO), %xmm9 | |||
| @@ -983,7 +983,7 @@ | |||
| addps %xmm8, %xmm3 | |||
| movaps 0 * SIZE(AO), %xmm8 | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) | |||
| PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) | |||
| #endif | |||
| mulps %xmm10, %xmm9 | |||
| @@ -1178,7 +1178,7 @@ | |||
| .L32: | |||
| mulps %xmm8, %xmm9 | |||
| addps %xmm9, %xmm0 | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) | |||
| PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) | |||
| #endif | |||
| movsd 4 * SIZE(BO), %xmm9 | |||
| @@ -1423,7 +1423,7 @@ | |||
| .L42: | |||
| mulss %xmm8, %xmm9 | |||
| addss %xmm9, %xmm0 | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) | |||
| PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) | |||
| #endif | |||
| movss 4 * SIZE(BO), %xmm9 | |||
| @@ -1765,7 +1765,7 @@ | |||
| .L62: | |||
| mulps %xmm8, %xmm9 | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) | |||
| PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) | |||
| #endif | |||
| mulps 4 * SIZE(BO), %xmm8 | |||
| @@ -1793,7 +1793,7 @@ | |||
| addps %xmm8, %xmm5 | |||
| movaps 32 * SIZE(AO), %xmm8 | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) | |||
| PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) | |||
| #endif | |||
| mulps %xmm10, %xmm11 | |||
| @@ -1822,7 +1822,7 @@ | |||
| addps %xmm10, %xmm5 | |||
| movaps 48 * SIZE(AO), %xmm10 | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) | |||
| PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) | |||
| #endif | |||
| mulps %xmm12, %xmm13 | |||
| @@ -1851,7 +1851,7 @@ | |||
| addps %xmm12, %xmm5 | |||
| movaps 64 * SIZE(AO), %xmm12 | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) | |||
| PREFETCH (PREFETCHSIZE + 48) * SIZE(AO) | |||
| #endif | |||
| mulps %xmm14, %xmm15 | |||
| @@ -2024,7 +2024,7 @@ | |||
| .L72: | |||
| mulps %xmm8, %xmm9 | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) | |||
| PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) | |||
| #endif | |||
| @@ -2208,7 +2208,7 @@ | |||
| .L82: | |||
| mulps %xmm8, %xmm9 | |||
| addps %xmm9, %xmm0 | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) | |||
| PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) | |||
| #endif | |||
| movsd 4 * SIZE(BO), %xmm9 | |||
| @@ -2395,7 +2395,7 @@ | |||
| .L92: | |||
| mulps %xmm8, %xmm9 | |||
| addps %xmm9, %xmm0 | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) | |||
| PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) | |||
| #endif | |||
| movss 4 * SIZE(BO), %xmm9 | |||
| @@ -2670,7 +2670,7 @@ | |||
| .L112: | |||
| mulps %xmm9, %xmm8 | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) | |||
| PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) | |||
| #endif | |||
| @@ -2687,7 +2687,7 @@ | |||
| addps %xmm9, %xmm4 | |||
| movaps 8 * SIZE(BO), %xmm9 | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) | |||
| PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) | |||
| #endif | |||
| mulps %xmm9, %xmm10 | |||
| @@ -2704,7 +2704,7 @@ | |||
| addps %xmm9, %xmm4 | |||
| movaps 32 * SIZE(BO), %xmm9 | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) | |||
| PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) | |||
| #endif | |||
| mulps %xmm11, %xmm12 | |||
| @@ -2721,7 +2721,7 @@ | |||
| addps %xmm11, %xmm4 | |||
| movaps 24 * SIZE(BO), %xmm11 | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) | |||
| PREFETCH (PREFETCHSIZE + 48) * SIZE(AO) | |||
| #endif | |||
| mulps %xmm11, %xmm14 | |||
| @@ -2857,7 +2857,7 @@ | |||
| .L122: | |||
| mulps %xmm8, %xmm9 | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) | |||
| PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) | |||
| #endif | |||
| movaps -28 * SIZE(AO), %xmm8 | |||
| @@ -2873,7 +2873,7 @@ | |||
| addps %xmm8, %xmm3 | |||
| movaps 0 * SIZE(AO), %xmm8 | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) | |||
| PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) | |||
| #endif | |||
| mulps %xmm10, %xmm11 | |||
| @@ -3003,7 +3003,7 @@ | |||
| .L132: | |||
| mulps %xmm8, %xmm9 | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) | |||
| PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) | |||
| #endif | |||
| movsd -30 * SIZE(AO), %xmm8 | |||
| @@ -3150,7 +3150,7 @@ | |||
| .L142: | |||
| mulss %xmm8, %xmm9 | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) | |||
| PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) | |||
| #endif | |||
| movss -31 * SIZE(AO), %xmm8 | |||
| @@ -39,7 +39,7 @@ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #if defined(BARCELONA) || defined(SHANGHAI) | |||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) | |||
| #define RPREFETCHSIZE (12 + 4) | |||
| #define WPREFETCHSIZE (48 + 4) | |||
| #define MOVNTQ MOVQ | |||
| @@ -79,7 +79,7 @@ | |||
| #define AO3 %r13 | |||
| #define AO4 %rax | |||
| #if defined(BARCELONA) || defined(SHANGHAI) | |||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) | |||
| #define RPREFETCH prefetch | |||
| #else | |||
| #define RPREFETCH prefetch | |||
| @@ -39,7 +39,7 @@ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #if defined(BARCELONA) || defined(SHANGHAI) | |||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) | |||
| #define RPREFETCHSIZE (12 + 4) | |||
| #define WPREFETCHSIZE (12 + 4) | |||
| #define MOVNTQ MOVQ | |||
| @@ -96,7 +96,7 @@ | |||
| #endif | |||
| #if defined(BARCELONA) || defined(SHANGHAI) | |||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) | |||
| #define RPREFETCH prefetch | |||
| #else | |||
| #define RPREFETCH prefetch | |||
| @@ -469,7 +469,7 @@ | |||
| ALIGN_4 | |||
| .L71: | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) | |||
| prefetch PREFETCHSIZE * SIZE(X) | |||
| #endif | |||
| @@ -266,7 +266,7 @@ | |||
| sarq $5, I | |||
| jle .L113 | |||
| #if defined(BARCELONA) || defined(SHANGHAI) | |||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) | |||
| movaps %xmm0, %xmm1 | |||
| mulps -32 * SIZE(X), %xmm1 | |||
| @@ -251,7 +251,7 @@ | |||
| sarq $4, I | |||
| jle .L113 | |||
| #if defined(BARCELONA) || defined(SHANGHAI) | |||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) | |||
| movaps %xmm0, %xmm1 | |||
| mulpd -16 * SIZE(X), %xmm1 | |||
| @@ -1,4 +1,3 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| @@ -47,7 +46,7 @@ | |||
| #ifndef WINDOWS_ABI | |||
| #define STACKSIZE 64 | |||
| #define STACKSIZE 128 | |||
| #define OLD_M %rdi | |||
| #define OLD_N %rsi | |||
| @@ -57,6 +56,10 @@ | |||
| #define STACK_Y 16 + STACKSIZE(%rsp) | |||
| #define STACK_INCY 24 + STACKSIZE(%rsp) | |||
| #define STACK_BUFFER 32 + STACKSIZE(%rsp) | |||
| #define MMM 56(%rsp) | |||
| #define NN 64(%rsp) | |||
| #define AA 72(%rsp) | |||
| #define LDAX 80(%rsp) | |||
| #else | |||
| @@ -71,6 +74,10 @@ | |||
| #define STACK_Y 72 + STACKSIZE(%rsp) | |||
| #define STACK_INCY 80 + STACKSIZE(%rsp) | |||
| #define STACK_BUFFER 88 + STACKSIZE(%rsp) | |||
| #define MMM 216(%rsp) | |||
| #define NN 224(%rsp) | |||
| #define AA 232(%rsp) | |||
| #define LDAX 240(%rsp) | |||
| #endif | |||
| @@ -127,29 +134,48 @@ | |||
| movups %xmm14, 192(%rsp) | |||
| movups %xmm15, 208(%rsp) | |||
| movq OLD_M, M | |||
| movq OLD_N, N | |||
| movq OLD_A, A | |||
| movq OLD_LDA, LDA | |||
| movq OLD_M, MMM | |||
| movq OLD_N, NN | |||
| movq OLD_A, X | |||
| movq X, AA | |||
| movq OLD_LDA, X | |||
| movq X, LDAX | |||
| movq OLD_X, X | |||
| #else | |||
| movq OLD_M, M | |||
| movq OLD_N, N | |||
| movq OLD_A, A | |||
| movq OLD_LDA, LDA | |||
| movq OLD_M, MMM | |||
| movq OLD_N, NN | |||
| movq OLD_A, AA | |||
| movq OLD_LDA, LDAX | |||
| #endif | |||
| movq STACK_INCX, INCX | |||
| movq STACK_Y, Y | |||
| movq STACK_INCY, INCY | |||
| movq STACK_BUFFER, BUFFER | |||
| #ifndef WINDOWS_ABI | |||
| pshufd $0, %xmm0, ALPHA | |||
| #else | |||
| pshufd $0, %xmm3, ALPHA | |||
| #endif | |||
| .L0t: | |||
| xorq M,M | |||
| addq $1,M | |||
| salq $22,M | |||
| subq M,MMM | |||
| jge .L00t | |||
| ALIGN_4 | |||
| movq MMM,%rax | |||
| addq M,%rax | |||
| jle .L999x | |||
| movq %rax,M | |||
| .L00t: | |||
| movq LDAX,LDA | |||
| movq NN,N | |||
| movq AA,A | |||
| movq STACK_INCX, INCX | |||
| movq STACK_Y, Y | |||
| movq STACK_INCY, INCY | |||
| movq STACK_BUFFER, BUFFER | |||
| leaq (,INCX, SIZE), INCX | |||
| leaq (,INCY, SIZE), INCY | |||
| leaq (,LDA, SIZE), LDA | |||
| @@ -6341,6 +6367,12 @@ | |||
| ALIGN_4 | |||
| .L999: | |||
| leaq (,M,SIZE),%rax | |||
| addq %rax,AA | |||
| jmp .L0t | |||
| ALIGN_4 | |||
| .L999x: | |||
| movq 0(%rsp), %rbx | |||
| movq 8(%rsp), %rbp | |||
| movq 16(%rsp), %r12 | |||
| @@ -76,7 +76,7 @@ | |||
| #define movsd movlps | |||
| #endif | |||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) | |||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) | |||
| #define PREFETCH prefetch | |||
| #define PREFETCHW prefetchw | |||
| #define PREFETCHSIZE (16 * 16) | |||
| @@ -76,7 +76,7 @@ | |||
| #define movsd movlpd | |||
| #endif | |||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) | |||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) | |||
| #define PREFETCH prefetch | |||
| #define PREFETCHW prefetchw | |||
| #define PREFETCHSIZE (16 * 16) | |||
| @@ -76,7 +76,7 @@ | |||
| #define movsd movlps | |||
| #endif | |||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) | |||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) | |||
| #define PREFETCH prefetch | |||
| #define PREFETCHW prefetchw | |||
| #define PREFETCHSIZE (16 * 16) | |||
| @@ -76,7 +76,7 @@ | |||
| #define movsd movlpd | |||
| #endif | |||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) | |||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) | |||
| #define PREFETCH prefetch | |||
| #define PREFETCHW prefetchw | |||
| #define PREFETCHSIZE (16 * 16) | |||
| @@ -86,7 +86,7 @@ | |||
| #define PREFETCHW prefetcht0 | |||
| #endif | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | |||
| #define PREFETCH prefetch | |||
| #define PREFETCHW prefetchw | |||
| #define movsd movlps | |||
| @@ -86,7 +86,7 @@ | |||
| #define PREFETCHW prefetcht0 | |||
| #endif | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | |||
| #define PREFETCH prefetch | |||
| #define PREFETCHW prefetchw | |||
| #define movsd movlps | |||
| @@ -86,7 +86,7 @@ | |||
| #define PREFETCHW prefetcht0 | |||
| #endif | |||
| #if defined(OPTERON) || defined(BARCELONA) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) | |||
| #define PREFETCH prefetch | |||
| #define PREFETCHW prefetchw | |||
| #define movsd movlps | |||
| @@ -699,7 +699,7 @@ | |||
| movsd -32 * SIZE(X), %xmm4 | |||
| pshufd $0xb1, %xmm4, %xmm12 | |||
| shufps $0x39, %xmm8, %xmm8 | |||
| shufps $0x59, %xmm8, %xmm8 | |||
| mulps %xmm8, %xmm4 | |||
| addps %xmm4, %xmm0 | |||
| mulps %xmm8, %xmm12 | |||
| @@ -1336,7 +1336,7 @@ | |||
| movss %xmm9, %xmm8 | |||
| pshufd $0xb1, %xmm4, %xmm12 | |||
| shufps $0x93, %xmm8, %xmm8 | |||
| shufps $0x03, %xmm8, %xmm8 | |||
| mulps %xmm8, %xmm4 | |||
| addps %xmm4, %xmm0 | |||
| mulps %xmm8, %xmm12 | |||
| @@ -1697,7 +1697,7 @@ | |||
| movsd -32 * SIZE(Y), %xmm4 | |||
| pshufd $0xb1, %xmm4, %xmm12 | |||
| shufps $0x39, %xmm8, %xmm8 | |||
| shufps $0xa9, %xmm8, %xmm8 | |||
| mulps %xmm8, %xmm4 | |||
| addps %xmm4, %xmm0 | |||
| mulps %xmm8, %xmm12 | |||
| @@ -2024,7 +2024,7 @@ | |||
| movss %xmm9, %xmm8 | |||
| pshufd $0xb1, %xmm4, %xmm12 | |||
| shufps $0x93, %xmm8, %xmm8 | |||
| shufps $0x03, %xmm8, %xmm8 | |||
| mulps %xmm8, %xmm4 | |||
| addps %xmm4, %xmm0 | |||
| mulps %xmm8, %xmm12 | |||
| @@ -85,7 +85,7 @@ | |||
| #define movsd movlpd | |||
| #endif | |||
| #if defined(BARCELONA) || defined(SHANGHAI) | |||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) | |||
| #define RPREFETCHSIZE 32 | |||
| #define WPREFETCHSIZE 48 | |||
| #endif | |||
| @@ -160,7 +160,7 @@ | |||
| #define a3 %xmm14 | |||
| #define xt1 %xmm15 | |||
| #if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) | |||
| #if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) | |||
| #define MOVDDUP(a, b, c) movddup a(b), c | |||
| #define MOVDDUP2(a, b, c) movddup a##b, c | |||
| #else | |||
| @@ -76,7 +76,7 @@ | |||
| #define movsd movlpd | |||
| #endif | |||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) | |||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) | |||
| #define PREFETCH prefetch | |||
| #define PREFETCHW prefetchw | |||
| #define PREFETCHSIZE (16 * 16) | |||
| @@ -167,7 +167,7 @@ | |||
| #define a3 %xmm14 | |||
| #define xt1 %xmm15 | |||
| #if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) | |||
| #if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) | |||
| #define MOVDDUP(a, b, c) movddup a(b), c | |||
| #define MOVDDUP2(a, b, c) movddup a##b, c | |||
| #else | |||
| @@ -76,7 +76,7 @@ | |||
| #define movsd movlpd | |||
| #endif | |||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) | |||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) | |||
| #define PREFETCH prefetch | |||
| #define PREFETCHW prefetchw | |||
| #define PREFETCHSIZE (16 * 16) | |||
| @@ -166,7 +166,7 @@ | |||
| #define xt1 %xmm14 | |||
| #define xt2 %xmm15 | |||
| #if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) | |||
| #if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) | |||
| #define MOVDDUP(a, b, c) movddup a(b), c | |||
| #define MOVDDUP2(a, b, c) movddup a##b, c | |||
| #else | |||
| @@ -76,7 +76,7 @@ | |||
| #define movsd movlpd | |||
| #endif | |||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) | |||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) | |||
| #define PREFETCH prefetch | |||
| #define PREFETCHW prefetchw | |||
| #define PREFETCHSIZE (16 * 16) | |||
| @@ -166,7 +166,7 @@ | |||
| #define a3 %xmm14 | |||
| #define xt1 %xmm15 | |||
| #if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) | |||
| #if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) | |||
| #define MOVDDUP(a, b, c) movddup a(b), c | |||
| #define MOVDDUP2(a, b, c) movddup a##b, c | |||
| #else | |||
| @@ -86,7 +86,7 @@ | |||
| #define BORIG 72(%rsp) | |||
| #define BUFFER 128(%rsp) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) | |||
| #define PREFETCH prefetch | |||
| #define PREFETCHW prefetchw | |||
| #define PREFETCHNTA prefetchnta | |||
| @@ -95,7 +95,7 @@ | |||
| #define PREFETCHSIZE (8 * 6 + 4) | |||
| #endif | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) | |||
| #define PREFETCH prefetch | |||
| #define PREFETCHW prefetchw | |||
| #define PREFETCHNTA prefetchnta | |||
| @@ -86,7 +86,7 @@ | |||
| #define BORIG 72(%rsp) | |||
| #define BUFFER 128(%rsp) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) | |||
| #define PREFETCH prefetch | |||
| #define PREFETCHW prefetchw | |||
| #define PREFETCHNTA prefetchnta | |||
| @@ -95,7 +95,7 @@ | |||
| #define PREFETCHSIZE (8 * 6 + 4) | |||
| #endif | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) | |||
| #define PREFETCH prefetch | |||
| #define PREFETCHW prefetchw | |||
| #define PREFETCHNTA prefetchnta | |||
| @@ -86,7 +86,7 @@ | |||
| #define BORIG 72(%rsp) | |||
| #define BUFFER 128(%rsp) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) | |||
| #define PREFETCH prefetch | |||
| #define PREFETCHW prefetchw | |||
| #define PREFETCHNTA prefetchnta | |||
| @@ -95,7 +95,7 @@ | |||
| #define PREFETCHSIZE (8 * 6 + 4) | |||
| #endif | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) | |||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) | |||
| #define PREFETCH prefetch | |||
| #define PREFETCHW prefetchw | |||
| #define PREFETCHNTA prefetchnta | |||
| @@ -74,6 +74,13 @@ | |||
| #define ALIGNED_ACCESS | |||
| #endif | |||
| #ifdef BULLDOZER | |||
| #define PREFETCH prefetch | |||
| #define PREFETCHW prefetchw | |||
| #define PREFETCHSIZE (128 * 5) | |||
| #define ALIGNED_ACCESS | |||
| #endif | |||
| #ifdef NANO | |||
| #define PREFETCH prefetcht0 | |||
| #define PREFETCHW prefetcht0 | |||
| @@ -85,7 +85,7 @@ | |||
| #define movsd movlps | |||
| #endif | |||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) | |||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) | |||
| #define ALIGNED_ACCESS | |||
| #define MOVUPS_A movaps | |||
| #define MOVUPS_XL movaps | |||
| @@ -66,7 +66,9 @@ static FLOAT dm1 = -1.; | |||
| #endif | |||
| #define GEMM_PQ MAX(GEMM_P, GEMM_Q) | |||
| #define REAL_GEMM_R (GEMM_R - GEMM_PQ) | |||
| //leave some space for GEMM_ALIGN in sb2 | |||
| #define REAL_GEMM_R (GEMM_R - 2*GEMM_PQ) | |||
| #if 0 | |||
| #define SHARED_ARRAY | |||
| @@ -220,7 +222,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, | |||
| sa, | |||
| sb2, | |||
| a + (is + js * lda) * COMPSIZE, lda, | |||
| - is + js); | |||
| is - js); | |||
| #endif | |||
| } | |||
| @@ -4,7 +4,7 @@ DRVOPTS = $(OPTS) | |||
| LOADER = $(FORTRAN) | |||
| TIMER = NONE | |||
| ARCHFLAGS= -ru | |||
| RANLIB = ranlib | |||
| #RANLIB = ranlib | |||
| BLASLIB = | |||
| TMGLIB = tmglib.a | |||
| EIGSRCLIB = eigsrc.a | |||
| @@ -48,7 +48,8 @@ typedef int blasint; | |||
| /* C99 supports complex floating numbers natively, which GCC also offers as an | |||
| extension since version 3.0. If neither are available, use a compatible | |||
| structure as fallback (see Clause 6.2.5.13 of the C99 standard). */ | |||
| #if defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || __GNUC__ >= 3 | |||
| #if (defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || \ | |||
| (__GNUC__ >= 3 && !defined(__cplusplus))) | |||
| #define OPENBLAS_COMPLEX_C99 | |||
| #include <complex.h> | |||
| typedef float _Complex openblas_complex_float; | |||
| @@ -143,7 +143,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) | |||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) | |||
| #define SNUMOPT 8 | |||
| #define DNUMOPT 4 | |||